Source code

001package gudusoft.gsqlparser.util;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.TGSqlParser;
005import gudusoft.gsqlparser.util.json.JSON;
006
007import java.io.*;
008import java.util.ArrayList;
009import java.util.LinkedHashMap;
010import java.util.List;
011import java.util.Map;
012
013public class FileSplitter {
014
015    private static final int MAX_CONSECUTIVE_FAILURES = 3;
016    private static final int BUFFER_SIZE = 8192;
017    private static final Logger logger = LoggerFactory.getLogger(FileSplitter.class);
018
019    /**
020     * Split large file
021     *
022     * @param inputFile   Input file
023     * @param outputDir   Output directory
024     * @param splitSizeMB Split size (MB)
025     * @param dbVendor    Database vendor
026     * @return List of split files
027     * @throws IOException IO exception
028     */
029    public static List<File> splitFile(File inputFile, File outputDir, int splitSizeMB, EDbVendor dbVendor) throws IOException {
030        List<File> splitFiles = new ArrayList<>();
031        if (!inputFile.exists()) {
032            throw new FileNotFoundException("Input file does not exist: " + inputFile.getAbsolutePath());
033        }
034
035        if (!outputDir.exists() && !outputDir.mkdirs()) {
036            throw new IOException("Failed to create output directory: " + outputDir.getAbsolutePath());
037        }
038
039        // Check if it's a JSON file
040        if (isJsonFile(inputFile)) {
041            return splitJsonFile(inputFile, outputDir, splitSizeMB);
042        }
043
044        // Process regular SQL file
045        long splitSizeBytes = splitSizeMB * 1024 * 1024L;
046
047        try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) {
048            long fileLength = raf.length();
049            long currentPos = 0;
050            long chunkStartPos = 0;
051            int consecutiveFailures = 0;
052            int fileIndex = 1;
053            long globalStartLineNo = 1;
054
055            while (currentPos < fileLength) {
056                Long lastValidatedLineNo = getLastLineFromCurrentPos(raf, currentPos, fileLength, splitSizeBytes, dbVendor);
057                if (lastValidatedLineNo == null) break;
058
059                while (lastValidatedLineNo != null && lastValidatedLineNo == -1) {
060                    consecutiveFailures++;
061                    logger.warn("No validated statement found at position " + currentPos + "  (consecutive failures: " + consecutiveFailures + ")");
062
063                    long incrementSize = Math.min(splitSizeBytes, fileLength - currentPos);
064                    long previousPos = currentPos;
065                    currentPos = Math.min(currentPos + incrementSize, fileLength);
066
067                    if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) {
068                        logger.error("Reached maximum consecutive failures (" + MAX_CONSECUTIVE_FAILURES + "), skipping this segment and moving to next");
069                        long skipSize = splitSizeBytes * (consecutiveFailures + 1);
070                        currentPos = Math.min(previousPos + skipSize, fileLength);
071                        chunkStartPos = currentPos;
072                        consecutiveFailures = 0; // Reset consecutive failure counter
073                        logger.info("Skipped to position " + currentPos + ", continuing with next segment");
074                        break;
075                    }
076
077                    lastValidatedLineNo = getLastLineFromCurrentPos(raf, currentPos, fileLength, splitSizeBytes * (consecutiveFailures + 1), dbVendor);
078                }
079
080                if (lastValidatedLineNo == null) {
081                    break;
082                }
083
084                if (lastValidatedLineNo == -1) {
085                    continue;
086                }
087
088                consecutiveFailures = 0;
089                long endPos = findLineEndPosition(raf, currentPos, lastValidatedLineNo);
090                long globalEndLineNo = globalStartLineNo + lastValidatedLineNo - 1;
091
092                File splitFile = splitFileByPosition(outputDir, inputFile, chunkStartPos, endPos, fileIndex, globalStartLineNo, globalEndLineNo);
093                splitFiles.add(splitFile);
094
095                chunkStartPos = endPos;
096                currentPos = endPos;
097                globalStartLineNo = globalEndLineNo + 1;
098                fileIndex++;
099
100                if (currentPos >= fileLength) {
101                    break;
102                }
103            }
104        }
105
106        return splitFiles;
107    }
108
109    /**
110     * Check if file is a JSON file
111     */
112    private static boolean isJsonFile(File file) {
113        try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
114            int c;
115            // Skip whitespace characters
116            while ((c = reader.read()) != -1) {
117                if (!Character.isWhitespace(c)) {
118                    // Check if first non-whitespace character is '{'
119                    return c == '{';
120                }
121            }
122        } catch (IOException e) {
123            logger.error("Error reading file to check if it's JSON: " + file.getAbsolutePath(), e);
124        }
125        return false;
126    }
127
128    /**
129     * Get file extension
130     */
131    private static String getFileExtension(String fileName) {
132        int lastDotIndex = fileName.lastIndexOf('.');
133        if (lastDotIndex == -1) {
134            return "";
135        }
136        return fileName.substring(lastDotIndex + 1);
137    }
138
139    /**
140     * Get file name without extension
141     */
142    private static String getFileNameWithoutExtension(String fileName) {
143        int lastDotIndex = fileName.lastIndexOf('.');
144        if (lastDotIndex == -1) {
145            return fileName;
146        }
147        return fileName.substring(0, lastDotIndex);
148    }
149
150    /**
151     * Move file
152     */
153    private static void moveFile(File source, File target) throws IOException {
154        if (!source.exists()) {
155            throw new FileNotFoundException("Source file does not exist: " + source.getAbsolutePath());
156        }
157        if (target.exists() && !target.delete()) {
158            throw new IOException("Failed to delete existing target file: " + target.getAbsolutePath());
159        }
160        if (!source.renameTo(target)) {
161            // If renameTo fails, try copy and delete
162            copyFile(source, target);
163            if (!source.delete()) {
164                throw new IOException("Failed to delete source file after copy: " + source.getAbsolutePath());
165            }
166        }
167    }
168
169    /**
170     * Copy file
171     */
172    private static void copyFile(File source, File target) throws IOException {
173        try (FileInputStream fis = new FileInputStream(source);
174             FileOutputStream fos = new FileOutputStream(target)) {
175            byte[] buffer = new byte[BUFFER_SIZE];
176            int bytesRead;
177            while ((bytesRead = fis.read(buffer)) != -1) {
178                fos.write(buffer, 0, bytesRead);
179            }
180        }
181    }
182
183    /**
184     * Split JSON file
185     */
186    private static List<File> splitJsonFile(File inputFile, File outputDir, int splitSizeMB) throws IOException {
187        List<File> splitFiles = new ArrayList<>();
188
189        // Read file content
190        String jsonContent = readFileContent(inputFile);
191        Map<?, ?> content = (Map<?, ?>) JSON.parseObject(jsonContent);
192        String createdBy = (String) content.get("createdBy");
193
194        if (createdBy == null) {
195            logger.warn("JSON file does not contain 'createdBy' field, treating as regular file");
196            return splitFiles;
197        }
198
199        if (createdBy.toLowerCase().contains("sqldep") || createdBy.toLowerCase().contains("grabit")) {
200            splitFiles.addAll(splitSqldepGrabitJson(inputFile, outputDir, splitSizeMB, content, createdBy));
201        } else if (createdBy.toLowerCase().contains("sqlflow")) {
202            splitFiles.addAll(splitSqlflowJson(inputFile, outputDir, splitSizeMB, content));
203        } else {
204            logger.warn("Unknown JSON file type with createdBy: " + createdBy);
205        }
206
207        return splitFiles;
208    }
209
210    private static String readFileContent(File inputFile) {
211        return SQLUtil.getFileContent(inputFile);
212    }
213
214    /**
215     * Write JSON file
216     */
217    private static void writeJsonFile(File file, Object content) throws IOException {
218        String jsonString = JSON.toJSONString(content);
219        SQLUtil.writeToFile(file, jsonString);
220    }
221
222    /**
223     * Split sqldep or grabit format JSON file
224     */
225    private static List<File> splitSqldepGrabitJson(File inputFile, File outputDir, int splitSizeMB, Map<?, ?> content, String createdBy) throws IOException {
226        List<File> splitFiles = new ArrayList<>();
227        List<Map<?, ?>> queries = (List<Map<?, ?>>) content.get("queries");
228        if (queries == null || queries.isEmpty()) {
229            logger.warn("No queries found in JSON file");
230            return splitFiles;
231        }
232
233        // Remove queries field, create metadata file
234        @SuppressWarnings("unchecked")
235        Map<String, Object> metadataContent = new LinkedHashMap<>((Map<String, Object>) content);
236        metadataContent.remove("queries");
237
238        String extension = getFileExtension(inputFile.getName());
239        File metadataFile = new File(outputDir, getFileNameWithoutExtension(inputFile.getName()) + "_" + 0 + "_" + 1 + (extension.isEmpty() ? "" : "." + extension));
240        writeJsonFile(metadataFile, metadataContent);
241        splitFiles.add(metadataFile);
242        logger.info("Created metadata file: " + metadataFile.getAbsolutePath());
243
244        // Split queries
245        long splitSizeBytes = splitSizeMB * 1024 * 1024L;
246        int length = 0;
247        List<Map<?, ?>> temp = new ArrayList<>();
248        long startIndex = 1;
249        long endIndex = 1;
250
251        for (Map<?, ?> item : queries) {
252            temp.add(item);
253            String sourceCode = (String) item.get("sourceCode");
254            if (sourceCode != null) {
255                length += sourceCode.length();
256            }
257
258            if (length >= splitSizeBytes) {
259                File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension);
260                Map<String, Object> jsonObject = new LinkedHashMap<>();
261                jsonObject.put("createdBy", createdBy);
262                jsonObject.put("dbvendor", content.get("dbvendor"));
263                jsonObject.put("databases", new ArrayList<>());
264                jsonObject.put("queries", temp);
265                writeJsonFile(queryFile, jsonObject);
266                splitFiles.add(queryFile);
267                logger.info("Created query file: " + queryFile.getAbsolutePath());
268
269                temp.clear();
270                length = 0;
271                startIndex = endIndex;
272            }
273            endIndex++;
274        }
275
276        // Process remaining queries
277        if (!temp.isEmpty()) {
278            File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension);
279            Map<String, Object> jsonObject = new LinkedHashMap<>();
280            jsonObject.put("createdBy", createdBy);
281            jsonObject.put("dbvendor", content.get("dbvendor"));
282            jsonObject.put("databases", new ArrayList<>());
283            jsonObject.put("queries", temp);
284            writeJsonFile(queryFile, jsonObject);
285            splitFiles.add(queryFile);
286            logger.info("Created query file: " + queryFile.getAbsolutePath());
287        }
288
289        return splitFiles;
290    }
291
292    /**
293     * Split sqlflow format JSON file
294     */
295    private static List<File> splitSqlflowJson(File inputFile, File outputDir, int splitSizeMB, Map<?, ?> content) throws IOException {
296        List<File> splitFiles = new ArrayList<>();
297        List<Map<?, ?>> queries = new ArrayList<>();
298
299        // Extract all queries
300        List<Map<?, ?>> servers = (List<Map<?, ?>>) content.get("servers");
301        if (servers != null) {
302            for (Map<?, ?> serverObject : servers) {
303                @SuppressWarnings("unchecked")
304                List<Map<?, ?>> serverQueries = (List<Map<?, ?>>) serverObject.get("queries");
305                if (serverQueries != null) {
306                    queries.addAll(serverQueries);
307                    serverObject.remove("queries");
308                }
309            }
310        }
311
312        if (queries.isEmpty()) {
313            logger.warn("No queries found in SQLFlow JSON file");
314            return splitFiles;
315        }
316
317        // Create metadata file
318        String extension = getFileExtension(inputFile.getName());
319        File metadataFile = new File(outputDir, getFileNameWithoutExtension(inputFile.getName()) + "_" + 0 + "_" + 1 + (extension.isEmpty() ? "" : "." + extension));
320        writeJsonFile(metadataFile, content);
321        splitFiles.add(metadataFile);
322        logger.info("Created metadata file: " + metadataFile.getAbsolutePath());
323
324        // Split queries
325        long splitSizeBytes = splitSizeMB * 1024 * 1024L;
326        int length = 0;
327        List<Map<?, ?>> temp = new ArrayList<>();
328        long startIndex = 1;
329        long endIndex = 1;
330
331        for (Map<?, ?> item : queries) {
332            temp.add(item);
333            String sourceCode = (String) item.get("sourceCode");
334            if (sourceCode != null) {
335                length += sourceCode.length();
336            }
337
338            if (length >= splitSizeBytes) {
339                File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension);
340                Map<String, Object> jsonObject = new LinkedHashMap<>();
341                jsonObject.put("createdBy", "grabit v1.7.0");
342                jsonObject.put("databases", new ArrayList<>());
343                jsonObject.put("queries", temp);
344                writeJsonFile(queryFile, jsonObject);
345                splitFiles.add(queryFile);
346                logger.info("Created query file: " + queryFile.getAbsolutePath());
347
348                temp.clear();
349                length = 0;
350                startIndex = endIndex;
351            }
352            endIndex++;
353        }
354
355        // Process remaining queries
356        if (!temp.isEmpty()) {
357            File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension);
358            Map<String, Object> jsonObject = new LinkedHashMap<>();
359            jsonObject.put("createdBy", "grabit v1.7.0");
360            jsonObject.put("databases", new ArrayList<>());
361            jsonObject.put("queries", temp);
362            writeJsonFile(queryFile, jsonObject);
363            splitFiles.add(queryFile);
364            logger.info("Created query file: " + queryFile.getAbsolutePath());
365        }
366
367        return splitFiles;
368    }
369
370    /**
371     * Create query file
372     */
373    private static File createQueryFile(File outputDir, File inputFile, long startIndex, long endIndex, String extension) {
374        String fileName = getFileNameWithoutExtension(inputFile.getName()) + "_" + startIndex + "_" + endIndex + (extension.isEmpty() ? "" : "." + extension);
375        return new File(outputDir, fileName);
376    }
377
378    private static Long getLastLineFromCurrentPos(RandomAccessFile raf, long currentPos, long fileLength, long splitSizeBytes, EDbVendor vendor) throws IOException {
379        raf.seek(currentPos);
380
381        long remainingBytes = fileLength - currentPos;
382        long readSize = Math.min(splitSizeBytes, remainingBytes);
383
384        byte[] buffer = new byte[(int) readSize];
385        int bytesRead = raf.read(buffer);
386        if (bytesRead <= 0) {
387            return null;
388        }
389
390        String partialContent = new String(buffer, 0, bytesRead, "UTF-8");
391        if (partialContent.isEmpty()) {
392            return null;
393        }
394
395        TGSqlParser parser = new TGSqlParser(vendor);
396        parser.sqltext = partialContent;
397        parser.getrawsqlstatements();
398
399        long lastValidatedLineNo = parser.getLastLineNoOfLastStatementBeenValidated();
400        return lastValidatedLineNo;
401    }
402
403    /**
404     * Find end position of Nth line from specified byte position
405     * Supports multiple newline formats: \n (Unix/Linux), \r\n (Windows), \r (Old Mac)
406     * Uses buffer for batch reading to improve performance
407     */
408    private static long findLineEndPosition(RandomAccessFile raf, long startPos, long lineCount) throws IOException {
409        raf.seek(startPos);
410        long newlineCount = 0;
411        long fileLength = raf.length();
412
413        byte[] buffer = new byte[BUFFER_SIZE];
414        long currentPos = startPos;
415
416        while (currentPos < fileLength) {
417            int toRead = (int) Math.min(buffer.length, fileLength - currentPos);
418            int bytesRead = raf.read(buffer, 0, toRead);
419            if (bytesRead <= 0) {
420                break;
421            }
422
423            for (int i = 0; i < bytesRead; i++) {
424                byte b = buffer[i];
425
426                if (b == '\n') {
427                    newlineCount++;
428                    if (newlineCount == lineCount) {
429                        return currentPos + i + 1;
430                    }
431                } else if (b == '\r') {
432                    if (i + 1 < bytesRead) {
433                        if (buffer[i + 1] == '\n') {
434                            newlineCount++;
435                            if (newlineCount == lineCount) {
436                                return currentPos + i + 2;
437                            }
438                            i++;
439                        } else {
440                            newlineCount++;
441                            if (newlineCount == lineCount) {
442                                return currentPos + i + 1;
443                            }
444                        }
445                    } else {
446                        long savedPos = raf.getFilePointer();
447                        int nextByte = raf.read();
448
449                        if (nextByte == '\n') {
450                            newlineCount++;
451                            if (newlineCount == lineCount) {
452                                return currentPos + i + 2;
453                            }
454                            currentPos++;
455                        } else {
456                            newlineCount++;
457                            if (newlineCount == lineCount) {
458                                return currentPos + i + 1;
459                            }
460                            if (nextByte != -1) {
461                                raf.seek(savedPos);
462                            }
463                        }
464                    }
465                }
466            }
467
468            currentPos += bytesRead;
469        }
470
471        return fileLength;
472    }
473
474    /**
475     * Split file by byte position
476     */
477    private static File splitFileByPosition(File outputDir, File inputFile, long startPos, long endPos, int fileIndex, long startLineNo, long endLineNo) throws IOException {
478        String fileName = getFileNameWithoutExtension(inputFile.getName()) + "_" + fileIndex + "_" + startLineNo + "_" + endLineNo + "." + getFileExtension(inputFile.getName());
479        File outputFile = new File(outputDir, fileName);
480
481        try (RandomAccessFile rafRead = new RandomAccessFile(inputFile, "r");
482             BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), java.nio.charset.StandardCharsets.UTF_8))) {
483
484            rafRead.seek(startPos);
485            long bytesToRead = endPos - startPos;
486            long bytesRemaining = bytesToRead;
487
488            byte[] buffer = new byte[BUFFER_SIZE];
489
490            while (bytesRemaining > 0) {
491                int toRead = (int) Math.min(buffer.length, bytesRemaining);
492                int bytesRead = rafRead.read(buffer, 0, toRead);
493                if (bytesRead <= 0) {
494                    break;
495                }
496
497                String chunk = new String(buffer, 0, bytesRead, java.nio.charset.StandardCharsets.UTF_8);
498                writer.write(chunk);
499                bytesRemaining -= bytesRead;
500            }
501
502            logger.info("split file " + inputFile.getName() + " (index: " + fileIndex + ", lines: " + startLineNo + "-" + endLineNo + ") from byte " + startPos + " to " + endPos + " to " + outputFile.getName());
503
504            return outputFile;
505        }
506    }
507}