001package gudusoft.gsqlparser.util; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TGSqlParser; 005import gudusoft.gsqlparser.util.json.JSON; 006 007import java.io.*; 008import java.util.ArrayList; 009import java.util.LinkedHashMap; 010import java.util.List; 011import java.util.Map; 012 013public class FileSplitter { 014 015 private static final int MAX_CONSECUTIVE_FAILURES = 3; 016 private static final int BUFFER_SIZE = 8192; 017 private static final Logger logger = LoggerFactory.getLogger(FileSplitter.class); 018 019 /** 020 * Split large file 021 * 022 * @param inputFile Input file 023 * @param outputDir Output directory 024 * @param splitSizeMB Split size (MB) 025 * @param dbVendor Database vendor 026 * @return List of split files 027 * @throws IOException IO exception 028 */ 029 public static List<File> splitFile(File inputFile, File outputDir, int splitSizeMB, EDbVendor dbVendor) throws IOException { 030 List<File> splitFiles = new ArrayList<>(); 031 if (!inputFile.exists()) { 032 throw new FileNotFoundException("Input file does not exist: " + inputFile.getAbsolutePath()); 033 } 034 035 if (!outputDir.exists() && !outputDir.mkdirs()) { 036 throw new IOException("Failed to create output directory: " + outputDir.getAbsolutePath()); 037 } 038 039 // Check if it's a JSON file 040 if (isJsonFile(inputFile)) { 041 return splitJsonFile(inputFile, outputDir, splitSizeMB); 042 } 043 044 // Process regular SQL file 045 long splitSizeBytes = splitSizeMB * 1024 * 1024L; 046 047 try (RandomAccessFile raf = new RandomAccessFile(inputFile, "r")) { 048 long fileLength = raf.length(); 049 long currentPos = 0; 050 long chunkStartPos = 0; 051 int consecutiveFailures = 0; 052 int fileIndex = 1; 053 long globalStartLineNo = 1; 054 055 while (currentPos < fileLength) { 056 Long lastValidatedLineNo = getLastLineFromCurrentPos(raf, currentPos, fileLength, splitSizeBytes, dbVendor); 057 if (lastValidatedLineNo == null) break; 058 059 while (lastValidatedLineNo != null && lastValidatedLineNo == -1) { 060 consecutiveFailures++; 061 logger.warn("No validated statement found at position " + currentPos + " (consecutive failures: " + consecutiveFailures + ")"); 062 063 long incrementSize = Math.min(splitSizeBytes, fileLength - currentPos); 064 long previousPos = currentPos; 065 currentPos = Math.min(currentPos + incrementSize, fileLength); 066 067 if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES) { 068 logger.error("Reached maximum consecutive failures (" + MAX_CONSECUTIVE_FAILURES + "), skipping this segment and moving to next"); 069 long skipSize = splitSizeBytes * (consecutiveFailures + 1); 070 currentPos = Math.min(previousPos + skipSize, fileLength); 071 chunkStartPos = currentPos; 072 consecutiveFailures = 0; // Reset consecutive failure counter 073 logger.info("Skipped to position " + currentPos + ", continuing with next segment"); 074 break; 075 } 076 077 lastValidatedLineNo = getLastLineFromCurrentPos(raf, currentPos, fileLength, splitSizeBytes * (consecutiveFailures + 1), dbVendor); 078 } 079 080 if (lastValidatedLineNo == null) { 081 break; 082 } 083 084 if (lastValidatedLineNo == -1) { 085 continue; 086 } 087 088 consecutiveFailures = 0; 089 long endPos = findLineEndPosition(raf, currentPos, lastValidatedLineNo); 090 long globalEndLineNo = globalStartLineNo + lastValidatedLineNo - 1; 091 092 File splitFile = splitFileByPosition(outputDir, inputFile, chunkStartPos, endPos, fileIndex, globalStartLineNo, globalEndLineNo); 093 splitFiles.add(splitFile); 094 095 chunkStartPos = endPos; 096 currentPos = endPos; 097 globalStartLineNo = globalEndLineNo + 1; 098 fileIndex++; 099 100 if (currentPos >= fileLength) { 101 break; 102 } 103 } 104 } 105 106 return splitFiles; 107 } 108 109 /** 110 * Check if file is a JSON file 111 */ 112 private static boolean isJsonFile(File file) { 113 try (BufferedReader reader = new BufferedReader(new FileReader(file))) { 114 int c; 115 // Skip whitespace characters 116 while ((c = reader.read()) != -1) { 117 if (!Character.isWhitespace(c)) { 118 // Check if first non-whitespace character is '{' 119 return c == '{'; 120 } 121 } 122 } catch (IOException e) { 123 logger.error("Error reading file to check if it's JSON: " + file.getAbsolutePath(), e); 124 } 125 return false; 126 } 127 128 /** 129 * Get file extension 130 */ 131 private static String getFileExtension(String fileName) { 132 int lastDotIndex = fileName.lastIndexOf('.'); 133 if (lastDotIndex == -1) { 134 return ""; 135 } 136 return fileName.substring(lastDotIndex + 1); 137 } 138 139 /** 140 * Get file name without extension 141 */ 142 private static String getFileNameWithoutExtension(String fileName) { 143 int lastDotIndex = fileName.lastIndexOf('.'); 144 if (lastDotIndex == -1) { 145 return fileName; 146 } 147 return fileName.substring(0, lastDotIndex); 148 } 149 150 /** 151 * Move file 152 */ 153 private static void moveFile(File source, File target) throws IOException { 154 if (!source.exists()) { 155 throw new FileNotFoundException("Source file does not exist: " + source.getAbsolutePath()); 156 } 157 if (target.exists() && !target.delete()) { 158 throw new IOException("Failed to delete existing target file: " + target.getAbsolutePath()); 159 } 160 if (!source.renameTo(target)) { 161 // If renameTo fails, try copy and delete 162 copyFile(source, target); 163 if (!source.delete()) { 164 throw new IOException("Failed to delete source file after copy: " + source.getAbsolutePath()); 165 } 166 } 167 } 168 169 /** 170 * Copy file 171 */ 172 private static void copyFile(File source, File target) throws IOException { 173 try (FileInputStream fis = new FileInputStream(source); 174 FileOutputStream fos = new FileOutputStream(target)) { 175 byte[] buffer = new byte[BUFFER_SIZE]; 176 int bytesRead; 177 while ((bytesRead = fis.read(buffer)) != -1) { 178 fos.write(buffer, 0, bytesRead); 179 } 180 } 181 } 182 183 /** 184 * Split JSON file 185 */ 186 private static List<File> splitJsonFile(File inputFile, File outputDir, int splitSizeMB) throws IOException { 187 List<File> splitFiles = new ArrayList<>(); 188 189 // Read file content 190 String jsonContent = readFileContent(inputFile); 191 Map<?, ?> content = (Map<?, ?>) JSON.parseObject(jsonContent); 192 String createdBy = (String) content.get("createdBy"); 193 194 if (createdBy == null) { 195 logger.warn("JSON file does not contain 'createdBy' field, treating as regular file"); 196 return splitFiles; 197 } 198 199 if (createdBy.toLowerCase().contains("sqldep") || createdBy.toLowerCase().contains("grabit")) { 200 splitFiles.addAll(splitSqldepGrabitJson(inputFile, outputDir, splitSizeMB, content, createdBy)); 201 } else if (createdBy.toLowerCase().contains("sqlflow")) { 202 splitFiles.addAll(splitSqlflowJson(inputFile, outputDir, splitSizeMB, content)); 203 } else { 204 logger.warn("Unknown JSON file type with createdBy: " + createdBy); 205 } 206 207 return splitFiles; 208 } 209 210 private static String readFileContent(File inputFile) { 211 return SQLUtil.getFileContent(inputFile); 212 } 213 214 /** 215 * Write JSON file 216 */ 217 private static void writeJsonFile(File file, Object content) throws IOException { 218 String jsonString = JSON.toJSONString(content); 219 SQLUtil.writeToFile(file, jsonString); 220 } 221 222 /** 223 * Split sqldep or grabit format JSON file 224 */ 225 private static List<File> splitSqldepGrabitJson(File inputFile, File outputDir, int splitSizeMB, Map<?, ?> content, String createdBy) throws IOException { 226 List<File> splitFiles = new ArrayList<>(); 227 List<Map<?, ?>> queries = (List<Map<?, ?>>) content.get("queries"); 228 if (queries == null || queries.isEmpty()) { 229 logger.warn("No queries found in JSON file"); 230 return splitFiles; 231 } 232 233 // Remove queries field, create metadata file 234 @SuppressWarnings("unchecked") 235 Map<String, Object> metadataContent = new LinkedHashMap<>((Map<String, Object>) content); 236 metadataContent.remove("queries"); 237 238 String extension = getFileExtension(inputFile.getName()); 239 File metadataFile = new File(outputDir, getFileNameWithoutExtension(inputFile.getName()) + "_" + 0 + "_" + 1 + (extension.isEmpty() ? "" : "." + extension)); 240 writeJsonFile(metadataFile, metadataContent); 241 splitFiles.add(metadataFile); 242 logger.info("Created metadata file: " + metadataFile.getAbsolutePath()); 243 244 // Split queries 245 long splitSizeBytes = splitSizeMB * 1024 * 1024L; 246 int length = 0; 247 List<Map<?, ?>> temp = new ArrayList<>(); 248 long startIndex = 1; 249 long endIndex = 1; 250 251 for (Map<?, ?> item : queries) { 252 temp.add(item); 253 String sourceCode = (String) item.get("sourceCode"); 254 if (sourceCode != null) { 255 length += sourceCode.length(); 256 } 257 258 if (length >= splitSizeBytes) { 259 File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension); 260 Map<String, Object> jsonObject = new LinkedHashMap<>(); 261 jsonObject.put("createdBy", createdBy); 262 jsonObject.put("dbvendor", content.get("dbvendor")); 263 jsonObject.put("databases", new ArrayList<>()); 264 jsonObject.put("queries", temp); 265 writeJsonFile(queryFile, jsonObject); 266 splitFiles.add(queryFile); 267 logger.info("Created query file: " + queryFile.getAbsolutePath()); 268 269 temp.clear(); 270 length = 0; 271 startIndex = endIndex; 272 } 273 endIndex++; 274 } 275 276 // Process remaining queries 277 if (!temp.isEmpty()) { 278 File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension); 279 Map<String, Object> jsonObject = new LinkedHashMap<>(); 280 jsonObject.put("createdBy", createdBy); 281 jsonObject.put("dbvendor", content.get("dbvendor")); 282 jsonObject.put("databases", new ArrayList<>()); 283 jsonObject.put("queries", temp); 284 writeJsonFile(queryFile, jsonObject); 285 splitFiles.add(queryFile); 286 logger.info("Created query file: " + queryFile.getAbsolutePath()); 287 } 288 289 return splitFiles; 290 } 291 292 /** 293 * Split sqlflow format JSON file 294 */ 295 private static List<File> splitSqlflowJson(File inputFile, File outputDir, int splitSizeMB, Map<?, ?> content) throws IOException { 296 List<File> splitFiles = new ArrayList<>(); 297 List<Map<?, ?>> queries = new ArrayList<>(); 298 299 // Extract all queries 300 List<Map<?, ?>> servers = (List<Map<?, ?>>) content.get("servers"); 301 if (servers != null) { 302 for (Map<?, ?> serverObject : servers) { 303 @SuppressWarnings("unchecked") 304 List<Map<?, ?>> serverQueries = (List<Map<?, ?>>) serverObject.get("queries"); 305 if (serverQueries != null) { 306 queries.addAll(serverQueries); 307 serverObject.remove("queries"); 308 } 309 } 310 } 311 312 if (queries.isEmpty()) { 313 logger.warn("No queries found in SQLFlow JSON file"); 314 return splitFiles; 315 } 316 317 // Create metadata file 318 String extension = getFileExtension(inputFile.getName()); 319 File metadataFile = new File(outputDir, getFileNameWithoutExtension(inputFile.getName()) + "_" + 0 + "_" + 1 + (extension.isEmpty() ? "" : "." + extension)); 320 writeJsonFile(metadataFile, content); 321 splitFiles.add(metadataFile); 322 logger.info("Created metadata file: " + metadataFile.getAbsolutePath()); 323 324 // Split queries 325 long splitSizeBytes = splitSizeMB * 1024 * 1024L; 326 int length = 0; 327 List<Map<?, ?>> temp = new ArrayList<>(); 328 long startIndex = 1; 329 long endIndex = 1; 330 331 for (Map<?, ?> item : queries) { 332 temp.add(item); 333 String sourceCode = (String) item.get("sourceCode"); 334 if (sourceCode != null) { 335 length += sourceCode.length(); 336 } 337 338 if (length >= splitSizeBytes) { 339 File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension); 340 Map<String, Object> jsonObject = new LinkedHashMap<>(); 341 jsonObject.put("createdBy", "grabit v1.7.0"); 342 jsonObject.put("databases", new ArrayList<>()); 343 jsonObject.put("queries", temp); 344 writeJsonFile(queryFile, jsonObject); 345 splitFiles.add(queryFile); 346 logger.info("Created query file: " + queryFile.getAbsolutePath()); 347 348 temp.clear(); 349 length = 0; 350 startIndex = endIndex; 351 } 352 endIndex++; 353 } 354 355 // Process remaining queries 356 if (!temp.isEmpty()) { 357 File queryFile = createQueryFile(outputDir, inputFile, startIndex, endIndex, extension); 358 Map<String, Object> jsonObject = new LinkedHashMap<>(); 359 jsonObject.put("createdBy", "grabit v1.7.0"); 360 jsonObject.put("databases", new ArrayList<>()); 361 jsonObject.put("queries", temp); 362 writeJsonFile(queryFile, jsonObject); 363 splitFiles.add(queryFile); 364 logger.info("Created query file: " + queryFile.getAbsolutePath()); 365 } 366 367 return splitFiles; 368 } 369 370 /** 371 * Create query file 372 */ 373 private static File createQueryFile(File outputDir, File inputFile, long startIndex, long endIndex, String extension) { 374 String fileName = getFileNameWithoutExtension(inputFile.getName()) + "_" + startIndex + "_" + endIndex + (extension.isEmpty() ? "" : "." + extension); 375 return new File(outputDir, fileName); 376 } 377 378 private static Long getLastLineFromCurrentPos(RandomAccessFile raf, long currentPos, long fileLength, long splitSizeBytes, EDbVendor vendor) throws IOException { 379 raf.seek(currentPos); 380 381 long remainingBytes = fileLength - currentPos; 382 long readSize = Math.min(splitSizeBytes, remainingBytes); 383 384 byte[] buffer = new byte[(int) readSize]; 385 int bytesRead = raf.read(buffer); 386 if (bytesRead <= 0) { 387 return null; 388 } 389 390 String partialContent = new String(buffer, 0, bytesRead, "UTF-8"); 391 if (partialContent.isEmpty()) { 392 return null; 393 } 394 395 TGSqlParser parser = new TGSqlParser(vendor); 396 parser.sqltext = partialContent; 397 parser.getrawsqlstatements(); 398 399 long lastValidatedLineNo = parser.getLastLineNoOfLastStatementBeenValidated(); 400 return lastValidatedLineNo; 401 } 402 403 /** 404 * Find end position of Nth line from specified byte position 405 * Supports multiple newline formats: \n (Unix/Linux), \r\n (Windows), \r (Old Mac) 406 * Uses buffer for batch reading to improve performance 407 */ 408 private static long findLineEndPosition(RandomAccessFile raf, long startPos, long lineCount) throws IOException { 409 raf.seek(startPos); 410 long newlineCount = 0; 411 long fileLength = raf.length(); 412 413 byte[] buffer = new byte[BUFFER_SIZE]; 414 long currentPos = startPos; 415 416 while (currentPos < fileLength) { 417 int toRead = (int) Math.min(buffer.length, fileLength - currentPos); 418 int bytesRead = raf.read(buffer, 0, toRead); 419 if (bytesRead <= 0) { 420 break; 421 } 422 423 for (int i = 0; i < bytesRead; i++) { 424 byte b = buffer[i]; 425 426 if (b == '\n') { 427 newlineCount++; 428 if (newlineCount == lineCount) { 429 return currentPos + i + 1; 430 } 431 } else if (b == '\r') { 432 if (i + 1 < bytesRead) { 433 if (buffer[i + 1] == '\n') { 434 newlineCount++; 435 if (newlineCount == lineCount) { 436 return currentPos + i + 2; 437 } 438 i++; 439 } else { 440 newlineCount++; 441 if (newlineCount == lineCount) { 442 return currentPos + i + 1; 443 } 444 } 445 } else { 446 long savedPos = raf.getFilePointer(); 447 int nextByte = raf.read(); 448 449 if (nextByte == '\n') { 450 newlineCount++; 451 if (newlineCount == lineCount) { 452 return currentPos + i + 2; 453 } 454 currentPos++; 455 } else { 456 newlineCount++; 457 if (newlineCount == lineCount) { 458 return currentPos + i + 1; 459 } 460 if (nextByte != -1) { 461 raf.seek(savedPos); 462 } 463 } 464 } 465 } 466 } 467 468 currentPos += bytesRead; 469 } 470 471 return fileLength; 472 } 473 474 /** 475 * Split file by byte position 476 */ 477 private static File splitFileByPosition(File outputDir, File inputFile, long startPos, long endPos, int fileIndex, long startLineNo, long endLineNo) throws IOException { 478 String fileName = getFileNameWithoutExtension(inputFile.getName()) + "_" + fileIndex + "_" + startLineNo + "_" + endLineNo + "." + getFileExtension(inputFile.getName()); 479 File outputFile = new File(outputDir, fileName); 480 481 try (RandomAccessFile rafRead = new RandomAccessFile(inputFile, "r"); 482 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), java.nio.charset.StandardCharsets.UTF_8))) { 483 484 rafRead.seek(startPos); 485 long bytesToRead = endPos - startPos; 486 long bytesRemaining = bytesToRead; 487 488 byte[] buffer = new byte[BUFFER_SIZE]; 489 490 while (bytesRemaining > 0) { 491 int toRead = (int) Math.min(buffer.length, bytesRemaining); 492 int bytesRead = rafRead.read(buffer, 0, toRead); 493 if (bytesRead <= 0) { 494 break; 495 } 496 497 String chunk = new String(buffer, 0, bytesRead, java.nio.charset.StandardCharsets.UTF_8); 498 writer.write(chunk); 499 bytesRemaining -= bytesRead; 500 } 501 502 logger.info("split file " + inputFile.getName() + " (index: " + fileIndex + ", lines: " + startLineNo + "-" + endLineNo + ") from byte " + startPos + " to " + endPos + " to " + outputFile.getName()); 503 504 return outputFile; 505 } 506 } 507}