001import java.io.*; 002import java.nio.file.*; 003import java.security.MessageDigest; 004import java.security.NoSuchAlgorithmException; 005import java.util.*; 006import java.util.regex.*; 007 008/** 009 * Tool to remove duplicate YAML test files based on MD5 hash of SQL content. 010 * 011 * This tool scans a directory recursively for YAML files, extracts the SQL 012 * from the input section, calculates MD5 hash, and removes duplicates within 013 * the same directory, keeping only one file per unique SQL. 014 * 015 * Usage: java removeDuplicateYamlTestFile <directory> 016 */ 017public class removeDuplicateYamlTestFile { 018 019 private static int totalFilesRemoved = 0; 020 private static int totalFilesScanned = 0; 021 022 public static void main(String[] args) { 023 if (args.length < 1) { 024 System.out.println("Usage: java removeDuplicateYamlTestFile <directory>"); 025 System.out.println(" <directory> - The directory to scan for YAML files"); 026 System.exit(1); 027 } 028 029 String directoryPath = args[0]; 030 File rootDir = new File(directoryPath); 031 032 if (!rootDir.exists()) { 033 System.err.println("Error: Directory does not exist: " + directoryPath); 034 System.exit(1); 035 } 036 037 if (!rootDir.isDirectory()) { 038 System.err.println("Error: Path is not a directory: " + directoryPath); 039 System.exit(1); 040 } 041 042 System.out.println("Scanning directory: " + rootDir.getAbsolutePath()); 043 System.out.println(); 044 045 try { 046 processDirectory(rootDir); 047 } catch (Exception e) { 048 System.err.println("Error processing directory: " + e.getMessage()); 049 e.printStackTrace(); 050 System.exit(1); 051 } 052 053 System.out.println(); 054 System.out.println("=== Summary ==="); 055 System.out.println("Total YAML files scanned: " + totalFilesScanned); 056 System.out.println("Total duplicate files removed: " + totalFilesRemoved); 057 } 058 059 /** 060 * Process all subdirectories recursively. 061 * For each directory, find duplicates within that directory only. 062 */ 063 private static void processDirectory(File dir) throws Exception { 064 // First, collect all YAML files in this directory (not subdirectories) 065 File[] yamlFiles = dir.listFiles((d, name) -> 066 name.toLowerCase().endsWith(".yaml") || name.toLowerCase().endsWith(".yml")); 067 068 if (yamlFiles != null && yamlFiles.length > 0) { 069 processYamlFilesInDirectory(dir, yamlFiles); 070 } 071 072 // Then recursively process subdirectories 073 File[] subdirs = dir.listFiles(File::isDirectory); 074 if (subdirs != null) { 075 for (File subdir : subdirs) { 076 processDirectory(subdir); 077 } 078 } 079 } 080 081 /** 082 * Process YAML files within a single directory, removing duplicates. 083 */ 084 private static void processYamlFilesInDirectory(File dir, File[] yamlFiles) throws Exception { 085 // Map: MD5 hash -> list of files with that hash 086 Map<String, List<File>> md5ToFiles = new HashMap<>(); 087 088 for (File yamlFile : yamlFiles) { 089 totalFilesScanned++; 090 091 try { 092 String sql = extractSqlFromYaml(yamlFile); 093 if (sql == null || sql.trim().isEmpty()) { 094 // Skip files without SQL content 095 continue; 096 } 097 098 String md5 = calculateMD5(sql); 099 md5ToFiles.computeIfAbsent(md5, k -> new ArrayList<>()).add(yamlFile); 100 } catch (Exception e) { 101 System.err.println("Warning: Could not process file: " + yamlFile.getName() + " - " + e.getMessage()); 102 } 103 } 104 105 // Remove duplicates (keep the first file, remove the rest) 106 int removedInDir = 0; 107 for (Map.Entry<String, List<File>> entry : md5ToFiles.entrySet()) { 108 List<File> files = entry.getValue(); 109 if (files.size() > 1) { 110 // Sort by filename to ensure consistent behavior 111 files.sort(Comparator.comparing(File::getName)); 112 113 // Keep the first file, remove the rest 114 File kept = files.get(0); 115 for (int i = 1; i < files.size(); i++) { 116 File toRemove = files.get(i); 117 if (toRemove.delete()) { 118 System.out.println("Removed duplicate: " + toRemove.getAbsolutePath()); 119 System.out.println(" (duplicate of: " + kept.getName() + ")"); 120 removedInDir++; 121 totalFilesRemoved++; 122 } else { 123 System.err.println("Warning: Failed to delete file: " + toRemove.getAbsolutePath()); 124 } 125 } 126 } 127 } 128 129 if (removedInDir > 0) { 130 System.out.println("Removed " + removedInDir + " duplicate(s) in: " + dir.getAbsolutePath()); 131 System.out.println(); 132 } 133 } 134 135 /** 136 * Extract SQL content from the input section of a YAML file. 137 * Uses simple text parsing to handle the YAML structure. 138 */ 139 private static String extractSqlFromYaml(File yamlFile) throws IOException { 140 String content = new String(Files.readAllBytes(yamlFile.toPath()), "UTF-8"); 141 142 // Pattern to match the input.sql section 143 // Handles both block scalar (|) and literal string formats 144 145 // First, try to find "input:" section 146 int inputIndex = content.indexOf("input:"); 147 if (inputIndex == -1) { 148 return null; 149 } 150 151 String afterInput = content.substring(inputIndex + "input:".length()); 152 153 // Find "sql:" within the input section 154 int sqlIndex = afterInput.indexOf("sql:"); 155 if (sqlIndex == -1) { 156 return null; 157 } 158 159 String afterSql = afterInput.substring(sqlIndex + "sql:".length()); 160 161 // Determine if it's a block scalar (|) or inline 162 String trimmed = afterSql.trim(); 163 164 if (trimmed.startsWith("|")) { 165 // Block scalar - extract indented content 166 return extractBlockScalar(afterSql); 167 } else if (trimmed.startsWith("\"") || trimmed.startsWith("'")) { 168 // Quoted string 169 return extractQuotedString(trimmed); 170 } else { 171 // Could be inline or next line 172 return extractInlineOrNextLine(afterSql); 173 } 174 } 175 176 /** 177 * Extract content from a YAML block scalar (|). 178 */ 179 private static String extractBlockScalar(String content) { 180 // Skip the | character and any modifiers 181 int pipeIndex = content.indexOf('|'); 182 if (pipeIndex == -1) { 183 return null; 184 } 185 186 String afterPipe = content.substring(pipeIndex + 1); 187 188 // Find the first line break 189 int firstNewline = afterPipe.indexOf('\n'); 190 if (firstNewline == -1) { 191 return afterPipe.trim(); 192 } 193 194 String blockContent = afterPipe.substring(firstNewline + 1); 195 196 // Determine the indentation of the block content 197 int baseIndent = 0; 198 for (char c : blockContent.toCharArray()) { 199 if (c == ' ') { 200 baseIndent++; 201 } else if (c == '\t') { 202 baseIndent += 2; // Treat tab as 2 spaces 203 } else { 204 break; 205 } 206 } 207 208 if (baseIndent == 0) { 209 // No indentation found, might be empty or malformed 210 return null; 211 } 212 213 // Extract all lines with the same or greater indentation 214 StringBuilder result = new StringBuilder(); 215 String[] lines = blockContent.split("\n"); 216 217 for (String line : lines) { 218 if (line.trim().isEmpty()) { 219 result.append("\n"); 220 continue; 221 } 222 223 int lineIndent = 0; 224 for (char c : line.toCharArray()) { 225 if (c == ' ') { 226 lineIndent++; 227 } else if (c == '\t') { 228 lineIndent += 2; 229 } else { 230 break; 231 } 232 } 233 234 if (lineIndent >= baseIndent) { 235 // Part of the block 236 if (line.length() > baseIndent) { 237 result.append(line.substring(baseIndent)).append("\n"); 238 } else { 239 result.append("\n"); 240 } 241 } else if (!line.trim().isEmpty()) { 242 // End of block (less indentation and non-empty) 243 break; 244 } 245 } 246 247 return result.toString().trim(); 248 } 249 250 /** 251 * Extract a quoted string value. 252 */ 253 private static String extractQuotedString(String content) { 254 char quote = content.charAt(0); 255 int endQuote = content.indexOf(quote, 1); 256 if (endQuote == -1) { 257 return null; 258 } 259 return content.substring(1, endQuote); 260 } 261 262 /** 263 * Extract inline value or value on next line. 264 */ 265 private static String extractInlineOrNextLine(String content) { 266 String[] lines = content.split("\n", 2); 267 String firstLine = lines[0].trim(); 268 269 if (!firstLine.isEmpty()) { 270 return firstLine; 271 } 272 273 if (lines.length > 1) { 274 return extractBlockScalar("|" + lines[1]); 275 } 276 277 return null; 278 } 279 280 /** 281 * Calculate MD5 hash of a string. 282 */ 283 private static String calculateMD5(String input) throws NoSuchAlgorithmException { 284 MessageDigest md = MessageDigest.getInstance("MD5"); 285 286 // Normalize the SQL: trim whitespace and convert to lowercase for comparison 287 String normalized = input.trim(); 288 289 byte[] digest = md.digest(normalized.getBytes()); 290 291 // Convert to hex string 292 StringBuilder hexString = new StringBuilder(); 293 for (byte b : digest) { 294 String hex = Integer.toHexString(0xff & b); 295 if (hex.length() == 1) { 296 hexString.append('0'); 297 } 298 hexString.append(hex); 299 } 300 301 return hexString.toString(); 302 } 303}