001import java.io.*;
002import java.nio.file.*;
003import java.security.MessageDigest;
004import java.security.NoSuchAlgorithmException;
005import java.util.*;
006import java.util.regex.*;
007
008/**
009 * Tool to remove duplicate YAML test files based on MD5 hash of SQL content.
010 *
011 * This tool scans a directory recursively for YAML files, extracts the SQL
012 * from the input section, calculates MD5 hash, and removes duplicates within
013 * the same directory, keeping only one file per unique SQL.
014 *
015 * Usage: java removeDuplicateYamlTestFile <directory>
016 */
017public class removeDuplicateYamlTestFile {
018
019    private static int totalFilesRemoved = 0;
020    private static int totalFilesScanned = 0;
021
022    public static void main(String[] args) {
023        if (args.length < 1) {
024            System.out.println("Usage: java removeDuplicateYamlTestFile <directory>");
025            System.out.println("  <directory> - The directory to scan for YAML files");
026            System.exit(1);
027        }
028
029        String directoryPath = args[0];
030        File rootDir = new File(directoryPath);
031
032        if (!rootDir.exists()) {
033            System.err.println("Error: Directory does not exist: " + directoryPath);
034            System.exit(1);
035        }
036
037        if (!rootDir.isDirectory()) {
038            System.err.println("Error: Path is not a directory: " + directoryPath);
039            System.exit(1);
040        }
041
042        System.out.println("Scanning directory: " + rootDir.getAbsolutePath());
043        System.out.println();
044
045        try {
046            processDirectory(rootDir);
047        } catch (Exception e) {
048            System.err.println("Error processing directory: " + e.getMessage());
049            e.printStackTrace();
050            System.exit(1);
051        }
052
053        System.out.println();
054        System.out.println("=== Summary ===");
055        System.out.println("Total YAML files scanned: " + totalFilesScanned);
056        System.out.println("Total duplicate files removed: " + totalFilesRemoved);
057    }
058
059    /**
060     * Process all subdirectories recursively.
061     * For each directory, find duplicates within that directory only.
062     */
063    private static void processDirectory(File dir) throws Exception {
064        // First, collect all YAML files in this directory (not subdirectories)
065        File[] yamlFiles = dir.listFiles((d, name) ->
066            name.toLowerCase().endsWith(".yaml") || name.toLowerCase().endsWith(".yml"));
067
068        if (yamlFiles != null && yamlFiles.length > 0) {
069            processYamlFilesInDirectory(dir, yamlFiles);
070        }
071
072        // Then recursively process subdirectories
073        File[] subdirs = dir.listFiles(File::isDirectory);
074        if (subdirs != null) {
075            for (File subdir : subdirs) {
076                processDirectory(subdir);
077            }
078        }
079    }
080
081    /**
082     * Process YAML files within a single directory, removing duplicates.
083     */
084    private static void processYamlFilesInDirectory(File dir, File[] yamlFiles) throws Exception {
085        // Map: MD5 hash -> list of files with that hash
086        Map<String, List<File>> md5ToFiles = new HashMap<>();
087
088        for (File yamlFile : yamlFiles) {
089            totalFilesScanned++;
090
091            try {
092                String sql = extractSqlFromYaml(yamlFile);
093                if (sql == null || sql.trim().isEmpty()) {
094                    // Skip files without SQL content
095                    continue;
096                }
097
098                String md5 = calculateMD5(sql);
099                md5ToFiles.computeIfAbsent(md5, k -> new ArrayList<>()).add(yamlFile);
100            } catch (Exception e) {
101                System.err.println("Warning: Could not process file: " + yamlFile.getName() + " - " + e.getMessage());
102            }
103        }
104
105        // Remove duplicates (keep the first file, remove the rest)
106        int removedInDir = 0;
107        for (Map.Entry<String, List<File>> entry : md5ToFiles.entrySet()) {
108            List<File> files = entry.getValue();
109            if (files.size() > 1) {
110                // Sort by filename to ensure consistent behavior
111                files.sort(Comparator.comparing(File::getName));
112
113                // Keep the first file, remove the rest
114                File kept = files.get(0);
115                for (int i = 1; i < files.size(); i++) {
116                    File toRemove = files.get(i);
117                    if (toRemove.delete()) {
118                        System.out.println("Removed duplicate: " + toRemove.getAbsolutePath());
119                        System.out.println("  (duplicate of: " + kept.getName() + ")");
120                        removedInDir++;
121                        totalFilesRemoved++;
122                    } else {
123                        System.err.println("Warning: Failed to delete file: " + toRemove.getAbsolutePath());
124                    }
125                }
126            }
127        }
128
129        if (removedInDir > 0) {
130            System.out.println("Removed " + removedInDir + " duplicate(s) in: " + dir.getAbsolutePath());
131            System.out.println();
132        }
133    }
134
135    /**
136     * Extract SQL content from the input section of a YAML file.
137     * Uses simple text parsing to handle the YAML structure.
138     */
139    private static String extractSqlFromYaml(File yamlFile) throws IOException {
140        String content = new String(Files.readAllBytes(yamlFile.toPath()), "UTF-8");
141
142        // Pattern to match the input.sql section
143        // Handles both block scalar (|) and literal string formats
144
145        // First, try to find "input:" section
146        int inputIndex = content.indexOf("input:");
147        if (inputIndex == -1) {
148            return null;
149        }
150
151        String afterInput = content.substring(inputIndex + "input:".length());
152
153        // Find "sql:" within the input section
154        int sqlIndex = afterInput.indexOf("sql:");
155        if (sqlIndex == -1) {
156            return null;
157        }
158
159        String afterSql = afterInput.substring(sqlIndex + "sql:".length());
160
161        // Determine if it's a block scalar (|) or inline
162        String trimmed = afterSql.trim();
163
164        if (trimmed.startsWith("|")) {
165            // Block scalar - extract indented content
166            return extractBlockScalar(afterSql);
167        } else if (trimmed.startsWith("\"") || trimmed.startsWith("'")) {
168            // Quoted string
169            return extractQuotedString(trimmed);
170        } else {
171            // Could be inline or next line
172            return extractInlineOrNextLine(afterSql);
173        }
174    }
175
176    /**
177     * Extract content from a YAML block scalar (|).
178     */
179    private static String extractBlockScalar(String content) {
180        // Skip the | character and any modifiers
181        int pipeIndex = content.indexOf('|');
182        if (pipeIndex == -1) {
183            return null;
184        }
185
186        String afterPipe = content.substring(pipeIndex + 1);
187
188        // Find the first line break
189        int firstNewline = afterPipe.indexOf('\n');
190        if (firstNewline == -1) {
191            return afterPipe.trim();
192        }
193
194        String blockContent = afterPipe.substring(firstNewline + 1);
195
196        // Determine the indentation of the block content
197        int baseIndent = 0;
198        for (char c : blockContent.toCharArray()) {
199            if (c == ' ') {
200                baseIndent++;
201            } else if (c == '\t') {
202                baseIndent += 2; // Treat tab as 2 spaces
203            } else {
204                break;
205            }
206        }
207
208        if (baseIndent == 0) {
209            // No indentation found, might be empty or malformed
210            return null;
211        }
212
213        // Extract all lines with the same or greater indentation
214        StringBuilder result = new StringBuilder();
215        String[] lines = blockContent.split("\n");
216
217        for (String line : lines) {
218            if (line.trim().isEmpty()) {
219                result.append("\n");
220                continue;
221            }
222
223            int lineIndent = 0;
224            for (char c : line.toCharArray()) {
225                if (c == ' ') {
226                    lineIndent++;
227                } else if (c == '\t') {
228                    lineIndent += 2;
229                } else {
230                    break;
231                }
232            }
233
234            if (lineIndent >= baseIndent) {
235                // Part of the block
236                if (line.length() > baseIndent) {
237                    result.append(line.substring(baseIndent)).append("\n");
238                } else {
239                    result.append("\n");
240                }
241            } else if (!line.trim().isEmpty()) {
242                // End of block (less indentation and non-empty)
243                break;
244            }
245        }
246
247        return result.toString().trim();
248    }
249
250    /**
251     * Extract a quoted string value.
252     */
253    private static String extractQuotedString(String content) {
254        char quote = content.charAt(0);
255        int endQuote = content.indexOf(quote, 1);
256        if (endQuote == -1) {
257            return null;
258        }
259        return content.substring(1, endQuote);
260    }
261
262    /**
263     * Extract inline value or value on next line.
264     */
265    private static String extractInlineOrNextLine(String content) {
266        String[] lines = content.split("\n", 2);
267        String firstLine = lines[0].trim();
268
269        if (!firstLine.isEmpty()) {
270            return firstLine;
271        }
272
273        if (lines.length > 1) {
274            return extractBlockScalar("|" + lines[1]);
275        }
276
277        return null;
278    }
279
280    /**
281     * Calculate MD5 hash of a string.
282     */
283    private static String calculateMD5(String input) throws NoSuchAlgorithmException {
284        MessageDigest md = MessageDigest.getInstance("MD5");
285
286        // Normalize the SQL: trim whitespace and convert to lowercase for comparison
287        String normalized = input.trim();
288
289        byte[] digest = md.digest(normalized.getBytes());
290
291        // Convert to hex string
292        StringBuilder hexString = new StringBuilder();
293        for (byte b : digest) {
294            String hex = Integer.toHexString(0xff & b);
295            if (hex.length() == 1) {
296                hexString.append('0');
297            }
298            hexString.append(hex);
299        }
300
301        return hexString.toString();
302    }
303}