Source code

001package gudusoft.gsqlparser.tools;
002
003import java.io.*;
004import java.util.*;
005import java.util.stream.Collectors;
006import java.nio.file.Files;
007import java.nio.file.Paths;
008
009/**
010 * 1. take a json file as input
011 * 2. read json file created in CountSql.java with this structure:
012 *   {"dbVendor":"dbvathena","statementType":"sstselect","sqlSize":156,"uniqueHashId":"0204e249ff1a6ae3f49db491e9709f13762314e95080e988d3ef99fe6b9c1773","sqlStatement":"WITH\r\ndataset AS (\r\n  SELECT ARRAY [1,2,2,3,3,4,5] AS items\r\n)\r\nSELECT array_agg(distinct i) AS array_items\r\nFROM dataset\r\nCROSS JOIN UNNEST(items) AS t(i);"}
013 * 3. Found the sql statement in the json file with the same uniqueHashId which means the sql statement is duplicate
014 * 4. list the number of duplicate sql statements of under the same uniqueHashId, only list the number of duplicate sql statements that are more than 1
015 * 5. list the uniqueHashId,statementType, sqlStatement of the duplicate sql statements in the order of the number of duplicate sql statements from high to low
016 * 6. output the result to a csv file named as duplicate_sql_statements.csv
017 *
018 * sample output:
019 * Total JSON records processed: 54617
020 * Analysis complete. Results written to:
021 * - Summary CSV: c:\prg\tmp\o.csv
022 * - Detailed CSV with SQL examples: c:\prg\tmp\o_with_sql.csv
023 * Total SQL statements processed: 54617
024 * Unique SQL statement types: 24669
025 * Found 9257 duplicate statement types.
026 * Total duplicate statements: 29948
027 * 
028 */
029public class ManageSqlInJson {
030    
031    // Class to hold SQL statement metadata from JSON
032    private static class SqlStatementInfo {
033        String uniqueHashId;
034        String statementType;
035        String dbVendor;
036        String sqlStatement;
037        int count = 1;
038        
039        public SqlStatementInfo(String uniqueHashId, String statementType, String dbVendor, String sqlStatement) {
040            this.uniqueHashId = uniqueHashId;
041            this.statementType = statementType;
042            this.dbVendor = dbVendor;
043            this.sqlStatement = sqlStatement;
044        }
045    }
046    
047    public static void main(String[] args) {
048        if (args.length < 1) {
049            System.out.println("Usage: java ManageSqlInJson <json_file_path> [output_csv_path]");
050            return;
051        }
052        
053        String jsonFilePath = args[0];
054        String outputCsvPath = args.length > 1 ? args[1] : "duplicate_sql_statements.csv";
055        
056        try {
057            // Process the JSON file and identify duplicates
058            Map<String, SqlStatementInfo> statementMap = findDuplicateStatements(jsonFilePath);
059            
060            // Export results to CSV
061            exportDuplicatesToCsv(statementMap, outputCsvPath);
062            
063        } catch (IOException e) {
064            System.err.println("Error processing file: " + e.getMessage());
065        }
066    }
067    
068    /**
069     * Find duplicate SQL statements in the JSON file
070     * @param jsonFilePath Path to JSON file
071     * @return Map of statement hash IDs to statement information
072     * @throws IOException If there's an error reading the file
073     */
074    private static Map<String, SqlStatementInfo> findDuplicateStatements(String jsonFilePath) throws IOException {
075        Map<String, SqlStatementInfo> statementMap = new HashMap<>();
076        int totalProcessed = 0;
077        
078        // Read the JSON file line by line
079        try (BufferedReader reader = Files.newBufferedReader(Paths.get(jsonFilePath))) {
080            String line;
081            while ((line = reader.readLine()) != null) {
082                if (line.trim().isEmpty()) {
083                    continue;
084                }
085                
086                totalProcessed++;
087                
088                // Extract information from JSON
089                String uniqueHashId = extractJsonField(line, "uniqueHashId");
090                String statementType = extractJsonField(line, "statementType");
091                String dbVendor = extractJsonField(line, "dbVendor");
092                String sqlStatement = extractJsonField(line, "sqlStatement");
093                
094                if (uniqueHashId == null || statementType == null || dbVendor == null || sqlStatement == null) {
095                    System.err.println("Warning: Invalid JSON line: " + line);
096                    continue;
097                }
098                
099                // Count occurrences of each hash ID
100                if (statementMap.containsKey(uniqueHashId)) {
101                    statementMap.get(uniqueHashId).count++;
102                    // We already have a sample statement, no need to store this one
103                } else {
104                    statementMap.put(uniqueHashId, new SqlStatementInfo(uniqueHashId, statementType, dbVendor, sqlStatement));
105                }
106            }
107        }
108        
109        System.out.println("Total JSON records processed: " + totalProcessed);
110        return statementMap;
111    }
112    
113    /**
114     * Extract field value from a JSON string
115     * @param json JSON string
116     * @param fieldName Name of the field to extract
117     * @return Value of the field or null if not found
118     */
119    private static String extractJsonField(String json, String fieldName) {
120        String fieldPattern = "\"" + fieldName + "\":\"";
121        int startIndex = json.indexOf(fieldPattern);
122        
123        if (startIndex == -1) {
124            // Handle numeric fields (no quotes)
125            if (fieldName.equals("sqlSize")) {
126                fieldPattern = "\"" + fieldName + "\":";
127                startIndex = json.indexOf(fieldPattern);
128                if (startIndex == -1) {
129                    return null;
130                }
131                
132                startIndex += fieldPattern.length();
133                int endIndex = json.indexOf(",", startIndex);
134                if (endIndex == -1) {
135                    endIndex = json.indexOf("}", startIndex);
136                }
137                
138                if (endIndex == -1) {
139                    return null;
140                }
141                
142                return json.substring(startIndex, endIndex).trim();
143            }
144            return null;
145        }
146        
147        startIndex += fieldPattern.length();
148        
149        // For SQL statement field, we need to handle escaped quotes
150        if (fieldName.equals("sqlStatement")) {
151            // Find the closing quote, handling escaped quotes
152            StringBuilder value = new StringBuilder();
153            boolean escaped = false;
154            for (int i = startIndex; i < json.length(); i++) {
155                char c = json.charAt(i);
156                
157                if (escaped) {
158                    value.append(c);
159                    escaped = false;
160                } else if (c == '\\') {
161                    value.append(c);
162                    escaped = true;
163                } else if (c == '"') {
164                    break; // End of the string value
165                } else {
166                    value.append(c);
167                }
168            }
169            return value.toString();
170        } else {
171            // Simple string field
172            int endIndex = json.indexOf("\"", startIndex);
173            
174            if (endIndex == -1) {
175                return null;
176            }
177            
178            return json.substring(startIndex, endIndex);
179        }
180    }
181    
182    /**
183     * Export duplicate SQL statements to CSV
184     * @param statementMap Map of statement hash IDs to statement information
185     * @param outputPath Path to output CSV file
186     * @throws IOException If there's an error writing the file
187     */
188    private static void exportDuplicatesToCsv(Map<String, SqlStatementInfo> statementMap, String outputPath) throws IOException {
189        // Filter for duplicates (count > 1) and sort by count (high to low)
190        List<SqlStatementInfo> duplicates = statementMap.values().stream()
191                .filter(info -> info.count > 1)
192                .sorted((a, b) -> Integer.compare(b.count, a.count))
193                .collect(Collectors.toList());
194        
195        // Calculate statistics
196        int totalStatements = statementMap.values().stream().mapToInt(info -> info.count).sum();
197        int totalDuplicates = duplicates.stream().mapToInt(info -> info.count).sum() - duplicates.size();
198        int uniqueStatements = statementMap.size() - duplicates.size();
199        
200        // Create a separate file for the detailed SQL output
201        String sqlOutputPath = outputPath.replace(".csv", "_with_sql.csv");
202        
203        // Write to CSV
204        try (PrintWriter writer = new PrintWriter(new FileWriter(outputPath));
205             PrintWriter sqlWriter = new PrintWriter(new FileWriter(sqlOutputPath))) {
206            
207            // Write statistics header to both files
208            for (PrintWriter w : new PrintWriter[]{writer, sqlWriter}) {
209                w.println("Summary Statistics");
210                w.println("Total SQL Statements Processed," + totalStatements);
211                w.println("Unique SQL Statement Types," + uniqueStatements);
212                w.println("Duplicate SQL Statement Types," + duplicates.size());
213                w.println("Total Duplicate Statements," + totalDuplicates);
214                w.println();
215            }
216            
217            // Header for duplicate details (regular CSV)
218            writer.println("Duplicate SQL Statements Details");
219            writer.println("UniqueHashId,StatementType,DbVendor,DuplicateCount");
220            
221            // Header for detailed output with SQL (separate file)
222            sqlWriter.println("Duplicate SQL Statements Details (With SQL Examples)");
223            sqlWriter.println("UniqueHashId,StatementType,DbVendor,DuplicateCount,Example SQL Statement");
224            
225            // Data rows
226            for (SqlStatementInfo info : duplicates) {
227                // Regular CSV (without SQL)
228                writer.println(String.format("%s,%s,%s,%d", 
229                        info.uniqueHashId, 
230                        info.statementType, 
231                        info.dbVendor,
232                        info.count));
233                
234                // Detailed CSV (with SQL)
235                sqlWriter.println(String.format("%s,%s,%s,%d,\"%s\"", 
236                        info.uniqueHashId, 
237                        info.statementType, 
238                        info.dbVendor,
239                        info.count,
240                        escapeForCsv(info.sqlStatement)));
241            }
242            
243            // Print processing summary to console
244            System.out.println("Analysis complete. Results written to:");
245            System.out.println("- Summary CSV: " + outputPath);
246            System.out.println("- Detailed CSV with SQL examples: " + sqlOutputPath);
247            System.out.println("Total SQL statements processed: " + totalStatements);
248            System.out.println("Unique SQL statement types: " + statementMap.size());
249            System.out.println("Found " + duplicates.size() + " duplicate statement types.");
250            System.out.println("Total duplicate statements: " + totalDuplicates);
251        }
252    }
253    
254    /**
255     * Escape a string for CSV output
256     * @param value The string to escape
257     * @return Escaped string safe for CSV
258     */
259    private static String escapeForCsv(String value) {
260        if (value == null) {
261            return "";
262        }
263        
264        // Replace double quotes with two double quotes (CSV standard)
265        value = value.replace("\"", "\"\"");
266        
267        // Replace newlines with space to keep CSV record on one line
268        value = value.replace("\r\n", " ").replace("\n", " ").replace("\r", " ");
269        
270        return value;
271    }
272}