Source code

001package gudusoft.gsqlparser.ir.semantic.diff;
002
003import java.util.ArrayList;
004import java.util.Collections;
005import java.util.HashMap;
006import java.util.HashSet;
007import java.util.LinkedHashSet;
008import java.util.List;
009import java.util.Map;
010import java.util.Set;
011
012/**
013 * Pure comparison: two {@link CanonicalLineageModel}s in, one
014 * {@link DivergenceReport} out. Pass order matters and is documented
015 * inline below — see the slice-7 plan §"Comparison passes".
016 */
017public final class DivergenceReporter {
018
019    private DivergenceReporter() {}
020
021    /**
022     * @param sqlName  human-readable name (e.g. "01_alias_collision")
023     * @param ir       result of projecting the Semantic IR
024     * @param dlineage result of projecting the dlineage XML
025     */
026    public static DivergenceReport report(String sqlName, ProjectorResult ir, ProjectorResult dlineage) {
027        if (sqlName == null || sqlName.isEmpty()) {
028            throw new IllegalArgumentException("sqlName must be non-empty");
029        }
030        if (ir == null || dlineage == null) {
031            throw new IllegalArgumentException("ir and dlineage results must not be null");
032        }
033
034        // Unsupported-side short-circuit. We still produce both models in the
035        // report (empty for the failing side) so the JSON shape is uniform.
036        List<Divergence> divs = new ArrayList<>();
037        if (!ir.isSupported()) {
038            divs.add(new Divergence(DivergenceClass.UNSUPPORTED_BY_IR, Divergence.QUERY_WIDE,
039                    detailFor(ir)));
040        }
041        if (!dlineage.isSupported()) {
042            divs.add(new Divergence(DivergenceClass.UNSUPPORTED_BY_DLINEAGE, Divergence.QUERY_WIDE,
043                    detailFor(dlineage)));
044        }
045        if (!ir.isSupported() || !dlineage.isSupported()) {
046            // When either side is unsupported the per-edge passes don't make
047            // sense; the query-wide divergence is the only one reported.
048            return new DivergenceReport(sqlName, ir.getModel(), dlineage.getModel(),
049                    sortDivergences(divs));
050        }
051
052        CanonicalLineageModel a = ir.getModel();
053        CanonicalLineageModel b = dlineage.getModel();
054
055        // Pass 1: output-presence symmetric difference.
056        // Drop SELECT edges for one-sided outputs so later passes don't double-count.
057        Set<String> aOnly = new LinkedHashSet<>(a.getOutputNames());
058        aOnly.removeAll(b.getOutputNames());
059        Set<String> bOnly = new LinkedHashSet<>(b.getOutputNames());
060        bOnly.removeAll(a.getOutputNames());
061        for (String n : aOnly) {
062            divs.add(new Divergence(DivergenceClass.IR_EXTRA_DEPENDENCY, n,
063                    Divergence.DETAIL_OUTPUT_PRESENT));
064        }
065        for (String n : bOnly) {
066            divs.add(new Divergence(DivergenceClass.IR_MISSING_DEPENDENCY, n,
067                    Divergence.DETAIL_OUTPUT_PRESENT));
068        }
069
070        Set<String> commonOutputs = new HashSet<>(a.getOutputNames());
071        commonOutputs.retainAll(b.getOutputNames());
072
073        Set<CanonicalLineageEdge> aEdges = filterEdgesForCommonOutputs(a.getEdges(), commonOutputs);
074        Set<CanonicalLineageEdge> bEdges = filterEdgesForCommonOutputs(b.getEdges(), commonOutputs);
075
076        // Pass 2: aggregation pass over common outputs.
077        for (String name : new java.util.TreeSet<>(commonOutputs)) {
078            Boolean ia = a.getAggregateByOutput().get(name);
079            Boolean da = b.getAggregateByOutput().get(name);
080            if (ia == null || da == null) continue;
081            if (!ia.equals(da)) {
082                divs.add(new Divergence(DivergenceClass.AGGREGATION_MISMATCH, name,
083                        "ir=" + ia + ",dlineage=" + da));
084            }
085        }
086
087        // Working sets we'll whittle down with binding/role passes.
088        Set<CanonicalLineageEdge> aRemaining = new LinkedHashSet<>(aEdges);
089        Set<CanonicalLineageEdge> bRemaining = new LinkedHashSet<>(bEdges);
090
091        // Pass 3: binding mismatch.
092        // Group by (role, outputName, baseColumn). If one side has exactly
093        // one edge and the other has exactly one edge with a different
094        // baseTable, that's a single binding mismatch. Drop both edges.
095        Map<String, List<CanonicalLineageEdge>> aByBindKey = groupBy(aRemaining, DivergenceReporter::bindingKey);
096        Map<String, List<CanonicalLineageEdge>> bByBindKey = groupBy(bRemaining, DivergenceReporter::bindingKey);
097        for (String key : new java.util.TreeSet<>(union(aByBindKey.keySet(), bByBindKey.keySet()))) {
098            List<CanonicalLineageEdge> al = aByBindKey.getOrDefault(key, Collections.emptyList());
099            List<CanonicalLineageEdge> bl = bByBindKey.getOrDefault(key, Collections.emptyList());
100            if (al.size() == 1 && bl.size() == 1) {
101                CanonicalLineageEdge ae = al.get(0);
102                CanonicalLineageEdge be = bl.get(0);
103                if (!ae.getBaseTable().equals(be.getBaseTable())) {
104                    divs.add(new Divergence(DivergenceClass.BINDING_MISMATCH,
105                            ae.getOutputName(),
106                            ae.getRole() + " " + ae.getBaseColumn()
107                                    + " ir=" + ae.getBaseTable()
108                                    + ",dlineage=" + be.getBaseTable()));
109                    aRemaining.remove(ae);
110                    bRemaining.remove(be);
111                }
112            }
113        }
114
115        // Pass 4: role mismatch (FILTER vs JOIN).
116        // Group by (outputName, baseTable, baseColumn) without role.
117        Map<String, List<CanonicalLineageEdge>> aByRoleKey = groupBy(aRemaining, DivergenceReporter::roleAgnosticKey);
118        Map<String, List<CanonicalLineageEdge>> bByRoleKey = groupBy(bRemaining, DivergenceReporter::roleAgnosticKey);
119        for (String key : new java.util.TreeSet<>(union(aByRoleKey.keySet(), bByRoleKey.keySet()))) {
120            List<CanonicalLineageEdge> al = aByRoleKey.getOrDefault(key, Collections.emptyList());
121            List<CanonicalLineageEdge> bl = bByRoleKey.getOrDefault(key, Collections.emptyList());
122            if (al.size() == 1 && bl.size() == 1) {
123                CanonicalLineageEdge ae = al.get(0);
124                CanonicalLineageEdge be = bl.get(0);
125                if (ae.getRole() != be.getRole()) {
126                    divs.add(new Divergence(DivergenceClass.FILTER_OR_JOIN_SCOPE_MISMATCH,
127                            ae.getOutputName(),
128                            ae.getBaseTable() + "." + ae.getBaseColumn()
129                                    + " ir=" + ae.getRole() + ",dlineage=" + be.getRole()));
130                    aRemaining.remove(ae);
131                    bRemaining.remove(be);
132                }
133            }
134        }
135
136        // Pass 5: symmetric difference. Whatever is left is a flat
137        // missing/extra divergence. Sort by canonical edge order so
138        // emission is deterministic.
139        List<CanonicalLineageEdge> aLeft = new ArrayList<>(aRemaining);
140        aLeft.removeAll(bRemaining);
141        List<CanonicalLineageEdge> bLeft = new ArrayList<>(bRemaining);
142        bLeft.removeAll(aRemaining);
143        Collections.sort(aLeft, CanonicalLineageEdge.ORDER);
144        Collections.sort(bLeft, CanonicalLineageEdge.ORDER);
145        for (CanonicalLineageEdge e : aLeft) {
146            divs.add(new Divergence(DivergenceClass.IR_EXTRA_DEPENDENCY,
147                    e.getOutputName(), edgeDetail(e)));
148        }
149        for (CanonicalLineageEdge e : bLeft) {
150            divs.add(new Divergence(DivergenceClass.IR_MISSING_DEPENDENCY,
151                    e.getOutputName(), edgeDetail(e)));
152        }
153
154        return new DivergenceReport(sqlName, a, b, sortDivergences(divs));
155    }
156
157    private static String detailFor(ProjectorResult r) {
158        StringBuilder sb = new StringBuilder(r.getReason().name());
159        if (r.getDetail() != null && !r.getDetail().isEmpty()) {
160            sb.append(": ").append(r.getDetail());
161        }
162        return sb.toString();
163    }
164
165    private static Set<CanonicalLineageEdge> filterEdgesForCommonOutputs(
166            Set<CanonicalLineageEdge> edges, Set<String> commonOutputs) {
167        // SELECT edges keyed by an output not in both sides → drop (already
168        // accounted for by output-presence pass). FILTER/JOIN edges have null
169        // outputName and always pass through.
170        Set<CanonicalLineageEdge> out = new LinkedHashSet<>();
171        for (CanonicalLineageEdge e : edges) {
172            if (e.getRole() == EdgeRole.SELECT && !commonOutputs.contains(e.getOutputName())) continue;
173            out.add(e);
174        }
175        return out;
176    }
177
178    private static String edgeDetail(CanonicalLineageEdge e) {
179        return e.getRole() + " " + e.getBaseTable() + "." + e.getBaseColumn();
180    }
181
182    private static String bindingKey(CanonicalLineageEdge e) {
183        // Include role so SELECT bindings don't merge with FILTER bindings
184        // for the same base column.
185        return e.getRole().name() + ""
186                + (e.getOutputName() == null ? "" : e.getOutputName()) + ""
187                + e.getBaseColumn();
188    }
189
190    private static String roleAgnosticKey(CanonicalLineageEdge e) {
191        return (e.getOutputName() == null ? "" : e.getOutputName()) + ""
192                + e.getBaseTable() + ""
193                + e.getBaseColumn();
194    }
195
196    private static Map<String, List<CanonicalLineageEdge>> groupBy(
197            Set<CanonicalLineageEdge> edges,
198            java.util.function.Function<CanonicalLineageEdge, String> keyFn) {
199        Map<String, List<CanonicalLineageEdge>> out = new HashMap<>();
200        for (CanonicalLineageEdge e : edges) {
201            out.computeIfAbsent(keyFn.apply(e), k -> new ArrayList<>()).add(e);
202        }
203        return out;
204    }
205
206    private static Set<String> union(Set<String> a, Set<String> b) {
207        Set<String> out = new HashSet<>(a);
208        out.addAll(b);
209        return out;
210    }
211
212    private static List<Divergence> sortDivergences(List<Divergence> divs) {
213        List<Divergence> sorted = new ArrayList<>(divs);
214        Collections.sort(sorted, Divergence.ORDER);
215        return sorted;
216    }
217}