001package gudusoft.gsqlparser.ir.semantic.diff;
002
003import gudusoft.gsqlparser.EDbVendor;
004import org.w3c.dom.Document;
005import org.w3c.dom.Element;
006import org.w3c.dom.Node;
007import org.w3c.dom.NodeList;
008import org.xml.sax.InputSource;
009
010import javax.xml.parsers.DocumentBuilder;
011import javax.xml.parsers.DocumentBuilderFactory;
012import java.io.StringReader;
013import java.util.ArrayDeque;
014import java.util.ArrayList;
015import java.util.Arrays;
016import java.util.Collections;
017import java.util.Deque;
018import java.util.HashMap;
019import java.util.HashSet;
020import java.util.LinkedHashMap;
021import java.util.LinkedHashSet;
022import java.util.List;
023import java.util.Locale;
024import java.util.Map;
025import java.util.Set;
026
027/**
028 * Project the existing {@code DataFlowAnalyzer} XML output into the same
029 * canonical form as the Semantic IR projector.
030 *
031 * <p>The projector consumes only the four XML shapes seen in the captured
032 * 5-SQL corpus baselines (see {@code phase0/golden/*.dlineage.xml}). Any
033 * input that doesn't fit produces a {@link ProjectorResult} carrying an
034 * {@link ProjectorResult.UnsupportedReason}; the harness translates it
035 * into a single {@link DivergenceClass#UNSUPPORTED_BY_DLINEAGE} divergence.
036 *
037 * <p>Aggregate handling — see slice-7 plan §"DlineageXmlProjector". The
038 * COUNT(*) heuristic detects function-resultset fdd sources that include
039 * a {@code column="*"} reference and suppresses SELECT fan-out from that
040 * function. {@code SUM(col)}/{@code AVG(col)}/{@code COUNT(col)} retain
041 * their fan-out.
042 */
043public final class DlineageXmlProjector {
044
045    /**
046     * Lower-cased function names whose presence on the immediate source-side
047     * of an output's fdd marks that output as aggregate. Mirrors the
048     * whitelist in {@code SemanticIRBuilder.AGGREGATE_FUNCTION_NAMES} so that
049     * scalar functions like {@code UPPER}, {@code COALESCE}, or {@code TRIM}
050     * do not cause a false {@link DivergenceClass#AGGREGATION_MISMATCH}
051     * against the IR's {@code OutputColumn.isAggregate()}.
052     *
053     * <p>Slice 30 added {@code mode} (PostgreSQL ordered-set aggregate). See
054     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} for the matching
055     * window-vs-aggregate discriminator override.
056     */
057    private static final Set<String> AGGREGATE_FUNCTION_NAMES;
058    static {
059        Set<String> s = new HashSet<>(Arrays.asList(
060                "count", "sum", "avg", "min", "max",
061                "stddev", "variance", "var_samp", "var_pop",
062                "stddev_samp", "stddev_pop",
063                "listagg", "string_agg", "group_concat", "array_agg",
064                // Slice 30: PostgreSQL ordered-set aggregate. Mirrors
065                // SemanticIRBuilder.AGGREGATE_FUNCTION_NAMES; see also
066                // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES below.
067                "mode",
068                // Slice 42: hypothetical-set ordered-set aggregates
069                // (Oracle / MSSQL {@code RANK(100) WITHIN GROUP (ORDER
070                // BY x)} family). On Oracle / MSSQL the dlineage XML for
071                // this shape emits NO {@code clauseType="orderby"} fdr —
072                // the slice-13 windowed discriminator
073                // {@link #isWindowFunctionResultset} returns false and
074                // these names mark the output aggregate=true. The OVER
075                // form on Oracle / MSSQL ({@code RANK() OVER (ORDER BY
076                // x)}) emits {@code clauseType="orderby"} and the
077                // discriminator returns true (windowed) — adding these
078                // names to the aggregate set therefore does not affect
079                // OVER-form classification. The PG direct-attachment
080                // hypothetical-set form ({@code rank(0.5) WITHIN GROUP
081                // (...)}) and PG OVER form share the same XML shape;
082                // both are classified as windowed (rank is NOT in
083                // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES) — slice 42
084                // does NOT lift PG, the IR builder rejects PG
085                // hypothetical-set so the divergence harness never
086                // sees that case. Probe-confirmed
087                // {@code /tmp/probe42/Probe42.java}.
088                "rank", "dense_rank", "percent_rank", "cume_dist"));
089        AGGREGATE_FUNCTION_NAMES = Collections.unmodifiableSet(s);
090    }
091
092    /**
093     * Function names whose {@code clauseType="orderby"} fdr may be the
094     * WITHIN GROUP order-by rather than a window {@code OVER ORDER BY}.
095     *
096     * <p>Slice 30 introduced {@code mode}. Slice 35 added PostgreSQL
097     * {@code listagg}; slice 36 adds PostgreSQL {@code string_agg}. Probes
098     * show dlineage emits the WITHIN GROUP order-by for both as an fdr with
099     * {@code clauseType="orderby"} structurally identical to LISTAGG (probe
100     * /tmp/ProbeStringAggXml.java verified the XML for aliased and unaliased
101     * STRING_AGG WG: a {@code resultset type="function"} STRING_AGG wrapper,
102     * an {@code fdd effectType="function"} target/source pair for the arg,
103     * and a single {@code fdr clauseType="orderby"} for the WG key — no
104     * {@code clauseType="selectList"} fdr is emitted, so the slice-13
105     * projector otherwise mistakes the orderby fdr for a window discriminator
106     * and reports {@link DivergenceClass#AGGREGATION_MISMATCH}). This set
107     * does NOT blindly short-circuit window classification:
108     * {@link #isWindowFunctionResultset} still returns {@code true} when a
109     * {@code selectList} fdr is present (PARTITION BY / analytic dependency),
110     * and only suppresses the order-by-only shape.
111     *
112     * <p><b>Subset constraint</b>: must be a strict subset of
113     * {@link #AGGREGATE_FUNCTION_NAMES} so the IR-side / projector-side
114     * aggregate-flag invariants stay symmetric. The class-init check below
115     * enforces this.
116     *
117     * <p><b>Why not include {@code percentile_cont} / {@code percentile_disc}</b>:
118     * these have windowed forms in BigQuery / Vertica / Redshift / Oracle /
119     * SQL Server (e.g. {@code PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x)
120     * OVER (PARTITION BY y)}). Lifting them requires a stronger structural
121     * discriminator. Similarly, hypothetical-set {@code rank} /
122     * {@code dense_rank} remain excluded because their PG XML is structurally
123     * identical to window form {@code RANK() OVER (ORDER BY ...)}.
124     */
125    private static final Set<String> ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES;
126    static {
127        Set<String> s = new HashSet<>();
128        s.add("mode");
129        s.add("listagg");
130        s.add("string_agg");
131        ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES = Collections.unmodifiableSet(s);
132        // Subset assertion: must be enforced at runtime (not via assert,
133        // which can be disabled). A future contributor adding a name to
134        // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES without also adding it to
135        // AGGREGATE_FUNCTION_NAMES would produce silently-wrong projector
136        // output (the function would be "non-aggregate but
137        // not-windowed-either" — first-hop check skips it entirely).
138        if (!AGGREGATE_FUNCTION_NAMES.containsAll(ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES)) {
139            Set<String> missing = new HashSet<>(ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES);
140            missing.removeAll(AGGREGATE_FUNCTION_NAMES);
141            throw new IllegalStateException(
142                    "ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES must be a subset of "
143                            + "AGGREGATE_FUNCTION_NAMES; missing=" + missing);
144        }
145    }
146
147    /**
148     * Functions whose {@code (*)} argument is a row-counting marker rather
149     * than a real value dependency, so SELECT fan-out from them should be
150     * suppressed (otherwise the dlineage XML's expansion to every joined
151     * column would produce false {@link DivergenceClass#IR_MISSING_DEPENDENCY}
152     * edges relative to the IR's "aggregate, no sources" representation).
153     * Today the only widely-supported example is {@code COUNT(*)}.
154     */
155    private static final Set<String> NULL_ARG_AGGREGATES;
156    static {
157        Set<String> s = new HashSet<>();
158        s.add("count");
159        NULL_ARG_AGGREGATES = Collections.unmodifiableSet(s);
160    }
161
162    /**
163     * Slice 48 — wrapper-function names through which the projector will
164     * descend ONE fdd hop to look for a known ordered-set aggregate
165     * inner.
166     *
167     * <p>Slice 48 introduced {@code lower}. Slice 50 widened the set to
168     * include {@code upper} — the natural case-folding sibling. Slice 51
169     * widened the set further to include {@code concat} — the canonical
170     * multi-argument scalar concatenation wrapper. Slice 52 widened the
171     * set again to include {@code trim} / {@code ltrim} / {@code rtrim} —
172     * the canonical whitespace-trimming scalar wrappers. Slice 53 widened
173     * it to include {@code substring} / {@code substr} — the
174     * three-argument string-extraction wrappers. Slice 54 widened it once
175     * more to include {@code replace} — the canonical three-argument
176     * pattern-replacement wrapper. Slice 55 widens it again to include
177     * {@code lpad} / {@code rpad} — the canonical three-argument string
178     * padding wrappers. Slice 51 probes ({@code /tmp/Probe51.java},
179     * {@code /tmp/Probe51b.java}), slice 52 probe
180     * ({@code /tmp/Probe52.java}), slice 53 probe
181     * ({@code /tmp/Probe53.java}), slice 54 probe (deleted
182     * post-implementation; see {@code Slice54Test} javadoc) and slice 55
183     * probe (transient {@code Slice55LpadRpadProbeTest} inside the Maven
184     * test JVM, deleted post-confirmation; see {@code Slice55Test}
185     * javadoc) confirmed the dlineage XML for {@code CONCAT(...)} /
186     * {@code TRIM(...)} / {@code LTRIM(...)} / {@code RTRIM(...)} /
187     * {@code SUBSTRING(...)} / {@code SUBSTR(...)} / {@code REPLACE(...)}
188     * / {@code LPAD(...)} / {@code RPAD(...)} differs from
189     * {@code LOWER(...)} / {@code UPPER(...)} only in the wrapper
190     * resultset's {@code name} attribute. The literal-string /
191     * character-class / direction-marker / numeric start+length /
192     * pattern+replacement / pad-length / pad-character arguments do NOT
193     * generate extra {@code fdd} edges on the wrapping column
194     * (constants and BOTH/LEADING/TRAILING/FROM/FOR markers do not
195     * produce fdd edges in the dlineage XML), so the structural shape is
196     * byte-identical to the case-folding wrapper shape: one
197     * {@code fdd effectType="select"} from the SELECT-list resultset to
198     * the wrapper column, then one {@code fdd effectType="function"}
199     * from the wrapper column to the inner ordered-set aggregate's
200     * resultset, then the inner aggregate's own argument fdd and
201     * orderby fdr exactly as in the case-folding wrapper shape. The
202     * IR-side classification (aggregate=true via the slice-31 collector
203     * descending into the wrapping function body) and the dlineage-side
204     * aggregate classification lift symmetrically once the wrapper
205     * allowlist contains the new wrapper name.
206     *
207     * <p>The {@code ||} (string-concatenation) operator is a special
208     * non-case: PG / Snowflake parsers fold {@code expr || ' '} into
209     * the projection root WITHOUT emitting a {@code concat} or
210     * {@code ||}-named function-resultset wrapper. The dlineage XML for
211     * {@code LISTAGG WG || '|'} is byte-identical to the unwrapped
212     * {@code LISTAGG WG} form — no descent is needed because the
213     * existing first-hop aggregate detection
214     * (slice-30 / 35 / 36 carve-outs in
215     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}) already
216     * classifies that direct path as aggregate=true. Slice 51 / 52 / 53
217     * / 54 therefore do NOT add {@code ||} to this allowlist; the
218     * wrapper widen is only required for explicit named function calls.
219     *
220     * <p>{@code TRIM(BOTH ' ' FROM x)} (PG / Snowflake special grammar)
221     * is a syntactic surface only — slice 52 probe confirmed the
222     * dlineage XML still emits a {@code resultset type="function"}
223     * named {@code TRIM} with the same one-fdd-edge connection chain
224     * as canonical {@code TRIM(x)}. The descent therefore fires for
225     * both forms unchanged. Likewise, the PG SQL-standard
226     * {@code SUBSTRING(x FROM 1 FOR 5)} grammar form (with
227     * {@code FROM}/{@code FOR} keyword markers) emits the same chain
228     * shape as the canonical {@code SUBSTRING(x, 1, 5)} comma form —
229     * slice 53 probe confirmed parity. The two-argument
230     * {@code SUBSTRING(x, 1)} call (no length) emits the same chain
231     * because constants do not produce fdd edges. Slice 54
232     * {@code REPLACE(x, pattern, replacement)} probe confirmed the
233     * pattern and replacement string literals likewise produce no fdd
234     * edges on the wrapping {@code REPLACE.REPLACE} column, so the
235     * descent fires identically regardless of the literal values.
236     * Slice 55 {@code LPAD(x, length, padchar)} /
237     * {@code RPAD(x, length, padchar)} probe confirmed the integer
238     * pad-length and literal-string pad-character arguments likewise
239     * produce no fdd edges on the wrapping {@code LPAD.LPAD} /
240     * {@code RPAD.RPAD} column, so the descent fires identically
241     * regardless of the pad-length / pad-character values.
242     *
243     * <p>Slice 56 {@code LEFT(x, length)} / {@code RIGHT(x, length)}
244     * probe confirmed the integer length argument (2nd arg) likewise
245     * produces no fdd edges on the wrapping {@code LEFT.LEFT} /
246     * {@code RIGHT.RIGHT} column — only the inner ordered-set
247     * aggregate's resultset is the fdd source — so the descent fires
248     * identically regardless of the literal length value.
249     *
250     * <p>Slice 57 {@code REGEXP_REPLACE(x, pattern, replacement [,
251     * flags] [, position, occurrence, parameters])} probe (transient
252     * test inside the Maven test JVM, deleted post-confirmation)
253     * confirmed the regex pattern, replacement string, optional
254     * flag, and Snowflake positional / occurrence / parameter
255     * literals likewise produce no fdd edges on the wrapping
256     * {@code REGEXP_REPLACE.REGEXP_REPLACE} column — only the inner
257     * ordered-set aggregate's resultset is the fdd source — so the
258     * descent fires identically regardless of the constant
259     * pattern / replacement / flag / positional values. The PG
260     * 3-arg form, PG 4-arg flag form, SF 3-arg form, and SF 6-arg
261     * (position / occurrence / parameters) form all produce the
262     * same chain shape as slice-49 {@code LOWER(...)}.
263     *
264     * <p>The wrapper allowlist is intentionally NOT vendor-gated:
265     * PostgreSQL and Snowflake are the validated scope per the
266     * slice-57 probe, but any dialect whose dlineage XML emits a
267     * {@code REGEXP_REPLACE}-named function-resultset wrapper around
268     * a {@code mode} / {@code listagg} / {@code string_agg} inner
269     * with a non-window fdr (typically the WITHIN GROUP order-by
270     * fdr admitted by the slice-30 / 35 / 36 carve-outs in
271     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}) will lift
272     * transparently. This matches the slice-48 → 56 posture.
273     *
274     * <p>Other scalar wrappers ({@code COALESCE},
275     * {@code INITCAP}, {@code REVERSE}, {@code LENGTH} /
276     * {@code CHAR_LENGTH}, arithmetic) stay out of scope. Each
277     * carries a different argument-shape signature in the dlineage
278     * XML and would need an independent probe pass before being
279     * widened — slice 57 stays narrowly focused on the canonical
280     * case-folding / concat / trim / substring / replace / lpad /
281     * rpad / left / right / regexp_replace family.
282     */
283    private static final Set<String> SLICE48_DESCENT_WRAPPER_NAMES;
284    static {
285        Set<String> s = new HashSet<>();
286        s.add("lower");
287        s.add("upper");
288        s.add("concat");
289        s.add("trim");
290        s.add("ltrim");
291        s.add("rtrim");
292        s.add("substring");
293        s.add("substr");
294        s.add("replace");
295        s.add("lpad");
296        s.add("rpad");
297        s.add("left");
298        s.add("right");
299        s.add("regexp_replace");
300        SLICE48_DESCENT_WRAPPER_NAMES = Collections.unmodifiableSet(s);
301    }
302
303    /**
304     * Slice 48 — inner aggregate names that the {@link
305     * #SLICE48_DESCENT_WRAPPER_NAMES} descent is allowed to lift.
306     *
307     * <p>Slice 48 introduced {@code mode}. Slice 49 widens the set to
308     * also include {@code listagg} / {@code string_agg} — the natural
309     * sibling lift along the same {@code LOWER(...) WITHIN GROUP (...)}
310     * embedded form. Probe {@code /tmp/Probe49.java} confirmed PG /
311     * Snowflake {@code LOWER(LISTAGG(x.id, ',') WG (ORDER BY x.region))}
312     * dlineage XML matches the slice-48 descent shape exactly: a
313     * stacked {@code RS-1.s} → {@code LOWER} → {@code LISTAGG} chain of
314     * {@code resultset type="function"} elements linked by
315     * {@code fdd effectType="function"} edges, plus a single
316     * {@code fdr clauseType="orderby"} on the inner resultset for the
317     * WG ORDER BY column. The slice-30 / 35 / 36 carve-outs in
318     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} include
319     * {@code listagg} and {@code string_agg}, so the defense-in-depth
320     * {@link #isWindowFunctionResultset} check at the bottom of the
321     * descent (which would otherwise mis-classify the orderby fdr as a
322     * window discriminator) returns false for these inner names — the
323     * descent is therefore trivially safe.
324     *
325     * <p>The extra {@code fdd effectType="function"} from
326     * {@code LISTAGG.LISTAGG} → base-table column (the function
327     * argument) is structurally orthogonal to the descent: the descent
328     * only flips {@code result.aggregate = true} and never alters the
329     * SELECT-terminal BFS, so the existing per-output SELECT lineage
330     * edges are unchanged.
331     *
332     * <p>Plain aggregates ({@code SUM} / {@code COUNT} / etc.) inside
333     * scalar wrappers stay outside the descent (slice-48 boundary
334     * preserved): they are NOT in
335     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} so the descent's
336     * {@code isWindowFunctionResultset} carve-out would NOT apply if
337     * they were added here. Lifting them needs a separate slice.
338     * Hypothetical-set / {@code percentile_cont} / {@code percentile_disc}
339     * also stay out for the same reason — slice 47 / 48 probes
340     * confirmed their dlineage XML is structurally indistinguishable
341     * from window-OVER form for the inner resultset, so widening the
342     * descent allowlist would mis-classify the OVER form as aggregate.
343     *
344     * <p>Constraint: must be a strict subset of
345     * {@link #AGGREGATE_FUNCTION_NAMES}; the runtime check below
346     * mirrors the same constraint on
347     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}.
348     */
349    private static final Set<String> SLICE48_DESCENT_INNER_NAMES;
350    static {
351        Set<String> s = new HashSet<>();
352        s.add("mode");
353        s.add("listagg");
354        s.add("string_agg");
355        SLICE48_DESCENT_INNER_NAMES = Collections.unmodifiableSet(s);
356        if (!AGGREGATE_FUNCTION_NAMES.containsAll(SLICE48_DESCENT_INNER_NAMES)) {
357            Set<String> missing = new HashSet<>(SLICE48_DESCENT_INNER_NAMES);
358            missing.removeAll(AGGREGATE_FUNCTION_NAMES);
359            throw new IllegalStateException(
360                    "SLICE48_DESCENT_INNER_NAMES must be a subset of "
361                            + "AGGREGATE_FUNCTION_NAMES; missing=" + missing);
362        }
363    }
364
365    /** Policy for {@link #resolveBaseSources(String, Document, FanoutPolicy)}. */
366    public enum FanoutPolicy {
367        /** SELECT projection: function fdd whose argument is `*` or constant suppresses fan-out. */
368        SUPPRESS_AGGREGATE_NULL_ARG,
369        /** Row-influence: every fdd terminal kept regardless of intervening function nodes. */
370        INCLUDE_ALL
371    }
372
373    private DlineageXmlProjector() {}
374
375    /**
376     * Legacy entry point — equivalent to {@link #project(String, EDbVendor)}
377     * with {@code vendor=null}. Retained for callers that don't know or
378     * don't care about the source dialect (Phase-0 harness, divergence
379     * goldens, slice-7 comparison harness).
380     */
381    public static ProjectorResult project(String xml) {
382        return project(xml, null);
383    }
384
385    /**
386     * Slice 43 — vendor-scoped projection entry point. The {@code vendor}
387     * parameter pins the per-vendor API contract so a future cautious
388     * projector override can apply per-vendor name-whitelist exceptions
389     * (e.g. PG hypothetical-set {@code rank} / {@code dense_rank} /
390     * {@code percent_rank} / {@code cume_dist} where the WG form's XML
391     * is structurally indistinguishable from the OVER form). Slice 43
392     * deliberately makes <b>no behavior change</b> in the projector —
393     * today the {@link #AGGREGATE_FUNCTION_NAMES} /
394     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} sets are vendor-
395     * agnostic and the {@code vendor} argument is recorded but not
396     * consulted. The byte-for-byte equivalence is enforced by
397     * {@code Slice43Test.projectorVendorScopeOverloadAccepts*Vendor}.
398     *
399     * <p>Future slices that wire vendor-specific overrides MUST keep
400     * this equivalence for callers that pass {@code vendor=null} (the
401     * legacy {@link #project(String)} delegates here with {@code null}).
402     * The follow-up slice for PG hypothetical-set top-level admission
403     * is the first expected consumer of the {@code vendor} parameter.
404     *
405     * @param xml dlineage XML output, must not be null/empty
406     * @param vendor source SQL dialect; may be {@code null} when the
407     *               caller has no vendor information
408     */
409    public static ProjectorResult project(String xml, EDbVendor vendor) {
410        if (xml == null || xml.isEmpty()) {
411            return ProjectorResult.unsupported(
412                    ProjectorResult.UnsupportedReason.MALFORMED_XML, "empty xml");
413        }
414        Document doc;
415        try {
416            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
417            dbf.setNamespaceAware(false);
418            // Defensive XML hardening: disable DTDs, external entities, and
419            // network access by default. The projector is a public API in
420            // src/main and may be called with untrusted input; matching the
421            // OWASP XXE-prevention recipe for JDK DocumentBuilderFactory.
422            applyXmlSecurity(dbf);
423            DocumentBuilder db = dbf.newDocumentBuilder();
424            // Belt-and-braces: even with the features above, refuse to
425            // resolve any external entity that slips through.
426            db.setEntityResolver((publicId, systemId) ->
427                    new InputSource(new java.io.StringReader("")));
428            db.setErrorHandler(new org.xml.sax.ErrorHandler() {
429                @Override public void warning(org.xml.sax.SAXParseException e) {}
430                @Override public void error(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException { throw e; }
431                @Override public void fatalError(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException { throw e; }
432            });
433            doc = db.parse(new InputSource(new StringReader(xml)));
434        } catch (Exception ex) {
435            return ProjectorResult.unsupported(
436                    ProjectorResult.UnsupportedReason.MALFORMED_XML, ex.getMessage());
437        }
438
439        // Identify the final resultset: exactly one <resultset> of a
440        // terminal-candidate type that is never referenced as the parent
441        // of any fdd OR fdr <source>. Slice 12 lifted set-op programs, so
442        // {@code select_union}, {@code select_intersect}, {@code select_minus}
443        // are also valid terminal types — dlineage emits one
444        // {@code RESULT_OF_<OP>-N} resultset of the matching set-op type
445        // whose columns each have an fdd edge with N sources (one per
446        // branch's per-position {@code select_list} column). Slice 23
447        // widened the parent-id collection to include fdr sources too,
448        // so the inner EXISTS-body resultset (referenced only via
449        // fdr clause="on") is correctly excluded from terminal candidates.
450        List<Element> terminalCandidates = new ArrayList<>();
451        NodeList resultsets = doc.getElementsByTagName("resultset");
452        for (int i = 0; i < resultsets.getLength(); i++) {
453            Element e = (Element) resultsets.item(i);
454            if (isTerminalResultsetType(e.getAttribute("type"))) {
455                terminalCandidates.add(e);
456            }
457        }
458        if (terminalCandidates.isEmpty()) {
459            return ProjectorResult.unsupported(
460                    ProjectorResult.UnsupportedReason.NO_RELATIONSHIPS,
461                    "no terminal-candidate <resultset> found "
462                            + "(select_list / select_union / select_intersect / "
463                            + "select_minus / select_except)");
464        }
465        Set<String> sourceParentIds = collectFddOrFdrSourceParentIds(doc);
466        List<Element> terminals = new ArrayList<>();
467        for (Element rs : terminalCandidates) {
468            if (!sourceParentIds.contains(rs.getAttribute("id"))) {
469                terminals.add(rs);
470            }
471        }
472        if (terminals.size() != 1) {
473            return ProjectorResult.unsupported(
474                    ProjectorResult.UnsupportedReason.MULTIPLE_TERMINAL_SELECTS,
475                    "expected exactly one terminal resultset, got " + terminals.size());
476        }
477        Element finalResultset = terminals.get(0);
478
479        Set<CanonicalLineageEdge> edges = new LinkedHashSet<>();
480        Set<String> outputNames = new LinkedHashSet<>();
481        Map<String, Boolean> aggregateByOutput = new LinkedHashMap<>();
482
483        // 1) SELECT projection per output column of the final resultset.
484        // Skip system pseudo-columns (e.g. RelationRows). They are the implicit
485        // fdr-target for row-influence, never user-visible projected columns.
486        NodeList finalCols = finalResultset.getElementsByTagName("column");
487        for (int i = 0; i < finalCols.getLength(); i++) {
488            Element col = (Element) finalCols.item(i);
489            // Only direct children of finalResultset count — getElementsByTagName
490            // returns descendants too in theory, but our XML schema has columns
491            // as direct children of resultset/table, so this is fine in practice.
492            if (col.getParentNode() != finalResultset) continue;
493            if ("system".equals(col.getAttribute("source"))) continue;
494
495            String outName = col.getAttribute("name").toLowerCase(Locale.ROOT);
496            outputNames.add(outName);
497            // Default false; flipped to true if BFS visits a function resultset.
498            aggregateByOutput.put(outName, false);
499
500            BfsResult br = projectColumn(col.getAttribute("id"), doc,
501                    FanoutPolicy.SUPPRESS_AGGREGATE_NULL_ARG);
502            if (br.aggregate) {
503                aggregateByOutput.put(outName, true);
504            }
505            for (TableColumn tc : br.terminals) {
506                edges.add(new CanonicalLineageEdge(EdgeRole.SELECT, outName, tc.table, tc.column));
507            }
508        }
509
510        // 2) Row-influence from fdr edges whose target is a column of the final
511        // resultset (typically the system RelationRows column).
512        String finalRsId = finalResultset.getAttribute("id");
513        // Walk the fdr graph: follow chains that target the final resultset's
514        // RelationRows, including indirect ones (e.g. RS-1.RelationRows -> sub.RelationRows
515        // in 04_nested_subquery). For each fdr source with clauseType where/joinCondition
516        // (or clause where/on), resolve to base columns via fdd closure.
517        NodeList fdrs = doc.getElementsByTagName("relationship");
518        // Collect fdr edges keyed by the parent_id of the target so we can
519        // chain RelationRows-targeting fdrs.
520        Map<String, List<Element>> fdrsByTargetParentId = new HashMap<>();
521        for (int i = 0; i < fdrs.getLength(); i++) {
522            Element rel = (Element) fdrs.item(i);
523            if (!"fdr".equals(rel.getAttribute("type"))) continue;
524            Element target = firstChildElement(rel, "target");
525            if (target == null) continue;
526            fdrsByTargetParentId.computeIfAbsent(target.getAttribute("parent_id"), k -> new ArrayList<>())
527                    .add(rel);
528        }
529
530        Deque<String> rowSetParents = new ArrayDeque<>();
531        Set<String> visitedRowSetParents = new HashSet<>();
532        rowSetParents.add(finalRsId);
533        visitedRowSetParents.add(finalRsId);
534        while (!rowSetParents.isEmpty()) {
535            String parentId = rowSetParents.removeFirst();
536            List<Element> rels = fdrsByTargetParentId.get(parentId);
537            if (rels == null) continue;
538            for (Element rel : rels) {
539                Element target = firstChildElement(rel, "target");
540                if (target == null) continue;
541                // Only follow fdrs whose target is a system column (RelationRows).
542                // Other fdrs (e.g. effectType="function" with target on a function
543                // resultset) carry aggregation semantics, not row-influence.
544                if (!"system".equals(target.getAttribute("source"))) continue;
545                // Process each <source> on this fdr.
546                NodeList sources = rel.getElementsByTagName("source");
547                for (int si = 0; si < sources.getLength(); si++) {
548                    Element src = (Element) sources.item(si);
549                    if (src.getParentNode() != rel) continue;
550                    String clauseType = src.getAttribute("clauseType");
551                    String relClause = rel.getAttribute("clause");
552                    EdgeRole role = clauseTypeToRole(clauseType, relClause);
553                    if (role == null) {
554                        // Source is itself a system RelationRows pointing at
555                        // another resultset — chain through it.
556                        if ("system".equals(src.getAttribute("source"))) {
557                            String nextParent = src.getAttribute("parent_id");
558                            if (visitedRowSetParents.add(nextParent)) {
559                                rowSetParents.add(nextParent);
560                            }
561                        }
562                        continue;
563                    }
564                    // Source identifies a column on a base table or a non-base
565                    // resultset. Resolve via fdd closure with INCLUDE_ALL so
566                    // function-arg suppression doesn't skew row influence.
567                    String srcParentId = src.getAttribute("parent_id");
568                    Element parent = elementById(doc, srcParentId);
569                    if (parent == null) continue;
570                    if (isBaseTable(parent)) {
571                        // Skip constantTable for row influence too.
572                        if ("constantTable".equals(parent.getAttribute("type"))) continue;
573                        String col = src.getAttribute("column");
574                        if ("*".equals(col)) continue;
575                        edges.add(new CanonicalLineageEdge(role, null,
576                                parent.getAttribute("name").toLowerCase(Locale.ROOT),
577                                col.toLowerCase(Locale.ROOT)));
578                    } else if (isResultset(parent)) {
579                        String srcId = src.getAttribute("id");
580                        BfsResult inner = resolveBaseSources(srcId, doc, FanoutPolicy.INCLUDE_ALL);
581                        for (TableColumn tc : inner.terminals) {
582                            edges.add(new CanonicalLineageEdge(role, null, tc.table, tc.column));
583                        }
584                    } // else: constantTable / function — already filtered
585                }
586            }
587        }
588
589        return ProjectorResult.ok(
590                new CanonicalLineageModel(edges, outputNames, aggregateByOutput));
591    }
592
593    /**
594     * Project one final-resultset output column. The aggregate flag is set
595     * <b>only when an immediate fdd source of this column lives on a
596     * function resultset whose function name is in
597     * {@link #AGGREGATE_FUNCTION_NAMES}</b>. This matches Semantic IR's
598     * per-output (non-transitive) aggregate semantics — see slice-6
599     * design notes on flag locality. The BFS itself walks transitively
600     * to collect base-table SELECT terminals.
601     */
602    private static BfsResult projectColumn(String columnId, Document doc, FanoutPolicy policy) {
603        BfsResult result = new BfsResult();
604
605        // First-hop aggregate detection. Local rule: this output is aggregate
606        // iff *any* immediate fdd source lives on a function resultset whose
607        // function name is in AGGREGATE_FUNCTION_NAMES AND the function
608        // resultset is NOT a window function (slice 13). We must scan all
609        // immediate function sources, not stop at the first one — a fdd whose
610        // sources include both `UPPER(...)` and `SUM(...)` columns must still
611        // mark the output aggregate.
612        //
613        // Slice 13 window-vs-aggregate discriminator: a function resultset is
614        // windowed iff at least one fdr targeting it carries a source with
615        // clauseType="selectList" (PARTITION BY) or clauseType="orderby"
616        // (OVER ORDER BY). Plain aggregates have only the RelationRows fdr;
617        // window functions add the PARTITION BY / OVER ORDER BY fdrs. Empty
618        // OVER () is not discriminable in dlineage XML (looks identical to a
619        // plain aggregate); the IR builder rejects empty OVER () to avoid
620        // manufacturing AGGREGATION_MISMATCH divergences.
621        outer:
622        for (Element fdd : findFddByTarget(doc, columnId)) {
623            for (Element fnRs : immediateFunctionSourceResultsets(fdd, doc)) {
624                String fnName = fnRs.getAttribute("name");
625                if (fnName == null || fnName.isEmpty()) continue;
626                if (!AGGREGATE_FUNCTION_NAMES.contains(fnName.toLowerCase(Locale.ROOT))) continue;
627                if (isWindowFunctionResultset(fnRs, doc)) continue;
628                result.aggregate = true;
629                break outer;
630            }
631        }
632
633        // Slice 48 — single-step descent through a known scalar wrapper.
634        // The first-hop check above mirrors the IR builder's per-output
635        // (non-transitive) aggregate semantics, but for a small set of
636        // case-folding scalar wrappers used to format the output of an
637        // ordered-set aggregate, the IR side classifies the projection
638        // as aggregate via the descendant ordered-set name. The
639        // projector must mirror that classification or the divergence
640        // harness reports a pre-existing
641        // {@link DivergenceClass#AGGREGATION_MISMATCH}.
642        //
643        // Lift surface as of slice 57:
644        //   wrapper-name ∈ SLICE48_DESCENT_WRAPPER_NAMES
645        //                = {lower, upper, concat, trim, ltrim, rtrim,
646        //                   substring, substr, replace, lpad, rpad,
647        //                   left, right, regexp_replace}
648        //   ∧ inner-name ∈ SLICE48_DESCENT_INNER_NAMES
649        //                = {mode, listagg, string_agg}
650        //   ∧ !isWindowFunctionResultset(inner).
651        //
652        // History:
653        //   - slice 48 introduced the descent with wrapper={lower},
654        //     inner={mode}.
655        //   - slice 49 widened inner to {mode, listagg, string_agg}.
656        //   - slice 50 widened wrapper to {lower, upper}.
657        //   - slice 51 widened wrapper to {lower, upper, concat}.
658        //   - slice 52 widened wrapper to
659        //     {lower, upper, concat, trim, ltrim, rtrim}.
660        //   - slice 53 widened wrapper to
661        //     {lower, upper, concat, trim, ltrim, rtrim, substring,
662        //      substr}.
663        //   - slice 54 widened wrapper to
664        //     {lower, upper, concat, trim, ltrim, rtrim, substring,
665        //      substr, replace}.
666        //   - slice 55 widened wrapper to
667        //     {lower, upper, concat, trim, ltrim, rtrim, substring,
668        //      substr, replace, lpad, rpad}.
669        //   - slice 56 widened wrapper to
670        //     {lower, upper, concat, trim, ltrim, rtrim, substring,
671        //      substr, replace, lpad, rpad, left, right}.
672        //   - slice 57 widens wrapper to
673        //     {lower, upper, concat, trim, ltrim, rtrim, substring,
674        //      substr, replace, lpad, rpad, left, right,
675        //      regexp_replace}.
676        // The {@code ||} string-concatenation operator does NOT emit a
677        // wrapper resultset on PG / Snowflake (the parser folds
678        // {@code expr || ' '} into the projection root directly), so no
679        // descent is needed for that shape — the existing first-hop
680        // aggregate detection already handles it. Other scalar wrappers
681        // ({@code COALESCE}, {@code INITCAP}, {@code REVERSE},
682        // {@code LENGTH} / {@code CHAR_LENGTH}, arithmetic) stay
683        // outside the descent — their argument-shape signatures
684        // differ and require independent probes.
685        //
686        // The descent is also one-level only — {@code LOWER(LOWER(mode WG))}
687        // remains divergent because the wrapper's immediate function
688        // source is {@code lower}, not {@code mode}. This keeps the
689        // descent O(immediate-fdds) and avoids any risk of
690        // double-counting through arbitrarily deep wrappings.
691        if (!result.aggregate) {
692            // The first-hop loop above scans the column's fdds for an
693            // immediate function-resultset source whose own column was
694            // the projection's value (e.g. {@code RS-1.s} fdd-source
695            // {@code LOWER.LOWER}). To descend one level we walk through
696            // each scalar-wrapper source's <i>column</i> and look at
697            // that column's own fdd targets — the wrapper-internal fdd
698            // (e.g. fdd targeting {@code LOWER.LOWER} with source
699            // {@code mode.mode}). The {@code findFddByTarget} helper
700            // matches on the target column's {@code id}, NOT the
701            // resultset id, so we must look up the wrapper's child
702            // column id (slice-48 descent's distinguishing detail vs
703            // the existing first-hop check).
704            outerDescent:
705            for (Element fdd : findFddByTarget(doc, columnId)) {
706                NodeList sources = fdd.getElementsByTagName("source");
707                for (int si = 0; si < sources.getLength(); si++) {
708                    Element src = (Element) sources.item(si);
709                    if (src.getParentNode() != fdd) continue;
710                    String wrapperParentId = src.getAttribute("parent_id");
711                    Element wrapperRs = elementById(doc, wrapperParentId);
712                    if (wrapperRs == null
713                            || !isResultset(wrapperRs)
714                            || !"function".equals(wrapperRs.getAttribute("type"))) continue;
715                    String wrapperName = wrapperRs.getAttribute("name");
716                    if (wrapperName == null) continue;
717                    if (!SLICE48_DESCENT_WRAPPER_NAMES.contains(
718                            wrapperName.toLowerCase(Locale.ROOT))) continue;
719                    String wrapperColumnId = src.getAttribute("id");
720                    if (wrapperColumnId == null || wrapperColumnId.isEmpty()) continue;
721                    for (Element wrapperFdd : findFddByTarget(doc, wrapperColumnId)) {
722                        for (Element innerRs : immediateFunctionSourceResultsets(wrapperFdd, doc)) {
723                            String innerName = innerRs.getAttribute("name");
724                            if (innerName == null) continue;
725                            if (!SLICE48_DESCENT_INNER_NAMES.contains(
726                                    innerName.toLowerCase(Locale.ROOT))) continue;
727                            // Defense-in-depth: even though `mode` is in
728                            // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES (so
729                            // an `orderby`-only fdr is not classified as
730                            // windowed), a `selectList` fdr (PARTITION
731                            // BY) on the inner resultset still indicates
732                            // the OVER form. Skip those PARTITION-BY
733                            // windowed inners so the slice-48 descent does
734                            // not lift that OVER projection. The known
735                            // OVER-only-ORDER-BY carve-out remains
736                            // classified as plain aggregate on the
737                            // projector side; IR rejects that form, and
738                            // Slice48Test documents the limitation.
739                            if (isWindowFunctionResultset(innerRs, doc)) continue;
740                            result.aggregate = true;
741                            break outerDescent;
742                        }
743                    }
744                }
745            }
746        }
747
748        // BFS for SELECT terminals. The aggregate flag is no longer touched
749        // inside the BFS; the only thing the function nodes still control is
750        // SELECT-fan-out suppression for COUNT(*)-shaped calls.
751        Set<String> visited = new HashSet<>();
752        Deque<String> q = new ArrayDeque<>();
753        q.add(columnId);
754        visited.add(columnId);
755        while (!q.isEmpty()) {
756            String cur = q.removeFirst();
757            for (Element fdd : findFddByTarget(doc, cur)) {
758                Element targetEl = firstChildElement(fdd, "target");
759                if (targetEl == null) continue;
760                Element targetParent = elementById(doc, targetEl.getAttribute("parent_id"));
761                boolean targetIsFunction = targetParent != null
762                        && isResultset(targetParent)
763                        && "function".equals(targetParent.getAttribute("type"));
764                if (targetIsFunction
765                        && policy == FanoutPolicy.SUPPRESS_AGGREGATE_NULL_ARG
766                        && shouldSuppressFunctionFanout(fdd, targetParent)) {
767                    continue;
768                }
769                NodeList sources = fdd.getElementsByTagName("source");
770                for (int si = 0; si < sources.getLength(); si++) {
771                    Element src = (Element) sources.item(si);
772                    if (src.getParentNode() != fdd) continue;
773                    String col = src.getAttribute("column");
774                    if ("*".equals(col)) continue;
775                    String parentId = src.getAttribute("parent_id");
776                    Element parent = elementById(doc, parentId);
777                    if (parent == null) continue;
778                    if (isBaseTable(parent)) {
779                        if ("constantTable".equals(parent.getAttribute("type"))) continue;
780                        result.terminals.add(new TableColumn(
781                                parent.getAttribute("name").toLowerCase(Locale.ROOT),
782                                col.toLowerCase(Locale.ROOT)));
783                    } else if (isResultset(parent)) {
784                        String nextId = src.getAttribute("id");
785                        if (visited.add(nextId)) {
786                            q.add(nextId);
787                        }
788                    }
789                    // constantTable: skipped already; function resultset
790                    // descent is handled when nextId is dequeued and its fdd
791                    // is matched above.
792                }
793            }
794        }
795        return result;
796    }
797
798    /**
799     * Public-ish helper used by the row-influence path. Same BFS as
800     * {@link #projectColumn} but the aggregate flag is meaningless here.
801     */
802    static BfsResult resolveBaseSources(String columnId, Document doc, FanoutPolicy policy) {
803        return projectColumn(columnId, doc, policy);
804    }
805
806    /**
807     * Return every function-resultset name that appears as the parent of an
808     * immediate {@code <source>} on this fdd. Multiple are possible when an
809     * expression projects something like {@code UPPER(name) || SUM(salary)}
810     * and dlineage produces sources from both function nodes.
811     */
812    private static List<String> immediateFunctionSourceNames(Element fdd, Document doc) {
813        List<String> names = new ArrayList<>();
814        NodeList sources = fdd.getElementsByTagName("source");
815        for (int si = 0; si < sources.getLength(); si++) {
816            Element src = (Element) sources.item(si);
817            if (src.getParentNode() != fdd) continue;
818            Element parent = elementById(doc, src.getAttribute("parent_id"));
819            if (parent != null && isResultset(parent)
820                    && "function".equals(parent.getAttribute("type"))) {
821                String name = parent.getAttribute("name");
822                if (name != null && !name.isEmpty()) names.add(name);
823            }
824        }
825        return names;
826    }
827
828    /**
829     * Slice 13: parallel to {@link #immediateFunctionSourceNames} but
830     * returns the function-resultset {@link Element}s themselves so the
831     * caller can inspect them with {@link #isWindowFunctionResultset}.
832     */
833    private static List<Element> immediateFunctionSourceResultsets(Element fdd, Document doc) {
834        List<Element> els = new ArrayList<>();
835        NodeList sources = fdd.getElementsByTagName("source");
836        for (int si = 0; si < sources.getLength(); si++) {
837            Element src = (Element) sources.item(si);
838            if (src.getParentNode() != fdd) continue;
839            Element parent = elementById(doc, src.getAttribute("parent_id"));
840            if (parent != null && isResultset(parent)
841                    && "function".equals(parent.getAttribute("type"))) {
842                els.add(parent);
843            }
844        }
845        return els;
846    }
847
848    /**
849     * Slice 13: window-vs-aggregate discriminator. A function resultset is
850     * windowed iff it has at least one {@code fdr} edge whose target's
851     * {@code parent_id} equals the function resultset's id AND whose source
852     * carries {@code clauseType="selectList"} (PARTITION BY) or
853     * {@code clauseType="orderby"} (OVER ORDER BY). Plain (non-windowed)
854     * aggregates have only the RelationRows fdr; the addition of PARTITION
855     * BY / OVER ORDER BY fdrs is dlineage's way of recording the analytic
856     * dependencies of a windowed call.
857     *
858     * <p>Empty {@code OVER ()} window functions are NOT discriminable here
859     * (their XML is byte-identical to a plain aggregate). Slice 13's IR
860     * builder rejects empty {@code OVER ()} so the divergence harness
861     * never sees that case in practice.
862     *
863     * <p>Probed against Oracle (slice-13 corpus); cross-vendor parity
864     * (PostgreSQL, BigQuery, SparkSQL) is unproven and deferred.
865     *
866     * <p>Slice 35 override: function names in
867     * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} suppress an
868     * order-by-only discriminator. PostgreSQL emits a
869     * {@code clauseType="orderby"} fdr for the WITHIN GROUP ORDER BY of
870     * {@code LISTAGG(... ) WITHIN GROUP (...)}, which the slice-13
871     * discriminator would otherwise mis-classify as windowed. A
872     * {@code selectList} fdr still wins and keeps analytic/window shapes
873     * classified as windowed.
874     */
875    private static boolean isWindowFunctionResultset(Element fnRs, Document doc) {
876        if (fnRs == null) return false;
877        String fnName = fnRs.getAttribute("name");
878        boolean orderByWithinGroupCandidate = fnName != null
879                && ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES.contains(
880                        fnName.toLowerCase(Locale.ROOT));
881        String fnRsId = fnRs.getAttribute("id");
882        if (fnRsId == null || fnRsId.isEmpty()) return false;
883        boolean sawOrderByDiscriminator = false;
884        NodeList rels = doc.getElementsByTagName("relationship");
885        for (int i = 0; i < rels.getLength(); i++) {
886            Element rel = (Element) rels.item(i);
887            if (!"fdr".equals(rel.getAttribute("type"))) continue;
888            Element target = firstChildElement(rel, "target");
889            if (target == null) continue;
890            if (!fnRsId.equals(target.getAttribute("parent_id"))) continue;
891            NodeList sources = rel.getElementsByTagName("source");
892            for (int si = 0; si < sources.getLength(); si++) {
893                Element src = (Element) sources.item(si);
894                if (src.getParentNode() != rel) continue;
895                String ct = src.getAttribute("clauseType");
896                if ("selectList".equals(ct)) return true;
897                if ("orderby".equals(ct)) sawOrderByDiscriminator = true;
898            }
899        }
900        return sawOrderByDiscriminator && !orderByWithinGroupCandidate;
901    }
902
903    /**
904     * Decide whether the SELECT fan-out from a function-targeted fdd should
905     * be suppressed. Two conditions must hold:
906     *
907     * <ol>
908     *   <li>The function name is in {@link #NULL_ARG_AGGREGATES}. Today
909     *       only {@code COUNT}; this prevents a hypothetical
910     *       {@code SUM(t.*)} (some dialects allow it) from silently losing
911     *       lineage.</li>
912     *   <li>Either at least one source has {@code column="*"} (the
913     *       {@code COUNT(*)} shape) or every source is a constant-table
914     *       column ({@code COUNT(1)} / {@code COUNT(literal)}).</li>
915     * </ol>
916     */
917    private static boolean shouldSuppressFunctionFanout(Element functionFdd, Element functionResultset) {
918        if (functionResultset == null) return false;
919        String fnName = functionResultset.getAttribute("name");
920        if (fnName == null || fnName.isEmpty()) return false;
921        if (!NULL_ARG_AGGREGATES.contains(fnName.toLowerCase(Locale.ROOT))) return false;
922
923        boolean anyStar = false;
924        boolean anyNonConstant = false;
925        boolean anySource = false;
926        NodeList sources = functionFdd.getElementsByTagName("source");
927        for (int si = 0; si < sources.getLength(); si++) {
928            Element src = (Element) sources.item(si);
929            if (src.getParentNode() != functionFdd) continue;
930            anySource = true;
931            if ("*".equals(src.getAttribute("column"))) {
932                anyStar = true;
933                continue;
934            }
935            String parentId = src.getAttribute("parent_id");
936            Element parent = elementById(functionFdd.getOwnerDocument(), parentId);
937            if (parent != null && isBaseTable(parent)
938                    && "constantTable".equals(parent.getAttribute("type"))) {
939                continue;
940            }
941            anyNonConstant = true;
942        }
943        if (!anySource) return false;
944        if (anyStar) return true;
945        return !anyNonConstant;
946    }
947
948    private static List<Element> findFddByTarget(Document doc, String columnId) {
949        List<Element> out = new ArrayList<>();
950        NodeList rels = doc.getElementsByTagName("relationship");
951        for (int i = 0; i < rels.getLength(); i++) {
952            Element rel = (Element) rels.item(i);
953            if (!"fdd".equals(rel.getAttribute("type"))) continue;
954            Element target = firstChildElement(rel, "target");
955            if (target == null) continue;
956            if (columnId.equals(target.getAttribute("id"))) {
957                out.add(rel);
958            }
959        }
960        return out;
961    }
962
963    /**
964     * Collect parent IDs of fdd <b>and</b> fdr sources. The terminal-resultset
965     * detection treats any resultset that is referenced as a source of either
966     * data-dependency (fdd) <b>or</b> row-influence (fdr) as non-terminal.
967     *
968     * <p>Slice 23 motivation: an EXISTS subquery in JOIN ON contributes its
969     * inner SELECT-list resultset only via an {@code fdr clause="on"} edge
970     * (the inner result is plumbed into outer's RelationRows as a row-
971     * influence source, NOT as a data dependency on any outer-projected
972     * column). Without considering fdr sources, both the outer SELECT-list
973     * resultset AND the inner EXISTS-body resultset would qualify as
974     * terminal candidates, producing a false MULTIPLE_TERMINAL_SELECTS.
975     *
976     * <p>Pre-slice-23 corpus entries (FROM-subquery, scalar-subquery,
977     * set-op) all referenced inner resultsets via fdd, so the original
978     * fdd-only check sufficed for them; widening to fdr is additive and
979     * cannot mis-classify their inner resultsets as referenced when they
980     * weren't already.
981     */
982    private static Set<String> collectFddOrFdrSourceParentIds(Document doc) {
983        Set<String> out = new HashSet<>();
984        NodeList rels = doc.getElementsByTagName("relationship");
985        for (int i = 0; i < rels.getLength(); i++) {
986            Element rel = (Element) rels.item(i);
987            String relType = rel.getAttribute("type");
988            if (!"fdd".equals(relType) && !"fdr".equals(relType)) continue;
989            NodeList sources = rel.getElementsByTagName("source");
990            for (int si = 0; si < sources.getLength(); si++) {
991                Element src = (Element) sources.item(si);
992                if (src.getParentNode() != rel) continue;
993                String parentId = src.getAttribute("parent_id");
994                if (parentId != null && !parentId.isEmpty()) {
995                    out.add(parentId);
996                }
997            }
998        }
999        return out;
1000    }
1001
1002    /**
1003     * Apply OWASP-recommended XXE-prevention features to a
1004     * {@link DocumentBuilderFactory}. Each set is best-effort: a parser
1005     * that does not expose a feature swallows the failure but the others
1006     * still apply.
1007     */
1008    private static void applyXmlSecurity(DocumentBuilderFactory dbf) {
1009        String[] disable = {
1010                // Disallow inline DTDs entirely — strongest mitigation.
1011                "http://apache.org/xml/features/disallow-doctype-decl",
1012        };
1013        String[] enable = {
1014                javax.xml.XMLConstants.FEATURE_SECURE_PROCESSING,
1015        };
1016        String[] disableExt = {
1017                "http://xml.org/sax/features/external-general-entities",
1018                "http://xml.org/sax/features/external-parameter-entities",
1019                "http://apache.org/xml/features/nonvalidating/load-external-dtd",
1020        };
1021        for (String f : disable) {
1022            try { dbf.setFeature(f, true); } catch (Exception ignore) {}
1023        }
1024        for (String f : enable) {
1025            try { dbf.setFeature(f, true); } catch (Exception ignore) {}
1026        }
1027        for (String f : disableExt) {
1028            try { dbf.setFeature(f, false); } catch (Exception ignore) {}
1029        }
1030        try {
1031            dbf.setAttribute(javax.xml.XMLConstants.ACCESS_EXTERNAL_DTD, "");
1032        } catch (Exception ignore) {}
1033        try {
1034            dbf.setAttribute(javax.xml.XMLConstants.ACCESS_EXTERNAL_SCHEMA, "");
1035        } catch (Exception ignore) {}
1036        dbf.setXIncludeAware(false);
1037        dbf.setExpandEntityReferences(false);
1038    }
1039
1040    private static Element elementById(Document doc, String id) {
1041        if (id == null || id.isEmpty()) return null;
1042        // The XML uses string ids on table/resultset/column elements; not
1043        // declared as XML IDs, so we lookup by attribute.
1044        for (String tag : new String[]{"table", "resultset", "column"}) {
1045            NodeList nl = doc.getElementsByTagName(tag);
1046            for (int i = 0; i < nl.getLength(); i++) {
1047                Element e = (Element) nl.item(i);
1048                if (id.equals(e.getAttribute("id"))) return e;
1049            }
1050        }
1051        return null;
1052    }
1053
1054    private static Element firstChildElement(Element parent, String tag) {
1055        NodeList nl = parent.getChildNodes();
1056        for (int i = 0; i < nl.getLength(); i++) {
1057            Node n = nl.item(i);
1058            if (n.getNodeType() != Node.ELEMENT_NODE) continue;
1059            if (tag.equals(n.getNodeName())) return (Element) n;
1060        }
1061        return null;
1062    }
1063
1064    private static boolean isBaseTable(Element parent) {
1065        return "table".equals(parent.getTagName());
1066    }
1067
1068    private static boolean isResultset(Element parent) {
1069        return "resultset".equals(parent.getTagName());
1070    }
1071
1072    /**
1073     * True iff the resultset {@code type} is a candidate to be the
1074     * program's terminal resultset. {@code select_list} is the standard
1075     * shape (slice 1+). Slice 12 added the set-op result types
1076     * ({@code select_union}, {@code select_intersect}, {@code select_minus},
1077     * {@code select_except}): dlineage emits one {@code RESULT_OF_<OP>-N}
1078     * resultset of the matching type per top-level set-op program; that
1079     * resultset's columns have one fdd edge each with N sources (one per
1080     * branch's per-position {@code select_list} column), and the existing
1081     * {@code projectColumn} BFS walks those fdd chains transparently to
1082     * reach base-table terminals. {@code select_except} (PostgreSQL /
1083     * SQL Server EXCEPT) is distinct from {@code select_minus} (Oracle /
1084     * Spark / Hive MINUS) at the dlineage layer even though they are
1085     * semantically equivalent.
1086     */
1087    private static boolean isTerminalResultsetType(String type) {
1088        if (type == null) return false;
1089        return "select_list".equals(type)
1090                || "select_union".equals(type)
1091                || "select_intersect".equals(type)
1092                || "select_minus".equals(type)
1093                || "select_except".equals(type);
1094    }
1095
1096    private static EdgeRole clauseTypeToRole(String clauseType, String relClause) {
1097        if ("where".equals(clauseType) || "where".equals(relClause)) return EdgeRole.FILTER;
1098        if ("joinCondition".equals(clauseType) || "on".equals(relClause)) return EdgeRole.JOIN;
1099        return null;
1100    }
1101
1102    /** Result of projecting one column: terminals plus the aggregate flag. */
1103    static final class BfsResult {
1104        final List<TableColumn> terminals = new ArrayList<>();
1105        boolean aggregate = false;
1106    }
1107
1108    /** Lower-cased base (table, column) pair. */
1109    static final class TableColumn {
1110        final String table;
1111        final String column;
1112        TableColumn(String table, String column) {
1113            this.table = table;
1114            this.column = column;
1115        }
1116    }
1117}