001package gudusoft.gsqlparser.ir.semantic.diff; 002 003import gudusoft.gsqlparser.EDbVendor; 004import org.w3c.dom.Document; 005import org.w3c.dom.Element; 006import org.w3c.dom.Node; 007import org.w3c.dom.NodeList; 008import org.xml.sax.InputSource; 009 010import javax.xml.parsers.DocumentBuilder; 011import javax.xml.parsers.DocumentBuilderFactory; 012import java.io.StringReader; 013import java.util.ArrayDeque; 014import java.util.ArrayList; 015import java.util.Arrays; 016import java.util.Collections; 017import java.util.Deque; 018import java.util.HashMap; 019import java.util.HashSet; 020import java.util.LinkedHashMap; 021import java.util.LinkedHashSet; 022import java.util.List; 023import java.util.Locale; 024import java.util.Map; 025import java.util.Set; 026 027/** 028 * Project the existing {@code DataFlowAnalyzer} XML output into the same 029 * canonical form as the Semantic IR projector. 030 * 031 * <p>The projector consumes only the four XML shapes seen in the captured 032 * 5-SQL corpus baselines (see {@code phase0/golden/*.dlineage.xml}). Any 033 * input that doesn't fit produces a {@link ProjectorResult} carrying an 034 * {@link ProjectorResult.UnsupportedReason}; the harness translates it 035 * into a single {@link DivergenceClass#UNSUPPORTED_BY_DLINEAGE} divergence. 036 * 037 * <p>Aggregate handling — see slice-7 plan §"DlineageXmlProjector". The 038 * COUNT(*) heuristic detects function-resultset fdd sources that include 039 * a {@code column="*"} reference and suppresses SELECT fan-out from that 040 * function. {@code SUM(col)}/{@code AVG(col)}/{@code COUNT(col)} retain 041 * their fan-out. 042 */ 043public final class DlineageXmlProjector { 044 045 /** 046 * Lower-cased function names whose presence on the immediate source-side 047 * of an output's fdd marks that output as aggregate. Mirrors the 048 * whitelist in {@code SemanticIRBuilder.AGGREGATE_FUNCTION_NAMES} so that 049 * scalar functions like {@code UPPER}, {@code COALESCE}, or {@code TRIM} 050 * do not cause a false {@link DivergenceClass#AGGREGATION_MISMATCH} 051 * against the IR's {@code OutputColumn.isAggregate()}. 052 * 053 * <p>Slice 30 added {@code mode} (PostgreSQL ordered-set aggregate). See 054 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} for the matching 055 * window-vs-aggregate discriminator override. 056 */ 057 private static final Set<String> AGGREGATE_FUNCTION_NAMES; 058 static { 059 Set<String> s = new HashSet<>(Arrays.asList( 060 "count", "sum", "avg", "min", "max", 061 "stddev", "variance", "var_samp", "var_pop", 062 "stddev_samp", "stddev_pop", 063 "listagg", "string_agg", "group_concat", "array_agg", 064 // Slice 30: PostgreSQL ordered-set aggregate. Mirrors 065 // SemanticIRBuilder.AGGREGATE_FUNCTION_NAMES; see also 066 // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES below. 067 "mode", 068 // Slice 42: hypothetical-set ordered-set aggregates 069 // (Oracle / MSSQL {@code RANK(100) WITHIN GROUP (ORDER 070 // BY x)} family). On Oracle / MSSQL the dlineage XML for 071 // this shape emits NO {@code clauseType="orderby"} fdr — 072 // the slice-13 windowed discriminator 073 // {@link #isWindowFunctionResultset} returns false and 074 // these names mark the output aggregate=true. The OVER 075 // form on Oracle / MSSQL ({@code RANK() OVER (ORDER BY 076 // x)}) emits {@code clauseType="orderby"} and the 077 // discriminator returns true (windowed) — adding these 078 // names to the aggregate set therefore does not affect 079 // OVER-form classification. The PG direct-attachment 080 // hypothetical-set form ({@code rank(0.5) WITHIN GROUP 081 // (...)}) and PG OVER form share the same XML shape; 082 // both are classified as windowed (rank is NOT in 083 // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES) — slice 42 084 // does NOT lift PG, the IR builder rejects PG 085 // hypothetical-set so the divergence harness never 086 // sees that case. Probe-confirmed 087 // {@code /tmp/probe42/Probe42.java}. 088 "rank", "dense_rank", "percent_rank", "cume_dist")); 089 AGGREGATE_FUNCTION_NAMES = Collections.unmodifiableSet(s); 090 } 091 092 /** 093 * Function names whose {@code clauseType="orderby"} fdr may be the 094 * WITHIN GROUP order-by rather than a window {@code OVER ORDER BY}. 095 * 096 * <p>Slice 30 introduced {@code mode}. Slice 35 added PostgreSQL 097 * {@code listagg}; slice 36 adds PostgreSQL {@code string_agg}. Probes 098 * show dlineage emits the WITHIN GROUP order-by for both as an fdr with 099 * {@code clauseType="orderby"} structurally identical to LISTAGG (probe 100 * /tmp/ProbeStringAggXml.java verified the XML for aliased and unaliased 101 * STRING_AGG WG: a {@code resultset type="function"} STRING_AGG wrapper, 102 * an {@code fdd effectType="function"} target/source pair for the arg, 103 * and a single {@code fdr clauseType="orderby"} for the WG key — no 104 * {@code clauseType="selectList"} fdr is emitted, so the slice-13 105 * projector otherwise mistakes the orderby fdr for a window discriminator 106 * and reports {@link DivergenceClass#AGGREGATION_MISMATCH}). This set 107 * does NOT blindly short-circuit window classification: 108 * {@link #isWindowFunctionResultset} still returns {@code true} when a 109 * {@code selectList} fdr is present (PARTITION BY / analytic dependency), 110 * and only suppresses the order-by-only shape. 111 * 112 * <p><b>Subset constraint</b>: must be a strict subset of 113 * {@link #AGGREGATE_FUNCTION_NAMES} so the IR-side / projector-side 114 * aggregate-flag invariants stay symmetric. The class-init check below 115 * enforces this. 116 * 117 * <p><b>Why not include {@code percentile_cont} / {@code percentile_disc}</b>: 118 * these have windowed forms in BigQuery / Vertica / Redshift / Oracle / 119 * SQL Server (e.g. {@code PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY x) 120 * OVER (PARTITION BY y)}). Lifting them requires a stronger structural 121 * discriminator. Similarly, hypothetical-set {@code rank} / 122 * {@code dense_rank} remain excluded because their PG XML is structurally 123 * identical to window form {@code RANK() OVER (ORDER BY ...)}. 124 */ 125 private static final Set<String> ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES; 126 static { 127 Set<String> s = new HashSet<>(); 128 s.add("mode"); 129 s.add("listagg"); 130 s.add("string_agg"); 131 ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES = Collections.unmodifiableSet(s); 132 // Subset assertion: must be enforced at runtime (not via assert, 133 // which can be disabled). A future contributor adding a name to 134 // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES without also adding it to 135 // AGGREGATE_FUNCTION_NAMES would produce silently-wrong projector 136 // output (the function would be "non-aggregate but 137 // not-windowed-either" — first-hop check skips it entirely). 138 if (!AGGREGATE_FUNCTION_NAMES.containsAll(ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES)) { 139 Set<String> missing = new HashSet<>(ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES); 140 missing.removeAll(AGGREGATE_FUNCTION_NAMES); 141 throw new IllegalStateException( 142 "ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES must be a subset of " 143 + "AGGREGATE_FUNCTION_NAMES; missing=" + missing); 144 } 145 } 146 147 /** 148 * Functions whose {@code (*)} argument is a row-counting marker rather 149 * than a real value dependency, so SELECT fan-out from them should be 150 * suppressed (otherwise the dlineage XML's expansion to every joined 151 * column would produce false {@link DivergenceClass#IR_MISSING_DEPENDENCY} 152 * edges relative to the IR's "aggregate, no sources" representation). 153 * Today the only widely-supported example is {@code COUNT(*)}. 154 */ 155 private static final Set<String> NULL_ARG_AGGREGATES; 156 static { 157 Set<String> s = new HashSet<>(); 158 s.add("count"); 159 NULL_ARG_AGGREGATES = Collections.unmodifiableSet(s); 160 } 161 162 /** 163 * Slice 48 — wrapper-function names through which the projector will 164 * descend ONE fdd hop to look for a known ordered-set aggregate 165 * inner. 166 * 167 * <p>Slice 48 introduced {@code lower}. Slice 50 widened the set to 168 * include {@code upper} — the natural case-folding sibling. Slice 51 169 * widened the set further to include {@code concat} — the canonical 170 * multi-argument scalar concatenation wrapper. Slice 52 widened the 171 * set again to include {@code trim} / {@code ltrim} / {@code rtrim} — 172 * the canonical whitespace-trimming scalar wrappers. Slice 53 widened 173 * it to include {@code substring} / {@code substr} — the 174 * three-argument string-extraction wrappers. Slice 54 widened it once 175 * more to include {@code replace} — the canonical three-argument 176 * pattern-replacement wrapper. Slice 55 widens it again to include 177 * {@code lpad} / {@code rpad} — the canonical three-argument string 178 * padding wrappers. Slice 51 probes ({@code /tmp/Probe51.java}, 179 * {@code /tmp/Probe51b.java}), slice 52 probe 180 * ({@code /tmp/Probe52.java}), slice 53 probe 181 * ({@code /tmp/Probe53.java}), slice 54 probe (deleted 182 * post-implementation; see {@code Slice54Test} javadoc) and slice 55 183 * probe (transient {@code Slice55LpadRpadProbeTest} inside the Maven 184 * test JVM, deleted post-confirmation; see {@code Slice55Test} 185 * javadoc) confirmed the dlineage XML for {@code CONCAT(...)} / 186 * {@code TRIM(...)} / {@code LTRIM(...)} / {@code RTRIM(...)} / 187 * {@code SUBSTRING(...)} / {@code SUBSTR(...)} / {@code REPLACE(...)} 188 * / {@code LPAD(...)} / {@code RPAD(...)} differs from 189 * {@code LOWER(...)} / {@code UPPER(...)} only in the wrapper 190 * resultset's {@code name} attribute. The literal-string / 191 * character-class / direction-marker / numeric start+length / 192 * pattern+replacement / pad-length / pad-character arguments do NOT 193 * generate extra {@code fdd} edges on the wrapping column 194 * (constants and BOTH/LEADING/TRAILING/FROM/FOR markers do not 195 * produce fdd edges in the dlineage XML), so the structural shape is 196 * byte-identical to the case-folding wrapper shape: one 197 * {@code fdd effectType="select"} from the SELECT-list resultset to 198 * the wrapper column, then one {@code fdd effectType="function"} 199 * from the wrapper column to the inner ordered-set aggregate's 200 * resultset, then the inner aggregate's own argument fdd and 201 * orderby fdr exactly as in the case-folding wrapper shape. The 202 * IR-side classification (aggregate=true via the slice-31 collector 203 * descending into the wrapping function body) and the dlineage-side 204 * aggregate classification lift symmetrically once the wrapper 205 * allowlist contains the new wrapper name. 206 * 207 * <p>The {@code ||} (string-concatenation) operator is a special 208 * non-case: PG / Snowflake parsers fold {@code expr || ' '} into 209 * the projection root WITHOUT emitting a {@code concat} or 210 * {@code ||}-named function-resultset wrapper. The dlineage XML for 211 * {@code LISTAGG WG || '|'} is byte-identical to the unwrapped 212 * {@code LISTAGG WG} form — no descent is needed because the 213 * existing first-hop aggregate detection 214 * (slice-30 / 35 / 36 carve-outs in 215 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}) already 216 * classifies that direct path as aggregate=true. Slice 51 / 52 / 53 217 * / 54 therefore do NOT add {@code ||} to this allowlist; the 218 * wrapper widen is only required for explicit named function calls. 219 * 220 * <p>{@code TRIM(BOTH ' ' FROM x)} (PG / Snowflake special grammar) 221 * is a syntactic surface only — slice 52 probe confirmed the 222 * dlineage XML still emits a {@code resultset type="function"} 223 * named {@code TRIM} with the same one-fdd-edge connection chain 224 * as canonical {@code TRIM(x)}. The descent therefore fires for 225 * both forms unchanged. Likewise, the PG SQL-standard 226 * {@code SUBSTRING(x FROM 1 FOR 5)} grammar form (with 227 * {@code FROM}/{@code FOR} keyword markers) emits the same chain 228 * shape as the canonical {@code SUBSTRING(x, 1, 5)} comma form — 229 * slice 53 probe confirmed parity. The two-argument 230 * {@code SUBSTRING(x, 1)} call (no length) emits the same chain 231 * because constants do not produce fdd edges. Slice 54 232 * {@code REPLACE(x, pattern, replacement)} probe confirmed the 233 * pattern and replacement string literals likewise produce no fdd 234 * edges on the wrapping {@code REPLACE.REPLACE} column, so the 235 * descent fires identically regardless of the literal values. 236 * Slice 55 {@code LPAD(x, length, padchar)} / 237 * {@code RPAD(x, length, padchar)} probe confirmed the integer 238 * pad-length and literal-string pad-character arguments likewise 239 * produce no fdd edges on the wrapping {@code LPAD.LPAD} / 240 * {@code RPAD.RPAD} column, so the descent fires identically 241 * regardless of the pad-length / pad-character values. 242 * 243 * <p>Slice 56 {@code LEFT(x, length)} / {@code RIGHT(x, length)} 244 * probe confirmed the integer length argument (2nd arg) likewise 245 * produces no fdd edges on the wrapping {@code LEFT.LEFT} / 246 * {@code RIGHT.RIGHT} column — only the inner ordered-set 247 * aggregate's resultset is the fdd source — so the descent fires 248 * identically regardless of the literal length value. 249 * 250 * <p>Slice 57 {@code REGEXP_REPLACE(x, pattern, replacement [, 251 * flags] [, position, occurrence, parameters])} probe (transient 252 * test inside the Maven test JVM, deleted post-confirmation) 253 * confirmed the regex pattern, replacement string, optional 254 * flag, and Snowflake positional / occurrence / parameter 255 * literals likewise produce no fdd edges on the wrapping 256 * {@code REGEXP_REPLACE.REGEXP_REPLACE} column — only the inner 257 * ordered-set aggregate's resultset is the fdd source — so the 258 * descent fires identically regardless of the constant 259 * pattern / replacement / flag / positional values. The PG 260 * 3-arg form, PG 4-arg flag form, SF 3-arg form, and SF 6-arg 261 * (position / occurrence / parameters) form all produce the 262 * same chain shape as slice-49 {@code LOWER(...)}. 263 * 264 * <p>The wrapper allowlist is intentionally NOT vendor-gated: 265 * PostgreSQL and Snowflake are the validated scope per the 266 * slice-57 probe, but any dialect whose dlineage XML emits a 267 * {@code REGEXP_REPLACE}-named function-resultset wrapper around 268 * a {@code mode} / {@code listagg} / {@code string_agg} inner 269 * with a non-window fdr (typically the WITHIN GROUP order-by 270 * fdr admitted by the slice-30 / 35 / 36 carve-outs in 271 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}) will lift 272 * transparently. This matches the slice-48 → 56 posture. 273 * 274 * <p>Other scalar wrappers ({@code COALESCE}, 275 * {@code INITCAP}, {@code REVERSE}, {@code LENGTH} / 276 * {@code CHAR_LENGTH}, arithmetic) stay out of scope. Each 277 * carries a different argument-shape signature in the dlineage 278 * XML and would need an independent probe pass before being 279 * widened — slice 57 stays narrowly focused on the canonical 280 * case-folding / concat / trim / substring / replace / lpad / 281 * rpad / left / right / regexp_replace family. 282 */ 283 private static final Set<String> SLICE48_DESCENT_WRAPPER_NAMES; 284 static { 285 Set<String> s = new HashSet<>(); 286 s.add("lower"); 287 s.add("upper"); 288 s.add("concat"); 289 s.add("trim"); 290 s.add("ltrim"); 291 s.add("rtrim"); 292 s.add("substring"); 293 s.add("substr"); 294 s.add("replace"); 295 s.add("lpad"); 296 s.add("rpad"); 297 s.add("left"); 298 s.add("right"); 299 s.add("regexp_replace"); 300 SLICE48_DESCENT_WRAPPER_NAMES = Collections.unmodifiableSet(s); 301 } 302 303 /** 304 * Slice 48 — inner aggregate names that the {@link 305 * #SLICE48_DESCENT_WRAPPER_NAMES} descent is allowed to lift. 306 * 307 * <p>Slice 48 introduced {@code mode}. Slice 49 widens the set to 308 * also include {@code listagg} / {@code string_agg} — the natural 309 * sibling lift along the same {@code LOWER(...) WITHIN GROUP (...)} 310 * embedded form. Probe {@code /tmp/Probe49.java} confirmed PG / 311 * Snowflake {@code LOWER(LISTAGG(x.id, ',') WG (ORDER BY x.region))} 312 * dlineage XML matches the slice-48 descent shape exactly: a 313 * stacked {@code RS-1.s} → {@code LOWER} → {@code LISTAGG} chain of 314 * {@code resultset type="function"} elements linked by 315 * {@code fdd effectType="function"} edges, plus a single 316 * {@code fdr clauseType="orderby"} on the inner resultset for the 317 * WG ORDER BY column. The slice-30 / 35 / 36 carve-outs in 318 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} include 319 * {@code listagg} and {@code string_agg}, so the defense-in-depth 320 * {@link #isWindowFunctionResultset} check at the bottom of the 321 * descent (which would otherwise mis-classify the orderby fdr as a 322 * window discriminator) returns false for these inner names — the 323 * descent is therefore trivially safe. 324 * 325 * <p>The extra {@code fdd effectType="function"} from 326 * {@code LISTAGG.LISTAGG} → base-table column (the function 327 * argument) is structurally orthogonal to the descent: the descent 328 * only flips {@code result.aggregate = true} and never alters the 329 * SELECT-terminal BFS, so the existing per-output SELECT lineage 330 * edges are unchanged. 331 * 332 * <p>Plain aggregates ({@code SUM} / {@code COUNT} / etc.) inside 333 * scalar wrappers stay outside the descent (slice-48 boundary 334 * preserved): they are NOT in 335 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} so the descent's 336 * {@code isWindowFunctionResultset} carve-out would NOT apply if 337 * they were added here. Lifting them needs a separate slice. 338 * Hypothetical-set / {@code percentile_cont} / {@code percentile_disc} 339 * also stay out for the same reason — slice 47 / 48 probes 340 * confirmed their dlineage XML is structurally indistinguishable 341 * from window-OVER form for the inner resultset, so widening the 342 * descent allowlist would mis-classify the OVER form as aggregate. 343 * 344 * <p>Constraint: must be a strict subset of 345 * {@link #AGGREGATE_FUNCTION_NAMES}; the runtime check below 346 * mirrors the same constraint on 347 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}. 348 */ 349 private static final Set<String> SLICE48_DESCENT_INNER_NAMES; 350 static { 351 Set<String> s = new HashSet<>(); 352 s.add("mode"); 353 s.add("listagg"); 354 s.add("string_agg"); 355 SLICE48_DESCENT_INNER_NAMES = Collections.unmodifiableSet(s); 356 if (!AGGREGATE_FUNCTION_NAMES.containsAll(SLICE48_DESCENT_INNER_NAMES)) { 357 Set<String> missing = new HashSet<>(SLICE48_DESCENT_INNER_NAMES); 358 missing.removeAll(AGGREGATE_FUNCTION_NAMES); 359 throw new IllegalStateException( 360 "SLICE48_DESCENT_INNER_NAMES must be a subset of " 361 + "AGGREGATE_FUNCTION_NAMES; missing=" + missing); 362 } 363 } 364 365 /** Policy for {@link #resolveBaseSources(String, Document, FanoutPolicy)}. */ 366 public enum FanoutPolicy { 367 /** SELECT projection: function fdd whose argument is `*` or constant suppresses fan-out. */ 368 SUPPRESS_AGGREGATE_NULL_ARG, 369 /** Row-influence: every fdd terminal kept regardless of intervening function nodes. */ 370 INCLUDE_ALL 371 } 372 373 private DlineageXmlProjector() {} 374 375 /** 376 * Legacy entry point — equivalent to {@link #project(String, EDbVendor)} 377 * with {@code vendor=null}. Retained for callers that don't know or 378 * don't care about the source dialect (Phase-0 harness, divergence 379 * goldens, slice-7 comparison harness). 380 */ 381 public static ProjectorResult project(String xml) { 382 return project(xml, null); 383 } 384 385 /** 386 * Slice 43 — vendor-scoped projection entry point. The {@code vendor} 387 * parameter pins the per-vendor API contract so a future cautious 388 * projector override can apply per-vendor name-whitelist exceptions 389 * (e.g. PG hypothetical-set {@code rank} / {@code dense_rank} / 390 * {@code percent_rank} / {@code cume_dist} where the WG form's XML 391 * is structurally indistinguishable from the OVER form). Slice 43 392 * deliberately makes <b>no behavior change</b> in the projector — 393 * today the {@link #AGGREGATE_FUNCTION_NAMES} / 394 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} sets are vendor- 395 * agnostic and the {@code vendor} argument is recorded but not 396 * consulted. The byte-for-byte equivalence is enforced by 397 * {@code Slice43Test.projectorVendorScopeOverloadAccepts*Vendor}. 398 * 399 * <p>Future slices that wire vendor-specific overrides MUST keep 400 * this equivalence for callers that pass {@code vendor=null} (the 401 * legacy {@link #project(String)} delegates here with {@code null}). 402 * The follow-up slice for PG hypothetical-set top-level admission 403 * is the first expected consumer of the {@code vendor} parameter. 404 * 405 * @param xml dlineage XML output, must not be null/empty 406 * @param vendor source SQL dialect; may be {@code null} when the 407 * caller has no vendor information 408 */ 409 public static ProjectorResult project(String xml, EDbVendor vendor) { 410 if (xml == null || xml.isEmpty()) { 411 return ProjectorResult.unsupported( 412 ProjectorResult.UnsupportedReason.MALFORMED_XML, "empty xml"); 413 } 414 Document doc; 415 try { 416 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); 417 dbf.setNamespaceAware(false); 418 // Defensive XML hardening: disable DTDs, external entities, and 419 // network access by default. The projector is a public API in 420 // src/main and may be called with untrusted input; matching the 421 // OWASP XXE-prevention recipe for JDK DocumentBuilderFactory. 422 applyXmlSecurity(dbf); 423 DocumentBuilder db = dbf.newDocumentBuilder(); 424 // Belt-and-braces: even with the features above, refuse to 425 // resolve any external entity that slips through. 426 db.setEntityResolver((publicId, systemId) -> 427 new InputSource(new java.io.StringReader(""))); 428 db.setErrorHandler(new org.xml.sax.ErrorHandler() { 429 @Override public void warning(org.xml.sax.SAXParseException e) {} 430 @Override public void error(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException { throw e; } 431 @Override public void fatalError(org.xml.sax.SAXParseException e) throws org.xml.sax.SAXException { throw e; } 432 }); 433 doc = db.parse(new InputSource(new StringReader(xml))); 434 } catch (Exception ex) { 435 return ProjectorResult.unsupported( 436 ProjectorResult.UnsupportedReason.MALFORMED_XML, ex.getMessage()); 437 } 438 439 // Identify the final resultset: exactly one <resultset> of a 440 // terminal-candidate type that is never referenced as the parent 441 // of any fdd OR fdr <source>. Slice 12 lifted set-op programs, so 442 // {@code select_union}, {@code select_intersect}, {@code select_minus} 443 // are also valid terminal types — dlineage emits one 444 // {@code RESULT_OF_<OP>-N} resultset of the matching set-op type 445 // whose columns each have an fdd edge with N sources (one per 446 // branch's per-position {@code select_list} column). Slice 23 447 // widened the parent-id collection to include fdr sources too, 448 // so the inner EXISTS-body resultset (referenced only via 449 // fdr clause="on") is correctly excluded from terminal candidates. 450 List<Element> terminalCandidates = new ArrayList<>(); 451 NodeList resultsets = doc.getElementsByTagName("resultset"); 452 for (int i = 0; i < resultsets.getLength(); i++) { 453 Element e = (Element) resultsets.item(i); 454 if (isTerminalResultsetType(e.getAttribute("type"))) { 455 terminalCandidates.add(e); 456 } 457 } 458 if (terminalCandidates.isEmpty()) { 459 return ProjectorResult.unsupported( 460 ProjectorResult.UnsupportedReason.NO_RELATIONSHIPS, 461 "no terminal-candidate <resultset> found " 462 + "(select_list / select_union / select_intersect / " 463 + "select_minus / select_except)"); 464 } 465 Set<String> sourceParentIds = collectFddOrFdrSourceParentIds(doc); 466 List<Element> terminals = new ArrayList<>(); 467 for (Element rs : terminalCandidates) { 468 if (!sourceParentIds.contains(rs.getAttribute("id"))) { 469 terminals.add(rs); 470 } 471 } 472 if (terminals.size() != 1) { 473 return ProjectorResult.unsupported( 474 ProjectorResult.UnsupportedReason.MULTIPLE_TERMINAL_SELECTS, 475 "expected exactly one terminal resultset, got " + terminals.size()); 476 } 477 Element finalResultset = terminals.get(0); 478 479 Set<CanonicalLineageEdge> edges = new LinkedHashSet<>(); 480 Set<String> outputNames = new LinkedHashSet<>(); 481 Map<String, Boolean> aggregateByOutput = new LinkedHashMap<>(); 482 483 // 1) SELECT projection per output column of the final resultset. 484 // Skip system pseudo-columns (e.g. RelationRows). They are the implicit 485 // fdr-target for row-influence, never user-visible projected columns. 486 NodeList finalCols = finalResultset.getElementsByTagName("column"); 487 for (int i = 0; i < finalCols.getLength(); i++) { 488 Element col = (Element) finalCols.item(i); 489 // Only direct children of finalResultset count — getElementsByTagName 490 // returns descendants too in theory, but our XML schema has columns 491 // as direct children of resultset/table, so this is fine in practice. 492 if (col.getParentNode() != finalResultset) continue; 493 if ("system".equals(col.getAttribute("source"))) continue; 494 495 String outName = col.getAttribute("name").toLowerCase(Locale.ROOT); 496 outputNames.add(outName); 497 // Default false; flipped to true if BFS visits a function resultset. 498 aggregateByOutput.put(outName, false); 499 500 BfsResult br = projectColumn(col.getAttribute("id"), doc, 501 FanoutPolicy.SUPPRESS_AGGREGATE_NULL_ARG); 502 if (br.aggregate) { 503 aggregateByOutput.put(outName, true); 504 } 505 for (TableColumn tc : br.terminals) { 506 edges.add(new CanonicalLineageEdge(EdgeRole.SELECT, outName, tc.table, tc.column)); 507 } 508 } 509 510 // 2) Row-influence from fdr edges whose target is a column of the final 511 // resultset (typically the system RelationRows column). 512 String finalRsId = finalResultset.getAttribute("id"); 513 // Walk the fdr graph: follow chains that target the final resultset's 514 // RelationRows, including indirect ones (e.g. RS-1.RelationRows -> sub.RelationRows 515 // in 04_nested_subquery). For each fdr source with clauseType where/joinCondition 516 // (or clause where/on), resolve to base columns via fdd closure. 517 NodeList fdrs = doc.getElementsByTagName("relationship"); 518 // Collect fdr edges keyed by the parent_id of the target so we can 519 // chain RelationRows-targeting fdrs. 520 Map<String, List<Element>> fdrsByTargetParentId = new HashMap<>(); 521 for (int i = 0; i < fdrs.getLength(); i++) { 522 Element rel = (Element) fdrs.item(i); 523 if (!"fdr".equals(rel.getAttribute("type"))) continue; 524 Element target = firstChildElement(rel, "target"); 525 if (target == null) continue; 526 fdrsByTargetParentId.computeIfAbsent(target.getAttribute("parent_id"), k -> new ArrayList<>()) 527 .add(rel); 528 } 529 530 Deque<String> rowSetParents = new ArrayDeque<>(); 531 Set<String> visitedRowSetParents = new HashSet<>(); 532 rowSetParents.add(finalRsId); 533 visitedRowSetParents.add(finalRsId); 534 while (!rowSetParents.isEmpty()) { 535 String parentId = rowSetParents.removeFirst(); 536 List<Element> rels = fdrsByTargetParentId.get(parentId); 537 if (rels == null) continue; 538 for (Element rel : rels) { 539 Element target = firstChildElement(rel, "target"); 540 if (target == null) continue; 541 // Only follow fdrs whose target is a system column (RelationRows). 542 // Other fdrs (e.g. effectType="function" with target on a function 543 // resultset) carry aggregation semantics, not row-influence. 544 if (!"system".equals(target.getAttribute("source"))) continue; 545 // Process each <source> on this fdr. 546 NodeList sources = rel.getElementsByTagName("source"); 547 for (int si = 0; si < sources.getLength(); si++) { 548 Element src = (Element) sources.item(si); 549 if (src.getParentNode() != rel) continue; 550 String clauseType = src.getAttribute("clauseType"); 551 String relClause = rel.getAttribute("clause"); 552 EdgeRole role = clauseTypeToRole(clauseType, relClause); 553 if (role == null) { 554 // Source is itself a system RelationRows pointing at 555 // another resultset — chain through it. 556 if ("system".equals(src.getAttribute("source"))) { 557 String nextParent = src.getAttribute("parent_id"); 558 if (visitedRowSetParents.add(nextParent)) { 559 rowSetParents.add(nextParent); 560 } 561 } 562 continue; 563 } 564 // Source identifies a column on a base table or a non-base 565 // resultset. Resolve via fdd closure with INCLUDE_ALL so 566 // function-arg suppression doesn't skew row influence. 567 String srcParentId = src.getAttribute("parent_id"); 568 Element parent = elementById(doc, srcParentId); 569 if (parent == null) continue; 570 if (isBaseTable(parent)) { 571 // Skip constantTable for row influence too. 572 if ("constantTable".equals(parent.getAttribute("type"))) continue; 573 String col = src.getAttribute("column"); 574 if ("*".equals(col)) continue; 575 edges.add(new CanonicalLineageEdge(role, null, 576 parent.getAttribute("name").toLowerCase(Locale.ROOT), 577 col.toLowerCase(Locale.ROOT))); 578 } else if (isResultset(parent)) { 579 String srcId = src.getAttribute("id"); 580 BfsResult inner = resolveBaseSources(srcId, doc, FanoutPolicy.INCLUDE_ALL); 581 for (TableColumn tc : inner.terminals) { 582 edges.add(new CanonicalLineageEdge(role, null, tc.table, tc.column)); 583 } 584 } // else: constantTable / function — already filtered 585 } 586 } 587 } 588 589 return ProjectorResult.ok( 590 new CanonicalLineageModel(edges, outputNames, aggregateByOutput)); 591 } 592 593 /** 594 * Project one final-resultset output column. The aggregate flag is set 595 * <b>only when an immediate fdd source of this column lives on a 596 * function resultset whose function name is in 597 * {@link #AGGREGATE_FUNCTION_NAMES}</b>. This matches Semantic IR's 598 * per-output (non-transitive) aggregate semantics — see slice-6 599 * design notes on flag locality. The BFS itself walks transitively 600 * to collect base-table SELECT terminals. 601 */ 602 private static BfsResult projectColumn(String columnId, Document doc, FanoutPolicy policy) { 603 BfsResult result = new BfsResult(); 604 605 // First-hop aggregate detection. Local rule: this output is aggregate 606 // iff *any* immediate fdd source lives on a function resultset whose 607 // function name is in AGGREGATE_FUNCTION_NAMES AND the function 608 // resultset is NOT a window function (slice 13). We must scan all 609 // immediate function sources, not stop at the first one — a fdd whose 610 // sources include both `UPPER(...)` and `SUM(...)` columns must still 611 // mark the output aggregate. 612 // 613 // Slice 13 window-vs-aggregate discriminator: a function resultset is 614 // windowed iff at least one fdr targeting it carries a source with 615 // clauseType="selectList" (PARTITION BY) or clauseType="orderby" 616 // (OVER ORDER BY). Plain aggregates have only the RelationRows fdr; 617 // window functions add the PARTITION BY / OVER ORDER BY fdrs. Empty 618 // OVER () is not discriminable in dlineage XML (looks identical to a 619 // plain aggregate); the IR builder rejects empty OVER () to avoid 620 // manufacturing AGGREGATION_MISMATCH divergences. 621 outer: 622 for (Element fdd : findFddByTarget(doc, columnId)) { 623 for (Element fnRs : immediateFunctionSourceResultsets(fdd, doc)) { 624 String fnName = fnRs.getAttribute("name"); 625 if (fnName == null || fnName.isEmpty()) continue; 626 if (!AGGREGATE_FUNCTION_NAMES.contains(fnName.toLowerCase(Locale.ROOT))) continue; 627 if (isWindowFunctionResultset(fnRs, doc)) continue; 628 result.aggregate = true; 629 break outer; 630 } 631 } 632 633 // Slice 48 — single-step descent through a known scalar wrapper. 634 // The first-hop check above mirrors the IR builder's per-output 635 // (non-transitive) aggregate semantics, but for a small set of 636 // case-folding scalar wrappers used to format the output of an 637 // ordered-set aggregate, the IR side classifies the projection 638 // as aggregate via the descendant ordered-set name. The 639 // projector must mirror that classification or the divergence 640 // harness reports a pre-existing 641 // {@link DivergenceClass#AGGREGATION_MISMATCH}. 642 // 643 // Lift surface as of slice 57: 644 // wrapper-name ∈ SLICE48_DESCENT_WRAPPER_NAMES 645 // = {lower, upper, concat, trim, ltrim, rtrim, 646 // substring, substr, replace, lpad, rpad, 647 // left, right, regexp_replace} 648 // ∧ inner-name ∈ SLICE48_DESCENT_INNER_NAMES 649 // = {mode, listagg, string_agg} 650 // ∧ !isWindowFunctionResultset(inner). 651 // 652 // History: 653 // - slice 48 introduced the descent with wrapper={lower}, 654 // inner={mode}. 655 // - slice 49 widened inner to {mode, listagg, string_agg}. 656 // - slice 50 widened wrapper to {lower, upper}. 657 // - slice 51 widened wrapper to {lower, upper, concat}. 658 // - slice 52 widened wrapper to 659 // {lower, upper, concat, trim, ltrim, rtrim}. 660 // - slice 53 widened wrapper to 661 // {lower, upper, concat, trim, ltrim, rtrim, substring, 662 // substr}. 663 // - slice 54 widened wrapper to 664 // {lower, upper, concat, trim, ltrim, rtrim, substring, 665 // substr, replace}. 666 // - slice 55 widened wrapper to 667 // {lower, upper, concat, trim, ltrim, rtrim, substring, 668 // substr, replace, lpad, rpad}. 669 // - slice 56 widened wrapper to 670 // {lower, upper, concat, trim, ltrim, rtrim, substring, 671 // substr, replace, lpad, rpad, left, right}. 672 // - slice 57 widens wrapper to 673 // {lower, upper, concat, trim, ltrim, rtrim, substring, 674 // substr, replace, lpad, rpad, left, right, 675 // regexp_replace}. 676 // The {@code ||} string-concatenation operator does NOT emit a 677 // wrapper resultset on PG / Snowflake (the parser folds 678 // {@code expr || ' '} into the projection root directly), so no 679 // descent is needed for that shape — the existing first-hop 680 // aggregate detection already handles it. Other scalar wrappers 681 // ({@code COALESCE}, {@code INITCAP}, {@code REVERSE}, 682 // {@code LENGTH} / {@code CHAR_LENGTH}, arithmetic) stay 683 // outside the descent — their argument-shape signatures 684 // differ and require independent probes. 685 // 686 // The descent is also one-level only — {@code LOWER(LOWER(mode WG))} 687 // remains divergent because the wrapper's immediate function 688 // source is {@code lower}, not {@code mode}. This keeps the 689 // descent O(immediate-fdds) and avoids any risk of 690 // double-counting through arbitrarily deep wrappings. 691 if (!result.aggregate) { 692 // The first-hop loop above scans the column's fdds for an 693 // immediate function-resultset source whose own column was 694 // the projection's value (e.g. {@code RS-1.s} fdd-source 695 // {@code LOWER.LOWER}). To descend one level we walk through 696 // each scalar-wrapper source's <i>column</i> and look at 697 // that column's own fdd targets — the wrapper-internal fdd 698 // (e.g. fdd targeting {@code LOWER.LOWER} with source 699 // {@code mode.mode}). The {@code findFddByTarget} helper 700 // matches on the target column's {@code id}, NOT the 701 // resultset id, so we must look up the wrapper's child 702 // column id (slice-48 descent's distinguishing detail vs 703 // the existing first-hop check). 704 outerDescent: 705 for (Element fdd : findFddByTarget(doc, columnId)) { 706 NodeList sources = fdd.getElementsByTagName("source"); 707 for (int si = 0; si < sources.getLength(); si++) { 708 Element src = (Element) sources.item(si); 709 if (src.getParentNode() != fdd) continue; 710 String wrapperParentId = src.getAttribute("parent_id"); 711 Element wrapperRs = elementById(doc, wrapperParentId); 712 if (wrapperRs == null 713 || !isResultset(wrapperRs) 714 || !"function".equals(wrapperRs.getAttribute("type"))) continue; 715 String wrapperName = wrapperRs.getAttribute("name"); 716 if (wrapperName == null) continue; 717 if (!SLICE48_DESCENT_WRAPPER_NAMES.contains( 718 wrapperName.toLowerCase(Locale.ROOT))) continue; 719 String wrapperColumnId = src.getAttribute("id"); 720 if (wrapperColumnId == null || wrapperColumnId.isEmpty()) continue; 721 for (Element wrapperFdd : findFddByTarget(doc, wrapperColumnId)) { 722 for (Element innerRs : immediateFunctionSourceResultsets(wrapperFdd, doc)) { 723 String innerName = innerRs.getAttribute("name"); 724 if (innerName == null) continue; 725 if (!SLICE48_DESCENT_INNER_NAMES.contains( 726 innerName.toLowerCase(Locale.ROOT))) continue; 727 // Defense-in-depth: even though `mode` is in 728 // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES (so 729 // an `orderby`-only fdr is not classified as 730 // windowed), a `selectList` fdr (PARTITION 731 // BY) on the inner resultset still indicates 732 // the OVER form. Skip those PARTITION-BY 733 // windowed inners so the slice-48 descent does 734 // not lift that OVER projection. The known 735 // OVER-only-ORDER-BY carve-out remains 736 // classified as plain aggregate on the 737 // projector side; IR rejects that form, and 738 // Slice48Test documents the limitation. 739 if (isWindowFunctionResultset(innerRs, doc)) continue; 740 result.aggregate = true; 741 break outerDescent; 742 } 743 } 744 } 745 } 746 } 747 748 // BFS for SELECT terminals. The aggregate flag is no longer touched 749 // inside the BFS; the only thing the function nodes still control is 750 // SELECT-fan-out suppression for COUNT(*)-shaped calls. 751 Set<String> visited = new HashSet<>(); 752 Deque<String> q = new ArrayDeque<>(); 753 q.add(columnId); 754 visited.add(columnId); 755 while (!q.isEmpty()) { 756 String cur = q.removeFirst(); 757 for (Element fdd : findFddByTarget(doc, cur)) { 758 Element targetEl = firstChildElement(fdd, "target"); 759 if (targetEl == null) continue; 760 Element targetParent = elementById(doc, targetEl.getAttribute("parent_id")); 761 boolean targetIsFunction = targetParent != null 762 && isResultset(targetParent) 763 && "function".equals(targetParent.getAttribute("type")); 764 if (targetIsFunction 765 && policy == FanoutPolicy.SUPPRESS_AGGREGATE_NULL_ARG 766 && shouldSuppressFunctionFanout(fdd, targetParent)) { 767 continue; 768 } 769 NodeList sources = fdd.getElementsByTagName("source"); 770 for (int si = 0; si < sources.getLength(); si++) { 771 Element src = (Element) sources.item(si); 772 if (src.getParentNode() != fdd) continue; 773 String col = src.getAttribute("column"); 774 if ("*".equals(col)) continue; 775 String parentId = src.getAttribute("parent_id"); 776 Element parent = elementById(doc, parentId); 777 if (parent == null) continue; 778 if (isBaseTable(parent)) { 779 if ("constantTable".equals(parent.getAttribute("type"))) continue; 780 result.terminals.add(new TableColumn( 781 parent.getAttribute("name").toLowerCase(Locale.ROOT), 782 col.toLowerCase(Locale.ROOT))); 783 } else if (isResultset(parent)) { 784 String nextId = src.getAttribute("id"); 785 if (visited.add(nextId)) { 786 q.add(nextId); 787 } 788 } 789 // constantTable: skipped already; function resultset 790 // descent is handled when nextId is dequeued and its fdd 791 // is matched above. 792 } 793 } 794 } 795 return result; 796 } 797 798 /** 799 * Public-ish helper used by the row-influence path. Same BFS as 800 * {@link #projectColumn} but the aggregate flag is meaningless here. 801 */ 802 static BfsResult resolveBaseSources(String columnId, Document doc, FanoutPolicy policy) { 803 return projectColumn(columnId, doc, policy); 804 } 805 806 /** 807 * Return every function-resultset name that appears as the parent of an 808 * immediate {@code <source>} on this fdd. Multiple are possible when an 809 * expression projects something like {@code UPPER(name) || SUM(salary)} 810 * and dlineage produces sources from both function nodes. 811 */ 812 private static List<String> immediateFunctionSourceNames(Element fdd, Document doc) { 813 List<String> names = new ArrayList<>(); 814 NodeList sources = fdd.getElementsByTagName("source"); 815 for (int si = 0; si < sources.getLength(); si++) { 816 Element src = (Element) sources.item(si); 817 if (src.getParentNode() != fdd) continue; 818 Element parent = elementById(doc, src.getAttribute("parent_id")); 819 if (parent != null && isResultset(parent) 820 && "function".equals(parent.getAttribute("type"))) { 821 String name = parent.getAttribute("name"); 822 if (name != null && !name.isEmpty()) names.add(name); 823 } 824 } 825 return names; 826 } 827 828 /** 829 * Slice 13: parallel to {@link #immediateFunctionSourceNames} but 830 * returns the function-resultset {@link Element}s themselves so the 831 * caller can inspect them with {@link #isWindowFunctionResultset}. 832 */ 833 private static List<Element> immediateFunctionSourceResultsets(Element fdd, Document doc) { 834 List<Element> els = new ArrayList<>(); 835 NodeList sources = fdd.getElementsByTagName("source"); 836 for (int si = 0; si < sources.getLength(); si++) { 837 Element src = (Element) sources.item(si); 838 if (src.getParentNode() != fdd) continue; 839 Element parent = elementById(doc, src.getAttribute("parent_id")); 840 if (parent != null && isResultset(parent) 841 && "function".equals(parent.getAttribute("type"))) { 842 els.add(parent); 843 } 844 } 845 return els; 846 } 847 848 /** 849 * Slice 13: window-vs-aggregate discriminator. A function resultset is 850 * windowed iff it has at least one {@code fdr} edge whose target's 851 * {@code parent_id} equals the function resultset's id AND whose source 852 * carries {@code clauseType="selectList"} (PARTITION BY) or 853 * {@code clauseType="orderby"} (OVER ORDER BY). Plain (non-windowed) 854 * aggregates have only the RelationRows fdr; the addition of PARTITION 855 * BY / OVER ORDER BY fdrs is dlineage's way of recording the analytic 856 * dependencies of a windowed call. 857 * 858 * <p>Empty {@code OVER ()} window functions are NOT discriminable here 859 * (their XML is byte-identical to a plain aggregate). Slice 13's IR 860 * builder rejects empty {@code OVER ()} so the divergence harness 861 * never sees that case in practice. 862 * 863 * <p>Probed against Oracle (slice-13 corpus); cross-vendor parity 864 * (PostgreSQL, BigQuery, SparkSQL) is unproven and deferred. 865 * 866 * <p>Slice 35 override: function names in 867 * {@link #ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} suppress an 868 * order-by-only discriminator. PostgreSQL emits a 869 * {@code clauseType="orderby"} fdr for the WITHIN GROUP ORDER BY of 870 * {@code LISTAGG(... ) WITHIN GROUP (...)}, which the slice-13 871 * discriminator would otherwise mis-classify as windowed. A 872 * {@code selectList} fdr still wins and keeps analytic/window shapes 873 * classified as windowed. 874 */ 875 private static boolean isWindowFunctionResultset(Element fnRs, Document doc) { 876 if (fnRs == null) return false; 877 String fnName = fnRs.getAttribute("name"); 878 boolean orderByWithinGroupCandidate = fnName != null 879 && ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES.contains( 880 fnName.toLowerCase(Locale.ROOT)); 881 String fnRsId = fnRs.getAttribute("id"); 882 if (fnRsId == null || fnRsId.isEmpty()) return false; 883 boolean sawOrderByDiscriminator = false; 884 NodeList rels = doc.getElementsByTagName("relationship"); 885 for (int i = 0; i < rels.getLength(); i++) { 886 Element rel = (Element) rels.item(i); 887 if (!"fdr".equals(rel.getAttribute("type"))) continue; 888 Element target = firstChildElement(rel, "target"); 889 if (target == null) continue; 890 if (!fnRsId.equals(target.getAttribute("parent_id"))) continue; 891 NodeList sources = rel.getElementsByTagName("source"); 892 for (int si = 0; si < sources.getLength(); si++) { 893 Element src = (Element) sources.item(si); 894 if (src.getParentNode() != rel) continue; 895 String ct = src.getAttribute("clauseType"); 896 if ("selectList".equals(ct)) return true; 897 if ("orderby".equals(ct)) sawOrderByDiscriminator = true; 898 } 899 } 900 return sawOrderByDiscriminator && !orderByWithinGroupCandidate; 901 } 902 903 /** 904 * Decide whether the SELECT fan-out from a function-targeted fdd should 905 * be suppressed. Two conditions must hold: 906 * 907 * <ol> 908 * <li>The function name is in {@link #NULL_ARG_AGGREGATES}. Today 909 * only {@code COUNT}; this prevents a hypothetical 910 * {@code SUM(t.*)} (some dialects allow it) from silently losing 911 * lineage.</li> 912 * <li>Either at least one source has {@code column="*"} (the 913 * {@code COUNT(*)} shape) or every source is a constant-table 914 * column ({@code COUNT(1)} / {@code COUNT(literal)}).</li> 915 * </ol> 916 */ 917 private static boolean shouldSuppressFunctionFanout(Element functionFdd, Element functionResultset) { 918 if (functionResultset == null) return false; 919 String fnName = functionResultset.getAttribute("name"); 920 if (fnName == null || fnName.isEmpty()) return false; 921 if (!NULL_ARG_AGGREGATES.contains(fnName.toLowerCase(Locale.ROOT))) return false; 922 923 boolean anyStar = false; 924 boolean anyNonConstant = false; 925 boolean anySource = false; 926 NodeList sources = functionFdd.getElementsByTagName("source"); 927 for (int si = 0; si < sources.getLength(); si++) { 928 Element src = (Element) sources.item(si); 929 if (src.getParentNode() != functionFdd) continue; 930 anySource = true; 931 if ("*".equals(src.getAttribute("column"))) { 932 anyStar = true; 933 continue; 934 } 935 String parentId = src.getAttribute("parent_id"); 936 Element parent = elementById(functionFdd.getOwnerDocument(), parentId); 937 if (parent != null && isBaseTable(parent) 938 && "constantTable".equals(parent.getAttribute("type"))) { 939 continue; 940 } 941 anyNonConstant = true; 942 } 943 if (!anySource) return false; 944 if (anyStar) return true; 945 return !anyNonConstant; 946 } 947 948 private static List<Element> findFddByTarget(Document doc, String columnId) { 949 List<Element> out = new ArrayList<>(); 950 NodeList rels = doc.getElementsByTagName("relationship"); 951 for (int i = 0; i < rels.getLength(); i++) { 952 Element rel = (Element) rels.item(i); 953 if (!"fdd".equals(rel.getAttribute("type"))) continue; 954 Element target = firstChildElement(rel, "target"); 955 if (target == null) continue; 956 if (columnId.equals(target.getAttribute("id"))) { 957 out.add(rel); 958 } 959 } 960 return out; 961 } 962 963 /** 964 * Collect parent IDs of fdd <b>and</b> fdr sources. The terminal-resultset 965 * detection treats any resultset that is referenced as a source of either 966 * data-dependency (fdd) <b>or</b> row-influence (fdr) as non-terminal. 967 * 968 * <p>Slice 23 motivation: an EXISTS subquery in JOIN ON contributes its 969 * inner SELECT-list resultset only via an {@code fdr clause="on"} edge 970 * (the inner result is plumbed into outer's RelationRows as a row- 971 * influence source, NOT as a data dependency on any outer-projected 972 * column). Without considering fdr sources, both the outer SELECT-list 973 * resultset AND the inner EXISTS-body resultset would qualify as 974 * terminal candidates, producing a false MULTIPLE_TERMINAL_SELECTS. 975 * 976 * <p>Pre-slice-23 corpus entries (FROM-subquery, scalar-subquery, 977 * set-op) all referenced inner resultsets via fdd, so the original 978 * fdd-only check sufficed for them; widening to fdr is additive and 979 * cannot mis-classify their inner resultsets as referenced when they 980 * weren't already. 981 */ 982 private static Set<String> collectFddOrFdrSourceParentIds(Document doc) { 983 Set<String> out = new HashSet<>(); 984 NodeList rels = doc.getElementsByTagName("relationship"); 985 for (int i = 0; i < rels.getLength(); i++) { 986 Element rel = (Element) rels.item(i); 987 String relType = rel.getAttribute("type"); 988 if (!"fdd".equals(relType) && !"fdr".equals(relType)) continue; 989 NodeList sources = rel.getElementsByTagName("source"); 990 for (int si = 0; si < sources.getLength(); si++) { 991 Element src = (Element) sources.item(si); 992 if (src.getParentNode() != rel) continue; 993 String parentId = src.getAttribute("parent_id"); 994 if (parentId != null && !parentId.isEmpty()) { 995 out.add(parentId); 996 } 997 } 998 } 999 return out; 1000 } 1001 1002 /** 1003 * Apply OWASP-recommended XXE-prevention features to a 1004 * {@link DocumentBuilderFactory}. Each set is best-effort: a parser 1005 * that does not expose a feature swallows the failure but the others 1006 * still apply. 1007 */ 1008 private static void applyXmlSecurity(DocumentBuilderFactory dbf) { 1009 String[] disable = { 1010 // Disallow inline DTDs entirely — strongest mitigation. 1011 "http://apache.org/xml/features/disallow-doctype-decl", 1012 }; 1013 String[] enable = { 1014 javax.xml.XMLConstants.FEATURE_SECURE_PROCESSING, 1015 }; 1016 String[] disableExt = { 1017 "http://xml.org/sax/features/external-general-entities", 1018 "http://xml.org/sax/features/external-parameter-entities", 1019 "http://apache.org/xml/features/nonvalidating/load-external-dtd", 1020 }; 1021 for (String f : disable) { 1022 try { dbf.setFeature(f, true); } catch (Exception ignore) {} 1023 } 1024 for (String f : enable) { 1025 try { dbf.setFeature(f, true); } catch (Exception ignore) {} 1026 } 1027 for (String f : disableExt) { 1028 try { dbf.setFeature(f, false); } catch (Exception ignore) {} 1029 } 1030 try { 1031 dbf.setAttribute(javax.xml.XMLConstants.ACCESS_EXTERNAL_DTD, ""); 1032 } catch (Exception ignore) {} 1033 try { 1034 dbf.setAttribute(javax.xml.XMLConstants.ACCESS_EXTERNAL_SCHEMA, ""); 1035 } catch (Exception ignore) {} 1036 dbf.setXIncludeAware(false); 1037 dbf.setExpandEntityReferences(false); 1038 } 1039 1040 private static Element elementById(Document doc, String id) { 1041 if (id == null || id.isEmpty()) return null; 1042 // The XML uses string ids on table/resultset/column elements; not 1043 // declared as XML IDs, so we lookup by attribute. 1044 for (String tag : new String[]{"table", "resultset", "column"}) { 1045 NodeList nl = doc.getElementsByTagName(tag); 1046 for (int i = 0; i < nl.getLength(); i++) { 1047 Element e = (Element) nl.item(i); 1048 if (id.equals(e.getAttribute("id"))) return e; 1049 } 1050 } 1051 return null; 1052 } 1053 1054 private static Element firstChildElement(Element parent, String tag) { 1055 NodeList nl = parent.getChildNodes(); 1056 for (int i = 0; i < nl.getLength(); i++) { 1057 Node n = nl.item(i); 1058 if (n.getNodeType() != Node.ELEMENT_NODE) continue; 1059 if (tag.equals(n.getNodeName())) return (Element) n; 1060 } 1061 return null; 1062 } 1063 1064 private static boolean isBaseTable(Element parent) { 1065 return "table".equals(parent.getTagName()); 1066 } 1067 1068 private static boolean isResultset(Element parent) { 1069 return "resultset".equals(parent.getTagName()); 1070 } 1071 1072 /** 1073 * True iff the resultset {@code type} is a candidate to be the 1074 * program's terminal resultset. {@code select_list} is the standard 1075 * shape (slice 1+). Slice 12 added the set-op result types 1076 * ({@code select_union}, {@code select_intersect}, {@code select_minus}, 1077 * {@code select_except}): dlineage emits one {@code RESULT_OF_<OP>-N} 1078 * resultset of the matching type per top-level set-op program; that 1079 * resultset's columns have one fdd edge each with N sources (one per 1080 * branch's per-position {@code select_list} column), and the existing 1081 * {@code projectColumn} BFS walks those fdd chains transparently to 1082 * reach base-table terminals. {@code select_except} (PostgreSQL / 1083 * SQL Server EXCEPT) is distinct from {@code select_minus} (Oracle / 1084 * Spark / Hive MINUS) at the dlineage layer even though they are 1085 * semantically equivalent. 1086 */ 1087 private static boolean isTerminalResultsetType(String type) { 1088 if (type == null) return false; 1089 return "select_list".equals(type) 1090 || "select_union".equals(type) 1091 || "select_intersect".equals(type) 1092 || "select_minus".equals(type) 1093 || "select_except".equals(type); 1094 } 1095 1096 private static EdgeRole clauseTypeToRole(String clauseType, String relClause) { 1097 if ("where".equals(clauseType) || "where".equals(relClause)) return EdgeRole.FILTER; 1098 if ("joinCondition".equals(clauseType) || "on".equals(relClause)) return EdgeRole.JOIN; 1099 return null; 1100 } 1101 1102 /** Result of projecting one column: terminals plus the aggregate flag. */ 1103 static final class BfsResult { 1104 final List<TableColumn> terminals = new ArrayList<>(); 1105 boolean aggregate = false; 1106 } 1107 1108 /** Lower-cased base (table, column) pair. */ 1109 static final class TableColumn { 1110 final String table; 1111 final String column; 1112 TableColumn(String table, String column) { 1113 this.table = table; 1114 this.column = column; 1115 } 1116 } 1117}