001package gudusoft.gsqlparser.ir.semantic.builder; 002 003import gudusoft.gsqlparser.EBoundaryType; 004import gudusoft.gsqlparser.EDbObjectType; 005import gudusoft.gsqlparser.EDbVendor; 006import gudusoft.gsqlparser.EExpressionType; 007import gudusoft.gsqlparser.EJoinType; 008import gudusoft.gsqlparser.ELimitRowType; 009import gudusoft.gsqlparser.EPseudoTableType; 010import gudusoft.gsqlparser.ESetOperatorType; 011import gudusoft.gsqlparser.EUniqueRowFilterType; 012import gudusoft.gsqlparser.ir.semantic.ColumnRef; 013import gudusoft.gsqlparser.ir.semantic.Diagnostic; 014import gudusoft.gsqlparser.ir.semantic.DiagnosticCode; 015import gudusoft.gsqlparser.ir.semantic.FrameBound; 016import gudusoft.gsqlparser.nodes.TParseTreeNode; 017import gudusoft.gsqlparser.ir.semantic.LineageEdge; 018import gudusoft.gsqlparser.ir.semantic.LineageRef; 019import gudusoft.gsqlparser.ir.semantic.OutputColumn; 020import gudusoft.gsqlparser.ir.semantic.RelationKind; 021import gudusoft.gsqlparser.ir.semantic.RelationSource; 022import gudusoft.gsqlparser.ir.semantic.RowLimit; 023import gudusoft.gsqlparser.ir.semantic.RowLimitKind; 024import gudusoft.gsqlparser.ir.semantic.SemanticProgram; 025import gudusoft.gsqlparser.ir.semantic.SetOperator; 026import gudusoft.gsqlparser.ir.semantic.StatementGraph; 027import gudusoft.gsqlparser.ir.semantic.TargetRelation; 028import gudusoft.gsqlparser.ir.semantic.WindowFrame; 029import gudusoft.gsqlparser.ir.semantic.WindowSpec; 030import gudusoft.gsqlparser.ir.semantic.binding.ColumnBinding; 031import gudusoft.gsqlparser.ir.semantic.binding.FromSubqueryNaming; 032import gudusoft.gsqlparser.ir.semantic.binding.NameBindingProvider; 033import gudusoft.gsqlparser.ir.semantic.binding.RelationBinding; 034import gudusoft.gsqlparser.ir.semantic.binding.UsingScope; 035import gudusoft.gsqlparser.nodes.TCTE; 036import gudusoft.gsqlparser.nodes.TCTEList; 037import gudusoft.gsqlparser.nodes.TExpression; 038import gudusoft.gsqlparser.nodes.TExpressionList; 039import gudusoft.gsqlparser.nodes.TFetchFirstClause; 040import gudusoft.gsqlparser.nodes.TFunctionCall; 041import gudusoft.gsqlparser.nodes.TLimitClause; 042import gudusoft.gsqlparser.nodes.TOffsetClause; 043import gudusoft.gsqlparser.nodes.TTopClause; 044import gudusoft.gsqlparser.nodes.TGroupBy; 045import gudusoft.gsqlparser.nodes.TGroupByItem; 046import gudusoft.gsqlparser.nodes.TGroupByItemList; 047import gudusoft.gsqlparser.nodes.TJoin; 048import gudusoft.gsqlparser.nodes.TJoinItem; 049import gudusoft.gsqlparser.nodes.TJoinItemList; 050import gudusoft.gsqlparser.nodes.TJoinList; 051import gudusoft.gsqlparser.nodes.TObjectName; 052import gudusoft.gsqlparser.nodes.TObjectNameList; 053import gudusoft.gsqlparser.nodes.TOrderBy; 054import gudusoft.gsqlparser.nodes.TOutputClause; 055import gudusoft.gsqlparser.nodes.TReturningClause; 056import gudusoft.gsqlparser.nodes.TOrderByItem; 057import gudusoft.gsqlparser.nodes.TOrderByItemList; 058import gudusoft.gsqlparser.nodes.TParseTreeVisitor; 059import gudusoft.gsqlparser.nodes.TPartitionClause; 060import gudusoft.gsqlparser.nodes.TResultColumn; 061import gudusoft.gsqlparser.nodes.TResultColumnList; 062import gudusoft.gsqlparser.nodes.TSelectDistinct; 063import gudusoft.gsqlparser.nodes.TTable; 064import gudusoft.gsqlparser.nodes.TWhereClause; 065import gudusoft.gsqlparser.nodes.TWindowDef; 066import gudusoft.gsqlparser.nodes.TWithinGroup; 067import gudusoft.gsqlparser.nodes.TWindowFrame; 068import gudusoft.gsqlparser.nodes.TWindowFrameBoundary; 069import gudusoft.gsqlparser.resolver2.ResolutionStatus; 070import gudusoft.gsqlparser.EInsertSource; 071import gudusoft.gsqlparser.nodes.TColumnDefinition; 072import gudusoft.gsqlparser.nodes.TColumnDefinitionList; 073import gudusoft.gsqlparser.nodes.TViewAliasClause; 074import gudusoft.gsqlparser.nodes.TViewAliasItem; 075import gudusoft.gsqlparser.nodes.TViewAliasItemList; 076import gudusoft.gsqlparser.stmt.TCreateTableSqlStatement; 077import gudusoft.gsqlparser.stmt.TCreateViewSqlStatement; 078import gudusoft.gsqlparser.stmt.TDeleteSqlStatement; 079import gudusoft.gsqlparser.stmt.TInsertSqlStatement; 080import gudusoft.gsqlparser.stmt.TMergeSqlStatement; 081import gudusoft.gsqlparser.stmt.TSelectSqlStatement; 082import gudusoft.gsqlparser.stmt.TUpdateSqlStatement; 083import gudusoft.gsqlparser.nodes.TMergeWhenClause; 084import gudusoft.gsqlparser.nodes.TMergeUpdateClause; 085import gudusoft.gsqlparser.nodes.TMergeInsertClause; 086 087import java.util.ArrayDeque; 088import java.util.ArrayList; 089import java.util.Collections; 090import java.util.Deque; 091import java.util.EnumSet; 092import java.util.HashMap; 093import java.util.HashSet; 094import java.util.IdentityHashMap; 095import java.util.LinkedHashMap; 096import java.util.LinkedHashSet; 097import java.util.List; 098import java.util.Locale; 099import java.util.Map; 100import java.util.Set; 101 102/** 103 * Builds a {@link SemanticProgram} from a parsed and resolved 104 * {@link TSelectSqlStatement}. 105 * 106 * <p>Current scope (after slice 9): SELECT with one or more base-table or 107 * CTE sources, optional WHERE, optional JOIN of base tables with ON 108 * conditions, optional GROUP BY (slice 6), optional WITH clause including 109 * chained CTEs (each CTE sees the ones declared strictly before it), 110 * optional FROM-clause subquery (slice 5), optional row-deduplication via 111 * {@code SELECT DISTINCT} or Oracle's {@code SELECT UNIQUE} synonym 112 * (slice 8 — see {@link StatementGraph#isDistinct()}), optional ORDER BY 113 * over physical column references or column-bearing expressions 114 * (slice 9 — see {@link StatementGraph#getOrderByColumnRefs()}). 115 * Expression projections like {@code salary * 2 AS doubled} or 116 * {@code a.x + a.y} are accepted and marked 117 * {@link OutputColumn#isDerived()}; aggregate function calls (slice 6) 118 * are flagged via {@link OutputColumn#isAggregate()}. 119 * 120 * <p>Slice 9 lifts {@code ORDER BY} for sort keys that are physical 121 * column references or expressions over them. The collected references 122 * surface as {@link StatementGraph#getOrderByColumnRefs()}. Sort 123 * direction ({@code ASC}/{@code DESC}) and null placement 124 * ({@code NULLS FIRST}/{@code NULLS LAST}) are presentation metadata 125 * and are not modelled. Ordinal forms ({@code ORDER BY 1}) and 126 * projection-alias forms ({@code SELECT id AS x ... ORDER BY x}) are 127 * rejected so the dependency information is never silently lost; a 128 * later slice can model output-position references explicitly. The 129 * canonical lineage model (slice 7) deliberately ignores ORDER BY — 130 * sort order changes presentation, not column dependency or row-set 131 * membership. 132 * 133 * <p>Row-limit clauses ({@code LIMIT}, {@code TOP}, {@code OFFSET}, 134 * {@code FETCH FIRST}) are rejected statement-wide, including the 135 * SQL Server-style {@code ORDER BY ... OFFSET ... FETCH NEXT}. With 136 * a row-limit present, {@code ORDER BY} ceases to be presentation-only 137 * and starts deciding which rows survive — the canonical-model 138 * exclusion would no longer be sound, so the entire statement is out 139 * of scope until a future slice models row-limit semantics. 140 * 141 * <p>Slice 10 lifts {@code HAVING}: the predicate's column references 142 * are collected into {@link StatementGraph#getHavingColumnRefs()} via 143 * {@link #buildHavingColumnRefs}. The same visitor pattern as projection 144 * and ORDER BY rejects subqueries (scalar, EXISTS, IN-SELECT, ANY/ALL/ 145 * SOME) and window functions before {@link #collectColumnRefs} runs, so 146 * inner-scope refs never leak. HAVING without GROUP BY is supported (the 147 * parser still attaches a {@link TGroupBy} node with empty items). 148 * HAVING is row-influence semantically but does not contribute to the 149 * canonical lineage model — see 150 * {@link StatementGraph#getHavingColumnRefs()} for why. 151 * 152 * <p>Slice 11 lifted uncorrelated scalar subqueries in projection; 153 * scalar bodies are extracted as their own statements via 154 * {@link #extractScalarSubqueriesAsStatements} with the synthetic-name 155 * convention {@code <scalar_subquery_<index>>}. 156 * 157 * <p>Slice 12 lifts set operations (UNION / UNION ALL / INTERSECT / 158 * INTERSECT ALL / MINUS / MINUS ALL / EXCEPT / EXCEPT ALL) at the top 159 * level and as CTE bodies. Each branch becomes its own 160 * {@link StatementGraph} with synthetic name 161 * {@code <set_op_branch_<index>>}; the outer set-op statement carries 162 * empty {@code relations} and lineage edges fan out per-position to 163 * each branch. The flatten descends the left-leaning AST iteratively 164 * (per CLAUDE.md — no recursion on {@code leftStmt}/{@code rightStmt}). 165 * See {@link #buildSetOpProgram}. 166 * 167 * <p>Slice 22 lifts window-function frame clauses 168 * ({@code ROWS}/{@code RANGE}/{@code GROUPS BETWEEN ...}); the frame 169 * unit, start bound, and optional end bound are captured in 170 * {@link WindowFrame} hung off 171 * {@link WindowSpec#getFrame()}. Frame info is presentation-only 172 * (dlineage XML harvests no frame information) and does NOT contribute 173 * to the canonical lineage model — same status as slice-13's 174 * PARTITION BY / OVER ORDER BY refs. Per-bound EXCLUDE clauses 175 * (Netezza-reachable) and non-constant offsets (PG 176 * {@code simple_object_name_t}, ANSI {@code parenthesis_t}) are still 177 * rejected. 178 * 179 * <p>Still rejected: {@code WITH RECURSIVE}, {@code DISTINCT ON (...)} 180 * and other non-{@code DISTINCT}/{@code UNIQUE} row-filters, 181 * scalar-body constant-only projections (zero column refs), 182 * correlated scalar subqueries, scalar bodies with 183 * subqueries in WHERE/JOIN ON/GROUP BY, multi-column scalar inner, 184 * scalar subqueries embedded in larger projection expressions including 185 * EXISTS-in-projection, embedded window functions in larger projection 186 * expressions, window functions in scalar-subquery bodies, window 187 * functions in WHERE/JOIN ON/GROUP BY/HAVING/ORDER BY, empty 188 * {@code OVER ()}, frame clauses with non-constant offsets (PG 189 * {@code simple_object_name_t}, ANSI {@code parenthesis_t}), frame 190 * {@code EXCLUDE} clauses (Netezza-reachable), named windows, 191 * vendor-specific window extensions ({@code FILTER (WHERE ...)}, 192 * {@code WITHIN GROUP}, 193 * {@code KEEP DENSE_RANK}, Hive {@code DISTRIBUTE BY}/{@code CLUSTER BY}/ 194 * {@code SORT BY}/{@code PARTITION BY ... SORT (...)}), non-physical 195 * {@code PARTITION BY} / OVER {@code ORDER BY} refs (literals, 196 * subqueries, function calls, expressions, expression-alias references), 197 * window function names outside the slice-13 allowlist, 198 * (slice 63 lifts explicit {@code CROSS JOIN}, slice 64 lifts 199 * {@code JOIN ... USING (...)}, and slice 66 lifts {@code NATURAL JOIN} 200 * at outer / CTE-body / FROM-subquery-body call sites; all three stay 201 * rejected inside scalar / set-op-branch / set-op-CTE / predicate bodies; 202 * NATURAL additionally requires resolvable catalog metadata on both 203 * sides, with a side-specific reject otherwise), duplicate aliases, 204 * Oracle 205 * {@code ORDER SIBLINGS BY}, Teradata {@code ORDER BY ... RESET WHEN}, 206 * row-limit clauses, ORDER BY ordinals/aliases, Teradata {@code QUALIFY} 207 * clause, set operations nested in FROM-subquery / scalar bodies, 208 * mixed-operator and mixed-{@code _ALL} set-op chains, set-op outer 209 * ORDER BY / row-limit clauses, set-op internal-node modifiers, branch 210 * column-count mismatch, set-op branches with FROM-subquery / scalar 211 * projection / their own CTE list, nested WITH on set-op CTE body. The 212 * builder fails fast outside this scope so callers see the unsupported 213 * case immediately rather than receiving a half-built IR. 214 */ 215public final class SemanticIRBuilder { 216 217 /** 218 * Reserved name prefix for synthetic scalar-subquery body 219 * statements (slice 11). Names take the form 220 * {@code "<scalar_subquery_<index>>"}; the angle brackets ensure 221 * no collision with real CTE names or FROM-clause aliases. 222 * {@link #isScalarSyntheticName(String)} is the only authorised 223 * detector — both this builder and 224 * {@code SemanticIRProjector.BodyIndexes} use it so the convention 225 * lives in one place. 226 */ 227 public static final String SCALAR_BODY_PREFIX = "<scalar_subquery_"; 228 229 /** 230 * Strict regex for synthetic scalar-subquery-body names. Format is 231 * exactly {@code <scalar_subquery_<digits>>} — pinning the digits 232 * suffix and the closing angle bracket prevents a real (quoted) 233 * CTE alias that happens to begin with the prefix from being 234 * misclassified as a synthetic name and silently skipped by 235 * {@code BodyIndexes}. 236 */ 237 private static final java.util.regex.Pattern SCALAR_NAME_PATTERN = 238 java.util.regex.Pattern.compile("<scalar_subquery_\\d+>"); 239 240 /** 241 * True iff {@code name} is a synthetic scalar-subquery-body name 242 * created by this builder (slice 11). Used by 243 * {@code SemanticIRProjector.BodyIndexes} to skip such bodies when 244 * building the CTE/FROM-subquery name lookup tables — scalar 245 * bodies are reached only via lineage edges, never via relations. 246 * 247 * <p>The match is strict: the name must be the full reserved 248 * pattern {@code <scalar_subquery_<digits>>}. A real CTE alias 249 * that happens to start with {@code <scalar_subquery_} but 250 * doesn't match the digits-and-closing-bracket suffix is NOT 251 * skipped (codex impl-review round-1 SHOULD 2). 252 */ 253 public static boolean isScalarSyntheticName(String name) { 254 return name != null && SCALAR_NAME_PATTERN.matcher(name).matches(); 255 } 256 257 /** 258 * Reserved name prefix for synthetic set-op-branch body statements 259 * (slice 12). Names take the form {@code "<set_op_branch_<index>>"}; 260 * the angle brackets ensure no collision with real CTE names or 261 * FROM-clause aliases. {@link #isSetOpBranchSyntheticName(String)} is 262 * the only authorised detector — both this builder and 263 * {@code SemanticIRProjector.BodyIndexes} use it so the convention 264 * lives in one place (slice-11 process lesson #10 generalised). 265 */ 266 public static final String SET_OP_BRANCH_PREFIX = "<set_op_branch_"; 267 268 /** 269 * Strict regex for synthetic set-op-branch-body names. Format is 270 * exactly {@code <set_op_branch_<digits>>} — pinning the digits 271 * suffix and the closing angle bracket prevents a real (quoted) CTE 272 * alias that happens to begin with the prefix from being 273 * misclassified as a synthetic name and silently skipped by 274 * {@code BodyIndexes}. 275 */ 276 private static final java.util.regex.Pattern SET_OP_BRANCH_NAME_PATTERN = 277 java.util.regex.Pattern.compile("^<set_op_branch_\\d+>$"); 278 279 /** 280 * True iff {@code name} is a synthetic set-op-branch-body name 281 * created by this builder (slice 12). Used by 282 * {@code SemanticIRProjector.BodyIndexes} to skip such bodies when 283 * building the CTE/FROM-subquery name lookup tables — set-op 284 * branches are reached only via lineage edges, never via relations. 285 * 286 * <p>The match is strict: the name must be the full reserved 287 * pattern {@code <set_op_branch_<digits>>}. 288 */ 289 public static boolean isSetOpBranchSyntheticName(String name) { 290 return name != null && SET_OP_BRANCH_NAME_PATTERN.matcher(name).matches(); 291 } 292 293 /** 294 * Reserved name prefix for synthetic predicate-subquery body statements 295 * (slice 23 — uncorrelated EXISTS extracted from outer-SELECT JOIN ON). 296 * Names take the form {@code "<predicate_subquery_<index>>"}; the angle 297 * brackets ensure no collision with real CTE names or FROM-clause aliases. 298 * {@link #isPredicateSubquerySyntheticName(String)} is the only authorised 299 * detector — both this builder and {@code SemanticIRProjector.BodyIndexes} 300 * use it so the convention lives in one place. 301 */ 302 public static final String PREDICATE_BODY_PREFIX = "<predicate_subquery_"; 303 304 /** 305 * Strict regex for synthetic predicate-subquery-body names. Format is 306 * exactly {@code <predicate_subquery_<digits>>}; pinning the digit suffix 307 * and the closing angle bracket prevents a real (quoted) CTE alias that 308 * happens to begin with the prefix from being misclassified as a synthetic 309 * name and silently skipped by {@code BodyIndexes}. 310 */ 311 private static final java.util.regex.Pattern PREDICATE_BODY_NAME_PATTERN = 312 java.util.regex.Pattern.compile("<predicate_subquery_\\d+>"); 313 314 /** 315 * True iff {@code name} is a synthetic predicate-subquery-body name 316 * created by this builder (slice 23). Used by 317 * {@code SemanticIRProjector.BodyIndexes} to skip such bodies when 318 * building the CTE/FROM-subquery name lookup tables — predicate-subquery 319 * bodies are unreachable from outer (no relation edge, no lineage edge). 320 */ 321 public static boolean isPredicateSubquerySyntheticName(String name) { 322 return name != null && PREDICATE_BODY_NAME_PATTERN.matcher(name).matches(); 323 } 324 325 /** 326 * Aggregate function names recognized by the builder's per-output 327 * aggregate flag detection (slice 6 originated; slice 29 / slice 30 328 * extended). Treated as case-insensitive. Callers should go through 329 * {@link #isAggregateFunction(TExpression)} rather than reading this 330 * set directly. 331 * 332 * <p>Slice-29 extensions: dialect aggregates {@code listagg}, 333 * {@code string_agg}, {@code group_concat}, {@code array_agg}. 334 * Slice-30 extension: {@code mode} (PostgreSQL ordered-set aggregate; 335 * admitted via the slice-29 WITHIN GROUP path under 336 * {@code findUnsupportedWithinGroupFunctionName}). Slice 30 also 337 * removes {@code mode} from {@link #WINDOW_FUNCTION_NAMES} via an 338 * explicit {@code s.remove("mode")} so the slice-13 window allowlist 339 * isn't widened — see {@link #WINDOW_FUNCTION_NAMES} JavaDoc and 340 * {@code DlineageXmlProjector.ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} for 341 * the matching window-vs-aggregate discriminator override. 342 */ 343 private static final Set<String> AGGREGATE_FUNCTION_NAMES; 344 static { 345 Set<String> s = new HashSet<>(); 346 s.add("count"); 347 s.add("sum"); 348 s.add("avg"); 349 s.add("min"); 350 s.add("max"); 351 s.add("stddev"); 352 s.add("variance"); 353 s.add("var_samp"); 354 s.add("var_pop"); 355 s.add("stddev_samp"); 356 s.add("stddev_pop"); 357 // Common dialect-specific aggregates so the flag has fewer false negatives. 358 s.add("listagg"); // Oracle, PostgreSQL 16+ 359 s.add("string_agg"); // PostgreSQL, SQL Server 360 s.add("group_concat"); // MySQL 361 s.add("array_agg"); // PostgreSQL, Snowflake, BigQuery 362 // Slice 30: PostgreSQL ordered-set aggregate. Unlike percentile_cont / 363 // percentile_disc / rank-family, mode() has no documented window form 364 // in any GSP-supported vendor; admitting it lets the WITHIN GROUP path 365 // accept it in JOIN ON predicate subqueries (slice 29 lift extension) 366 // AND lets DlineageXmlProjector mark its output aggregate=true. 367 // Defensive: WINDOW_FUNCTION_NAMES below subtracts mode after 368 // s.addAll(AGGREGATE_FUNCTION_NAMES) so mode() OVER (...) stays 369 // rejected by the slice-13 window allowlist. 370 s.add("mode"); // PostgreSQL ordered-set aggregate (slice 30) 371 AGGREGATE_FUNCTION_NAMES = Collections.unmodifiableSet(s); 372 } 373 374 /** 375 * Slice 42: hypothetical-set ordered-set aggregate function names that 376 * are ALSO valid window functions. Unlike {@link #AGGREGATE_FUNCTION_NAMES} 377 * these names are admitted as aggregates ONLY when the call carries a 378 * {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only} 379 * {@link TWindowDef} attachment (Oracle / SQL Server parser style — 380 * {@code RANK(100) WITHIN GROUP (ORDER BY x.id)} produces 381 * {@code fn.getWindowDef()!=null}, {@code wd.isIncludingOverClause()== 382 * false}, {@code wd.getWithinGroup()!=null}). Any other shape — direct 383 * {@code fn.getWithinGroup()} (PG / Snowflake style), 384 * {@code fn.getWindowDef()} with {@code OVER (...)}, or no attachment 385 * at all — keeps the existing window-function classification. 386 * 387 * <p>The set is intentionally NOT merged into 388 * {@link #AGGREGATE_FUNCTION_NAMES} because that would also lift the PG 389 * direct-attachment hypothetical-set form ({@code rank(0.5) WITHIN GROUP 390 * (ORDER BY x.salary)}). Pre-plan probe ({@code /tmp/probe42/Probe42.java}) 391 * confirmed PG dlineage XML for hypothetical-set is structurally 392 * indistinguishable from {@code rank() OVER (ORDER BY x)} (both emit 393 * {@code clauseType="orderby"} fdr) — admitting PG hypothetical-set 394 * would manufacture an {@code AGGREGATION_MISMATCH} divergence on the 395 * windowed form because the projector's 396 * {@code DlineageXmlProjector.isWindowFunctionResultset} cannot tell 397 * the two forms apart on PG. 398 * 399 * <p>The Oracle / MSSQL hypothetical-set form, by contrast, emits 400 * neither a {@code clauseType="orderby"} fdr nor a 401 * {@code clauseType="selectList"} fdr (probe-confirmed) — so the 402 * projector's slice-13 windowed-vs-aggregate discriminator returns 403 * {@code false} and the matching projector-side 404 * {@code AGGREGATE_FUNCTION_NAMES} entry marks the output aggregate. 405 * Their OVER form ({@code RANK() OVER (ORDER BY x.id)}) emits 406 * {@code clauseType="orderby"} as expected and stays correctly 407 * classified as windowed. 408 * 409 * <p>Vendor-gated to Oracle / MSSQL inside 410 * {@link #isAdmittedTopLevelWithinGroupAggregate} and 411 * {@link #findUnsupportedWithinGroupFunctionName}; the PG 412 * direct-attachment shape never satisfies the 413 * {@link #isWithinGroupOnlyWindowDef} predicate (because PG sets 414 * {@code fn.getWindowDef()==null}) so the carve-out cannot accidentally 415 * fire on PG. 416 */ 417 private static final Set<String> HYPOTHETICAL_SET_AGGREGATE_NAMES; 418 static { 419 Set<String> s = new HashSet<>(); 420 s.add("rank"); 421 s.add("dense_rank"); 422 s.add("percent_rank"); 423 s.add("cume_dist"); 424 HYPOTHETICAL_SET_AGGREGATE_NAMES = Collections.unmodifiableSet(s); 425 } 426 427 /** 428 * Slice 42: true iff {@code fn} is an Oracle / MSSQL hypothetical-set 429 * ordered-set aggregate call shape — {@code RANK} / {@code DENSE_RANK} / 430 * {@code PERCENT_RANK} / {@code CUME_DIST} with 431 * {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only} 432 * {@link TWindowDef} attachment. Used both as a name-whitelist 433 * discriminator (so PG direct {@code fn.getWithinGroup()} cannot 434 * accidentally pass through, since PG sets {@code fn.getWindowDef()== 435 * null}) and as the {@link #isAggregateFunction} carve-out trigger. 436 */ 437 private static boolean isHypotheticalSetWithinGroupCall(TFunctionCall fn) { 438 if (fn == null) return false; 439 if (!isWithinGroupOnlyWindowDef(fn.getWindowDef())) return false; 440 if (fn.getFunctionName() == null) return false; 441 String name = fn.getFunctionName().toString(); 442 if (name == null || name.isEmpty()) return false; 443 return HYPOTHETICAL_SET_AGGREGATE_NAMES.contains( 444 name.toLowerCase(Locale.ROOT)); 445 } 446 447 /** 448 * Predicate-bearing join types accepted by the current builder. 449 * Slice 64: each must carry either an ON condition or a USING 450 * clause; the per-key {@code joinColumnRefs} emission happens in 451 * {@link #buildRelations} for USING and via 452 * {@link #collectColumnRefs} for ON. NATURAL, semi/anti, 453 * vendor-specific joins, and nested-join sources stay rejected so 454 * the IR cannot quietly drop a row-set predicate. The unqualified 455 * output-naming case for USING merged keys is deferred to S65. 456 */ 457 private static final EnumSet<EJoinType> ALLOWED_PREDICATE_JOIN_TYPES = EnumSet.of( 458 EJoinType.inner, 459 EJoinType.left, 460 EJoinType.right, 461 EJoinType.full, 462 EJoinType.fullouter, 463 EJoinType.leftouter, 464 EJoinType.rightouter, 465 EJoinType.join 466 ); 467 468 /** 469 * Slice 63 — join types admitted by the builder but that must NOT 470 * carry an ON or USING clause. Currently just {@code CROSS}; the 471 * tier exists so that future ON-less shapes can join the same path 472 * with the same shape contract. Slice 66 added a separate 473 * {@link #NATURAL_JOIN_TYPES} tier because NATURAL has its own 474 * catalog-required reject path that CROSS does not. 475 */ 476 private static final EnumSet<EJoinType> ALLOWED_ON_LESS_JOIN_TYPES = EnumSet.of( 477 EJoinType.cross 478 ); 479 480 /** 481 * Slice 66 — NATURAL join types. Each MUST NOT carry an ON or USING 482 * clause. Each MUST have resolvable catalog metadata on BOTH sides; 483 * a missing-catalog reject fires inside {@link #buildRelations} 484 * with a side-specific diagnostic. The shared-column list is 485 * inferred from the running {@link LeftOutputState} ∩ right's 486 * catalog and feeds into {@link #emitMergedJoinRefs} the same way 487 * a syntactically-declared USING list does. 488 */ 489 private static final EnumSet<EJoinType> NATURAL_JOIN_TYPES = EnumSet.of( 490 EJoinType.natural, 491 EJoinType.natural_inner, 492 EJoinType.natural_left, 493 EJoinType.natural_right, 494 EJoinType.natural_leftouter, 495 EJoinType.natural_rightouter, 496 EJoinType.natural_full, 497 EJoinType.natural_fullouter 498 ); 499 500 private static boolean isNaturalJoinType(EJoinType jt) { 501 return jt != null && NATURAL_JOIN_TYPES.contains(jt); 502 } 503 504 private SemanticIRBuilder() {} 505 506 public static SemanticProgram build(TSelectSqlStatement select, NameBindingProvider provider) { 507 if (select == null) { 508 throw new IllegalArgumentException("select must not be null"); 509 } 510 if (provider == null) { 511 throw new IllegalArgumentException("provider must not be null"); 512 } 513 List<StatementGraph> stmts = new ArrayList<>(); 514 List<LineageEdge> lineage = new ArrayList<>(); 515 Map<String, Integer> cteNameToStatementIndex = new HashMap<>(); 516 Map<String, List<String>> ctePublishedColumns = new HashMap<>(); 517 518 TCTEList cteList = select.getCteList(); 519 boolean hasOuterCteList = cteList != null && cteList.size() > 0; 520 521 // Slice 108 Phase 0 — extract inline SELECT-side CTE walker into 522 // buildSelectCteList helper. Phase 0 is behaviour-preserving: the 523 // helper called with allowShadowOverride=false and 524 // additionalAllCteNames=null reproduces the prior inline walker. 525 // Phase 1 (shadow override) and Phase 3 (mixed outer+inner WITH on 526 // INSERT) reuse the same helper from buildInsert. 527 buildSelectCteList(cteList, provider, stmts, lineage, 528 cteNameToStatementIndex, ctePublishedColumns, 529 /*allowShadowOverride=*/ false, 530 /*additionalAllCteNames=*/ null); 531 532 // Slice 108 Phase 0 — extract outer-SELECT processing into 533 // buildSelectBodyAfterCteWalk helper. The hasOuterCteListAlreadyProcessed 534 // flag is passed explicitly so the buildInsert shadow path can null 535 // both AST CTE lists before calling and still claim "CTEs already 536 // walked" (round-2 codex BLOCKER 4 fix). 537 buildSelectBodyAfterCteWalk(select, provider, stmts, lineage, 538 cteNameToStatementIndex, ctePublishedColumns, 539 /*hasOuterCteListAlreadyProcessed=*/ hasOuterCteList); 540 541 return new SemanticProgram(stmts, lineage); 542 } 543 544 /** 545 * Slice 108 — walk a SELECT-side WITH clause and append each CTE body 546 * to {@code stmts} as a preceding statement. Extracted from the inline 547 * walker that previously lived in {@link #build} (lines ~516–663 pre- 548 * slice-108). Mirrors the slice-101 {@link #buildMergeCteList}, 549 * slice-105 {@link #buildUpdateCteList}, and slice-106 550 * {@link #buildDeleteCteList} helpers. 551 * 552 * <p>Phase 0 (behaviour-preserving refactor): {@code allowShadowOverride 553 * = false} and {@code additionalAllCteNames = null} reproduce the 554 * pre-slice-108 inline walker byte-for-byte. 555 * 556 * <p>Phase 1 (shadow admit): {@code allowShadowOverride = true} enables 557 * the mixed outer+inner WITH on INSERT shadow case (slice 108). When 558 * called from {@link #buildInsert}, the OUTER pass runs with 559 * {@code allowShadowOverride=false}, populating 560 * {@code cteNameToStatementIndex} and {@code ctePublishedColumns} with 561 * outer CTE bindings. The INNER pass then runs with 562 * {@code allowShadowOverride=true} and 563 * {@code additionalAllCteNames=outerAllNames}. The inner pass: 564 * <ul> 565 * <li>uses a fresh local {@code localVisibleSoFar} for intra-list 566 * duplicate detection (so {@code DUPLICATE_CTE_NAME} still 567 * fires for inner {@code x, x} even when outer also declares 568 * {@code x});</li> 569 * <li>snapshots {@code cteNameToStatementIndex.keySet()} at entry 570 * into {@code outerKeysSnapshot}; the union 571 * {@code outerKeysSnapshot ∪ localVisibleSoFar} drives BOTH 572 * {@link #rejectForwardCteReferences} AND 573 * {@link NameBindingProvider#withCteContext} (round-2 codex 574 * BLOCKER 3 fix — keeps inner-y references to outer-x from 575 * being falsely flagged as forward references);</li> 576 * <li>on collision with an outer entry (after a successful body 577 * build), {@link Map#put} overrides the 578 * {@code cteNameToStatementIndex} and {@code ctePublishedColumns} 579 * entries so the source SELECT sees the INNER body. The OUTER 580 * body stays in {@code stmts[]} at its earlier position; its 581 * cteMap entry is just no longer referenced by name (PG nested- 582 * WITH inner-shadows-outer semantics).</li> 583 * </ul> 584 * 585 * <p>{@code additionalAllCteNames} is unioned into the per-call 586 * {@code allCteNames} that {@link #rejectForwardCteReferences} consults 587 * (round-1 codex BLOCKER 2 fix — keeps each scope's forward-ref check 588 * narrow so an outer CTE body referencing a base-table whose name 589 * happens to coincide with an inner CTE name does NOT falsely flag). 590 */ 591 private static void buildSelectCteList( 592 TCTEList cteList, 593 NameBindingProvider provider, 594 List<StatementGraph> stmts, 595 List<LineageEdge> lineage, 596 Map<String, Integer> cteNameToStatementIndex, 597 Map<String, List<String>> ctePublishedColumns, 598 boolean allowShadowOverride, 599 Set<String> additionalAllCteNames) { 600 if (cteList == null || cteList.size() == 0) { 601 return; 602 } 603 rejectRecursiveCtes(cteList); 604 605 // Per-call allCteNames for rejectForwardCteReferences. The optional 606 // additionalAllCteNames extends this scope (Phase 1: outer names 607 // visible to inner CTE body forward-ref checks). Phase 0 path passes 608 // null, so this is just collectCteNames(cteList). 609 Set<String> allCteNames; 610 if (additionalAllCteNames != null && !additionalAllCteNames.isEmpty()) { 611 allCteNames = new HashSet<>(collectCteNames(cteList)); 612 allCteNames.addAll(additionalAllCteNames); 613 } else { 614 allCteNames = collectCteNames(cteList); 615 } 616 617 // Phase 1: snapshot outer-scope CTE names at entry so subsequent 618 // iterations of this list always see the FULL outer scope for 619 // forward-ref classification and withCteContext, even if a shadow 620 // override later overwrites a name's cteMap entry. 621 Set<String> outerKeysSnapshot = allowShadowOverride 622 ? new HashSet<>(cteNameToStatementIndex.keySet()) 623 : null; 624 625 // Build each CTE body left-to-right. Each CTE sees CTEs declared 626 // strictly before it (standard SQL chain semantics, slice 4). 627 // Slice 18: CTE bodies accept FROM-subqueries (mirroring the 628 // outer-SELECT extraction path) AND scalar-subquery projections 629 // (slice 11): for each CTE body, FROM-subqueries are extracted 630 // first, then scalar bodies, then the CTE body is built/appended. 631 // The per-CTE-body subqueryAliasToIndex is local to the iteration 632 // so different CTE bodies cannot collide on FROM-subquery aliases. 633 // Slice 60: running map of "CTE name → published column names" 634 // for star expansion. Each CTE's published columns are added 635 // AFTER its body is built so a CTE cannot self-reference and 636 // forward references (rejected earlier) cannot leak through. 637 // Set-op CTE bodies use the merged StatementGraph.outputColumns. 638 // For non-set-op CTE bodies the column names also come from 639 // StatementGraph.outputColumns. Explicit CTE column lists are 640 // rejected at the star expander, not at populate time. 641 Set<String> localVisibleSoFar = new HashSet<>(); 642 for (int i = 0; i < cteList.size(); i++) { 643 TCTE cte = cteList.getCTE(i); 644 String cteName = cte.getTableName().toString(); 645 String cteNameLower = cteName.toLowerCase(Locale.ROOT); 646 // Slice 15 MUST 9 / round-4 MUST 1: reject duplicate CTE 647 // names BEFORE rejectForwardCteReferences so duplicate-name 648 // diagnostics are not preempted by forward-reference 649 // diagnostics. cteNameToStatementIndex is keyed lower-case; 650 // a duplicate entry would silently overwrite the earlier 651 // body and leave OUTER_REFERENCE-of-CTE pointing at the 652 // wrong statement. 653 // 654 // Slice 108: intra-list duplicate check uses localVisibleSoFar 655 // (NOT outerKeysSnapshot) so an inner CTE shadowing an outer 656 // CTE is admitted while inner-x, inner-x stays rejected (round-1 657 // codex BLOCKER 1 fix). 658 if (localVisibleSoFar.contains(cteNameLower)) { 659 throw new SemanticIRBuildException( 660 Diagnostic.error(DiagnosticCode.DUPLICATE_CTE_NAME, 661 "duplicate CTE name '" + cteName 662 + "' in WITH clause; CTE names must be unique", cte)); 663 } 664 // Slice 108: effectiveVisible = outerKeysSnapshot ∪ localVisibleSoFar. 665 // Drives BOTH rejectForwardCteReferences AND 666 // bodyProvider.withCteContext so inner-y body referencing outer-x 667 // is admitted (round-2 codex BLOCKER 3 fix). 668 Set<String> effectiveVisible; 669 if (outerKeysSnapshot != null) { 670 if (outerKeysSnapshot.isEmpty()) { 671 effectiveVisible = localVisibleSoFar; 672 } else if (localVisibleSoFar.isEmpty()) { 673 effectiveVisible = outerKeysSnapshot; 674 } else { 675 effectiveVisible = new HashSet<>(outerKeysSnapshot); 676 effectiveVisible.addAll(localVisibleSoFar); 677 } 678 } else { 679 effectiveVisible = localVisibleSoFar; 680 } 681 rejectForwardCteReferences(cte, allCteNames, effectiveVisible); 682 // Slice 60: bodyProvider gets the CTE-context narrowing 683 // first; the effective-alias-keyed in-scope map is 684 // applied LATER, after the body's own FROM-subqueries 685 // are extracted (so we can walk the body's FROM clause 686 // and resolve each relation to its effective alias). 687 // This deferred narrowing replaces the slice-60 v1 path 688 // that put the running ctePublishedColumns map (CTE-name 689 // keyed) directly on the provider — that keying class 690 // could collide when a subquery alias matched a CTE 691 // name (codex diff-review). 692 NameBindingProvider bodyProvider = provider.withCteContext(effectiveVisible); 693 TSelectSqlStatement cteBody = cte.getSubquery(); 694 // Slice 103 — snapshot lineage size BEFORE the body branch so 695 // the slice-102 rename helper can rewrite outgoing 696 // STATEMENT_OUTPUT refs in [lineageSize0, lineage.size()) 697 // without touching prior CTE bodies' edges. Covers BOTH the 698 // set-op and non-set-op branches (mirrors slice-102 699 // buildMergeCteList at line ~5820). 700 int lineageSize0 = lineage.size(); 701 int bodyIdx; 702 if (cteBody != null 703 && cteBody.getSetOperatorType() != null 704 && cteBody.getSetOperatorType() != ESetOperatorType.none) { 705 // Slice 12: set-op CTE body. The outer set-op statement 706 // carries the CTE name so BodyIndexes.cteByConsumerAndName 707 // resolves it (slice-18 consumer-keyed projector lookup). 708 // The CTE body's CTE list (if any) is rejected as a 709 // nested-WITH inside buildSetOpProgram. 710 bodyIdx = buildSetOpProgram(cteBody, bodyProvider, stmts, lineage, 711 cteNameToStatementIndex, cteName, 712 /*hasOuterCteListAlreadyProcessed=*/ false); 713 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 714 } else { 715 // Slice 18: snapshot/rollback around recursive 716 // FROM-subquery extraction inside this CTE body. 717 // Mirrors the outer-SELECT wrapper below and the 718 // slice-16 set-op wrapper. Currently defensive: a 719 // thrown exception in a deeper level would otherwise 720 // leak siblings/ancestors at this CTE's level into 721 // stmts/lineage. The wrapper truncates back to the 722 // pre-extraction boundary and rethrows. Per-CTE 723 // granularity: earlier CTE bodies in the same WITH 724 // list are NOT rolled back (they're already complete). 725 int cteStmtsSize0 = stmts.size(); 726 int cteLineageSize0 = lineage.size(); 727 Map<String, Integer> cteSubqueryAliasToIndex; 728 try { 729 // Slice 60: pass the running ctePublishedColumns 730 // so the body's own FROM-subqueries see earlier 731 // CTEs at every recursion level. 732 cteSubqueryAliasToIndex = 733 extractFromSubqueriesAsStatements(cteBody, bodyProvider, 734 stmts, lineage, cteNameToStatementIndex, 735 ctePublishedColumns); 736 } catch (RuntimeException ex) { 737 while (stmts.size() > cteStmtsSize0) stmts.remove(stmts.size() - 1); 738 while (lineage.size() > cteLineageSize0) lineage.remove(lineage.size() - 1); 739 throw ex; 740 } 741 EnclosingScope cteEnclosing = buildEnclosingScope(cteBody, 742 cteNameToStatementIndex, cteSubqueryAliasToIndex, 743 /*parent=*/ null); 744 Map<Integer, ScalarInfo> cteScalarMap = 745 extractScalarSubqueriesAsStatements(cteBody, 746 bodyProvider, stmts, lineage, 747 cteNameToStatementIndex, cteEnclosing, 748 /*allowRecursiveScalarSubqueryExtraction=*/ true); 749 // Slice 60 (codex diff-review): build the per-CTE 750 // effective-alias-keyed in-scope map by walking the 751 // CTE body's FROM list. CTE references and 752 // FROM-subquery aliases live in the same FROM 753 // namespace (preflight rejects duplicates), so 754 // effective-alias keying makes a name collision 755 // physically impossible. 756 Map<String, List<String>> cteBodyInScope = 757 buildEffectiveAliasInScopeMap(cteBody, bodyProvider, 758 ctePublishedColumns, cteSubqueryAliasToIndex, 759 stmts); 760 NameBindingProvider cteBodyProviderWithStar = bodyProvider 761 .withInScopeRelationColumns(cteBodyInScope); 762 // Slice 114 — switch from the 7-arg buildSelectStatement 763 // to the 14-arg buildSelectStatementImpl so the CTE 764 // body's WHERE clause can extract uncorrelated predicate 765 // subqueries (IN-SELECT / EXISTS / NOT EXISTS / scalar 766 // comparison / ANY-ALL-SOME) as their own statements. 767 // The wrapper mirrors the outer-SELECT entry pattern in 768 // {@link #build}: if the build appends predicate bodies 769 // and then a later post-extraction reject fires, the 770 // try/catch truncates stmts/lineage back to the 771 // pre-call boundary so a partial extraction doesn't 772 // leak into the program. The slice-113 set-op branch 773 // call site is itself enclosed by the slice-16 774 // SET-OP-WIDE rollback at {@link #buildSetOpProgram}; 775 // the CTE-body call sites do NOT inherit a similar 776 // enclosing wrapper, which is why slice 114 adds one 777 // here. The from-subquery / scalar-subquery 778 // extractions above this point have their own 779 // slice-17/18 wrappers, so the pre-CALL snapshot 780 // bounds the truncate exactly to whatever 781 // buildSelectStatementImpl appended. 782 int cteBodyStmtsSnapshot = stmts.size(); 783 int cteBodyLineageSnapshot = lineage.size(); 784 StatementGraph body; 785 try { 786 body = buildSelectStatementImpl(cteBody, 787 cteBodyProviderWithStar, cteName, 788 /*hasOuterCteListAlreadyProcessed=*/ false, 789 /*allowFromSubqueries=*/ true, 790 /*allowScalarProjectionSubqueries=*/ true, 791 /*allowWindowProjection=*/ true, 792 // Slice 114 — keep JOIN-ON predicate 793 // subqueries rejected inside CTE bodies 794 // (preserve slice 23/26 contract; the lift 795 // is WHERE-only; the two flags are 796 // independent per slice 113 split). 797 /*allowJoinOnPredicateSubqueries=*/ false, 798 /*stmtsForExtraction=*/ stmts, 799 /*lineageForExtraction=*/ lineage, 800 /*cteMapForExtraction=*/ cteNameToStatementIndex, 801 /*isPredicateBody=*/ false, 802 /*whereClauseContext=*/ PredicateClauseContext.CTE_BODY_WHERE, 803 /*allowWherePredicateSubqueries=*/ true); 804 } catch (RuntimeException ex) { 805 while (stmts.size() > cteBodyStmtsSnapshot) stmts.remove(stmts.size() - 1); 806 while (lineage.size() > cteBodyLineageSnapshot) lineage.remove(lineage.size() - 1); 807 throw ex; 808 } 809 bodyIdx = stmts.size(); 810 stmts.add(body); 811 // Slice 108 — emit lineage BEFORE the cteMap.put so that 812 // in the shadow case (allowShadowOverride=true with 813 // cteNameLower already in cteMap from outer pass), the 814 // body's column refs to <cteNameLower> still resolve to 815 // the OUTER body (PG inner-x body sees outer-x via the 816 // closer-enclosing-not-yet-shadowed fallback). Non-shadow 817 // cases are unaffected because cteMap does not yet contain 818 // cteNameLower at this point and the body cannot reference 819 // its own name without going through the recursive-CTE 820 // path (already rejected upstream). 821 emitLineageForStatement(body, bodyIdx, lineage, 822 cteNameToStatementIndex, cteSubqueryAliasToIndex, 823 cteScalarMap); 824 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 825 } 826 // Slice 103 — apply the slice-102 rename helper if the CTE 827 // declares an explicit column list (no-op otherwise). The 828 // helper returns the published column list (renamed if 829 // explicit, else body's inner names). Slice-60's 830 // `ctePublishedColumns.put` is collapsed into this single 831 // call site (covers both branches above). 832 List<String> publishedCols = applyExplicitCteColumnListRename( 833 cte, stmts, lineage, bodyIdx, lineageSize0, "SELECT"); 834 ctePublishedColumns.put(cteNameLower, publishedCols); 835 localVisibleSoFar.add(cteNameLower); 836 } 837 } 838 839 /** 840 * Slice 108 — outer-SELECT processing extracted from the previous inline 841 * body of {@link #build} (lines ~665–763 pre-slice-108). 842 * 843 * <p>{@code hasOuterCteListAlreadyProcessed} is an EXPLICIT boolean 844 * parameter (round-2 codex BLOCKER 4 fix). Previously this was inferred 845 * from {@code select.getCteList() != null && size > 0}; after the 846 * slice-108 buildInsert shadow path nulls {@code source.getCteList()} 847 * before calling, that inference would be wrong. The caller passes the 848 * truth. 849 * 850 * <p>The post-walk {@code cteNameToStatementIndex.keySet()} replaces the 851 * pre-walk {@code allCteNames} because the walker has populated every 852 * declared CTE name by lowercase key — they are equal sets. 853 */ 854 private static void buildSelectBodyAfterCteWalk( 855 TSelectSqlStatement select, 856 NameBindingProvider provider, 857 List<StatementGraph> stmts, 858 List<LineageEdge> lineage, 859 Map<String, Integer> cteNameToStatementIndex, 860 Map<String, List<String>> ctePublishedColumns, 861 boolean hasOuterCteListAlreadyProcessed) { 862 Set<String> allCteNames = cteNameToStatementIndex.isEmpty() 863 ? Collections.<String>emptySet() 864 : new HashSet<>(cteNameToStatementIndex.keySet()); 865 866 // Slice 12: top-level set-op dispatch. CTE list (if any) was 867 // already processed above; pass hasOuterCteListAlreadyProcessed=true 868 // so buildSetOpProgram doesn't re-flag it as a nested WITH. 869 if (select.getSetOperatorType() != null 870 && select.getSetOperatorType() != ESetOperatorType.none) { 871 NameBindingProvider outerProvider = provider.withCteContext(allCteNames); 872 buildSetOpProgram(select, outerProvider, stmts, lineage, 873 cteNameToStatementIndex, /*setOpName=*/ null, 874 /*hasOuterCteListAlreadyProcessed=*/ hasOuterCteListAlreadyProcessed); 875 return; 876 } 877 878 // Outer statement: pre-extract any FROM-clause subqueries as their 879 // own statements, then any scalar-subquery projections, then build 880 // the outer body, then emit lineage with the global CTE map, the 881 // outer-local subquery alias map, AND the scalar-projection map. 882 // Slice 60: outerProvider gets the CTE-context narrowing here; 883 // the effective-alias-keyed in-scope map is applied LATER, after 884 // outer FROM-subqueries are extracted. The same deferred 885 // narrowing pattern as the CTE-body branch — see the codex 886 // diff-review note on alias/CTE-name collision. 887 NameBindingProvider outerProvider = provider.withCteContext(allCteNames); 888 // Slice 17: snapshot/rollback around recursive FROM-subquery 889 // extraction. The recursive extractor mutates stmts/lineage as 890 // each level's bodies land; if a deeper-level rejection fires 891 // after sibling/ancestor mutations, this wrapper truncates the 892 // lists back to the pre-call boundary and rethrows. Mirrors the 893 // slice-16 buildSetOpProgram wrapper (§14.18 process lesson #21: 894 // when a class of mutation-free checks can fire after partial 895 // mutation, close it transactionally instead of point-fixing). 896 // 897 // The rollback is currently defensive: build() allocates fresh 898 // stmts/lineage per invocation, so a thrown exception's caller 899 // cannot directly observe leaked state. The wrapper is kept 900 // because (a) the slice-17 preflight closes the most direct 901 // partial-mutation classes BEFORE the recursive extraction 902 // runs, but recursive levels can still fail at deeper rejection 903 // points (e.g. a nested set-op-in-FROM-subquery body inside a 904 // sibling that succeeds at the preflight); (b) consistency with 905 // slice 16's pattern means a future refactor that lifts the 906 // build() per-call list allocation does not silently re-open 907 // the partial-mutation class. 908 int stmtsSize0 = stmts.size(); 909 int lineageSize0 = lineage.size(); 910 Map<String, Integer> outerSubqueryAliasToIndex; 911 try { 912 outerSubqueryAliasToIndex = 913 extractFromSubqueriesAsStatements(select, outerProvider, 914 stmts, lineage, cteNameToStatementIndex, 915 ctePublishedColumns); 916 } catch (RuntimeException ex) { 917 while (stmts.size() > stmtsSize0) stmts.remove(stmts.size() - 1); 918 while (lineage.size() > lineageSize0) lineage.remove(lineage.size() - 1); 919 throw ex; 920 } 921 EnclosingScope outerEnclosing = buildEnclosingScope(select, 922 cteNameToStatementIndex, outerSubqueryAliasToIndex, 923 /*parent=*/ null); 924 Map<Integer, ScalarInfo> outerScalarMap = 925 extractScalarSubqueriesAsStatements(select, outerProvider, 926 stmts, lineage, cteNameToStatementIndex, outerEnclosing, 927 /*allowRecursiveScalarSubqueryExtraction=*/ true); 928 // Slice 60 (codex diff-review): build the outer's 929 // effective-alias-keyed in-scope map by walking the outer 930 // SELECT's FROM list. Effective-alias keying eliminates the 931 // CTE-name vs subquery-alias collision class. 932 Map<String, List<String>> outerInScope = buildEffectiveAliasInScopeMap( 933 select, outerProvider, ctePublishedColumns, 934 outerSubqueryAliasToIndex, stmts); 935 NameBindingProvider outerProviderWithStar = outerProvider 936 .withInScopeRelationColumns(outerInScope); 937 // Slice 23: outer-SELECT path uses buildSelectStatementImpl directly so 938 // the slice-23 EXISTS-extraction can append predicate-body statements 939 // to `stmts`/`lineage`. Snapshot/rollback wrapper around the call 940 // matches the slice-16/17/20 pattern: a partial extraction (e.g. third 941 // EXISTS rejected after first two extracted) truncates the lists. 942 int outerStmtsSnapshot = stmts.size(); 943 int outerLineageSnapshot = lineage.size(); 944 StatementGraph outer; 945 try { 946 outer = buildSelectStatementImpl(select, outerProviderWithStar, null, 947 /*hasOuterCteListAlreadyProcessed=*/ hasOuterCteListAlreadyProcessed, 948 /*allowFromSubqueries=*/ true, 949 /*allowScalarProjectionSubqueries=*/ true, 950 /*allowWindowProjection=*/ true, 951 /*allowJoinOnPredicateSubqueries=*/ true, 952 stmts, lineage, 953 /*cteMapForExtraction=*/ cteNameToStatementIndex, 954 /*isPredicateBody=*/ false, 955 /*whereClauseContext=*/ PredicateClauseContext.SELECT_WHERE, 956 /*allowWherePredicateSubqueries=*/ true); 957 } catch (RuntimeException e) { 958 while (stmts.size() > outerStmtsSnapshot) stmts.remove(stmts.size() - 1); 959 while (lineage.size() > outerLineageSnapshot) lineage.remove(lineage.size() - 1); 960 throw e; 961 } 962 int outerIndex = stmts.size(); 963 stmts.add(outer); 964 emitLineageForStatement(outer, outerIndex, lineage, 965 cteNameToStatementIndex, outerSubqueryAliasToIndex, outerScalarMap); 966 } 967 968 /** 969 * Slice 78 — admit a single {@code INSERT INTO target SELECT ...} 970 * statement. Builds the source SELECT via {@link #build} (reusing 971 * the existing pipeline unchanged), then appends an {@code "INSERT"}- 972 * kind {@link StatementGraph} carrying the target relation and 973 * cross-statement lineage edges. 974 * 975 * <p>Admitted shape: {@code INSERT INTO <target> [(c1, c2, ...)] 976 * <subquery-SELECT>}. Rejections: 977 * <ul> 978 * <li>{@link EInsertSource#values}, {@code values_empty}, 979 * {@code default_values}, {@code execute}, 980 * {@code values_function}, {@code values_multi_table}, 981 * {@code hive_query}, {@code values_oracle_record}, 982 * {@code set_column_value}, {@code value_table} → 983 * {@link DiagnosticCode#INSERT_SOURCE_NOT_SUPPORTED}.</li> 984 * <li>Oracle {@code INSERT ALL} / {@code INSERT FIRST} → 985 * {@link DiagnosticCode#INSERT_MULTI_TABLE_NOT_SUPPORTED}. 986 * Hive multi-insert ({@code multiInsertStatements} non-empty) is 987 * routed to {@link #buildHiveMultiInsert} instead of rejected.</li> 988 * <li>Missing target table (defensive — the parser usually rejects 989 * first) → {@link DiagnosticCode#INSERT_TARGET_MISSING}.</li> 990 * <li>Explicit column list arity ≠ source SELECT output count → 991 * {@link DiagnosticCode#INSERT_COLUMN_COUNT_MISMATCH}.</li> 992 * </ul> 993 * 994 * <p>The source SELECT is built first via {@code build()} and its 995 * full {@link SemanticProgram} (CTE bodies + scalar bodies + 996 * FROM-subquery bodies + outer SELECT + cross-stmt lineage) is 997 * appended verbatim to the returned program. The INSERT 998 * {@link StatementGraph} is appended LAST; its 999 * {@link StatementGraph#getRelations() relations} lists the source 1000 * SELECT as a single {@link RelationKind#SUBQUERY} entry whose 1001 * {@code qualifiedName} is the source SELECT's outer-statement name 1002 * (synthesised when needed). All other column-ref lists stay empty 1003 * on the INSERT — an INSERT has no projection of its own. 1004 * 1005 * <p>Cross-statement {@link LineageEdge}s for the INSERT are 1006 * {@code from = TABLE_COLUMN(target_qname, target_col_i_name)} 1007 * and {@code to = STATEMENT_OUTPUT(selectIdx, source_output_i_name)}. 1008 * Target column names are the explicit INSERT column-list spellings 1009 * when supplied, else the source SELECT's positional output names. 1010 */ 1011 public static SemanticProgram buildInsert(TInsertSqlStatement insert, 1012 NameBindingProvider provider) { 1013 if (insert == null) { 1014 throw new IllegalArgumentException("insert must not be null"); 1015 } 1016 if (provider == null) { 1017 throw new IllegalArgumentException("provider must not be null"); 1018 } 1019 1020 // Oracle INSERT ALL / FIRST rejects: their multi-value AST shape 1021 // is fundamentally different from the Hive multi-insert path. 1022 // Slice 78 scopes single-target INSERT SELECT; slice 93 lifts 1023 // the Hive multi-insert case via buildHiveMultiInsert. 1024 if (insert.isInsertAll() || insert.isInsertFirst()) { 1025 throw new SemanticIRBuildException(Diagnostic.error( 1026 DiagnosticCode.INSERT_MULTI_TABLE_NOT_SUPPORTED, 1027 "multi-table INSERT (INSERT ALL / INSERT FIRST) is not " 1028 + "supported by SemanticIRBuilder.buildInsert; " 1029 + "slice 78 admits single-target INSERT INTO <target> SELECT ...", 1030 insert)); 1031 } 1032 // Slice 109 — outer-WITH on Hive multi-insert 1033 // (`WITH x AS (...) FROM x INSERT INTO t1 SELECT ... INSERT INTO t2 1034 // SELECT ...`) is now admitted via buildHiveMultiInsert's CTE-aware 1035 // path. The slice-104 early reject for this shape is removed; the 1036 // helper builds the outer CTE bodies ONCE upfront and reuses the 1037 // shared cteMap/publishedMap across every sub-SELECT. 1038 // INSERT_OUTER_WITH_ON_HIVE_MULTI_INSERT_NOT_SUPPORTED stays declared- 1039 // but-unreached for API stability (slice 71/72/82/86/95/96/97/98/108 1040 // retain-for-documentation precedent). 1041 // 1042 // Hive multi-insert: FROM src INSERT INTO t1 SELECT ... INSERT INTO t2 SELECT ... 1043 // Each sub-SELECT already carries the shared FROM source in its fromClause. 1044 if (!insert.getMultiInsertStatements().isEmpty()) { 1045 return buildHiveMultiInsert(insert, provider); 1046 } 1047 1048 EInsertSource src = insert.getInsertSource(); 1049 if (src != EInsertSource.subquery) { 1050 throw new SemanticIRBuildException(Diagnostic.error( 1051 DiagnosticCode.INSERT_SOURCE_NOT_SUPPORTED, 1052 "INSERT source '" + src + "' is not supported by " 1053 + "SemanticIRBuilder.buildInsert; slice 78 admits " 1054 + "subquery-source INSERT only (INSERT INTO <target> SELECT ...)", 1055 insert)); 1056 } 1057 1058 // Slice 85 — cheap statement-level OUTPUT_INTO reject runs 1059 // BEFORE the source SELECT is built so a multi-violation 1060 // shape (e.g. `INSERT INTO t OUTPUT INSERTED.x INTO #log 1061 // SELECT ... FROM bad_join`) routes to the cheaper structural 1062 // code first. 1063 if (insert.getOutputClause() != null 1064 && insert.getOutputClause().getIntoTable() != null) { 1065 throw new SemanticIRBuildException(Diagnostic.error( 1066 DiagnosticCode.OUTPUT_INTO_NOT_SUPPORTED, 1067 "INSERT OUTPUT ... INTO <target> writes a second target; " 1068 + "slice 85 admits projection-only OUTPUT", 1069 insert)); 1070 } 1071 1072 TTable targetTable = insert.getTargetTable(); 1073 if (targetTable == null || targetTable.getTableName() == null) { 1074 throw new SemanticIRBuildException(Diagnostic.error( 1075 DiagnosticCode.INSERT_TARGET_MISSING, 1076 "INSERT statement has no resolvable target table", 1077 insert)); 1078 } 1079 String targetQName = targetTable.getTableName().toString(); 1080 if (targetQName.isEmpty()) { 1081 throw new SemanticIRBuildException(Diagnostic.error( 1082 DiagnosticCode.INSERT_TARGET_MISSING, 1083 "INSERT target table name is empty", 1084 insert)); 1085 } 1086 1087 TSelectSqlStatement source = insert.getSubQuery(); 1088 if (source == null) { 1089 // Defensive: getInsertSource() == subquery but subQuery is 1090 // null. Surface as INSERT_TARGET_MISSING's source half. 1091 throw new SemanticIRBuildException(Diagnostic.error( 1092 DiagnosticCode.INSERT_SOURCE_NOT_SUPPORTED, 1093 "INSERT source is declared as subquery but no SELECT " 1094 + "statement was attached", 1095 insert)); 1096 } 1097 1098 // Slice 104 — outer-WITH on INSERT. The parser attaches the outer 1099 // WITH clause to insert.getCteList(), NOT to source.getCteList(). 1100 // Before slice 104 buildInsert ignored insert.getCteList(), which 1101 // silently mis-bound CTE references in the source SELECT as 1102 // TABLE-kind relations with phantom columns. The slice-104 fix is 1103 // an AST handoff: move insert.getCteList() onto source.getCteList() 1104 // for the duration of the inner build(source) call so the 1105 // slice-103 SELECT-side CTE walker handles construction, rename, 1106 // and rejects (recursive / duplicate / forward-reference / arity 1107 // mismatch). Restore in finally so the AST is observably 1108 // unchanged to the caller (Java field references — token-chain 1109 // state is perturbed by setCteList(null)'s removeTokens() but 1110 // observably benign for downstream Semantic IR). 1111 // 1112 // Slice 107 / 108 — mixed outer-WITH + inner-WITH on INSERT. PG / 1113 // Oracle / Snowflake admit at parse. Three sub-cases: 1114 // (a) Only outer WITH populated. AST handoff (slice 104): move 1115 // insert.cteList onto source.cteList and call build(source). 1116 // (b) Only inner WITH populated. Pass through unchanged (the 1117 // walker handles it on its own). 1118 // (c) Both outer and inner WITH populated. Slice 107 admitted 1119 // this for disjoint names via a flat-merge; slice 108 admits 1120 // it for the SHADOWING case too (`WITH x ... INSERT ... WITH 1121 // x ... SELECT ... FROM x` — inner shadows outer per 1122 // PG/Oracle/Snowflake nested-WITH semantics). The slice-108 1123 // implementation uses a TWO-PASS walker invocation in this 1124 // method: outer pass first (allowShadowOverride=false), then 1125 // inner pass (allowShadowOverride=true, 1126 // additionalAllCteNames=outer-names). The walker's two-set 1127 // visibility model (outerKeysSnapshot ∪ localVisibleSoFar) 1128 // keeps PG semantics correct: inner-x's body sees outer-x via 1129 // the cteMap (override is post-build), and inner CTEs declared 1130 // after inner-x see inner-x. The OUTER body stays in stmts[] 1131 // at its position; its cteMap entry is just no longer 1132 // referenced by name (shadowed). Source SELECT's `FROM x` 1133 // resolves to inner-x. 1134 // 1135 // INSERT_MIXED_OUTER_AND_INNER_WITH_NOT_SUPPORTED stays declared but 1136 // is no longer reached by slice 108. Slice107Test §F/§Q (cross- 1137 // boundary duplicate rejects) are deleted; positive coverage moves 1138 // to Slice108Test. 1139 TCTEList outerCtes = insert.getCteList(); 1140 TCTEList savedSourceCtes = source.getCteList(); 1141 boolean handoffApplied = false; 1142 SemanticProgram inner; 1143 boolean haveOuterCtes = outerCtes != null && outerCtes.size() > 0; 1144 boolean haveInnerCtes = savedSourceCtes != null && savedSourceCtes.size() > 0; 1145 if (haveOuterCtes && haveInnerCtes) { 1146 // Slice 108 — two-pass walker. Null both AST CTE lists before 1147 // calling buildSelectBodyAfterCteWalk so the helper does not 1148 // re-process source.getCteList(). hasOuterCteListAlreadyProcessed 1149 // is passed true (round-2 codex BLOCKER 4 fix). 1150 source.setCteList(null); 1151 insert.setCteList(null); 1152 handoffApplied = true; 1153 try { 1154 List<StatementGraph> innerStmts = new ArrayList<>(); 1155 List<LineageEdge> innerLineage = new ArrayList<>(); 1156 Map<String, Integer> cteMap = new HashMap<>(); 1157 Map<String, List<String>> publishedMap = new HashMap<>(); 1158 // Outer pass: outerAllNames as its own scope. 1159 buildSelectCteList(outerCtes, provider, innerStmts, innerLineage, 1160 cteMap, publishedMap, 1161 /*allowShadowOverride=*/ false, 1162 /*additionalAllCteNames=*/ null); 1163 // Inner pass: outerAllNames also visible for forward-ref 1164 // classification (round-1 codex BLOCKER 2 fix); shadow 1165 // override admits cross-boundary duplicate names. 1166 Set<String> outerAllNames = collectCteNames(outerCtes); 1167 buildSelectCteList(savedSourceCtes, provider, innerStmts, innerLineage, 1168 cteMap, publishedMap, 1169 /*allowShadowOverride=*/ true, 1170 /*additionalAllCteNames=*/ outerAllNames); 1171 // Source SELECT body sees the post-pass cteMap (inner wins 1172 // for shadowed names). 1173 buildSelectBodyAfterCteWalk(source, provider, innerStmts, innerLineage, 1174 cteMap, publishedMap, 1175 /*hasOuterCteListAlreadyProcessed=*/ true); 1176 inner = new SemanticProgram(innerStmts, innerLineage); 1177 } finally { 1178 source.setCteList(savedSourceCtes); 1179 insert.setCteList(outerCtes); 1180 } 1181 } else { 1182 // Single-sided cases. Slice 104 AST handoff for outer-only; 1183 // pass-through for inner-only or no CTEs. 1184 if (haveOuterCtes) { 1185 source.setCteList(outerCtes); 1186 insert.setCteList(null); 1187 handoffApplied = true; 1188 } 1189 try { 1190 inner = build(source, provider); 1191 } finally { 1192 if (handoffApplied) { 1193 source.setCteList(savedSourceCtes); 1194 insert.setCteList(outerCtes); 1195 } 1196 } 1197 } 1198 1199 // Slice 93 — delegate INSERT-graph assembly to the shared helper 1200 // used by both single-target (slice 78) and Hive multi-insert 1201 // (slice 93). out is freshly empty so the helper's rebase offset 1202 // is 0 (no-op for inner lineage). RETURNING/OUTPUT clauses are 1203 // passed directly (slice 85 still owns the projection build). 1204 List<StatementGraph> out = new ArrayList<>(inner.getStatements().size() + 1); 1205 List<LineageEdge> outLineage = new ArrayList<>(); 1206 assembleInsertGraphAndLineage( 1207 insert, targetTable, targetQName, inner, 1208 "INSERT", 1209 insert.getReturningClause(), 1210 insert.getOutputClause(), 1211 out, outLineage, provider); 1212 return new SemanticProgram(out, outLineage); 1213 } 1214 1215 /** 1216 * Slice 93 — admit a Hive multi-insert block of the form 1217 * {@code FROM src INSERT INTO t1 SELECT col1 INSERT INTO t2 SELECT col2}. 1218 * 1219 * <p>The parser represents the whole block as one {@link TInsertSqlStatement} 1220 * whose first INSERT-SELECT pair is the primary statement and whose 1221 * additional pairs are in {@link TInsertSqlStatement#getMultiInsertStatements()}. 1222 * Crucially, each sub-SELECT already carries the shared FROM source in its own 1223 * {@code fromClause} / {@code fromSourceTable} — no post-processing is needed. 1224 * 1225 * <p>Produces a flat {@link SemanticProgram} containing per-pair blocks of 1226 * statements concatenated in INSERT order: each block contributes its source 1227 * SELECT's inner statements (CTE bodies / FROM-subquery bodies extracted by 1228 * {@link #build}) followed by its outer SELECT followed by an INSERT graph. 1229 * The minimum is {@code 2N} statements (one SELECT + one INSERT per target); 1230 * sub-SELECTs with extracted inner programs produce more. Each INSERT carries 1231 * cross-statement lineage edges pointing at its preceding SELECT via 1232 * {@link LineageRef#statementOutput}; per-pair inner lineage edges are 1233 * rebased by the current {@code out.size()} so absolute statement indices 1234 * remain valid across the concatenated program. 1235 * 1236 * <p>Safety note on the source-table fallback: this method enables 1237 * {@code provider.withSourceTableFallback(true)} so secondary sub-SELECTs 1238 * (which Resolver2 does not traverse) can still bind their column refs. 1239 * The fallback is constrained at the provider level to fire only when 1240 * Phase 2 did not run AND any explicit qualifier matches Phase 1's source — 1241 * see {@link Resolver2NameBindingProvider#bindColumn}. Current Hive 1242 * multi-insert parses always present a single FROM source, so Phase 1's 1243 * unqualified-column resolution is unambiguous in practice. 1244 */ 1245 private static SemanticProgram buildHiveMultiInsert(TInsertSqlStatement insert, 1246 NameBindingProvider provider) { 1247 // Slice 93 — source-table fallback strategy for Hive multi-insert. 1248 // 1249 // TSQLResolver2 does NOT process the secondary inserts in 1250 // getMultiInsertStatements(): their column refs have 1251 // resolution == null (Phase 2 did not run) even though Phase 1's 1252 // linkColumnToTable sets sourceTable. To let collectColumnRefs 1253 // accept these bindings, we enable a narrow source-table fallback 1254 // in the provider — but ONLY when every sub-SELECT has a SINGLE 1255 // FROM source (the common Hive multi-insert shape that current 1256 // parser support admits). In single-source contexts, Phase 1's 1257 // unqualified-column resolution is unambiguous; the fallback is 1258 // safe (round-2 codex Q1 BLOCKING). 1259 // 1260 // If any sub-SELECT has multiple FROM sources, Phase 1 may have 1261 // heuristically picked one source for an unqualified column — 1262 // promoting that to EXACT_MATCH could silently mis-bind. In that 1263 // case the fallback stays disabled; users must qualify column 1264 // references in the secondary branch (the qualifier-matches-source 1265 // safety in bindColumn still allows qualified refs through). 1266 boolean singleSource = isSingleSourceMultiInsert(insert); 1267 NameBindingProvider effectiveProvider = singleSource 1268 ? provider.withSourceTableFallback(true) 1269 : provider; 1270 1271 List<StatementGraph> out = new ArrayList<>(); 1272 List<LineageEdge> outLineage = new ArrayList<>(); 1273 1274 // Slice 109 — outer WITH on multi-insert: build the CTE bodies ONCE 1275 // upfront so each sub-SELECT's `FROM x` resolves against the shared 1276 // cteMap/publishedMap. The parser attaches the outer WITH to the 1277 // primary insert's getCteList(); sub-INSERTs in 1278 // getMultiInsertStatements() carry null cteLists. The AST handoff 1279 // mirrors the slice-104 single-target pattern but only nulls 1280 // insert.getCteList() — there is no source SELECT to move it onto 1281 // because each sub-INSERT has its own. 1282 TCTEList outerCtes = insert.getCteList(); 1283 Map<String, Integer> cteMap = new HashMap<>(); 1284 Map<String, List<String>> publishedMap = new HashMap<>(); 1285 boolean handoffApplied = false; 1286 if (outerCtes != null && outerCtes.size() > 0) { 1287 insert.setCteList(null); 1288 handoffApplied = true; 1289 try { 1290 buildSelectCteList(outerCtes, effectiveProvider, out, outLineage, 1291 cteMap, publishedMap, 1292 /*allowShadowOverride=*/ false, 1293 /*additionalAllCteNames=*/ null); 1294 } catch (RuntimeException ex) { 1295 // Restore eagerly on CTE-build failure so a downstream caller 1296 // observing the AST sees the original cteList. 1297 insert.setCteList(outerCtes); 1298 throw ex; 1299 } 1300 } 1301 1302 try { 1303 // Primary INSERT (first target) 1304 appendOneHiveInsert(insert, effectiveProvider, out, outLineage, 1305 cteMap, publishedMap); 1306 1307 // Additional INSERTs from getMultiInsertStatements() 1308 for (Object miObj : insert.getMultiInsertStatements()) { 1309 appendOneHiveInsert((TInsertSqlStatement) miObj, effectiveProvider, 1310 out, outLineage, cteMap, publishedMap); 1311 } 1312 } finally { 1313 if (handoffApplied) { 1314 insert.setCteList(outerCtes); 1315 } 1316 } 1317 1318 return new SemanticProgram(out, outLineage); 1319 } 1320 1321 /** 1322 * Slice 93 — true when every INSERT-SELECT pair in a Hive multi-insert 1323 * block has a single FROM source (i.e., one entry in 1324 * {@code subQuery.getTables()}). Guards the source-table fallback so 1325 * Phase 1's heuristic source assignment is only trusted in contexts 1326 * where it is provably unambiguous (round-2 codex Q1 BLOCKING). 1327 */ 1328 private static boolean isSingleSourceMultiInsert(TInsertSqlStatement insert) { 1329 if (!isSingleSourceSubQuery(insert.getSubQuery())) { 1330 return false; 1331 } 1332 for (Object miObj : insert.getMultiInsertStatements()) { 1333 TInsertSqlStatement mi = (TInsertSqlStatement) miObj; 1334 if (!isSingleSourceSubQuery(mi.getSubQuery())) { 1335 return false; 1336 } 1337 } 1338 return true; 1339 } 1340 1341 private static boolean isSingleSourceSubQuery(TSelectSqlStatement sel) { 1342 return sel != null && sel.getTables() != null && sel.getTables().size() == 1; 1343 } 1344 1345 /** 1346 * Build one INSERT-SELECT pair into {@code out} / {@code outLineage}. 1347 * Called by {@link #buildHiveMultiInsert} for the primary and each 1348 * additional INSERT in a Hive multi-insert block. Each call validates 1349 * the target/source, builds the source SELECT via {@link #build}, and 1350 * delegates the post-build INSERT-graph assembly to 1351 * {@link #assembleInsertGraphAndLineage} so the layout exactly mirrors 1352 * the single-target slice-78 INSERT path (the helper also handles 1353 * inner-lineage rebasing when {@code out} is non-empty). 1354 */ 1355 private static void appendOneHiveInsert(TInsertSqlStatement insert, 1356 NameBindingProvider provider, 1357 List<StatementGraph> out, 1358 List<LineageEdge> outLineage, 1359 Map<String, Integer> cteMap, 1360 Map<String, List<String>> publishedMap) { 1361 TTable targetTable = insert.getTargetTable(); 1362 if (targetTable == null || targetTable.getTableName() == null) { 1363 throw new SemanticIRBuildException(Diagnostic.error( 1364 DiagnosticCode.INSERT_TARGET_MISSING, 1365 "Hive multi-insert: INSERT has no resolvable target table", 1366 insert)); 1367 } 1368 String targetQName = targetTable.getTableName().toString(); 1369 if (targetQName.isEmpty()) { 1370 throw new SemanticIRBuildException(Diagnostic.error( 1371 DiagnosticCode.INSERT_TARGET_MISSING, 1372 "Hive multi-insert: INSERT target table name is empty", 1373 insert)); 1374 } 1375 TSelectSqlStatement source = insert.getSubQuery(); 1376 if (source == null) { 1377 throw new SemanticIRBuildException(Diagnostic.error( 1378 DiagnosticCode.INSERT_SOURCE_NOT_SUPPORTED, 1379 "Hive multi-insert: INSERT has no source SELECT", 1380 insert)); 1381 } 1382 1383 // Slice 109 — when outer CTEs are present (cteMap non-empty), build 1384 // the source SELECT via buildSelectBodyAfterCteWalk directly into 1385 // out/outLineage so it sees the shared cteMap/publishedMap. The 1386 // slice-93 path (no outer CTEs) keeps the build(source, provider) + 1387 // assembleInsertGraphAndLineage flow unchanged. 1388 if (cteMap.isEmpty()) { 1389 SemanticProgram inner = build(source, provider); 1390 // Hive has no RETURNING/OUTPUT — pass null clauses directly. 1391 assembleInsertGraphAndLineage( 1392 insert, targetTable, targetQName, inner, 1393 "Hive multi-insert: INSERT", 1394 /*returningClause=*/ null, 1395 /*outputClause=*/ null, 1396 out, outLineage, provider); 1397 return; 1398 } 1399 1400 // Slice 109 — defensive: parser probe shows sub-SELECTs in Hive 1401 // multi-insert do NOT carry their own cteList. If a future parser 1402 // change ever attached one, mixed outer+inner WITH semantics would 1403 // need slice-107/108-style two-pass walker support; until then the 1404 // shape rejects with the existing mixed-WITH code. 1405 if (source.getCteList() != null && source.getCteList().size() > 0) { 1406 throw new SemanticIRBuildException(Diagnostic.error( 1407 DiagnosticCode.INSERT_MIXED_OUTER_AND_INNER_WITH_NOT_SUPPORTED, 1408 "Hive multi-insert: mixed outer + inner WITH on a " 1409 + "sub-SELECT is not supported by " 1410 + "SemanticIRBuilder.buildHiveMultiInsert; " 1411 + "slice 109 admits outer-only WITH on multi-insert", 1412 insert)); 1413 } 1414 1415 // Snapshot out.size() so the source SELECT and its inner extractions 1416 // are pinned to known positions. The slice-23 EXISTS-extraction and 1417 // FROM-subquery extraction paths inside buildSelectBodyAfterCteWalk 1418 // append directly to out/outLineage; the source SELECT lands LAST. 1419 int beforeSelectIdx = out.size(); 1420 buildSelectBodyAfterCteWalk(source, provider, out, outLineage, 1421 cteMap, publishedMap, 1422 /*hasOuterCteListAlreadyProcessed=*/ true); 1423 if (out.size() <= beforeSelectIdx) { 1424 // Defensive: buildSelectBodyAfterCteWalk always appends at least 1425 // the source SELECT; this branch is unreachable in practice. 1426 throw new SemanticIRBuildException(Diagnostic.error( 1427 DiagnosticCode.INSERT_SOURCE_NOT_SUPPORTED, 1428 "Hive multi-insert: INSERT source built no statements", 1429 insert)); 1430 } 1431 int selectIdx = out.size() - 1; 1432 assembleInsertTargetGraphFromAppended( 1433 insert, targetTable, targetQName, selectIdx, 1434 "Hive multi-insert: INSERT", 1435 /*returningClause=*/ null, 1436 /*outputClause=*/ null, 1437 out, outLineage, provider); 1438 } 1439 1440 /** 1441 * Slice 93 — shared INSERT-graph assembly used by both the slice-78 1442 * single-target {@link #buildInsert} and the slice-93 Hive multi-insert 1443 * {@link #appendOneHiveInsert}. Appends {@code inner.getStatements()} 1444 * to {@code out} (rebasing {@code inner.getLineage()}'s STATEMENT_OUTPUT 1445 * indices when {@code out} is non-empty), then appends an INSERT-kind 1446 * {@link StatementGraph} and per-source-output cross-statement 1447 * {@link LineageEdge}s. 1448 * 1449 * <p>Discriminators between the two callers: 1450 * <ul> 1451 * <li>{@code diagnosticPrefix} is woven into column-count-mismatch 1452 * and empty-inner-source error messages so the originating call 1453 * site is identifiable.</li> 1454 * <li>{@code returningClause} / {@code outputClause} are passed 1455 * directly to {@link #buildReturningColumns} (slice 78 supplies 1456 * the INSERT's RETURNING/OUTPUT clauses; slice 93's Hive path 1457 * passes {@code null}/{@code null} since Hive has no 1458 * RETURNING/OUTPUT). Passing the clauses directly keeps the 1459 * discriminator visible at every call site rather than hidden 1460 * behind a boolean (round-2 codex Q3 suggestion).</li> 1461 * </ul> 1462 * 1463 * <p>Mutates both {@code out} and {@code outLineage}. 1464 */ 1465 private static void assembleInsertGraphAndLineage( 1466 TInsertSqlStatement insert, 1467 TTable targetTable, 1468 String targetQName, 1469 SemanticProgram inner, 1470 String diagnosticPrefix, 1471 TReturningClause returningClause, 1472 TOutputClause outputClause, 1473 List<StatementGraph> out, 1474 List<LineageEdge> outLineage, 1475 NameBindingProvider provider) { 1476 List<StatementGraph> innerStmts = inner.getStatements(); 1477 if (innerStmts.isEmpty()) { 1478 // Defensive: build() always returns at least one statement when 1479 // it doesn't throw. This branch is unreachable in practice but 1480 // surfaces a structured diagnostic instead of an 1481 // IndexOutOfBoundsException on the sourceOuter access below. 1482 throw new SemanticIRBuildException(Diagnostic.error( 1483 DiagnosticCode.INSERT_SOURCE_NOT_SUPPORTED, 1484 diagnosticPrefix + " source built no statements", 1485 insert)); 1486 } 1487 1488 // Rebase inner lineage edges by the current out.size() offset 1489 // (round-2 codex Q4 BLOCKING). For the slice-78 single-target 1490 // path out is empty (offset=0) so rebase is a no-op; for the 1491 // slice-93 Hive path each subsequent INSERT-SELECT pair adds 1492 // an offset matching the absolute position of its inner block. 1493 int offset = out.size(); 1494 int selectIdx = offset + innerStmts.size() - 1; 1495 out.addAll(innerStmts); 1496 for (LineageEdge e : inner.getLineage()) { 1497 outLineage.add(rebaseLineageEdge(e, offset)); 1498 } 1499 1500 StatementGraph sourceOuter = innerStmts.get(innerStmts.size() - 1); 1501 List<OutputColumn> sourceOutputs = sourceOuter.getOutputColumns(); 1502 int sourceOutCount = sourceOutputs.size(); 1503 1504 // Optional explicit INSERT column list. Verbatim bare-name 1505 // spelling per slice-78 contract; arity mismatch rejects. 1506 TObjectNameList colList = insert.getColumnList(); 1507 List<String> targetColumnNames = new ArrayList<>(); 1508 if (colList != null && colList.size() > 0) { 1509 for (int i = 0; i < colList.size(); i++) { 1510 TObjectName n = colList.getObjectName(i); 1511 targetColumnNames.add(n == null ? "" : n.toString()); 1512 } 1513 if (targetColumnNames.size() != sourceOutCount) { 1514 throw new SemanticIRBuildException(Diagnostic.error( 1515 DiagnosticCode.INSERT_COLUMN_COUNT_MISMATCH, 1516 diagnosticPrefix + " column list has " 1517 + targetColumnNames.size() 1518 + " column(s) but source SELECT produced " 1519 + sourceOutCount + " output(s)", 1520 insert)); 1521 } 1522 } 1523 1524 // INSERT StatementGraph — slice-78 single-target shape with the 1525 // source SELECT as a SUBQUERY-kind relation entry. 1526 String sourceName = sourceOuter.getName(); 1527 String sourceRelAlias = (sourceName != null && !sourceName.isEmpty()) 1528 ? sourceName : "__insert_source__"; 1529 RelationBinding sourceBinding = new RelationBinding( 1530 RelationKind.SUBQUERY, sourceRelAlias); 1531 List<RelationSource> insertRelations = new ArrayList<>(); 1532 insertRelations.add(new RelationSource(sourceRelAlias, sourceBinding)); 1533 1534 RelationBinding targetBinding = new RelationBinding( 1535 RelationKind.TABLE, targetQName); 1536 TargetRelation target = new TargetRelation(targetBinding, targetColumnNames); 1537 1538 int insertIdx = out.size(); 1539 String insertTargetAlias = effectiveAliasOf(targetTable); 1540 if (insertTargetAlias == null || insertTargetAlias.isEmpty()) { 1541 insertTargetAlias = targetQName; 1542 } 1543 // Slice 85: RETURNING/OUTPUT projections. Clauses are passed 1544 // through directly from the call site (slice-78 single-target 1545 // forwards the INSERT's own clauses; slice-93 Hive multi-insert 1546 // forwards null/null since Hive has no RETURNING/OUTPUT). 1547 List<OutputColumn> returningCols = buildReturningColumns( 1548 returningClause, 1549 outputClause, 1550 "INSERT", 1551 targetQName, 1552 insertTargetAlias, 1553 targetTable, 1554 /*fromSideRelations=*/ Collections.<RelationSource>emptyList(), 1555 provider, 1556 insertIdx, 1557 outLineage, 1558 insert); 1559 1560 StatementGraph insertOuter = new StatementGraph( 1561 /*name=*/ null, 1562 "INSERT", 1563 insertRelations, 1564 /*outputColumns=*/ Collections.<OutputColumn>emptyList(), 1565 returningCols, 1566 /*filterColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1567 /*joinColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1568 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1569 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1570 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1571 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1572 /*distinct=*/ false, 1573 /*setOperator=*/ null, 1574 /*rowLimit=*/ null, 1575 target); 1576 out.add(insertOuter); 1577 1578 // Cross-statement lineage: target.col_i ← STATEMENT_OUTPUT(selectIdx, srcName_i) 1579 for (int i = 0; i < sourceOutCount; i++) { 1580 String srcName = sourceOutputs.get(i).getName(); 1581 String tgtName = (i < targetColumnNames.size()) 1582 ? targetColumnNames.get(i) : srcName; 1583 if (tgtName == null || tgtName.isEmpty()) { 1584 continue; 1585 } 1586 outLineage.add(new LineageEdge( 1587 LineageRef.tableColumn(targetQName, tgtName), 1588 LineageRef.statementOutput(selectIdx, srcName))); 1589 } 1590 } 1591 1592 /** 1593 * Slice 109 — assemble the INSERT-target half (TargetRelation, INSERT 1594 * StatementGraph, RETURNING/OUTPUT projections, and cross-statement 1595 * lineage edges) when the source SELECT and its inner extractions have 1596 * ALREADY been appended directly to {@code out}/{@code outLineage} by 1597 * {@link #buildSelectBodyAfterCteWalk}. The slice-93 1598 * {@link #assembleInsertGraphAndLineage} helper, by contrast, takes a 1599 * pre-built {@link SemanticProgram} and rebases STATEMENT_OUTPUT 1600 * indices on the way in — that path is unused here because the source 1601 * SELECT was already built into absolute positions in {@code out}. 1602 * 1603 * <p>{@code selectIdx} must be the position of the source SELECT in 1604 * {@code out} (last statement appended by the caller before this helper 1605 * runs). RETURNING/OUTPUT clauses are passed directly (Hive multi- 1606 * insert callers pass {@code null}/{@code null}); other DMLs that 1607 * adopt this helper later can forward their own. 1608 */ 1609 private static void assembleInsertTargetGraphFromAppended( 1610 TInsertSqlStatement insert, 1611 TTable targetTable, 1612 String targetQName, 1613 int selectIdx, 1614 String diagnosticPrefix, 1615 TReturningClause returningClause, 1616 TOutputClause outputClause, 1617 List<StatementGraph> out, 1618 List<LineageEdge> outLineage, 1619 NameBindingProvider provider) { 1620 StatementGraph sourceOuter = out.get(selectIdx); 1621 List<OutputColumn> sourceOutputs = sourceOuter.getOutputColumns(); 1622 int sourceOutCount = sourceOutputs.size(); 1623 1624 TObjectNameList colList = insert.getColumnList(); 1625 List<String> targetColumnNames = new ArrayList<>(); 1626 if (colList != null && colList.size() > 0) { 1627 for (int i = 0; i < colList.size(); i++) { 1628 TObjectName n = colList.getObjectName(i); 1629 targetColumnNames.add(n == null ? "" : n.toString()); 1630 } 1631 if (targetColumnNames.size() != sourceOutCount) { 1632 throw new SemanticIRBuildException(Diagnostic.error( 1633 DiagnosticCode.INSERT_COLUMN_COUNT_MISMATCH, 1634 diagnosticPrefix + " column list has " 1635 + targetColumnNames.size() 1636 + " column(s) but source SELECT produced " 1637 + sourceOutCount + " output(s)", 1638 insert)); 1639 } 1640 } 1641 1642 String sourceName = sourceOuter.getName(); 1643 String sourceRelAlias = (sourceName != null && !sourceName.isEmpty()) 1644 ? sourceName : "__insert_source__"; 1645 RelationBinding sourceBinding = new RelationBinding( 1646 RelationKind.SUBQUERY, sourceRelAlias); 1647 List<RelationSource> insertRelations = new ArrayList<>(); 1648 insertRelations.add(new RelationSource(sourceRelAlias, sourceBinding)); 1649 1650 RelationBinding targetBinding = new RelationBinding( 1651 RelationKind.TABLE, targetQName); 1652 TargetRelation target = new TargetRelation(targetBinding, targetColumnNames); 1653 1654 int insertIdx = out.size(); 1655 String insertTargetAlias = effectiveAliasOf(targetTable); 1656 if (insertTargetAlias == null || insertTargetAlias.isEmpty()) { 1657 insertTargetAlias = targetQName; 1658 } 1659 List<OutputColumn> returningCols = buildReturningColumns( 1660 returningClause, 1661 outputClause, 1662 "INSERT", 1663 targetQName, 1664 insertTargetAlias, 1665 targetTable, 1666 /*fromSideRelations=*/ Collections.<RelationSource>emptyList(), 1667 provider, 1668 insertIdx, 1669 outLineage, 1670 insert); 1671 1672 StatementGraph insertOuter = new StatementGraph( 1673 /*name=*/ null, 1674 "INSERT", 1675 insertRelations, 1676 /*outputColumns=*/ Collections.<OutputColumn>emptyList(), 1677 returningCols, 1678 /*filterColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1679 /*joinColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1680 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1681 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1682 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1683 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1684 /*distinct=*/ false, 1685 /*setOperator=*/ null, 1686 /*rowLimit=*/ null, 1687 target); 1688 out.add(insertOuter); 1689 1690 for (int i = 0; i < sourceOutCount; i++) { 1691 String srcName = sourceOutputs.get(i).getName(); 1692 String tgtName = (i < targetColumnNames.size()) 1693 ? targetColumnNames.get(i) : srcName; 1694 if (tgtName == null || tgtName.isEmpty()) { 1695 continue; 1696 } 1697 outLineage.add(new LineageEdge( 1698 LineageRef.tableColumn(targetQName, tgtName), 1699 LineageRef.statementOutput(selectIdx, srcName))); 1700 } 1701 } 1702 1703 /** 1704 * Slice 93 — rebase a {@link LineageEdge}'s {@code STATEMENT_OUTPUT} 1705 * statement indices by {@code offset}. {@code TABLE_COLUMN} refs are 1706 * returned unchanged. Used to concatenate inner {@link SemanticProgram}s 1707 * into a larger one (Hive multi-insert: each INSERT-SELECT pair's inner 1708 * program contributes its own block of statements). 1709 */ 1710 private static LineageEdge rebaseLineageEdge(LineageEdge e, int offset) { 1711 if (offset == 0) { 1712 return e; 1713 } 1714 LineageRef from = rebaseLineageRef(e.getFrom(), offset); 1715 LineageRef to = rebaseLineageRef(e.getTo(), offset); 1716 if (from == e.getFrom() && to == e.getTo()) { 1717 return e; 1718 } 1719 return new LineageEdge(from, to); 1720 } 1721 1722 private static LineageRef rebaseLineageRef(LineageRef ref, int offset) { 1723 if (ref == null) { 1724 return null; 1725 } 1726 if (ref.getKind() != LineageRef.Kind.STATEMENT_OUTPUT) { 1727 return ref; 1728 } 1729 return LineageRef.statementOutput( 1730 ref.getStatementIndex() + offset, ref.getOutputName()); 1731 } 1732 1733 /** 1734 * Slice 79 — admit a single {@code CREATE TABLE target [(c1, ...)] AS 1735 * SELECT ...} (CTAS) statement. Builds the source SELECT via 1736 * {@link #build} unchanged, then appends a {@code "CREATE_TABLE"}- 1737 * kind {@link StatementGraph} carrying the target relation and 1738 * cross-statement lineage edges (mirrors slice-78 INSERT). 1739 * 1740 * <p>Admitted shape: {@code CREATE [OR REPLACE] TABLE target 1741 * [(c1, c2, ...)] AS <subquery-SELECT>}. Plain 1742 * {@code CREATE TABLE target (a INT, b VARCHAR)} (column DDL with 1743 * no AS SELECT) is rejected via 1744 * {@link DiagnosticCode#CREATE_AS_NO_SOURCE_SELECT}. Explicit 1745 * column-list arity mismatch surfaces as 1746 * {@link DiagnosticCode#CREATE_AS_COLUMN_COUNT_MISMATCH}; a 1747 * missing / empty target name surfaces (defensively) as 1748 * {@link DiagnosticCode#CREATE_AS_TARGET_MISSING}. 1749 * 1750 * <p>For CTAS the explicit column-list spellings come from 1751 * {@link TCreateTableSqlStatement#getColumnList()} — only the bare 1752 * column name from each {@link TColumnDefinition} is consumed; 1753 * data-type tokens are ignored by slice 79. 1754 */ 1755 public static SemanticProgram buildCreateTable(TCreateTableSqlStatement create, 1756 NameBindingProvider provider) { 1757 if (create == null) { 1758 throw new IllegalArgumentException("create must not be null"); 1759 } 1760 if (provider == null) { 1761 throw new IllegalArgumentException("provider must not be null"); 1762 } 1763 1764 // Target name extraction. CTAS exposes the target via the 1765 // TCustomSqlStatement-inherited getTargetTable(); the explicit 1766 // getTableName() is a thin wrapper around tables[0].getTableName() 1767 // and also works. Use getTableName() for symmetry with the 1768 // slice-78 INSERT path. 1769 TObjectName targetName = create.getTableName(); 1770 if (targetName == null) { 1771 throw new SemanticIRBuildException(Diagnostic.error( 1772 DiagnosticCode.CREATE_AS_TARGET_MISSING, 1773 "CREATE TABLE has no resolvable target table name", 1774 create)); 1775 } 1776 String targetQName = targetName.toString(); 1777 if (targetQName == null || targetQName.isEmpty()) { 1778 throw new SemanticIRBuildException(Diagnostic.error( 1779 DiagnosticCode.CREATE_AS_TARGET_MISSING, 1780 "CREATE TABLE target table name is empty", 1781 create)); 1782 } 1783 1784 TSelectSqlStatement source = create.getSubQuery(); 1785 if (source == null) { 1786 throw new SemanticIRBuildException(Diagnostic.error( 1787 DiagnosticCode.CREATE_AS_NO_SOURCE_SELECT, 1788 "CREATE TABLE has no AS SELECT subquery; slice 79 admits " 1789 + "CTAS (CREATE TABLE <target> [(c1, ...)] AS SELECT ...) only", 1790 create)); 1791 } 1792 1793 // Pull explicit column-list spellings BEFORE building the inner 1794 // — keeps the error path cheap for the structural-invalid case 1795 // (CTAS with column count mismatch is detected after the inner 1796 // build because we don't know the source output count yet). 1797 List<String> targetColumnNames = new ArrayList<>(); 1798 TColumnDefinitionList colList = create.getColumnList(); 1799 if (colList != null && colList.size() > 0) { 1800 for (int i = 0; i < colList.size(); i++) { 1801 TColumnDefinition cd = colList.getColumn(i); 1802 TObjectName n = (cd == null) ? null : cd.getColumnName(); 1803 String spelling = (n == null) ? "" : n.toString(); 1804 targetColumnNames.add(spelling); 1805 } 1806 } 1807 1808 return assembleCreateLikeProgram(create, source, provider, 1809 "CREATE_TABLE", targetQName, targetColumnNames); 1810 } 1811 1812 /** 1813 * Slice 79 — admit a single 1814 * {@code CREATE [OR REPLACE] VIEW v [(c1, ...)] AS SELECT ...} 1815 * statement. Mirrors {@link #buildCreateTable} except the source 1816 * SELECT is fetched via {@link TCreateViewSqlStatement#getSubquery()} 1817 * (lowercase 'q'), the target name from 1818 * {@link TCreateViewSqlStatement#getViewName()}, and the explicit 1819 * column-list spellings from {@link TViewAliasClause} on the AST. 1820 */ 1821 public static SemanticProgram buildCreateView(TCreateViewSqlStatement create, 1822 NameBindingProvider provider) { 1823 if (create == null) { 1824 throw new IllegalArgumentException("create must not be null"); 1825 } 1826 if (provider == null) { 1827 throw new IllegalArgumentException("provider must not be null"); 1828 } 1829 1830 TObjectName viewName = create.getViewName(); 1831 if (viewName == null) { 1832 throw new SemanticIRBuildException(Diagnostic.error( 1833 DiagnosticCode.CREATE_AS_TARGET_MISSING, 1834 "CREATE VIEW has no resolvable view name", 1835 create)); 1836 } 1837 String targetQName = viewName.toString(); 1838 if (targetQName == null || targetQName.isEmpty()) { 1839 throw new SemanticIRBuildException(Diagnostic.error( 1840 DiagnosticCode.CREATE_AS_TARGET_MISSING, 1841 "CREATE VIEW target view name is empty", 1842 create)); 1843 } 1844 1845 TSelectSqlStatement source = create.getSubquery(); 1846 if (source == null) { 1847 throw new SemanticIRBuildException(Diagnostic.error( 1848 DiagnosticCode.CREATE_AS_NO_SOURCE_SELECT, 1849 "CREATE VIEW has no AS SELECT subquery; slice 79 admits " 1850 + "CREATE VIEW <target> [(c1, ...)] AS SELECT ... only", 1851 create)); 1852 } 1853 1854 // View-side explicit column aliases via viewAliasClause. Items 1855 // whose alias is null are preserved as empty-string entries so 1856 // a parser-quirk gap doesn't silently collapse the list and 1857 // shift later aliases onto wrong source-output positions — 1858 // count-mismatch detection downstream stays accurate 1859 // (codex diff-review round 1 P2 catch). 1860 List<String> targetColumnNames = new ArrayList<>(); 1861 TViewAliasClause aliasClause = create.getViewAliasClause(); 1862 if (aliasClause != null) { 1863 TViewAliasItemList items = aliasClause.getViewAliasItemList(); 1864 if (items != null) { 1865 for (int i = 0; i < items.size(); i++) { 1866 TViewAliasItem item = items.getViewAliasItem(i); 1867 TObjectName alias = (item == null) ? null : item.getAlias(); 1868 String spelling = (alias == null) ? "" : alias.toString(); 1869 targetColumnNames.add(spelling); 1870 } 1871 } 1872 } 1873 1874 return assembleCreateLikeProgram(create, source, provider, 1875 "CREATE_VIEW", targetQName, targetColumnNames); 1876 } 1877 1878 /** 1879 * Shared assembly path for slice-79 CTAS / CREATE VIEW. Given a 1880 * pre-validated target name and the (possibly empty) list of 1881 * explicit column-list spellings, builds the source SELECT, 1882 * validates column-list arity, and emits the outer 1883 * StatementGraph + cross-stmt lineage edges. Mirrors the 1884 * post-source half of slice-78 {@link #buildInsert}. 1885 */ 1886 private static SemanticProgram assembleCreateLikeProgram( 1887 TParseTreeNode anchor, TSelectSqlStatement source, 1888 NameBindingProvider provider, String outerKind, 1889 String targetQName, List<String> targetColumnNames) { 1890 SemanticProgram inner = build(source, provider); 1891 List<StatementGraph> innerStmts = inner.getStatements(); 1892 if (innerStmts.isEmpty()) { 1893 throw new SemanticIRBuildException(Diagnostic.error( 1894 DiagnosticCode.CREATE_AS_NO_SOURCE_SELECT, 1895 "CREATE source built no statements", 1896 anchor)); 1897 } 1898 int selectIdx = innerStmts.size() - 1; 1899 StatementGraph sourceOuter = innerStmts.get(selectIdx); 1900 List<OutputColumn> sourceOutputs = sourceOuter.getOutputColumns(); 1901 int sourceOutCount = sourceOutputs.size(); 1902 1903 if (!targetColumnNames.isEmpty() 1904 && targetColumnNames.size() != sourceOutCount) { 1905 throw new SemanticIRBuildException(Diagnostic.error( 1906 DiagnosticCode.CREATE_AS_COLUMN_COUNT_MISMATCH, 1907 outerKind.equals("CREATE_TABLE") 1908 ? ("CREATE TABLE column list has " + targetColumnNames.size() 1909 + " column(s) but source SELECT produced " 1910 + sourceOutCount + " output(s)") 1911 : ("CREATE VIEW alias list has " + targetColumnNames.size() 1912 + " column(s) but source SELECT produced " 1913 + sourceOutCount + " output(s)"), 1914 anchor)); 1915 } 1916 1917 String sourceName = sourceOuter.getName(); 1918 String sourceRelAlias = (sourceName != null && !sourceName.isEmpty()) 1919 ? sourceName : "__create_source__"; 1920 RelationBinding sourceBinding = new RelationBinding( 1921 RelationKind.SUBQUERY, sourceRelAlias); 1922 List<RelationSource> createRelations = new ArrayList<>(); 1923 createRelations.add(new RelationSource(sourceRelAlias, sourceBinding)); 1924 1925 RelationBinding targetBinding = new RelationBinding( 1926 RelationKind.TABLE, targetQName); 1927 TargetRelation target = new TargetRelation(targetBinding, targetColumnNames); 1928 1929 List<StatementGraph> out = new ArrayList<>(innerStmts.size() + 1); 1930 out.addAll(innerStmts); 1931 List<LineageEdge> outLineage = new ArrayList<>(inner.getLineage()); 1932 1933 StatementGraph createOuter = new StatementGraph( 1934 /*name=*/ null, 1935 outerKind, 1936 createRelations, 1937 /*outputColumns=*/ Collections.<OutputColumn>emptyList(), 1938 /*filterColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1939 /*joinColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1940 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1941 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1942 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1943 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 1944 /*distinct=*/ false, 1945 /*setOperator=*/ null, 1946 /*rowLimit=*/ null, 1947 target); 1948 out.add(createOuter); 1949 1950 for (int i = 0; i < sourceOutCount; i++) { 1951 String srcName = sourceOutputs.get(i).getName(); 1952 String tgtName = (i < targetColumnNames.size()) 1953 ? targetColumnNames.get(i) : srcName; 1954 if (tgtName == null || tgtName.isEmpty()) { 1955 continue; 1956 } 1957 outLineage.add(new LineageEdge( 1958 LineageRef.tableColumn(targetQName, tgtName), 1959 LineageRef.statementOutput(selectIdx, srcName))); 1960 } 1961 1962 return new SemanticProgram(out, outLineage); 1963 } 1964 1965 /** 1966 * Slice 80 / 82 — admit {@code UPDATE target SET c1 = expr1, 1967 * c2 = expr2, ... [FROM source_list] [WHERE pred]} statements. 1968 * Emits one {@code "UPDATE"}-kind {@link StatementGraph} carrying 1969 * the target relation plus synthetic {@link OutputColumn} entries 1970 * per SET assignment (output name = SET LHS verbatim spelling; 1971 * sources = column refs collected from the RHS expression). 1972 * Optional WHERE refs surface on 1973 * {@link StatementGraph#getFilterColumnRefs()}. 1974 * 1975 * <p>Slice 82 lifts the slice-80 {@code UPDATE_JOINED_NOT_SUPPORTED} 1976 * reject for the common PG / MSSQL / BigQuery / Snowflake / Redshift 1977 * FROM-side joined UPDATE shapes. The IR shape gains two slots: 1978 * {@code relations[]} now carries TABLE-kind RelationSources for 1979 * FROM-side sources (slice 80 left empty), and 1980 * {@code joinColumnRefs[]} now carries ON-clause column refs from 1981 * FROM-side JOINs. The target stays on 1982 * {@link StatementGraph#getTarget()}; a reference-identity filter 1983 * excludes the target's own TTable instance from {@code relations[]}. 1984 * 1985 * <p>Admitted shape: 1986 * <ul> 1987 * <li>Single-target UPDATE without FROM (slice 80) — 1988 * {@code relations[]} stays empty.</li> 1989 * <li>PG / BQ / SF / RS {@code UPDATE t SET ... FROM source} 1990 * (single FROM source).</li> 1991 * <li>PG / BQ {@code UPDATE t SET ... FROM s1, s2, ...} 1992 * (comma-FROM list).</li> 1993 * <li>PG / MSSQL {@code UPDATE t SET ... FROM s1 [INNER|LEFT|RIGHT|FULL OUTER] JOIN s2 ON ...} 1994 * — ON refs populate {@code joinColumnRefs[]}.</li> 1995 * <li>MSSQL {@code UPDATE t SET ... FROM t INNER JOIN s ON ...} 1996 * — target may appear in FROM; reference-identity filter 1997 * excludes the target's own TTable instance from 1998 * {@code relations[]}.</li> 1999 * <li>Explicit {@code CROSS JOIN} (no ON; semantically equivalent 2000 * to comma-FROM).</li> 2001 * <li>SET LHS is a {@link EExpressionType#simple_object_name_t} 2002 * column reference (qualified {@code t.x} or bare {@code x}). 2003 * Oracle tuple {@code SET (a, b) = (...)} (LHS = list_t) 2004 * rejects via 2005 * {@link DiagnosticCode#UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED}.</li> 2006 * <li>SET RHS may be any expression NOT containing a scalar 2007 * subquery and NOT containing a window function. Subqueries 2008 * reject via 2009 * {@link DiagnosticCode#UPDATE_SET_HAS_SUBQUERY_NOT_SUPPORTED}; 2010 * window functions reuse the existing 2011 * {@link DiagnosticCode#CLAUSE_WINDOW_FUNCTION_LEAK} 2012 * routed through {@link #rejectWindowFunctionInScope}.</li> 2013 * <li>Optional WHERE clause — existing WHERE-side rejects 2014 * (subqueries, window functions) continue to apply via the 2015 * shared {@link #containsAnySubquery} + 2016 * {@code rejectWindowFunctionInScope} helpers used by SELECT 2017 * WHERE.</li> 2018 * </ul> 2019 * 2020 * <p>Slice 82 reject scope, with slice 83 admitting subquery FROM 2021 * sources (the slice-82 {@code UPDATE_FROM_SUBQUERY_NOT_SUPPORTED} 2022 * code stays declared but unreached — slice-71/72 2023 * retain-for-documentation precedent): 2024 * <ul> 2025 * <li>Subquery as a FROM source — slice 83 admits via the 2026 * SELECT-side {@code processDirectSubqueryTable} extractor, 2027 * publishing a SUBQUERY-kind {@link RelationSource} and a 2028 * cross-statement {@link LineageEdge} per subquery-bound 2029 * output source.</li> 2030 * <li>USING in any FROM-side join item — 2031 * {@link DiagnosticCode#UPDATE_FROM_JOIN_USING_NOT_SUPPORTED}.</li> 2032 * <li>NATURAL JOIN in any FROM-side join item — 2033 * {@link DiagnosticCode#UPDATE_FROM_JOIN_NATURAL_NOT_SUPPORTED}.</li> 2034 * <li>Subquery in any ON condition — 2035 * {@link DiagnosticCode#UPDATE_JOIN_ON_HAS_SUBQUERY_NOT_SUPPORTED}.</li> 2036 * <li>Window function in any ON condition — reuses 2037 * {@link DiagnosticCode#CLAUSE_WINDOW_FUNCTION_LEAK} via 2038 * {@link #rejectWindowFunctionInScope}.</li> 2039 * </ul> 2040 * 2041 * <p>Deferred (rejected at the outer level before any SET 2042 * processing): 2043 * <ul> 2044 * <li>Top-level WITH on UPDATE → 2045 * {@link DiagnosticCode#UPDATE_CTE_NOT_SUPPORTED}.</li> 2046 * <li>RETURNING projection (PG / Oracle) → 2047 * {@link DiagnosticCode#UPDATE_RETURNING_CLAUSE_NOT_SUPPORTED}.</li> 2048 * <li>OUTPUT projection (SQL Server) → 2049 * {@link DiagnosticCode#UPDATE_OUTPUT_CLAUSE_NOT_SUPPORTED}.</li> 2050 * <li>ORDER BY / LIMIT on UPDATE (MySQL / Couchbase) → 2051 * {@link DiagnosticCode#UPDATE_ORDER_BY_OR_LIMIT_NOT_SUPPORTED}.</li> 2052 * <li>Empty / missing SET clause, Couchbase UNSET-only updates → 2053 * {@link DiagnosticCode#UPDATE_NO_SET_CLAUSE}.</li> 2054 * <li>Missing target table (defensive) → 2055 * {@link DiagnosticCode#UPDATE_TARGET_MISSING}.</li> 2056 * </ul> 2057 * 2058 * <p>Cross-statement {@link LineageEdge}s, one per SET assignment: 2059 * <pre> 2060 * from = LineageRef.tableColumn(targetQName, target_col_i) 2061 * to = LineageRef.statementOutput(0, output_name_i) 2062 * </pre> 2063 * Statement index 0 is the UPDATE statement itself — the synthetic 2064 * output IS the per-assignment "projection" that flows into the 2065 * target column. This is the slice-78 INSERT contract 2066 * (TABLE_COLUMN → STATEMENT_OUTPUT) with the source SELECT replaced 2067 * by the UPDATE's own per-assignment outputs; consumers read 2068 * {@code outputs[i].sources} to enumerate the RHS column refs that 2069 * feed the target column. 2070 */ 2071 public static SemanticProgram buildUpdate(TUpdateSqlStatement update, 2072 NameBindingProvider provider) { 2073 if (update == null) { 2074 throw new IllegalArgumentException("update must not be null"); 2075 } 2076 if (provider == null) { 2077 throw new IllegalArgumentException("provider must not be null"); 2078 } 2079 2080 // Slice 86 — defensive UsingScope reset at entry so a parent 2081 // scope cannot leak into UPDATE's binding decisions. Mirrors 2082 // SELECT-side buildSelectStatementImpl (slice 65). The UPDATE's 2083 // own UsingScope is installed after the FROM-join walker (step 2084 // 5.8 below). 2085 provider = provider.withUsingScope(UsingScope.EMPTY); 2086 2087 // 1) Slice 105 — admit top-level WITH on UPDATE. Walks the CTE 2088 // list left-to-right, building each body as a preceding 2089 // StatementGraph and producing cteNameToStatementIndex + 2090 // ctePublishedColumns for the FROM-as-CTE branch in 2091 // buildUpdateRelation below. Mirrors the slice-101 MERGE walker. 2092 // `stmts` / `lineage` allocated here (hoisted from the prior 2093 // slice-83 location) so the CTE walker can append. 2094 // UPDATE_CTE_NOT_SUPPORTED stays declared-but-unreached 2095 // (slice 71/72/82/86/95/96/97/98/99/100/101/102/103/104 precedent). 2096 List<StatementGraph> stmts = new ArrayList<>(); 2097 List<LineageEdge> lineage = new ArrayList<>(); 2098 Map<String, List<String>> ctePublishedColumns = new LinkedHashMap<>(); 2099 Map<String, Integer> cteNameToStatementIndex = buildUpdateCteList( 2100 update, provider, stmts, lineage, ctePublishedColumns); 2101 2102 // 2) Target table — defensive (parser usually rejects first). 2103 TTable targetTable = update.getTargetTable(); 2104 if (targetTable == null || targetTable.getTableName() == null) { 2105 throw new SemanticIRBuildException(Diagnostic.error( 2106 DiagnosticCode.UPDATE_TARGET_MISSING, 2107 "UPDATE statement has no resolvable target table", 2108 update)); 2109 } 2110 String targetQName = targetTable.getTableName().toString(); 2111 if (targetQName == null || targetQName.isEmpty()) { 2112 throw new SemanticIRBuildException(Diagnostic.error( 2113 DiagnosticCode.UPDATE_TARGET_MISSING, 2114 "UPDATE target table name is empty", 2115 update)); 2116 } 2117 2118 // 3) Slice 82 — FROM-side joined UPDATE is now admitted. The 2119 // slice-80 UPDATE_JOINED_NOT_SUPPORTED rejects (which fired on 2120 // update.tables.size() > 1 and update.getFromSourceJoin() != null) 2121 // are removed. The shape-specific rejects below (subquery in 2122 // FROM, USING, NATURAL, subquery in ON, window in ON) replace 2123 // them. UPDATE_JOINED_NOT_SUPPORTED remains declared but 2124 // unreached for API stability (the residual join-form-target 2125 // shape `UPDATE (a JOIN b) SET ...` does not parse in any 2126 // supported dialect — verified by AST probe). 2127 // 2128 // Reject ordering within buildUpdate: WITH / target-missing / 2129 // RETURNING / OUTPUT / ORDER BY / LIMIT / SET-empty all run 2130 // before the per-source FROM walk so a single rejection wins 2131 // on multi-violation shapes (e.g. `UPDATE t ... FROM s 2132 // RETURNING ...` rejects RETURNING before the FROM walk). 2133 2134 // 4) Slice 85 lifts the RETURNING / OUTPUT rejects — projections 2135 // are now admitted via {@link #buildReturningColumns} called after 2136 // SET / WHERE / FROM walks complete (the projection expressions 2137 // need the providerWithStar binding constructed in step 5.5). 2138 // The cheap statement-level OUTPUT_INTO reject fires here so a 2139 // multi-violation shape (OUTPUT … INTO target with RETURNING 2140 // content errors) routes to the cheaper structural code first. 2141 // {@code UPDATE_RETURNING_CLAUSE_NOT_SUPPORTED} and 2142 // {@code UPDATE_OUTPUT_CLAUSE_NOT_SUPPORTED} stay declared but 2143 // unreached (slice 71/72 retain-for-documentation precedent). 2144 if (update.getOutputClause() != null 2145 && update.getOutputClause().getIntoTable() != null) { 2146 throw new SemanticIRBuildException(Diagnostic.error( 2147 DiagnosticCode.OUTPUT_INTO_NOT_SUPPORTED, 2148 "UPDATE OUTPUT ... INTO <target> writes a second target; " 2149 + "slice 85 admits projection-only OUTPUT", 2150 update)); 2151 } 2152 if (update.getOrderByClause() != null 2153 || update.getLimitClause() != null) { 2154 throw new SemanticIRBuildException(Diagnostic.error( 2155 DiagnosticCode.UPDATE_ORDER_BY_OR_LIMIT_NOT_SUPPORTED, 2156 "UPDATE with ORDER BY / LIMIT (MySQL / Couchbase) is " 2157 + "not supported by SemanticIRBuilder.buildUpdate; " 2158 + "slice 80 admits no row-pruning on UPDATE", 2159 update)); 2160 } 2161 2162 // 5) SET / UNSET — slice 80 requires a non-empty SET clause; a 2163 // Couchbase UNSET-only update (UnSetTerms populated, SET empty) 2164 // routes through the same code with discriminating message text. 2165 TResultColumnList sets = update.getResultColumnList(); 2166 boolean hasUnSet = update.getUnSetTerms() != null 2167 && update.getUnSetTerms().size() > 0; 2168 if (sets == null || sets.size() == 0) { 2169 String reason = hasUnSet 2170 ? "UPDATE has only an UNSET clause (Couchbase); slice 80 " 2171 + "requires a non-empty SET clause" 2172 : "UPDATE has no SET clause"; 2173 throw new SemanticIRBuildException(Diagnostic.error( 2174 DiagnosticCode.UPDATE_NO_SET_CLAUSE, 2175 reason, 2176 update)); 2177 } 2178 2179 // 5.5) Slice 83 — extract FROM subqueries as their own 2180 // StatementGraphs (after slice 105's CTE walker so the CTE 2181 // bodies precede any extracted FROM-subquery in the program). 2182 // 2183 // The extractor reuses the SELECT-side 2184 // {@link #processDirectSubqueryTable} verbatim — passing the 2185 // slice-105 cteNameToStatementIndex + ctePublishedColumns so a 2186 // nested SELECT inside a FROM-subquery can still resolve outer 2187 // CTE references through CTEScope (Resolver2 already binds CTE 2188 // refs in UPDATE correctly; the maps are passed for parity with 2189 // the SELECT/MERGE call sites). Inner predicate subqueries in 2190 // WHERE / JOIN ON / GROUP BY are caught by the slice-17 leak 2191 // guard ({@link #rejectSubqueriesInFromSubqueryBodyClauses}). 2192 // 2193 // No snapshot/rollback wrapper here (codex round-1 Q5 NICE): 2194 // buildUpdate owns fresh local stmts/lineage lists and 2195 // propagates exceptions to the caller — no observer can see 2196 // partial mutation. 2197 // 2198 // Slice 110 — decorate `provider` with `withCteContext` BEFORE 2199 // passing it to `extractUpdateFromSubqueries` so a nested SELECT 2200 // inside an extracted FROM-subquery body (e.g. 2201 // `UPDATE t SET col = sub.x FROM (SELECT id, x FROM cte) sub`) 2202 // routes CTE refs through `RelationKind.CTE`. Mirrors the 2203 // slice-106 DELETE-side `providerWithCte` pattern at line ~3205 2204 // (codex round-2 Q2 BLOCKING fix in slice 106). The slice-105 2205 // UPDATE site missed this decoration; slice 110 closes the gap 2206 // here since it also adds the same decoration on the WHERE-side 2207 // predicate-subquery extraction (line ~2370 below). 2208 NameBindingProvider providerWithCte = cteNameToStatementIndex.isEmpty() 2209 ? provider 2210 : provider.withCteContext(cteNameToStatementIndex.keySet()); 2211 Map<String, Integer> subqueryAliasToIndex = 2212 extractUpdateFromSubqueries(update, providerWithCte, stmts, lineage, 2213 cteNameToStatementIndex, ctePublishedColumns); 2214 // Build the in-scope map (subquery-alias → published column 2215 // names, plus CTE-bound alias → CTE published columns) so 2216 // `provider.withInScopeRelationColumns(map)` recognises 2217 // `sub.x` AND `cte.x` for the consuming UPDATE. Base-table 2218 // FROM-side relations don't need an entry; their column 2219 // resolution stays on the Resolver2 catalog path. 2220 Map<String, List<String>> updateInScope = buildUpdateInScopeMap( 2221 update, subqueryAliasToIndex, stmts, 2222 cteNameToStatementIndex, ctePublishedColumns); 2223 // Slice 110 — base `providerWithStar` on `providerWithCte` 2224 // (instead of raw `provider`) so SET RHS / WHERE / RETURNING 2225 // collectors and the slice-86 USING/NATURAL walker all see the 2226 // outer CTE context. Without this, a CTE-bound reference inside 2227 // a JOIN ON expression or a SET RHS scalar would bind as TABLE- 2228 // kind even when the CTE is declared at the UPDATE level. 2229 NameBindingProvider providerWithStar = updateInScope.isEmpty() 2230 ? providerWithCte 2231 : providerWithCte.withInScopeRelationColumns(updateInScope); 2232 2233 // 5.7) Slice 86 — relocated from slice-82 step 8. The FROM-side 2234 // join walker now runs BEFORE SET RHS / WHERE collection so the 2235 // slice-86 UsingScope (step 5.8 below) can be applied to those 2236 // collectors. Slice 65 SELECT-side ordering: buildRelations → 2237 // buildUsingScope → buildOutputColumns / buildFilter / etc. 2238 // The join walker uses `providerWithStar` (inScope only — no 2239 // UsingScope yet) because USING/NATURAL emit joinColumnRefs[] 2240 // directly via emitMergedJoinRefs without consulting UsingScope. 2241 // 2242 // The walker treats `update.getJoins()` as the authoritative 2243 // FROM-list representation: 2244 // - PG plain `FROM s` → joins=[{table=s, items=[]}] 2245 // - PG comma-FROM → joins=[{s1, items=[]}, {s2, items=[]}, ...] 2246 // - PG / MSSQL explicit JOIN → joins=[{driver, items=[item1,...]}] 2247 // - MSSQL target-in-FROM → joins=[{target_alias, items=[other,...]}] 2248 // 2249 // For each TJoin: the driver table goes through buildUpdateRelation 2250 // (which applies the slice-82 FROM-source rejects + identity 2251 // filter); each JoinItem is walked through buildUpdateJoinItem 2252 // which (slice 86) admits USING / NATURAL via slice-64/65/66 2253 // shared helpers in addition to ON / CROSS. 2254 List<RelationSource> relations = new ArrayList<>(); 2255 // Slice 82 codex round-1 Q2 BLOCKING — LinkedHashSet dedup spans 2256 // the whole FROM so a column appearing in two ON clauses 2257 // produces one entry. Slice 86 USING/NATURAL emit refs also flow 2258 // through this dedup. 2259 java.util.LinkedHashSet<ColumnRef> joinRefsSet = 2260 new java.util.LinkedHashSet<>(); 2261 for (TJoin join : update.getJoins()) { 2262 TTable leftTable = join.getTable(); 2263 buildUpdateRelation(leftTable, targetTable, relations, update, 2264 cteNameToStatementIndex); 2265 TJoinItemList items = join.getJoinItems(); 2266 if (items == null) continue; 2267 // Slice 86 — per top-level TJoin LeftOutputState seeded 2268 // with providerWithStar (codex round-1 B2 BLOCKING: inScope 2269 // installed so extracted FROM-subquery drivers' published 2270 // columns are visible to lookupRelationColumnNames for 2271 // NATURAL inference). Reset between top-level TJoins so 2272 // comma-FROM groups stay independent (matches SELECT-side 2273 // buildRelations slice-66 behavior). 2274 LeftOutputState leftState = new LeftOutputState(); 2275 seedLeftOutput(leftState, leftTable, providerWithStar); 2276 for (int i = 0; i < items.size(); i++) { 2277 TJoinItem item = items.getJoinItem(i); 2278 // Slice 86 — extended buildUpdateJoinItem signature 2279 // threads the join context (topJoin / items / itemIndex) 2280 // and LeftOutputState to the USING/NATURAL admit paths 2281 // so they can call the SELECT-side slice-64/65/66 2282 // shared helpers verbatim. 2283 // Slice 105 — threads cteNameToStatementIndex so the 2284 // join walker's per-item buildUpdateRelation call can 2285 // route objectname-typed CTE references to a SUBQUERY- 2286 // kind RelationSource pointing at the CTE statement. 2287 buildUpdateJoinItem(join, items, i, targetTable, 2288 providerWithStar, relations, joinRefsSet, leftState, 2289 update, cteNameToStatementIndex); 2290 } 2291 } 2292 List<ColumnRef> joinRefs = new ArrayList<>(joinRefsSet); 2293 2294 // 5.8) Slice 86 — install the UPDATE's own UsingScope on 2295 // providerWithStar AFTER the join walker so SET RHS / WHERE / 2296 // RETURNING refs see merged-key resolution (mirrors SELECT-side 2297 // buildSelectStatementImpl slice 65 ordering). The join walker 2298 // itself emits joinColumnRefs via direct emit-refs helpers, so 2299 // UsingScope is irrelevant to ON refs (matches SELECT-side 2300 // contract). 2301 UsingScope updateUsingScope = buildUpdateUsingScope(update, providerWithStar); 2302 if (!updateUsingScope.isEmpty()) { 2303 providerWithStar = providerWithStar.withUsingScope(updateUsingScope); 2304 } 2305 2306 // 5.9) Slice 115 — extract uncorrelated scalar subqueries on SET 2307 // RHS as their own <scalar_subquery_<idx>> StatementGraphs 2308 // appended to `stmts` BEFORE the UPDATE statement. Mirrors slice 2309 // 11 SELECT-side scalar projection extraction. A SET assignment 2310 // whose RHS is exactly a top-level subquery_t admits as a scalar 2311 // SET RHS: the body is built via buildSelectStatement (with the 2312 // slice-11 scalar-body invariants: allowFromSubqueries=false, 2313 // allowScalarProjectionSubqueries=false, allowWindowProjection= 2314 // false), inner predicate-leak guards run, and the resulting 2315 // ScalarInfo (extracted body index + inner output name) is stored 2316 // for the per-assignment loop and lineage emission below. 2317 // 2318 // Correlated scalar subqueries (whose inner refs would resolve to 2319 // an outer alias such as the UPDATE target or a FROM-side 2320 // relation) STILL reject via the slice-11 promoter called with 2321 // EnclosingScope.empty() — the inner ref's alias does not match 2322 // any local relation and no enclosing scope is provided, so 2323 // promoteCorrelatedRefsToOuterReference throws 2324 // SCALAR_SUBQUERY_UNKNOWN_RELATION_ALIAS. Lifting UPDATE-side 2325 // correlation is a follow-up slice (slice 14 SELECT analogue 2326 // extended to UPDATE). 2327 Map<Integer, List<ScalarInfo>> setRhsScalarInfo = 2328 extractScalarSubqueriesFromUpdateSetRhs(update, providerWithStar, 2329 stmts, lineage, cteNameToStatementIndex, 2330 subqueryAliasToIndex); 2331 2332 // 6) Per-assignment processing. Each TResultColumn carries an 2333 // assignment_t TExpression whose leftOperand is the SET LHS 2334 // (target column reference) and whose rightOperand is the value 2335 // expression. We collect: 2336 // - target column spelling → TargetRelation.columns[i] 2337 // - synthetic output name → outputs[i].name (verbatim LHS 2338 // spelling, mirrors slice-78 2339 // INSERT column-list contract) 2340 // - RHS source column refs → outputs[i].sources 2341 List<OutputColumn> outputs = new ArrayList<>(); 2342 List<String> targetColumnNames = new ArrayList<>(); 2343 for (int i = 0; i < sets.size(); i++) { 2344 TResultColumn rc = sets.getResultColumn(i); 2345 TExpression assignment = (rc == null) ? null : rc.getExpr(); 2346 // Defensive: per TUpdateSqlStatement's javadoc each SET term 2347 // is an assignment_t. If the parser produced something else 2348 // (no AST shape observed in the tested corpora) we still 2349 // route through TUPLE_ASSIGNMENT_NOT_SUPPORTED so an 2350 // unexpected shape surfaces a stable diagnostic. 2351 if (assignment == null 2352 || assignment.getExpressionType() != EExpressionType.assignment_t) { 2353 throw new SemanticIRBuildException(Diagnostic.error( 2354 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 2355 "UPDATE SET assignment #" + (i + 1) + " is not a " 2356 + "simple column-value assignment_t; slice 80 " 2357 + "admits target_col = expr assignments only", 2358 update)); 2359 } 2360 TExpression lhs = assignment.getLeftOperand(); 2361 TExpression rhs = assignment.getRightOperand(); 2362 if (lhs == null || rhs == null) { 2363 throw new SemanticIRBuildException(Diagnostic.error( 2364 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 2365 "UPDATE SET assignment #" + (i + 1) 2366 + " is missing an operand", 2367 update)); 2368 } 2369 // Tuple LHS (Oracle) - SET (a, b) = (SELECT c1, c2 FROM ...) 2370 // surfaces as list_t. Reject before any subquery-on-RHS 2371 // walk so the diagnostic clearly identifies the tuple shape. 2372 if (lhs.getExpressionType() == EExpressionType.list_t) { 2373 throw new SemanticIRBuildException(Diagnostic.error( 2374 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 2375 "UPDATE SET tuple assignment '(a, b) = ...' is not " 2376 + "supported by SemanticIRBuilder.buildUpdate; " 2377 + "slice 80 admits target_col = expr only", 2378 update)); 2379 } 2380 if (lhs.getExpressionType() != EExpressionType.simple_object_name_t) { 2381 throw new SemanticIRBuildException(Diagnostic.error( 2382 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 2383 "UPDATE SET assignment #" + (i + 1) + " LHS is " 2384 + "expressionType=" + lhs.getExpressionType() 2385 + "; slice 80 admits simple column references only", 2386 update)); 2387 } 2388 TObjectName targetCol = lhs.getObjectOperand(); 2389 if (targetCol == null) { 2390 throw new SemanticIRBuildException(Diagnostic.error( 2391 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 2392 "UPDATE SET assignment #" + (i + 1) + " LHS has no " 2393 + "TObjectName operand", 2394 update)); 2395 } 2396 String colSpelling = targetCol.toString(); 2397 2398 // Slice 115 — top-level subquery_t SET RHS already extracted 2399 // in step 5.9 as a <scalar_subquery_<idx>> StatementGraph. 2400 // OutputColumn carries empty sources; slice-115/119 cross-stmt 2401 // edge below wires the consumer to the extracted body. 2402 if (rhs.getExpressionType() == EExpressionType.subquery_t) { 2403 targetColumnNames.add(colSpelling); 2404 outputs.add(new OutputColumn(colSpelling, 2405 /*derived=*/ true, /*aggregate=*/ false, 2406 Collections.<ColumnRef>emptyList())); 2407 continue; 2408 } 2409 // Slice 119 — mixed-expression scalar subquery path: subquery 2410 // nested inside a compound RHS (e.g. `SET col = (SELECT...) + 1`). 2411 // The scalar(s) were already extracted in step 5.9; collect only 2412 // the non-subquery column refs by skipping extracted subq nodes. 2413 if (containsAnySubqueryExpression(rhs)) { 2414 List<TExpression> subqRootsList = 2415 collectNestedSubqueryExpressions(rhs); 2416 if (subqRootsList.isEmpty()) { 2417 // P2-1 codex-review: containsAnySubqueryExpression 2418 // returned true (via getSubQuery() != null) but no 2419 // subquery_t nodes were found by acceptChildren 2420 // traversal (e.g. EXISTS or non-scalar predicate 2421 // subquery). Preserve the original reject so lineage 2422 // is not silently dropped. 2423 throw new SemanticIRBuildException(Diagnostic.error( 2424 DiagnosticCode.UPDATE_SET_HAS_SUBQUERY_NOT_SUPPORTED, 2425 "UPDATE SET assignment #" + (i + 1) + " right-hand " 2426 + "side contains a non-scalar subquery " 2427 + "(slice 119 admits only scalar subquery_t " 2428 + "inside compound expressions)", 2429 update)); 2430 } 2431 Set<TExpression> subqRoots = Collections.newSetFromMap( 2432 new IdentityHashMap<TExpression, Boolean>()); 2433 subqRoots.addAll(subqRootsList); 2434 // P2-2 codex-review: window functions in the non-subquery 2435 // part of a compound RHS are still illegal. Use the 2436 // skipping variant so scalar body contents are not scanned 2437 // (window functions inside a scalar SELECT are legitimate). 2438 rejectWindowFunctionInScopeSkipping(rhs, "UPDATE SET RHS", 2439 subqRoots); 2440 List<ColumnRef> sources = collectColumnRefsSkipping( 2441 rhs, providerWithStar, subqRoots); 2442 targetColumnNames.add(colSpelling); 2443 outputs.add(new OutputColumn(colSpelling, 2444 /*derived=*/ true, /*aggregate=*/ false, sources)); 2445 continue; 2446 } 2447 // Window function on RHS — reuse the existing scope reject. 2448 rejectWindowFunctionInScope(rhs, "UPDATE SET RHS"); 2449 2450 // Collect physical column refs from the RHS expression. 2451 List<ColumnRef> sources = collectColumnRefs(rhs, providerWithStar); 2452 boolean derived = 2453 rhs.getExpressionType() != EExpressionType.simple_object_name_t; 2454 targetColumnNames.add(colSpelling); 2455 outputs.add(new OutputColumn(colSpelling, 2456 derived, /*aggregate=*/ false, sources)); 2457 } 2458 2459 // 7) WHERE refs — slice 110 lifts the slice-80 blanket subquery 2460 // reject by routing uncorrelated predicate-subquery wrappers 2461 // (IN-SELECT / EXISTS / NOT EXISTS / scalar comparison / 2462 // ANY-ALL-SOME) through the slice-23+ JOIN-ON extraction pipeline 2463 // refactored to take a PredicateClauseContext. Each extracted 2464 // wrapper lands as its own <predicate_subquery_<i>> StatementGraph 2465 // BEFORE the UPDATE statement (so updateIdx already accounts for 2466 // them via stmts.size() below). Remaining non-subquery refs flow 2467 // into filterColumnRefs via collectColumnRefsSkipping. SET-RHS 2468 // subqueries still reject (slice-110 scope excludes SET RHS). 2469 // Window functions in non-subquery subtrees still reject via 2470 // the existing rejectWindowFunctionInScopeSkipping helper. 2471 List<ColumnRef> filterRefs; 2472 TWhereClause where = update.getWhereClause(); 2473 if (where == null || where.getCondition() == null) { 2474 filterRefs = Collections.<ColumnRef>emptyList(); 2475 } else { 2476 Set<TExpression> extractedWhereRoots = 2477 Collections.<TExpression>emptySet(); 2478 if (containsAnySubquery(where)) { 2479 // Slice 110 — `providerWithStar` already carries 2480 // `withCteContext(cteNameToStatementIndex.keySet())` 2481 // (applied at the providerWithCte → providerWithStar 2482 // chain above) so the predicate body's inner SELECT's 2483 // `FROM cte` refs route through `RelationKind.CTE`. 2484 // Without that, emitLineageForStatement would lose the 2485 // STATEMENT_OUTPUT → STATEMENT_OUTPUT edge to the CTE 2486 // body. 2487 extractedWhereRoots = 2488 extractUncorrelatedPredicateSubqueriesFromClause( 2489 where.getCondition(), providerWithStar, 2490 stmts, lineage, cteNameToStatementIndex, 2491 PredicateClauseContext.UPDATE_WHERE); 2492 rejectAnyRemainingSubqueriesFromClause( 2493 where.getCondition(), extractedWhereRoots, 2494 PredicateClauseContext.UPDATE_WHERE); 2495 } 2496 rejectWindowFunctionInScopeSkipping(where, "WHERE clause", 2497 extractedWhereRoots); 2498 // Slice 83 — providerWithStar so WHERE refs against 2499 // extracted subquery aliases bind correctly. Slice 110 — 2500 // skip extracted predicate-subquery subtrees so inner refs 2501 // do not leak into outer filterColumnRefs (mirrors the 2502 // slice-23 JOIN-ON ref collector). 2503 filterRefs = collectColumnRefsSkipping(where, providerWithStar, 2504 extractedWhereRoots); 2505 } 2506 2507 // (Step 8 of slice 82 was relocated to step 5.7 by slice 86 so 2508 // the slice-86 UsingScope built at step 5.8 can apply to the 2509 // SET RHS / WHERE collectors above. The walker logic itself is 2510 // unchanged from slice 82's contract — only its position moved.) 2511 2512 RelationBinding targetBinding = new RelationBinding( 2513 RelationKind.TABLE, targetQName); 2514 TargetRelation target = new TargetRelation(targetBinding, targetColumnNames); 2515 2516 // Slice 85 — build RETURNING / OUTPUT projection columns BEFORE 2517 // the StatementGraph so the new returningColumns slot can be 2518 // populated. updateIdx is computed first (deterministic — the 2519 // DML's position is stmts.size() at the moment of the 2520 // upcoming stmts.add(updateStmt)). LineageEdges are emitted 2521 // here via the shared helper (consumer ← producer). 2522 int updateIdx = stmts.size(); 2523 // UPDATE target alias = effective alias from the target's 2524 // TTable (slice-82 / slice-83 use the same convention for 2525 // FROM-side reference identity). FROM-side relations is the 2526 // walked `relations[]` list already built above. 2527 String updateTargetAlias = effectiveAliasOf(targetTable); 2528 if (updateTargetAlias == null || updateTargetAlias.isEmpty()) { 2529 updateTargetAlias = targetQName; 2530 } 2531 List<OutputColumn> returningColumns = buildReturningColumns( 2532 update.getReturningClause(), 2533 update.getOutputClause(), 2534 "UPDATE", 2535 targetQName, 2536 updateTargetAlias, 2537 /*targetTable=*/ targetTable, 2538 relations, 2539 providerWithStar, 2540 updateIdx, 2541 lineage, 2542 update); 2543 2544 StatementGraph updateStmt = new StatementGraph( 2545 /*name=*/ null, 2546 "UPDATE", 2547 relations, 2548 outputs, 2549 returningColumns, 2550 filterRefs, 2551 /*joinColumnRefs=*/ joinRefs, 2552 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 2553 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 2554 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 2555 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 2556 /*distinct=*/ false, 2557 /*setOperator=*/ null, 2558 /*rowLimit=*/ null, 2559 target); 2560 2561 // Slice 83 — updateIdx is dynamic: stmts already contains any 2562 // extracted FROM-subquery statements from step 5.5. The 2563 // slice-78/80 contract `target.col_i ← STATEMENT_OUTPUT(idx, 2564 // out_i)` is preserved by indexing the UPDATE's own statement 2565 // position rather than the slice-80 hardcoded 0. 2566 stmts.add(updateStmt); 2567 2568 // Slice 78/80 cross-stmt edges — one per SET assignment: 2569 // target.col_i ← STATEMENT_OUTPUT(updateIdx, out_i) 2570 for (int i = 0; i < outputs.size(); i++) { 2571 String tgtName = targetColumnNames.get(i); 2572 String outName = outputs.get(i).getName(); 2573 if (tgtName == null || tgtName.isEmpty() 2574 || outName == null || outName.isEmpty()) { 2575 // Defensive — both should be the same verbatim spelling. 2576 continue; 2577 } 2578 lineage.add(new LineageEdge( 2579 LineageRef.tableColumn(targetQName, tgtName), 2580 LineageRef.statementOutput(updateIdx, outName))); 2581 } 2582 2583 // Slice 115 — for each SET assignment whose RHS was extracted as 2584 // a top-level scalar subquery in step 5.9, emit the cross-stmt 2585 // wire edge: 2586 // STATEMENT_OUTPUT(updateIdx, outName) → 2587 // STATEMENT_OUTPUT(scalarIdx, innerOutputName) 2588 // mirrors the SELECT-side slice-11 emission in 2589 // emitLineageForStatement (line ~7440). Runs AFTER the slice-78/ 2590 // 80 target edge loop so the target edge is always emitted 2591 // first; the scalar-bound assignment's OutputColumn.sources is 2592 // empty by construction so the slice-83 2593 // emitUpdateSubquerySourceEdges call below is a no-op for these 2594 // outputs. 2595 // Slice 115/119 — one edge per extracted scalar per SET assignment: 2596 // STATEMENT_OUTPUT(updateIdx, outName) → STATEMENT_OUTPUT(scalarIdx, innerOutputName) 2597 if (!setRhsScalarInfo.isEmpty()) { 2598 for (Map.Entry<Integer, List<ScalarInfo>> e : setRhsScalarInfo.entrySet()) { 2599 int ord = e.getKey(); 2600 if (ord < 0 || ord >= outputs.size()) continue; 2601 String outName = outputs.get(ord).getName(); 2602 if (outName == null || outName.isEmpty()) continue; 2603 for (ScalarInfo info : e.getValue()) { 2604 lineage.add(new LineageEdge( 2605 LineageRef.statementOutput(updateIdx, outName), 2606 LineageRef.statementOutput(info.statementIndex, 2607 info.innerOutputName))); 2608 } 2609 } 2610 } 2611 2612 // Slice 83 — emit STATEMENT_OUTPUT(updateIdx, out_i) → 2613 // STATEMENT_OUTPUT(subIdx, col) edges for output sources that 2614 // bind to a SUBQUERY-kind relation in this UPDATE's 2615 // relations[]. Base-table FROM-side sources stay as 2616 // outputs[i].sources only — preserves the slice-82 contract 2617 // that joined UPDATE without subqueries emits exactly ONE 2618 // cross-stmt edge per SET assignment (the target edge above). 2619 // Slice 105 — combine the slice-83 subqueryAliasToIndex with 2620 // the slice-105 CTE-as-relation alias→cteIdx entries so a SET 2621 // RHS reference to a CTE column (which lives on a SUBQUERY- 2622 // kind relation per slice 105) still produces a cross-stmt 2623 // STATEMENT_OUTPUT edge to the CTE body. Without the merge the 2624 // visible OutputColumn.sources stays correct but lineage[] 2625 // silently drops the canonical edge (codex round-2 Q5). 2626 Map<String, Integer> combinedAliasToSubIdx = 2627 buildUpdateCombinedAliasToSubIdx(update, 2628 subqueryAliasToIndex, cteNameToStatementIndex); 2629 if (!combinedAliasToSubIdx.isEmpty()) { 2630 emitUpdateSubquerySourceEdges(updateStmt, updateIdx, 2631 combinedAliasToSubIdx, lineage); 2632 } 2633 2634 return new SemanticProgram(stmts, lineage); 2635 } 2636 2637 /** 2638 * Slice 83 — emit STATEMENT_OUTPUT → STATEMENT_OUTPUT edges from 2639 * each UPDATE output to its subquery-bound source column. Walks 2640 * {@code outputs[i].sources} and, for any source whose 2641 * {@code relationAlias} matches a SUBQUERY-kind entry in the 2642 * statement's {@link RelationSource} list, emits an edge to the 2643 * corresponding extracted subquery's STATEMENT_OUTPUT position. 2644 * 2645 * <p>Why not call {@link #emitLineageForStatement}? The SELECT-path 2646 * helper emits edges for ALL output sources (TABLE-kind → 2647 * TABLE_COLUMN; CTE/SUBQUERY-kind → STATEMENT_OUTPUT). For UPDATE 2648 * the slice-78/80 contract is intentionally narrower: the only 2649 * cross-stmt edge per SET assignment is the target edge. Adding 2650 * STATEMENT_OUTPUT → TABLE_COLUMN edges for base-table FROM-side 2651 * sources would change the cross-stmt edge count contract that 2652 * slice-82 tests assert ({@code edges.size() == numAssignments}). 2653 * The slice-83 emitter therefore is SUBQUERY-only — base-table 2654 * FROM-side refs continue to surface via {@code outputs[i].sources} 2655 * but emit no extra LineageEdge. 2656 */ 2657 private static void emitUpdateSubquerySourceEdges( 2658 StatementGraph updateStmt, 2659 int updateIdx, 2660 Map<String, Integer> subqueryAliasToIndex, 2661 List<LineageEdge> lineage) { 2662 // Codex slice-83 diff-review Q1 BLOCKING — both the map and 2663 // the lookup must use the same casing policy so SQL like 2664 // `... FROM (SELECT ...) sub WHERE ... SUB.x = …` (resolver-2 2665 // may surface either case in `src.getRelationAlias()` depending 2666 // on dialect and quoting) still finds the SUBQUERY-kind entry. 2667 // The slice-83 inScope map and `subqueryAliasToIndex` are both 2668 // keyed lowercase; do the same here. (SELECT-side 2669 // `emitLineageForStatement` uses case-sensitive equality — 2670 // pre-existing limitation; a separate refactor.) 2671 Map<String, RelationSource> aliasToRelation = new HashMap<>(); 2672 for (RelationSource rs : updateStmt.getRelations()) { 2673 String key = rs.getAlias(); 2674 // Skip null / empty aliases — empty-string would produce a 2675 // vacuous "" key and could spuriously match other empty-alias 2676 // relations (codex round-2 Q2 advisory). 2677 if (key == null || key.isEmpty()) continue; 2678 aliasToRelation.put(key.toLowerCase(Locale.ROOT), rs); 2679 } 2680 for (OutputColumn out : updateStmt.getOutputColumns()) { 2681 String outName = out.getName(); 2682 if (outName == null || outName.isEmpty()) continue; 2683 for (ColumnRef src : out.getSources()) { 2684 String srcAlias = src.getRelationAlias(); 2685 if (srcAlias == null || srcAlias.isEmpty()) continue; 2686 RelationSource rel = aliasToRelation.get( 2687 srcAlias.toLowerCase(Locale.ROOT)); 2688 if (rel == null) continue; 2689 if (rel.getBinding() == null 2690 || rel.getBinding().getKind() != RelationKind.SUBQUERY) { 2691 continue; 2692 } 2693 Integer subIdx = subqueryAliasToIndex.get( 2694 rel.getAlias().toLowerCase(Locale.ROOT)); 2695 if (subIdx == null) continue; 2696 lineage.add(new LineageEdge( 2697 LineageRef.statementOutput(updateIdx, outName), 2698 LineageRef.statementOutput(subIdx, src.getColumnName()))); 2699 } 2700 } 2701 } 2702 2703 /** 2704 * Slice 115 — walk the UPDATE's SET clause and extract each 2705 * assignment whose RHS is exactly a top-level 2706 * {@link EExpressionType#subquery_t} as its own 2707 * {@code <scalar_subquery_<idx>>} {@link StatementGraph} appended to 2708 * {@code stmts} BEFORE the UPDATE. Mirrors the SELECT-side 2709 * {@link #extractScalarSubqueriesAsStatementsInternal} slice-11 2710 * pipeline but iterates SET assignments instead of result columns. 2711 * Returns {@code assignmentOrdinal → ScalarInfo} so {@link #buildUpdate} 2712 * can wire the cross-stmt edge for each extracted body. 2713 * 2714 * <p>Scope rejects (mirroring slice 11): 2715 * <ul> 2716 * <li>Multi-column inner SELECT — 2717 * {@link DiagnosticCode#SCALAR_SUBQUERY_COLUMN_COUNT}.</li> 2718 * <li>Inner projection has no alias and no column name — 2719 * {@link DiagnosticCode#SCALAR_SUBQUERY_INNER_PROJECTION_UNNAMED}.</li> 2720 * <li>Subqueries in scalar body's WHERE / JOIN ON / GROUP BY — 2721 * slice-11 {@link #rejectSubqueriesInScalarBodyClauses}.</li> 2722 * <li>FROM-subqueries inside scalar body — 2723 * {@code allowFromSubqueries=false} (slice-15 invariant).</li> 2724 * <li>Nested scalar projections inside scalar body — 2725 * {@code allowScalarProjectionSubqueries=false} (set-op-branch 2726 * precedent; slice 115 initial scope).</li> 2727 * <li>Window functions in scalar body — 2728 * {@code allowWindowProjection=false} (slice-11 precedent).</li> 2729 * <li>Correlated scalar subqueries (inner refs to outer aliases) — 2730 * {@link #promoteCorrelatedRefsToOuterReference} called with 2731 * {@link EnclosingScope#empty()} throws 2732 * {@link DiagnosticCode#SCALAR_SUBQUERY_UNKNOWN_RELATION_ALIAS}. 2733 * Lifting UPDATE-side correlation is a follow-up slice 2734 * (slice 14 SELECT analogue extended to UPDATE).</li> 2735 * </ul> 2736 * 2737 * <p>Snapshot/rollback wrapper around the loop body mirrors 2738 * {@link #extractScalarSubqueriesAsStatements} so a partial 2739 * extraction (e.g. second of two scalar SET RHS fails on shape 2740 * validation) truncates {@code stmts}/{@code lineage} back to the 2741 * pre-call boundary. 2742 * 2743 * <p>Assignments whose RHS is not a top-level {@code subquery_t} are 2744 * silently skipped here; they fall through to the per-assignment 2745 * loop's existing slice-80 / slice-115 mixed-expression reject path. 2746 */ 2747 private static Map<Integer, List<ScalarInfo>> extractScalarSubqueriesFromUpdateSetRhs( 2748 TUpdateSqlStatement update, 2749 NameBindingProvider provider, 2750 List<StatementGraph> stmts, 2751 List<LineageEdge> lineage, 2752 Map<String, Integer> cteNameToStatementIndex, 2753 Map<String, Integer> subqueryAliasToIndex) { 2754 TResultColumnList sets = update.getResultColumnList(); 2755 if (sets == null || sets.size() == 0) { 2756 return Collections.<Integer, List<ScalarInfo>>emptyMap(); 2757 } 2758 // Fast pre-scan: any SET RHS that contains a subquery (top-level 2759 // subquery_t or nested inside a compound expression)? Avoids the 2760 // snapshot/rollback wrapper overhead when none are present. 2761 // Slice 115 handled top-level subquery_t only; slice 119 extends 2762 // to mixed-expression RHS (e.g. `SET col = (SELECT...) + 1`). 2763 // Tuple-LHS assignments (Oracle SET (a, b) = ...) are 2764 // intentionally skipped so the per-assignment loop's 2765 // UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED reject (slice 80 2766 // contract) wins. 2767 boolean anySubquery = false; 2768 for (int i = 0; i < sets.size(); i++) { 2769 TResultColumn rc = sets.getResultColumn(i); 2770 if (rc == null || rc.getExpr() == null) continue; 2771 TExpression assignment = rc.getExpr(); 2772 if (assignment.getExpressionType() != EExpressionType.assignment_t) { 2773 continue; 2774 } 2775 TExpression lhs = assignment.getLeftOperand(); 2776 if (lhs == null 2777 || lhs.getExpressionType() == EExpressionType.list_t) { 2778 continue; 2779 } 2780 TExpression rhs = assignment.getRightOperand(); 2781 if (rhs != null && containsAnySubqueryExpression(rhs)) { 2782 anySubquery = true; 2783 break; 2784 } 2785 } 2786 if (!anySubquery) { 2787 return Collections.<Integer, List<ScalarInfo>>emptyMap(); 2788 } 2789 int stmtsSnapshot = stmts.size(); 2790 int lineageSnapshot = lineage.size(); 2791 try { 2792 return extractScalarSubqueriesFromUpdateSetRhsInternal( 2793 update, provider, stmts, lineage, 2794 cteNameToStatementIndex, subqueryAliasToIndex, sets); 2795 } catch (RuntimeException ex) { 2796 while (stmts.size() > stmtsSnapshot) stmts.remove(stmts.size() - 1); 2797 while (lineage.size() > lineageSnapshot) lineage.remove(lineage.size() - 1); 2798 throw ex; 2799 } 2800 } 2801 2802 /** 2803 * Internal body of {@link #extractScalarSubqueriesFromUpdateSetRhs}; 2804 * wrapped with snapshot/rollback by the public entry point. Do not 2805 * call directly from non-wrapper sites. 2806 */ 2807 private static Map<Integer, List<ScalarInfo>> extractScalarSubqueriesFromUpdateSetRhsInternal( 2808 TUpdateSqlStatement update, 2809 NameBindingProvider provider, 2810 List<StatementGraph> stmts, 2811 List<LineageEdge> lineage, 2812 Map<String, Integer> cteNameToStatementIndex, 2813 Map<String, Integer> subqueryAliasToIndex, 2814 TResultColumnList sets) { 2815 Map<Integer, List<ScalarInfo>> ordinalToInfo = new HashMap<>(); 2816 for (int i = 0; i < sets.size(); i++) { 2817 TResultColumn rc = sets.getResultColumn(i); 2818 if (rc == null || rc.getExpr() == null) continue; 2819 TExpression assignment = rc.getExpr(); 2820 if (assignment.getExpressionType() != EExpressionType.assignment_t) { 2821 continue; 2822 } 2823 TExpression lhs = assignment.getLeftOperand(); 2824 // Skip tuple-LHS assignments — the per-assignment loop's 2825 // slice-80 UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED reject 2826 // should win for these (e.g. Oracle `SET (a, b) = (SELECT 2827 // c1, c2 FROM ...)`). Without this skip, the inner SELECT's 2828 // multi-column projection would surface as 2829 // SCALAR_SUBQUERY_COLUMN_COUNT here instead. 2830 if (lhs == null 2831 || lhs.getExpressionType() == EExpressionType.list_t) { 2832 continue; 2833 } 2834 TExpression rhs = assignment.getRightOperand(); 2835 if (rhs == null || !containsAnySubqueryExpression(rhs)) { 2836 continue; // no subquery in this RHS — handled by per-assignment loop 2837 } 2838 // "outer alias" used in diagnostic messages — the SET LHS 2839 // column spelling. Mirrors the slice-11 `outerAlias` role. 2840 String outerAlias = (lhs.getExpressionType() == EExpressionType.simple_object_name_t 2841 && lhs.getObjectOperand() != null) 2842 ? lhs.getObjectOperand().toString() 2843 : ("SET assignment #" + (i + 1)); 2844 2845 // Determine which subquery TExpression nodes to extract. 2846 // Slice 115 path: RHS is exactly a top-level subquery_t → 2847 // single-element list. 2848 // Slice 119 path: RHS is a compound expression (arithmetic, 2849 // CASE, function) containing one or more subquery_t nodes 2850 // at any depth → list in traversal order. 2851 List<TExpression> subqExprs; 2852 if (rhs.getExpressionType() == EExpressionType.subquery_t) { 2853 subqExprs = Collections.singletonList(rhs); 2854 } else { 2855 subqExprs = collectNestedSubqueryExpressions(rhs); 2856 } 2857 if (subqExprs.isEmpty()) continue; // defensive (containsAnySubqueryExpression true but none found) 2858 2859 // Build the UPDATE-side enclosing scope once per assignment 2860 // (used by each per-scalar correlation promotion below). 2861 EnclosingScope innerEnclosing = buildUpdateEnclosingScope(update, 2862 cteNameToStatementIndex, subqueryAliasToIndex, 2863 /*parent=*/ null); 2864 2865 List<ScalarInfo> infos = new ArrayList<>(); 2866 for (TExpression subqExpr : subqExprs) { 2867 TSelectSqlStatement inner = subqExpr.getSubQuery(); 2868 if (inner == null) { 2869 throw new SemanticIRBuildException( 2870 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_NO_INNER_SELECT, 2871 "scalar subquery on UPDATE SET RHS for '" + outerAlias 2872 + "' has no inner SELECT", rc)); 2873 } 2874 // Pre-recursion validation (matches slice 11 ordering): 2875 // inspect inner column count and naming before recursive 2876 // build so the diagnostic is scalar-specific. 2877 TResultColumnList innerRcl = inner.getResultColumnList(); 2878 if (innerRcl == null || innerRcl.size() == 0) { 2879 throw new SemanticIRBuildException( 2880 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_COLUMN_COUNT, 2881 "scalar subquery on UPDATE SET RHS for '" + outerAlias 2882 + "' must project exactly one column, got 0", rc)); 2883 } 2884 if (innerRcl.size() != 1) { 2885 throw new SemanticIRBuildException( 2886 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_COLUMN_COUNT, 2887 "scalar subquery on UPDATE SET RHS for '" + outerAlias 2888 + "' must project exactly one column, got " 2889 + innerRcl.size(), rc)); 2890 } 2891 TResultColumn innerCol = innerRcl.getResultColumn(0); 2892 String innerAlias = innerCol.getColumnAlias(); 2893 String innerColName = innerCol.getColumnNameOnly(); 2894 boolean innerHasName = 2895 (innerAlias != null && !innerAlias.isEmpty()) 2896 || (innerColName != null && !innerColName.isEmpty()); 2897 if (!innerHasName && !isConstantExpression(innerCol.getExpr())) { 2898 throw new SemanticIRBuildException( 2899 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_INNER_PROJECTION_UNNAMED, 2900 "scalar subquery on UPDATE SET RHS for '" + outerAlias 2901 + "' inner projection has no alias and no column " 2902 + "name; add an explicit alias inside the subquery", 2903 rc)); 2904 } 2905 // Predicate-leak guard: scalar body's WHERE / JOIN ON / 2906 // GROUP BY must not contain subqueries. 2907 rejectSubqueriesInScalarBodyClauses(inner, outerAlias); 2908 2909 // Slice 117 / 119 — decorate provider with the inner 2910 // SELECT's local FROM aliases for tolerant outer-binding. 2911 Set<String> innerLocalAliases = precomputeInnerLocalAliases(inner); 2912 NameBindingProvider tolerantProvider = innerLocalAliases.isEmpty() 2913 ? provider 2914 : provider.withTolerantOuterBinding(innerLocalAliases); 2915 2916 String scalarName = SCALAR_BODY_PREFIX + stmts.size() + ">"; 2917 StatementGraph innerStmt = buildSelectStatement(inner, tolerantProvider, 2918 scalarName, 2919 /*hasOuterCteListAlreadyProcessed=*/ false, 2920 /*allowFromSubqueries=*/ false, 2921 /*allowScalarProjectionSubqueries=*/ false, 2922 /*allowWindowProjection=*/ false); 2923 innerStmt = promoteCorrelatedRefsToOuterReference( 2924 innerStmt, outerAlias, innerEnclosing); 2925 int idx = stmts.size(); 2926 stmts.add(innerStmt); 2927 String innerOutName = effectiveOutputName(innerCol); 2928 infos.add(new ScalarInfo(idx, innerOutName)); 2929 emitLineageForStatement(innerStmt, idx, lineage, 2930 cteNameToStatementIndex, 2931 innerEnclosing.flattenSubqueryAliasToIndex(), 2932 Collections.<Integer, ScalarInfo>emptyMap()); 2933 } 2934 ordinalToInfo.put(i, infos); 2935 } 2936 return ordinalToInfo; 2937 } 2938 2939 /** 2940 * Slice 82 — process one FROM-side source table for joined 2941 * {@link #buildUpdate}. Applies the slice-82 reject contract for 2942 * non-table FROM sources, then appends a TABLE-kind 2943 * {@link RelationSource} unless the table is the target 2944 * (reference-identity filter — clean IR semantics: relations[] 2945 * models read-side sources only). 2946 */ 2947 private static void buildUpdateRelation(TTable t, TTable targetTable, 2948 List<RelationSource> relations, 2949 TUpdateSqlStatement update, 2950 Map<String, Integer> cteNameToStatementIndex) { 2951 if (t == null) { 2952 return; // defensive — parser should never produce a null table 2953 } 2954 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 2955 // Slice 83 — admit FROM-side subqueries. The inner SELECT 2956 // has already been extracted as its own StatementGraph by 2957 // {@link #extractUpdateFromSubqueries} (step 5.5 of 2958 // buildUpdate). Here we publish the SUBQUERY-kind 2959 // {@link RelationSource} so {@code outputs[i].sources} 2960 // resolved via the inScope-enhanced provider can route to 2961 // it. Alias and qualifiedName both use 2962 // {@code effectiveAliasOf(t)} — matching slice-14 / slice-58 2963 // SUBQUERY-kind convention used by SELECT. 2964 // 2965 // {@code UPDATE_FROM_SUBQUERY_NOT_SUPPORTED} stays declared 2966 // but unreached (slice-71/72 retain-for-documentation 2967 // precedent — keeps the public DiagnosticCode enum stable 2968 // for consumers that route by code). 2969 String subAlias = effectiveAliasOf(t); 2970 if (subAlias != null && !subAlias.isEmpty()) { 2971 relations.add(new RelationSource(subAlias, 2972 new RelationBinding(RelationKind.SUBQUERY, subAlias))); 2973 } 2974 return; 2975 } 2976 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.join) { 2977 // Defensive: TTable wrapping a TJoin. Not reached by any 2978 // observed parser path on the supported dialects (slice-82 2979 // probe set), but gets its own DiagnosticCode so consumers 2980 // can route this distinct shape without parsing message 2981 // text — per slice-80's message-text-discrimination 2982 // contract (codex round-1 Q4 BLOCKING). 2983 throw new SemanticIRBuildException(Diagnostic.error( 2984 DiagnosticCode.UPDATE_FROM_NESTED_JOIN_NOT_SUPPORTED, 2985 "UPDATE FROM source is a nested join wrapper; " 2986 + "slice 82 admits simple table FROM sources only", 2987 update)); 2988 } 2989 // Reference-identity filter: target's own TTable instance is 2990 // excluded from relations[]. In MSSQL `UPDATE T2 ... FROM Table2 T2 …`, 2991 // tables[0] / joins[0].getTable() IS the same instance as 2992 // update.getTargetTable(); excluding it keeps the IR clean 2993 // (relations[] models reads, target models writes). The 2994 // catalog-miss WARN walker's target-first ordering handles the 2995 // cross-instance-same-name MSSQL self-join edge case where two 2996 // distinct TTable instances share the same qualified name. 2997 if (t == targetTable) { 2998 return; 2999 } 3000 TObjectName tName = t.getTableName(); 3001 if (tName == null) { 3002 return; // defensive 3003 } 3004 // Slice 105 — FROM-side CTE detection. When the FROM-side table 3005 // is an objectname-typed reference whose bare name matches a 3006 // declared CTE in this UPDATE's outer WITH clause, emit a 3007 // SUBQUERY-kind RelationSource pointing at the CTE statement 3008 // (mirrors MERGE USING-as-CTE in slice 101). The slice-77 3009 // catalog-miss WARN walker filters to RelationKind.TABLE so 3010 // CTE-bound relations are naturally skipped, even when the 3011 // catalog also declares the same name (codex round-2 Q4 3012 // confirmed YES). The cross-stmt lineage edge from 3013 // STATEMENT_OUTPUT(updateIdx,col) → STATEMENT_OUTPUT(cteIdx,col) 3014 // is emitted by emitUpdateSubquerySourceEdges using the 3015 // combined alias→subIdx map. 3016 if (cteNameToStatementIndex != null 3017 && !cteNameToStatementIndex.isEmpty()) { 3018 String bareName = tName.toString(); 3019 if (bareName != null && !bareName.isEmpty()) { 3020 String bareNameLower = bareName.toLowerCase(Locale.ROOT); 3021 if (cteNameToStatementIndex.containsKey(bareNameLower)) { 3022 String cteAlias = effectiveAliasOf(t); 3023 if (cteAlias == null || cteAlias.isEmpty()) { 3024 cteAlias = bareName; 3025 } 3026 relations.add(new RelationSource(cteAlias, 3027 new RelationBinding(RelationKind.SUBQUERY, cteAlias))); 3028 return; 3029 } 3030 } 3031 } 3032 // effectiveAliasOf returns the SQL-written alias if present, 3033 // else the table name. RelationSource requires a non-empty 3034 // alias; this matches the slice-58/59 buildRelation contract. 3035 relations.add(new RelationSource(effectiveAliasOf(t), 3036 new RelationBinding(RelationKind.TABLE, tName.toString()))); 3037 } 3038 3039 /** 3040 * Slice 82 (extended by slice 86) — process one {@link TJoinItem} 3041 * for joined {@link #buildUpdate}. Routes USING / NATURAL JoinItems 3042 * through the SELECT-side slice-64/65/66 shared helpers 3043 * ({@link #populateUsingJoinRefs} / {@link #emitMergedJoinRefs} / 3044 * {@link #naturalSharedKeys}) so the UPDATE join walker emits the 3045 * same {@code joinColumnRefs[]} shape as a SELECT body. ON / CROSS 3046 * JoinItems retain the slice-82 reject contract (subquery in ON, 3047 * window in ON) and ref-collection path. 3048 * 3049 * <p>Slice 86 signature extension: the join context 3050 * ({@code topJoin}, {@code items}, {@code itemIndex}) and the 3051 * per-top-level-TJoin {@link LeftOutputState} are required by the 3052 * shared helpers — the prior-relations chain for emit-refs and the 3053 * accumulated left row type for NATURAL inference. 3054 * 3055 * <p>USING / NATURAL shape conflicts (USING+ON, NATURAL+USING, 3056 * NATURAL+ON) reuse the existing slice-64/66 codes 3057 * ({@link DiagnosticCode#JOIN_WITH_BOTH_ON_AND_USING}, 3058 * {@link DiagnosticCode#NATURAL_WITH_USING}, 3059 * {@link DiagnosticCode#NATURAL_WITH_ON}) rather than introducing 3060 * UPDATE-specific codes — matching slice 86's "reuse SELECT-side 3061 * machinery verbatim" architecture. 3062 * 3063 * <p>Slice 82's lifted reject codes 3064 * ({@link DiagnosticCode#UPDATE_FROM_JOIN_USING_NOT_SUPPORTED} and 3065 * {@link DiagnosticCode#UPDATE_FROM_JOIN_NATURAL_NOT_SUPPORTED}) 3066 * stay declared-but-unreached for API stability — slice 71/72/82 3067 * retain-for-documentation precedent. 3068 */ 3069 private static void buildUpdateJoinItem(TJoin topJoin, 3070 TJoinItemList items, 3071 int itemIndex, 3072 TTable targetTable, 3073 NameBindingProvider provider, 3074 List<RelationSource> relations, 3075 java.util.LinkedHashSet<ColumnRef> joinRefs, 3076 LeftOutputState leftState, 3077 TUpdateSqlStatement update, 3078 Map<String, Integer> cteNameToStatementIndex) { 3079 if (items == null) return; 3080 TJoinItem item = items.getJoinItem(itemIndex); 3081 if (item == null) return; 3082 3083 TObjectNameList usingCols = item.getUsingColumns(); 3084 boolean hasUsing = usingCols != null && usingCols.size() > 0; 3085 boolean isNatural = isNaturalJoinType(item.getJoinType()); 3086 boolean hasOn = item.getOnCondition() != null; 3087 3088 // Slice 86 — USING/NATURAL admit paths. Shape conflicts use the 3089 // slice-64/66 SELECT-side codes verbatim; the UPDATE-specific 3090 // lifted codes (UPDATE_FROM_JOIN_USING_NOT_SUPPORTED / 3091 // UPDATE_FROM_JOIN_NATURAL_NOT_SUPPORTED) are no longer thrown 3092 // (declared-but-unreached for API stability). 3093 if (isNatural && hasUsing) { 3094 throw new SemanticIRBuildException(Diagnostic.error( 3095 DiagnosticCode.NATURAL_WITH_USING, 3096 "NATURAL JOIN must not carry a USING clause; choose " 3097 + "either NATURAL or USING, not both", item)); 3098 } 3099 if (isNatural && hasOn) { 3100 throw new SemanticIRBuildException(Diagnostic.error( 3101 DiagnosticCode.NATURAL_WITH_ON, 3102 "NATURAL JOIN must not carry an ON condition; rewrite " 3103 + "as JOIN ... ON, or drop the NATURAL keyword", item)); 3104 } 3105 if (hasUsing && hasOn) { 3106 throw new SemanticIRBuildException(Diagnostic.error( 3107 DiagnosticCode.JOIN_WITH_BOTH_ON_AND_USING, 3108 "JOIN cannot carry both ON and USING; choose one", item)); 3109 } 3110 3111 if (hasUsing) { 3112 // Right-side table first: applies slice-82 source-shape 3113 // rejects + identity filter exactly as the ON path. 3114 buildUpdateRelation(item.getTable(), targetTable, relations, update, 3115 cteNameToStatementIndex); 3116 // Slice 64 emit-refs: left-then-right per key, walking 3117 // priorRelations = topJoin.getTable() + items[0..itemIndex-1]. 3118 List<ColumnRef> usingRefs = new ArrayList<>(); 3119 populateUsingJoinRefs(topJoin, items, itemIndex, item.getTable(), 3120 usingCols, provider, usingRefs); 3121 joinRefs.addAll(usingRefs); 3122 // Slice 66 LeftOutputState update: merge right's columns 3123 // into accumulated state so a subsequent NATURAL JoinItem 3124 // sees the row type (matches SELECT-side 3125 // {@code buildRelations}). 3126 List<String> usingKeyNames = new ArrayList<>(usingCols.size()); 3127 for (int k = 0; k < usingCols.size(); k++) { 3128 TObjectName key = usingCols.getObjectName(k); 3129 if (key == null) continue; 3130 String keyName = key.getColumnNameOnly(); 3131 if (keyName != null && !keyName.isEmpty()) { 3132 usingKeyNames.add(keyName); 3133 } 3134 } 3135 mergeRightIntoLeftOutput(leftState, item.getTable(), provider, 3136 usingKeyNames); 3137 return; 3138 } 3139 3140 if (isNatural) { 3141 // Right-side table first; identity filter excludes target. 3142 buildUpdateRelation(item.getTable(), targetTable, relations, update, 3143 cteNameToStatementIndex); 3144 // Slice 66 catalog-required NATURAL inference. Reject with 3145 // NATURAL_CATALOG_REQUIRED (re-use SELECT-side code) when 3146 // either side lacks resolvable column metadata. 3147 NaturalKeyResult r = naturalSharedKeys(leftState, item.getTable(), provider); 3148 if (r.kind != NaturalKeyResult.Kind.SUCCESS) { 3149 throw new SemanticIRBuildException(Diagnostic.error( 3150 DiagnosticCode.NATURAL_CATALOG_REQUIRED, 3151 formatNaturalCatalogReject(r), item)); 3152 } 3153 List<ColumnRef> naturalRefs = new ArrayList<>(); 3154 emitMergedJoinRefs(JoinKind.NATURAL, r.keys, topJoin, items, 3155 itemIndex, item.getTable(), provider, naturalRefs); 3156 joinRefs.addAll(naturalRefs); 3157 // Update LeftOutputState with the right's columns (merging 3158 // shared keys into existing slots, appending non-shared 3159 // columns as new entries). 3160 mergeRightIntoLeftOutput(leftState, item.getTable(), provider, r.keys); 3161 return; 3162 } 3163 3164 // ON / CROSS branch — slice-82 contract preserved. 3165 buildUpdateRelation(item.getTable(), targetTable, relations, update, 3166 cteNameToStatementIndex); 3167 // Slice 86 — append right to LeftOutputState so subsequent 3168 // NATURAL JoinItems in the same top-level TJoin observe the 3169 // accumulated row type. CROSS / ON contribute non-merged 3170 // columns to state (matches SELECT-side appendRightToLeftOutput). 3171 appendRightToLeftOutput(leftState, item.getTable(), provider); 3172 TExpression onCond = item.getOnCondition(); 3173 if (onCond == null) return; // CROSS JOIN: no ON. 3174 if (containsAnySubqueryExpression(onCond)) { 3175 throw new SemanticIRBuildException(Diagnostic.error( 3176 DiagnosticCode.UPDATE_JOIN_ON_HAS_SUBQUERY_NOT_SUPPORTED, 3177 "UPDATE FROM JOIN ON condition contains a subquery; " 3178 + "slice 82 admits scalar predicates only", 3179 item)); 3180 } 3181 rejectWindowFunctionInScope(onCond, "UPDATE FROM JOIN ON"); 3182 joinRefs.addAll(collectColumnRefs(onCond, provider)); 3183 } 3184 3185 /** 3186 * Slice 83 — extract every FROM-side subquery in 3187 * {@code update.getJoins()} as its own {@link StatementGraph} 3188 * appended to {@code stmts} before the UPDATE itself. Walks both 3189 * the driver TTable of each TJoin AND each JoinItem's right table. 3190 * Returns an alias → stmts-index map so the consuming UPDATE can 3191 * (a) build its in-scope column map via 3192 * {@link #buildUpdateInScopeMap}, and (b) emit 3193 * STATEMENT_OUTPUT → STATEMENT_OUTPUT edges via 3194 * {@link #emitUpdateSubquerySourceEdges}. 3195 * 3196 * <p>Reuses the SELECT-side {@link #processDirectSubqueryTable} 3197 * verbatim, passing empty CTE maps because slice 80 already 3198 * rejects top-level WITH on UPDATE 3199 * ({@link DiagnosticCode#UPDATE_CTE_NOT_SUPPORTED}). The inner 3200 * SELECT's own FROM-subqueries are handled recursively by the 3201 * helper. Inner predicate subqueries in WHERE / JOIN ON / 3202 * GROUP BY are caught by the slice-17 leak guard 3203 * ({@link #rejectSubqueriesInFromSubqueryBodyClauses}). Inner 3204 * top-level WITH is rejected by 3205 * {@code buildSelectStatement(hasOuterCteListAlreadyProcessed=false)}. 3206 * Inner scalar projection subqueries are rejected by 3207 * {@code buildSelectStatement(allowScalarProjectionSubqueries=false)}. 3208 * 3209 * <p>No mutation-guard wrapper here: buildUpdate owns fresh local 3210 * lists and exceptions propagate to the caller (codex round-1 Q5 3211 * NICE). 3212 */ 3213 private static Map<String, Integer> extractUpdateFromSubqueries( 3214 TUpdateSqlStatement update, 3215 NameBindingProvider provider, 3216 List<StatementGraph> stmts, 3217 List<LineageEdge> lineage, 3218 Map<String, Integer> cteNameToStatementIndex, 3219 Map<String, List<String>> ctePublishedColumns) { 3220 Map<String, Integer> aliasToIndex = new HashMap<>(); 3221 TJoinList joins = update.getJoins(); 3222 if (joins == null) return aliasToIndex; 3223 // Slice 105 — forward the outer-WITH CTE maps so a nested SELECT 3224 // inside an extracted FROM-subquery body can resolve outer-WITH 3225 // CTE references. Resolver2 wires CTEScope already; the maps are 3226 // forwarded for parity with the SELECT / MERGE call sites. 3227 Map<String, Integer> cteMap = cteNameToStatementIndex == null 3228 ? Collections.<String, Integer>emptyMap() 3229 : cteNameToStatementIndex; 3230 Map<String, List<String>> ctePublished = ctePublishedColumns == null 3231 ? Collections.<String, List<String>>emptyMap() 3232 : ctePublishedColumns; 3233 for (TJoin join : joins) { 3234 // Driver table — may be a subquery (PG / Snowflake / BQ / 3235 // Redshift `UPDATE t SET … FROM (SELECT …) sub` shape). 3236 processDirectSubqueryTable(join.getTable(), provider, 3237 stmts, lineage, cteMap, ctePublished, aliasToIndex); 3238 TJoinItemList items = join.getJoinItems(); 3239 if (items == null) continue; 3240 for (int i = 0; i < items.size(); i++) { 3241 TJoinItem item = items.getJoinItem(i); 3242 if (item == null) continue; 3243 // Right-side table of a JoinItem — may be a subquery 3244 // (MSSQL / PG `UPDATE t SET … FROM x JOIN (SELECT …) 3245 // sub ON …` shape). 3246 processDirectSubqueryTable(item.getTable(), provider, 3247 stmts, lineage, cteMap, ctePublished, aliasToIndex); 3248 } 3249 } 3250 return aliasToIndex; 3251 } 3252 3253 /** 3254 * Slice 83 — build an effective-alias-keyed in-scope map publishing 3255 * each extracted FROM-subquery's output column names. The consuming 3256 * UPDATE wraps its provider via 3257 * {@code provider.withInScopeRelationColumns(map)} so {@code sub.x} 3258 * resolves to the subquery's published column rather than failing 3259 * resolution against the catalog. 3260 * 3261 * <p>Base-table FROM-side relations do not need an entry: their 3262 * column resolution stays on the Resolver2 catalog path. Slice 60's 3263 * SELECT-side {@link #buildEffectiveAliasInScopeMap} also skips 3264 * base-table relations. 3265 * 3266 * <p>Slice 105 — when an outer WITH clause declares a CTE and a 3267 * FROM-side relation references that CTE by its bare name, publish 3268 * the CTE's column names against the FROM-side effective alias so 3269 * SET RHS / WHERE / ON refs against the CTE alias bind correctly. 3270 */ 3271 private static Map<String, List<String>> buildUpdateInScopeMap( 3272 TUpdateSqlStatement update, 3273 Map<String, Integer> subqueryAliasToIndex, 3274 List<StatementGraph> stmts, 3275 Map<String, Integer> cteNameToStatementIndex, 3276 Map<String, List<String>> ctePublishedColumns) { 3277 Map<String, List<String>> result = new HashMap<>(); 3278 boolean haveSubq = subqueryAliasToIndex != null 3279 && !subqueryAliasToIndex.isEmpty(); 3280 boolean haveCte = cteNameToStatementIndex != null 3281 && !cteNameToStatementIndex.isEmpty(); 3282 if (!haveSubq && !haveCte) { 3283 return result; 3284 } 3285 TJoinList joins = update.getJoins(); 3286 if (joins == null) return result; 3287 for (TJoin join : joins) { 3288 addUpdateRelationToInScopeMap(join.getTable(), 3289 subqueryAliasToIndex, stmts, result, 3290 cteNameToStatementIndex, ctePublishedColumns); 3291 TJoinItemList items = join.getJoinItems(); 3292 if (items == null) continue; 3293 for (int i = 0; i < items.size(); i++) { 3294 TJoinItem item = items.getJoinItem(i); 3295 if (item == null) continue; 3296 addUpdateRelationToInScopeMap(item.getTable(), 3297 subqueryAliasToIndex, stmts, result, 3298 cteNameToStatementIndex, ctePublishedColumns); 3299 } 3300 } 3301 return result; 3302 } 3303 3304 private static void addUpdateRelationToInScopeMap(TTable t, 3305 Map<String, Integer> subqueryAliasToIndex, 3306 List<StatementGraph> stmts, 3307 Map<String, List<String>> result, 3308 Map<String, Integer> cteNameToStatementIndex, 3309 Map<String, List<String>> ctePublishedColumns) { 3310 if (t == null) return; 3311 // Slice 105 — CTE-as-FROM-relation in-scope publication. When 3312 // the FROM-side table is an objectname-typed reference whose 3313 // bare name matches a declared outer CTE, publish the CTE's 3314 // own column names against the FROM-side effective alias so 3315 // SET RHS / WHERE refs against the CTE alias bind correctly. 3316 if (cteNameToStatementIndex != null 3317 && !cteNameToStatementIndex.isEmpty() 3318 && ctePublishedColumns != null 3319 && t.getTableType() 3320 == gudusoft.gsqlparser.ETableSource.objectname) { 3321 TObjectName tName = t.getTableName(); 3322 if (tName != null) { 3323 String bare = tName.toString(); 3324 if (bare != null && !bare.isEmpty()) { 3325 String bareLower = bare.toLowerCase(Locale.ROOT); 3326 if (cteNameToStatementIndex.containsKey(bareLower)) { 3327 String aliasKey = effectiveAliasLowerCaseOrNull(t); 3328 if (aliasKey == null) aliasKey = bareLower; 3329 List<String> cols = ctePublishedColumns.get(bareLower); 3330 if (cols != null) { 3331 result.put(aliasKey, cols); 3332 } 3333 return; 3334 } 3335 } 3336 } 3337 } 3338 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.subquery) { 3339 return; 3340 } 3341 if (subqueryAliasToIndex == null) { 3342 return; 3343 } 3344 String key = effectiveAliasLowerCaseOrNull(t); 3345 if (key == null) return; 3346 Integer idx = subqueryAliasToIndex.get(key); 3347 if (idx == null) return; 3348 result.put(key, outputColumnNames(stmts.get(idx))); 3349 } 3350 3351 /** 3352 * Slice 81 / slice 84 — admit single-target and joined 3353 * {@code DELETE} statements and produce a {@code "DELETE"}-kind 3354 * {@link StatementGraph} (§8.1.4 row D11 follow-up via slice 84's 3355 * joined-DELETE candidate (a)). 3356 * 3357 * <p>Structurally mirrors slice-80 + slice-82 + slice-83 3358 * {@link #buildUpdate} but with no SET clause and an empty 3359 * {@code outputColumns} list — DELETE has no projection of its 3360 * own (RETURNING / OUTPUT projections are deferred to a later 3361 * slice). The target relation is exposed via the slice-78 3362 * {@link TargetRelation} slot; its {@code columns} list is 3363 * intentionally empty because DELETE removes whole rows rather 3364 * than writing specific columns. 3365 * 3366 * <p>WHERE-side reads still surface on 3367 * {@link StatementGraph#getFilterColumnRefs()} so downstream 3368 * governance can audit "what predicates does this DELETE depend 3369 * on". Cross-statement {@link LineageEdge}s are NOT emitted (the 3370 * slice-78 / slice-80 {@code target.col_i ← STATEMENT_OUTPUT(…)} 3371 * contract has no DELETE analogue: there is no source 3372 * projection). 3373 * 3374 * <p>Slice 84 admit scope (lifts slice-81's blanket joined-DELETE 3375 * reject for the common PG / MSSQL FROM-side shapes; mirrors 3376 * slice 82 + slice 83 onto DELETE): 3377 * <ul> 3378 * <li>PG / Snowflake / BQ / Redshift {@code DELETE FROM t USING 3379 * source_list [WHERE]} — {@code source_list} = simple table, 3380 * comma-separated tables, or chain of explicit JOIN ... ON 3381 * (driver is taken from {@code referenceJoins}).</li> 3382 * <li>MSSQL {@code DELETE FROM t FROM driver_table [JOIN other 3383 * ON ...] [WHERE]} — the target may itself appear in the 3384 * FROM-FROM clause as a different TTable instance.</li> 3385 * <li>MSSQL {@code DELETE alias FROM t alias INNER JOIN ... ON …} 3386 * — the alias-form DELETE where target is matched by alias.</li> 3387 * <li>CROSS JOIN inside USING — no ON; semantically equivalent 3388 * to comma-FROM.</li> 3389 * <li>{@code DELETE FROM t USING (SELECT …) s [WHERE]} — 3390 * FROM-subquery as a USING source; mirrors slice-83 UPDATE 3391 * FROM-subquery extraction.</li> 3392 * </ul> 3393 * 3394 * <p>Slice 84 reject scope (preserves slice-81 reject coverage 3395 * for shapes that still need a refinement slice): 3396 * <ul> 3397 * <li>{@link DiagnosticCode#DELETE_JOINED_NOT_SUPPORTED} — any 3398 * shape with {@code delete.getJoins().size() > 0}: MySQL 3399 * multi-target {@code DELETE T1, T2 FROM …}, MySQL 3400 * self-reference {@code DELETE T1 FROM T1}, MySQL 3401 * multi-USING {@code DELETE FROM T1 USING T1, T2}. 3402 * Candidates (c) and (d) in §8.1.4 lift these later.</li> 3403 * <li>{@link DiagnosticCode#DELETE_FROM_JOIN_USING_NOT_SUPPORTED} 3404 * — {@code USING(col1, col2)} on a FROM-side join item; 3405 * mirror of slice-82 {@code UPDATE_FROM_JOIN_USING_*}.</li> 3406 * <li>{@link DiagnosticCode#DELETE_FROM_JOIN_NATURAL_NOT_SUPPORTED} 3407 * — {@code NATURAL JOIN} on a FROM-side join item.</li> 3408 * <li>{@link DiagnosticCode#DELETE_FROM_NESTED_JOIN_NOT_SUPPORTED} 3409 * — defensive: TTable wrapping a TJoin in the FROM source 3410 * (not reached by any observed parser path on supported 3411 * dialects, but kept distinct from the subquery code per 3412 * slice-80 message-text-discrimination contract).</li> 3413 * <li>{@link DiagnosticCode#DELETE_JOIN_ON_HAS_SUBQUERY_NOT_SUPPORTED} 3414 * — subquery in a JOIN ON predicate.</li> 3415 * </ul> 3416 * 3417 * <p>Other rejected shapes (slice-81 baseline preserved): 3418 * {@link DiagnosticCode#DELETE_CTE_NOT_SUPPORTED}, 3419 * {@link DiagnosticCode#DELETE_TARGET_MISSING}, 3420 * {@link DiagnosticCode#DELETE_RETURNING_CLAUSE_NOT_SUPPORTED}, 3421 * {@link DiagnosticCode#DELETE_OUTPUT_CLAUSE_NOT_SUPPORTED}, 3422 * {@link DiagnosticCode#DELETE_ORDER_BY_OR_LIMIT_NOT_SUPPORTED}. 3423 * 3424 * <p>WHERE-side subqueries reuse the existing 3425 * {@link DiagnosticCode#WHERE_HAS_SUBQUERY_NOT_SUPPORTED} (no 3426 * new DELETE-side code) — consistent with slice-80 UPDATE WHERE 3427 * handling. Window functions in WHERE / ON reuse 3428 * {@link DiagnosticCode#CLAUSE_WINDOW_FUNCTION_LEAK}. 3429 * 3430 * <p>IR shape (slice 84 changes from slice 81): 3431 * <ul> 3432 * <li>{@code relations[]} — now carries TABLE-kind 3433 * {@link RelationSource}s for joined-DELETE FROM-side 3434 * sources, plus SUBQUERY-kind sources for {@code USING 3435 * (SELECT …)} extractions. Slice 81 left it empty. 3436 * Reference-identity filter excludes the target's own 3437 * TTable instance; the slice-82 walker-order swap (target 3438 * before relations[] in 3439 * {@link gudusoft.gsqlparser.ir.semantic.SqlSemanticAnalyzer#collectCatalogMissWarnings}) 3440 * handles same-qualified-name target+driver collisions 3441 * (e.g. MSSQL {@code DELETE FROM t FROM t spqh JOIN sp}).</li> 3442 * <li>{@code joinColumnRefs[]} — now carries ON-clause refs 3443 * collected from each JoinItem under a per-DELETE 3444 * {@link java.util.LinkedHashSet} for cross-JoinItem dedup 3445 * (slice-82 codex round-1 Q2 BLOCKING precedent).</li> 3446 * <li>The DELETE itself emits NO new cross-stmt 3447 * {@link LineageEdge}s — empty {@code outputColumns[]} 3448 * means there is no STATEMENT_OUTPUT(deleteIdx, …) anchor 3449 * for slice-83's SUBQUERY-kind emitter. Extracted 3450 * FROM-subqueries DO emit their own internal lineage edges 3451 * via {@code emitLineageForStatement} inside 3452 * {@link #processDirectSubqueryTable}.</li> 3453 * </ul> 3454 */ 3455 public static SemanticProgram buildDelete(TDeleteSqlStatement delete, 3456 NameBindingProvider provider) { 3457 if (delete == null) { 3458 throw new IllegalArgumentException("delete must not be null"); 3459 } 3460 if (provider == null) { 3461 throw new IllegalArgumentException("provider must not be null"); 3462 } 3463 3464 // 1) Slice 106 — admit top-level WITH on DELETE. Walks the CTE 3465 // list left-to-right, building each body as a preceding 3466 // StatementGraph and producing cteNameToStatementIndex + 3467 // ctePublishedColumns for the FROM-as-CTE branch in 3468 // buildDeleteRelation below. Mirrors the slice-105 UPDATE 3469 // walker. `stmts` / `lineage` allocated here (hoisted from the 3470 // prior slice-84 location) so the CTE walker can append. 3471 // DELETE_CTE_NOT_SUPPORTED stays declared-but-unreached 3472 // (slice 71/72/82/86/95/96/97/98/99/100/101/102/103/104/105 3473 // precedent). 3474 List<StatementGraph> stmts = new ArrayList<>(); 3475 List<LineageEdge> lineage = new ArrayList<>(); 3476 Map<String, List<String>> ctePublishedColumns = new LinkedHashMap<>(); 3477 Map<String, Integer> cteNameToStatementIndex = buildDeleteCteList( 3478 delete, provider, stmts, lineage, ctePublishedColumns); 3479 3480 // 2) Target table — defensive (parser usually rejects first). 3481 TTable targetTable = delete.getTargetTable(); 3482 if (targetTable == null || targetTable.getTableName() == null) { 3483 throw new SemanticIRBuildException(Diagnostic.error( 3484 DiagnosticCode.DELETE_TARGET_MISSING, 3485 "DELETE statement has no resolvable target table", 3486 delete)); 3487 } 3488 String targetQName = targetTable.getTableName().toString(); 3489 if (targetQName == null || targetQName.isEmpty()) { 3490 throw new SemanticIRBuildException(Diagnostic.error( 3491 DiagnosticCode.DELETE_TARGET_MISSING, 3492 "DELETE target table name is empty", 3493 delete)); 3494 } 3495 3496 // 3) Slice 84 / Slice 92 — joined-DELETE discriminator. 3497 // Parser-probe-verified shapes: 3498 // - Admit (slice 84): PG `DELETE FROM t USING j` / MSSQL 3499 // `DELETE FROM t FROM t spqh JOIN sp` / MSSQL `DELETE spqh 3500 // FROM t spqh JOIN sp` / Snowflake DELETE-USING — all have 3501 // joins.size=0 and referenceJoins.size > 0. 3502 // - Admit (slice 92): MySQL `DELETE T1 FROM T1 [WHERE pred]` 3503 // self-reference — joins.size=1, refJoins.size=1, and all 3504 // three names (joins[0].table, refJoins[0].table, target) 3505 // agree case-insensitively. Semantically identical to 3506 // `DELETE FROM T1 [WHERE pred]`; produces the same IR shape. 3507 // - Reject (slice-81 code preserved for non-self-ref): 3508 // MySQL `DELETE T1, T2 FROM …` (joins.size=2) and 3509 // MySQL `DELETE FROM T1 USING T1, T2` (refJoins.size=2). 3510 // 3511 // Slice 84 drops the slice-81 `tables.size > 1` and 3512 // `fromSourceJoin != null` blanket rejects (both fire for 3513 // admit shapes; probe confirms no parser-reachable shape 3514 // needs them when joins.size == 0). Candidate (d) in §8.1.4 3515 // (Hive multi-insert) remains open for a future slice. 3516 boolean mysqlSelfRef = false; 3517 if (delete.joins != null && delete.joins.size() > 0) { 3518 // Slice 92 — admit MySQL self-reference form: 3519 // DELETE T1 FROM T1 [WHERE …] 3520 // The check requires all three names to match (codex 3521 // plan-review rounds Q1+Q5 BLOCKING fix: checking only 3522 // joins[0] is insufficient — DELETE T1 FROM T2 would 3523 // incorrectly admit because joins[0]=T1=target but 3524 // refJoins[0]=T2≠target). 3525 mysqlSelfRef = isMysqlSelfReferenceDelete(delete, targetQName); 3526 if (!mysqlSelfRef) { 3527 throw new SemanticIRBuildException(Diagnostic.error( 3528 DiagnosticCode.DELETE_JOINED_NOT_SUPPORTED, 3529 "DELETE with multi-target / multi-USING clause is " 3530 + "not supported by SemanticIRBuilder.buildDelete; " 3531 + "slice 84 admits PG `DELETE FROM t USING j` and " 3532 + "MSSQL `DELETE FROM t FROM t JOIN s` shapes; " 3533 + "slice 92 admits MySQL " 3534 + "`DELETE T1 FROM T1 [WHERE …]` self-reference", 3535 delete)); 3536 } 3537 } 3538 3539 // 4) Slice 85 lifts the RETURNING / OUTPUT rejects on DELETE. 3540 // The cheap statement-level OUTPUT_INTO reject fires here so a 3541 // multi-violation shape routes to the cheaper structural code 3542 // first. {@code DELETE_RETURNING_CLAUSE_NOT_SUPPORTED} and 3543 // {@code DELETE_OUTPUT_CLAUSE_NOT_SUPPORTED} stay declared but 3544 // unreached (slice 71/72 retain-for-documentation precedent). 3545 if (delete.getOutputClause() != null 3546 && delete.getOutputClause().getIntoTable() != null) { 3547 throw new SemanticIRBuildException(Diagnostic.error( 3548 DiagnosticCode.OUTPUT_INTO_NOT_SUPPORTED, 3549 "DELETE OUTPUT ... INTO <target> writes a second target; " 3550 + "slice 85 admits projection-only OUTPUT", 3551 delete)); 3552 } 3553 if (delete.getOrderByClause() != null 3554 || delete.getLimitClause() != null) { 3555 throw new SemanticIRBuildException(Diagnostic.error( 3556 DiagnosticCode.DELETE_ORDER_BY_OR_LIMIT_NOT_SUPPORTED, 3557 "DELETE with ORDER BY / LIMIT (MySQL) is not " 3558 + "supported by SemanticIRBuilder.buildDelete; " 3559 + "slice 81 admits no row-pruning on DELETE", 3560 delete)); 3561 } 3562 3563 // 4.7) Slice 84 — extract FROM-subqueries from referenceJoins 3564 // (after slice 106's CTE walker so the CTE bodies precede any 3565 // extracted FROM-subquery in the program). Mirrors slice-83 3566 // UPDATE FROM-subquery extraction (which uses 3567 // update.getJoins()); here we use delete.getReferenceJoins(). 3568 // buildDelete owns fresh local stmts/lineage lists (allocated 3569 // in step 1 above) so exceptions propagate cleanly to the 3570 // caller — no snapshot/rollback wrapper. 3571 // 3572 // Slice 106 — forward cteNameToStatementIndex + 3573 // ctePublishedColumns so a nested SELECT inside an extracted 3574 // FROM-subquery body can resolve outer-WITH CTE references 3575 // (Resolver2 wires CTEScope; the maps are forwarded for parity 3576 // with the SELECT / MERGE / UPDATE call sites and so the 3577 // §N test for `USING (SELECT … FROM cte) sub` produces the 3578 // expected cross-stmt edge to the CTE body). 3579 // 3580 // Decorate the provider with the outer-WITH CTE name set so 3581 // the SELECT-side {@link #buildRelation} routes references to 3582 // those names through {@link RelationKind#CTE} (rather than 3583 // TABLE), which in turn makes 3584 // {@link #emitLineageForStatement} emit the cross-stmt 3585 // {@code STATEMENT_OUTPUT(subIdx,col) → 3586 // STATEMENT_OUTPUT(cteIdx,col)} edge required by §N. This 3587 // mirrors the SELECT-side outer-WITH walker 3588 // (see {@link #build}'s {@code outerProvider}). 3589 NameBindingProvider providerWithCte = cteNameToStatementIndex.isEmpty() 3590 ? provider 3591 : provider.withCteContext(cteNameToStatementIndex.keySet()); 3592 Map<String, Integer> subqueryAliasToIndex = 3593 extractDeleteFromSubqueries(delete, providerWithCte, stmts, lineage, 3594 cteNameToStatementIndex, ctePublishedColumns); 3595 Map<String, List<String>> deleteInScope = buildDeleteInScopeMap( 3596 delete, subqueryAliasToIndex, stmts, 3597 cteNameToStatementIndex, ctePublishedColumns); 3598 NameBindingProvider providerWithStar = deleteInScope.isEmpty() 3599 ? providerWithCte 3600 : providerWithCte.withInScopeRelationColumns(deleteInScope); 3601 3602 // 5) WHERE refs — slice 111 lifts the slice-81 blanket subquery 3603 // reject by routing uncorrelated predicate-subquery wrappers 3604 // (IN-SELECT / EXISTS / NOT EXISTS / scalar comparison / 3605 // ANY-ALL-SOME) through the slice-23+ JOIN-ON extraction pipeline 3606 // refactored by slice 110 to take a PredicateClauseContext. The 3607 // new DELETE_WHERE constant carries clause-specific 3608 // DiagnosticCode IDs (8 new DELETE_WHERE_* codes) and a 3609 // "DELETE WHERE clause" label. Each extracted wrapper lands as 3610 // its own <predicate_subquery_<i>> StatementGraph BEFORE the 3611 // DELETE (deleteIdx below = stmts.size() naturally accounts for 3612 // them — slice-83 dynamic-index pattern, slice 110 UPDATE 3613 // precedent). Remaining non-subquery refs flow into 3614 // filterColumnRefs via collectColumnRefsSkipping (or 3615 // collectColumnRefsTolerant on the slice-92 MySQL self-ref 3616 // path). Window functions in non-subquery subtrees still reject 3617 // via rejectWindowFunctionInScopeSkipping. Slice 84 — 3618 // providerWithStar so WHERE refs against extracted subquery 3619 // aliases bind correctly (slice-83 precedent). 3620 // 3621 // Slice 106 — providerWithCte (then providerWithStar on top of 3622 // it) already decorates the provider with the outer-WITH CTE 3623 // name set so the predicate body's inner SELECT routes 3624 // `FROM cte` refs through RelationKind.CTE and 3625 // emitLineageForStatement emits the 3626 // STATEMENT_OUTPUT(subIdx,col) → STATEMENT_OUTPUT(cteIdx,col) 3627 // cross-stmt edge (slice 110 UPDATE precedent). 3628 List<ColumnRef> filterRefs; 3629 TWhereClause where = delete.getWhereClause(); 3630 if (where == null || where.getCondition() == null) { 3631 filterRefs = Collections.<ColumnRef>emptyList(); 3632 } else { 3633 Set<TExpression> extractedWhereRoots = 3634 Collections.<TExpression>emptySet(); 3635 if (containsAnySubquery(where)) { 3636 extractedWhereRoots = 3637 extractUncorrelatedPredicateSubqueriesFromClause( 3638 where.getCondition(), providerWithStar, 3639 stmts, lineage, cteNameToStatementIndex, 3640 PredicateClauseContext.DELETE_WHERE); 3641 rejectAnyRemainingSubqueriesFromClause( 3642 where.getCondition(), extractedWhereRoots, 3643 PredicateClauseContext.DELETE_WHERE); 3644 } 3645 rejectWindowFunctionInScopeSkipping(where, "WHERE clause", 3646 extractedWhereRoots); 3647 // Codex diff-review P1 fix: for MySQL self-reference DELETE the 3648 // MySQL parser puts 3 T1 instances in stmt.tables (target + 3649 // joins[0] + refJoins[0]), making Resolver2's inferredCandidates 3650 // see 3 candidates for any unqualified column → NOT_FOUND → 3651 // COLUMN_BINDING_NON_EXACT. Use a tolerant collector for the 3652 // self-ref path: EXACT_MATCH bindings (qualified refs) are 3653 // preserved verbatim; non-exact bindings emit the column ref with 3654 // the SQL-written qualifier (null for unqualified refs) instead of 3655 // throwing. Qualified refs like WHERE T1.id = 1 still get full 3656 // EXACT_MATCH treatment; only WHERE id = 1 (no qualifier) falls 3657 // back to the tolerant path. Slice 111 — both helpers now skip 3658 // extracted predicate-subquery subtrees so inner refs do not 3659 // leak into outer filterColumnRefs. 3660 filterRefs = mysqlSelfRef 3661 ? collectColumnRefsTolerant(where, providerWithStar, 3662 targetQName, extractedWhereRoots) 3663 : collectColumnRefsSkipping(where, providerWithStar, 3664 extractedWhereRoots); 3665 } 3666 3667 // 5.5) Slice 84 — walk delete.getReferenceJoins() to populate 3668 // relations[] (TABLE-kind FROM-side sources, target excluded 3669 // by reference identity; SUBQUERY-kind for USING (SELECT …)) 3670 // and joinColumnRefs[] (ON-clause refs across all JoinItems). 3671 // Mirrors slice-82 buildUpdate's FROM walk, with the 3672 // `update.getJoins()` source replaced by 3673 // `delete.getReferenceJoins()`. 3674 // 3675 // Slice 92 — for MySQL self-reference DELETE T1 FROM T1, 3676 // refJoins[0] is the same table as the target; skip the loop 3677 // so relations[] stays empty (mirrors the slice-81 single-target 3678 // contract). Resolver2's ScopeBuilder has already registered 3679 // the FROM-clause table (including any alias) via the 3680 // `referenceJoins` walk in preVisit(TDeleteSqlStatement), so 3681 // WHERE refs resolve correctly even without a relations[] entry. 3682 List<RelationSource> relations = new ArrayList<>(); 3683 // Slice-82 codex round-1 Q2 BLOCKING precedent — joinRefs 3684 // accumulates across multiple JoinItems in chained-JOIN 3685 // shapes. LinkedHashSet ensures cross-JoinItem dedup. 3686 java.util.LinkedHashSet<ColumnRef> joinRefsSet = 3687 new java.util.LinkedHashSet<>(); 3688 TJoinList refJoins = delete.getReferenceJoins(); 3689 if (!mysqlSelfRef && refJoins != null) { 3690 for (int ji = 0; ji < refJoins.size(); ji++) { 3691 TJoin join = refJoins.getJoin(ji); 3692 TTable leftTable = join.getTable(); 3693 // Slice 106 — threads cteNameToStatementIndex so the 3694 // FROM-driver buildDeleteRelation call can route 3695 // objectname-typed CTE references to a SUBQUERY-kind 3696 // RelationSource pointing at the CTE statement. 3697 buildDeleteRelation(leftTable, targetTable, relations, delete, 3698 cteNameToStatementIndex); 3699 TJoinItemList items = join.getJoinItems(); 3700 if (items == null) continue; 3701 for (int i = 0; i < items.size(); i++) { 3702 TJoinItem item = items.getJoinItem(i); 3703 // Slice 106 — threads cteNameToStatementIndex through 3704 // the JoinItem walker so JOIN-side CTE refs (MSSQL 3705 // `FROM target t JOIN cte ON …`) get SUBQUERY-kind 3706 // RelationSource emission. 3707 buildDeleteJoinItem(item, targetTable, providerWithStar, 3708 relations, joinRefsSet, delete, 3709 cteNameToStatementIndex); 3710 } 3711 } 3712 } 3713 List<ColumnRef> joinRefs = new ArrayList<>(joinRefsSet); 3714 3715 // 6) Build the DELETE outer. 3716 // - relations[] may be non-empty for joined DELETE (slice 84); 3717 // empty for single-target DELETE (slice 81 contract). 3718 // - target.columns empty by design — DELETE removes whole rows. 3719 RelationBinding targetBinding = new RelationBinding( 3720 RelationKind.TABLE, targetQName); 3721 TargetRelation target = new TargetRelation( 3722 targetBinding, Collections.<String>emptyList()); 3723 3724 // Slice 85 — build RETURNING / OUTPUT projection columns BEFORE 3725 // the StatementGraph so the new returningColumns slot can be 3726 // populated. deleteIdx mirrors the slice-84 stmts.size() pattern. 3727 int deleteIdx = stmts.size(); 3728 // DELETE target alias = effective alias from the target's 3729 // TTable (slice-84 convention). 3730 String deleteTargetAlias = effectiveAliasOf(targetTable); 3731 if (deleteTargetAlias == null || deleteTargetAlias.isEmpty()) { 3732 deleteTargetAlias = targetQName; 3733 } 3734 List<OutputColumn> returningColumns = buildReturningColumns( 3735 delete.getReturningClause(), 3736 delete.getOutputClause(), 3737 "DELETE", 3738 targetQName, 3739 deleteTargetAlias, 3740 /*targetTable=*/ targetTable, 3741 relations, 3742 providerWithStar, 3743 deleteIdx, 3744 lineage, 3745 delete); 3746 3747 StatementGraph deleteStmt = new StatementGraph( 3748 /*name=*/ null, 3749 "DELETE", 3750 relations, 3751 /*outputColumns=*/ Collections.<OutputColumn>emptyList(), 3752 returningColumns, 3753 filterRefs, 3754 joinRefs, 3755 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 3756 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 3757 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 3758 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 3759 /*distinct=*/ false, 3760 /*setOperator=*/ null, 3761 /*rowLimit=*/ null, 3762 target); 3763 3764 stmts.add(deleteStmt); 3765 // Slice 85 — extracted FROM-subqueries have already emitted 3766 // their own internal lineage edges into `lineage` via 3767 // processDirectSubqueryTable; buildReturningColumns also 3768 // already appended STATEMENT_OUTPUT(deleteIdx, retName) → 3769 // TABLE_COLUMN(targetQName, baseCol) edges above. No further 3770 // edges are needed. 3771 return new SemanticProgram(stmts, lineage); 3772 } 3773 3774 /** 3775 * Slice 94 — admit the single-target MERGE skeleton: 3776 * <pre> 3777 * MERGE INTO target [AS] tgt 3778 * USING (source_table | (SELECT ...) ) [AS] src 3779 * ON <join condition> 3780 * WHEN MATCHED [AND <cond>] THEN UPDATE SET c1 = expr1 [, ...] 3781 * WHEN NOT MATCHED [AND <cond>] THEN INSERT [(c1, ...)] VALUES (expr1, ...) 3782 * WHEN MATCHED [AND <cond>] THEN DELETE 3783 * </pre> 3784 * 3785 * <p>Emits one {@code "MERGE"}-kind {@link StatementGraph} carrying: 3786 * <ul> 3787 * <li>{@link TargetRelation} on {@code getTarget()} only — slice 3788 * 78/80 contract: target lives on the dedicated target slot, 3789 * NOT in {@code relations[]}. The slice-77/79 catalog walker 3790 * fires the kind-discriminated "MERGE target relation 'X'" 3791 * message via {@code targetWarnMessage("MERGE")}.</li> 3792 * <li>{@code relations[]} = one entry for the USING source 3793 * (TABLE-kind base table or SUBQUERY-kind aliased subquery). 3794 * The slice-77 FROM walker fires "FROM relation 'X'" for 3795 * missing source.</li> 3796 * <li>{@code outputColumns[]} = empty (MERGE has no projection).</li> 3797 * <li>{@code joinColumnRefs[]} = ON condition refs + per-WHEN AND 3798 * condition refs, LinkedHashSet-deduplicated (slice 82 3799 * pattern).</li> 3800 * <li>{@code filterColumnRefs[]} = per-WHEN action WHERE refs 3801 * (UPDATE WHERE, UPDATE...DELETE WHERE, INSERT WHERE; slice 3802 * 95). Empty when no action WHERE is present.</li> 3803 * </ul> 3804 * 3805 * <p>Per-WHEN action lineage: 3806 * <ul> 3807 * <li>{@code WHEN MATCHED THEN UPDATE SET col_i = expr_i}: emit 3808 * one {@link LineageEdge} per (target col, RHS source ref) 3809 * pair as {@code TABLE_COLUMN(target,col) ← <ref>} — direct, 3810 * no STATEMENT_OUTPUT intermediate (MERGE has no SELECT 3811 * projection). Codex round-2 Q4 confirmed YES.</li> 3812 * <li>{@code WHEN NOT MATCHED THEN INSERT (c1, ...) VALUES (e1, ...)}: 3813 * same pattern — one edge per (insert col, source ref).</li> 3814 * <li>{@code WHEN MATCHED THEN DELETE}: no per-column lineage 3815 * (slice 81 DELETE contract).</li> 3816 * <li>{@code WHEN MATCHED [AND <cond>] THEN DO NOTHING} (PG 15+, 3817 * slice 96): admitted as a no-op action. No per-column 3818 * lineage (slice 81 DELETE precedent). Per-WHEN AND 3819 * condition refs still feed {@code joinColumnRefs[]} via 3820 * the pre-dispatch block.</li> 3821 * <li>{@code WHEN NOT MATCHED BY SOURCE [AND <cond>] THEN 3822 * UPDATE SET ... | DELETE} (SQL Server, slice 97): 3823 * admitted with the SQL Server semantic invariant that 3824 * SET RHS and per-WHEN AND cond may not reference USING 3825 * source columns (no source row exists when the action 3826 * fires). Source-side refs reject with 3827 * {@link DiagnosticCode#MERGE_NOT_MATCHED_BY_SOURCE_REFERENCES_SOURCE}. 3828 * INSERT on BY SOURCE is parser-admitted but semantically 3829 * invalid; rejects with 3830 * {@link DiagnosticCode#MERGE_NOT_MATCHED_BY_SOURCE_INSERT_NOT_VALID}. 3831 * UPDATE target self-refs ({@code t.a = t.b}) emit no 3832 * lineage edges (slice-94 alias-filter convention; codex 3833 * round-1 Q2 confirmed). PG 17+ BY SOURCE syntax still 3834 * parses as type 2 plain NOT MATCHED in parser 4.1.5.0 3835 * — that parser gap is not addressed in slice 97.</li> 3836 * </ul> 3837 * 3838 * <p>For USING-subquery, the inner SELECT is built via {@link #build} 3839 * and appended as a preceding {@link StatementGraph}; its inner 3840 * lineage edges are rebased by the current statement-list offset so 3841 * STATEMENT_OUTPUT indices stay valid (slice 78 INSERT pattern). 3842 * 3843 * <p>Resolver2 already handles MERGE via {@link gudusoft.gsqlparser.resolver2.scope.MergeScope} 3844 * — both USING base tables and USING subqueries surface as 3845 * {@code sourceTable + EXACT_MATCH} bindings on RHS / VALUES / 3846 * ON / WHEN-AND refs. Codex round-2 Q5 BLOCKING fix: we install 3847 * an explicit slice-83-style published-column map only for 3848 * USING subqueries (deterministic; cheap; matches the SELECT- 3849 * side FROM-subquery pattern even when redundant). 3850 */ 3851 public static SemanticProgram buildMerge(TMergeSqlStatement merge, 3852 NameBindingProvider provider) { 3853 if (merge == null) { 3854 throw new IllegalArgumentException("merge must not be null"); 3855 } 3856 if (provider == null) { 3857 throw new IllegalArgumentException("provider must not be null"); 3858 } 3859 // Slice 94 — defensive UsingScope reset at entry. MERGE does 3860 // not produce its own UsingScope but a parent context might 3861 // (e.g. nested-statement contexts); mirrors slice 80 / 86 3862 // buildUpdate hygiene. 3863 provider = provider.withUsingScope(UsingScope.EMPTY); 3864 3865 // Slice 101 — hoist allocations earlier so buildMergeCteList can 3866 // append CTE bodies as preceding statements. The slice-94 reject 3867 // at this location is replaced by the CTE walker below. 3868 List<StatementGraph> stmts = new ArrayList<>(); 3869 List<LineageEdge> lineage = new ArrayList<>(); 3870 3871 // 1) Slice 101 — admit top-level WITH on MERGE. Walks CTE list 3872 // left-to-right, building each body as a preceding statement. 3873 // Produces cteNameToStatementIndex + ctePublishedColumns for the 3874 // USING-as-CTE branch below. Mirrors SELECT-side build() at 3875 // lines 516-653. `MERGE_CTE_NOT_SUPPORTED` stays declared-but- 3876 // unreached for API stability (slice 71/72/82/86/95/96/97/98/99/100 3877 // precedent). 3878 Map<String, List<String>> ctePublishedColumns = new LinkedHashMap<>(); 3879 Map<String, Integer> cteNameToStatementIndex = buildMergeCteList( 3880 merge, provider, stmts, lineage, ctePublishedColumns); 3881 3882 // 2) Target table — defensive. 3883 TTable targetTable = merge.getTargetTable(); 3884 if (targetTable == null || targetTable.getTableName() == null) { 3885 throw new SemanticIRBuildException(Diagnostic.error( 3886 DiagnosticCode.MERGE_TARGET_MISSING, 3887 "MERGE statement has no resolvable target table", 3888 merge)); 3889 } 3890 String targetQName = targetTable.getTableName().toString(); 3891 if (targetQName == null || targetQName.isEmpty()) { 3892 throw new SemanticIRBuildException(Diagnostic.error( 3893 DiagnosticCode.MERGE_TARGET_MISSING, 3894 "MERGE target table name is empty", 3895 merge)); 3896 } 3897 3898 // 3) USING source — defensive. 3899 TTable usingTable = merge.getUsingTable(); 3900 if (usingTable == null) { 3901 throw new SemanticIRBuildException(Diagnostic.error( 3902 DiagnosticCode.MERGE_USING_SOURCE_MISSING, 3903 "MERGE statement has no USING source", 3904 merge)); 3905 } 3906 3907 // 4) ON condition — defensive (parser usually rejects first). 3908 TExpression onCondition = merge.getCondition(); 3909 if (onCondition == null) { 3910 throw new SemanticIRBuildException(Diagnostic.error( 3911 DiagnosticCode.MERGE_ON_CONDITION_MISSING, 3912 "MERGE statement has no ON condition", 3913 merge)); 3914 } 3915 3916 // 5) OUTPUT INTO / RETURNING / LIMIT / error logging rejects. 3917 // Slice 98 lifts MSSQL MERGE OUTPUT projection (non-INTO) via 3918 // the slice-85 buildReturningColumns walker; the actual call 3919 // is deferred until after step 8 because the walker needs the 3920 // populated relations[] (USING source). OUTPUT INTO continues 3921 // to reject (writes a second target). The RETURNING-clause 3922 // branch stays declared-but-unreached: PG parser drops 3923 // MERGE RETURNING silently, Oracle PARSE_FAILED, Couchbase 3924 // has no test reach (slice 71/72/82/86/95/96/97 precedent). 3925 if (merge.getOutputClause() != null 3926 && merge.getOutputClause().getIntoTable() != null) { 3927 throw new SemanticIRBuildException(Diagnostic.error( 3928 DiagnosticCode.OUTPUT_INTO_NOT_SUPPORTED, 3929 "MERGE OUTPUT ... INTO <target> writes a second " 3930 + "target; slice 98 admits OUTPUT projection only", 3931 merge)); 3932 } 3933 if (merge.getReturningClause() != null) { 3934 throw new SemanticIRBuildException(Diagnostic.error( 3935 DiagnosticCode.MERGE_RETURNING_CLAUSE_NOT_SUPPORTED, 3936 "MERGE RETURNING projection (Oracle / Couchbase) is " 3937 + "not supported by SemanticIRBuilder.buildMerge", 3938 merge)); 3939 } 3940 if (merge.getLimitClause() != null) { 3941 throw new SemanticIRBuildException(Diagnostic.error( 3942 DiagnosticCode.MERGE_LIMIT_NOT_SUPPORTED, 3943 "MERGE with LIMIT (Couchbase) is not supported by " 3944 + "SemanticIRBuilder.buildMerge", 3945 merge)); 3946 } 3947 if (merge.getErrorLoggingClause() != null) { 3948 throw new SemanticIRBuildException(Diagnostic.error( 3949 DiagnosticCode.MERGE_ERROR_LOGGING_NOT_SUPPORTED, 3950 "MERGE LOG ERRORS INTO (Oracle) is not supported by " 3951 + "SemanticIRBuilder.buildMerge", 3952 merge)); 3953 } 3954 3955 // 6) Build USING source RelationSource. If USING is a subquery, 3956 // extract it as a preceding StatementGraph and emit a SUBQUERY- 3957 // kind RelationSource that points at it; slice-83 pattern. 3958 // Otherwise emit a TABLE-kind RelationSource. 3959 // Slice 101 — `stmts` / `lineage` were hoisted to the top of 3960 // buildMerge so the CTE walker can append its preceding CTE 3961 // body statements first. Do NOT re-declare them here. 3962 String usingAlias = effectiveAliasOf(usingTable); 3963 if (usingAlias == null || usingAlias.isEmpty()) { 3964 usingAlias = (usingTable.getName() == null 3965 || usingTable.getName().toString().isEmpty()) 3966 ? "__merge_using__" 3967 : usingTable.getName().toString(); 3968 } 3969 boolean usingIsSubquery = usingTable.getTableType() 3970 == gudusoft.gsqlparser.ETableSource.subquery; 3971 List<RelationSource> relations = new ArrayList<>(); 3972 Map<String, List<String>> mergeInScope = new LinkedHashMap<>(); 3973 NameBindingProvider providerWithStar = provider; 3974 // Slice 94 — alias resolution maps for the per-WHEN action 3975 // lineage emitter. TABLE-kind sources map alias → qualifiedName; 3976 // SUBQUERY-kind sources map alias → statement index of the 3977 // extracted inner SELECT. A SEPARATE `targetAliases` set 3978 // identifies refs whose relationAlias is the target alias 3979 // (codex round-1 diff Q1 BLOCKING — without this separation, 3980 // a self-merge where USING happens to share the target's name 3981 // would mis-classify the source alias as the target alias). 3982 Map<String, String> aliasToTableQName = new HashMap<>(); 3983 Map<String, Integer> aliasToSubIdx = new HashMap<>(); 3984 Set<String> targetAliases = new HashSet<>(); 3985 String targetAlias = effectiveAliasOf(targetTable); 3986 if (targetAlias != null && !targetAlias.isEmpty()) { 3987 targetAliases.add(targetAlias.toLowerCase(Locale.ROOT)); 3988 } 3989 targetAliases.add(targetQName.toLowerCase(Locale.ROOT)); 3990 if (usingIsSubquery) { 3991 TSelectSqlStatement usingSelect = usingTable.getSubquery(); 3992 if (usingSelect == null) { 3993 throw new SemanticIRBuildException(Diagnostic.error( 3994 DiagnosticCode.MERGE_SOURCE_NOT_SUPPORTED, 3995 "MERGE USING declared as subquery but no inner " 3996 + "SELECT statement was attached", 3997 merge)); 3998 } 3999 // Slice 110 — known parity gap with slice-110 UPDATE-side 4000 // and slice-106 DELETE-side fixes: when the outer MERGE has 4001 // a CTE (slice 101) and the USING subquery body references 4002 // it (`MERGE INTO t USING (SELECT ... FROM cte) s ON ...`), 4003 // the `provider` passed here lacks `withCteContext`. The 4004 // proper fix is non-trivial because `build()` is the public 4005 // entry and creates a fresh local `cteNameToStatementIndex` 4006 // for the inner SELECT — adding `withCteContext` here would 4007 // classify the inner `cte` ref as CTE-kind but 4008 // `emitLineageForStatement` would then fail because the 4009 // inner build's own `cteNameToStatementIndex` is empty. 4010 // Fixing this requires plumbing the outer's CTE name+index 4011 // map into the inner build, similar to slice-93's 4012 // `appendOneHiveInsert` / slice-108's 4013 // `buildSelectBodyAfterCteWalk`. Deferred to a follow-up 4014 // slice; the rare shape currently produces correct REF 4015 // classification (Resolver2's CTEScope binds correctly) 4016 // but may miss the cross-stmt STATEMENT_OUTPUT edge to the 4017 // CTE body. Codex round-2 Q4 NO BLOCKING; addressed by 4018 // explicit documentation here. 4019 SemanticProgram inner = build(usingSelect, provider); 4020 int offset = stmts.size(); 4021 stmts.addAll(inner.getStatements()); 4022 for (LineageEdge e : inner.getLineage()) { 4023 lineage.add(rebaseLineageEdge(e, offset)); 4024 } 4025 int subIdx = stmts.size() - 1; 4026 // Codex round-2 Q5 BLOCKING fix: install slice-83-style 4027 // in-scope map for USING subquery columns, scoped only to 4028 // the USING alias (codex round-3 Q2: ensure scoped to 4029 // USING alias only, no override of target / base-table). 4030 StatementGraph usingOuter = stmts.get(subIdx); 4031 List<String> publishedCols = new ArrayList<>(); 4032 for (OutputColumn oc : usingOuter.getOutputColumns()) { 4033 if (oc.getName() != null && !oc.getName().isEmpty()) { 4034 publishedCols.add(oc.getName()); 4035 } 4036 } 4037 mergeInScope.put( 4038 usingAlias.toLowerCase(Locale.ROOT), publishedCols); 4039 providerWithStar = provider.withInScopeRelationColumns( 4040 mergeInScope); 4041 relations.add(new RelationSource(usingAlias, 4042 new RelationBinding(RelationKind.SUBQUERY, usingAlias))); 4043 aliasToSubIdx.put( 4044 usingAlias.toLowerCase(Locale.ROOT), subIdx); 4045 } else { 4046 // Slice 101 — USING-as-CTE detection. When MERGE has a WITH 4047 // clause and the USING bare name matches a CTE declared in 4048 // that WITH clause, route to a SUBQUERY-kind RelationSource 4049 // pointing at the CTE's already-built statement index. This 4050 // ensures: 4051 // (a) lineage edges flow to STATEMENT_OUTPUT(cteIdx, col), 4052 // not the fictitious TABLE_COLUMN(cteName, col); 4053 // (b) the slice-77 catalog-miss WARN walker (which walks 4054 // only TABLE-kind RelationSources) skips the CTE name; 4055 // (c) Resolver2-bound CTE refs (probe 2026-05-17: status 4056 // EXACT_MATCH with sourceTable=<cteName>) flow through 4057 // the same emitMergeLineageEdge dispatch. 4058 // Case-insensitive lookup matches SQL identifier semantics. 4059 String usingBareName = (usingTable.getName() == null) 4060 ? "" 4061 : usingTable.getName().toString(); 4062 String usingBareNameLower = 4063 usingBareName.toLowerCase(Locale.ROOT); 4064 Integer cteIdx = usingBareNameLower.isEmpty() 4065 ? null 4066 : cteNameToStatementIndex.get(usingBareNameLower); 4067 if (cteIdx != null) { 4068 // USING references a declared CTE. 4069 List<String> publishedCols = ctePublishedColumns.get( 4070 usingBareNameLower); 4071 if (publishedCols == null) { 4072 publishedCols = new ArrayList<>(); 4073 } 4074 mergeInScope.put( 4075 usingAlias.toLowerCase(Locale.ROOT), 4076 publishedCols); 4077 providerWithStar = provider.withInScopeRelationColumns( 4078 mergeInScope); 4079 relations.add(new RelationSource(usingAlias, 4080 new RelationBinding( 4081 RelationKind.SUBQUERY, usingAlias))); 4082 aliasToSubIdx.put( 4083 usingAlias.toLowerCase(Locale.ROOT), cteIdx); 4084 // Also register the bare CTE name in case the SQL 4085 // omits the alias (e.g. `USING src ON ...` with no 4086 // trailing alias). Mirrors the TABLE-kind branch 4087 // (line below) which also registers the bare name. 4088 aliasToSubIdx.put(usingBareNameLower, cteIdx); 4089 } else { 4090 // TABLE-kind USING — use the source table's qualified name 4091 // as the binding's qualifiedName so the slice-77 catalog 4092 // walker can find it. 4093 String usingQName = (usingTable.getTableName() == null) 4094 ? usingAlias 4095 : usingTable.getTableName().toString(); 4096 relations.add(new RelationSource(usingAlias, 4097 new RelationBinding(RelationKind.TABLE, usingQName))); 4098 aliasToTableQName.put( 4099 usingAlias.toLowerCase(Locale.ROOT), usingQName); 4100 // Also register the bare name in case the SQL omits the 4101 // alias (e.g. `USING managers ON ...` without `s`). 4102 aliasToTableQName.put( 4103 usingQName.toLowerCase(Locale.ROOT), usingQName); 4104 } 4105 } 4106 4107 // 7) Walk ON condition + per-WHEN AND conditions to build 4108 // joinColumnRefs[] with LinkedHashSet dedup (slice 82 pattern). 4109 // Reject ON-side subqueries: not supported in this slice; users 4110 // can still use a USING subquery for complex source logic. 4111 if (containsAnySubqueryExpression(onCondition)) { 4112 throw new SemanticIRBuildException(Diagnostic.error( 4113 DiagnosticCode.MERGE_WHEN_CONDITION_HAS_SUBQUERY_NOT_SUPPORTED, 4114 "MERGE ON condition contains a subquery; slice 94 " 4115 + "admits scalar-only ON conditions", 4116 merge)); 4117 } 4118 rejectWindowFunctionInScope(onCondition, "MERGE ON condition"); 4119 LinkedHashSet<ColumnRef> joinRefsSet = new LinkedHashSet<>(); 4120 joinRefsSet.addAll(collectColumnRefs(onCondition, providerWithStar)); 4121 // Slice 95 — per-WHEN action WHERE refs (UPDATE WHERE, 4122 // UPDATE...DELETE WHERE, INSERT WHERE) accumulate here. 4123 // Slice 94 left these refs silently dropped; slice 95 routes 4124 // them through filterColumnRefs[] (slice-80 UPDATE WHERE 4125 // precedent) — distinct from joinColumnRefs[] which holds 4126 // ON + WHEN-AND match conditions. 4127 LinkedHashSet<ColumnRef> filterRefsSet = new LinkedHashSet<>(); 4128 4129 // 8) Per-WHEN clause loop. Validate type, dispatch to action 4130 // builder, accumulate joinColumnRefs and lineage edges. 4131 TargetRelation targetRel = null; 4132 List<String> targetColumnNames = new ArrayList<>(); 4133 // Stable-order map: target col spelling (lower-cased) → 4134 // verbatim spelling encountered first. Iterating WHEN clauses 4135 // in order naturally produces SET-LHS first, then INSERT 4136 // column-list, matching the plan v3 column ordering rule. 4137 Map<String, String> seenTargetCols = new LinkedHashMap<>(); 4138 // LineageEdge dedup spans the whole MERGE on 4139 // (target column lower-case, source ref lower-case key). 4140 Set<String> emittedEdgeKeys = new HashSet<>(); 4141 4142 if (merge.getWhenClauses() == null 4143 || merge.getWhenClauses().size() == 0) { 4144 throw new SemanticIRBuildException(Diagnostic.error( 4145 DiagnosticCode.MERGE_WHEN_NO_ACTION, 4146 "MERGE statement has no WHEN clauses", 4147 merge)); 4148 } 4149 // Slice 116 — providerWithCteForActionWhere decorates 4150 // providerWithStar with withCteContext so the predicate body's 4151 // inner SELECT's `FROM cte` refs route through 4152 // RelationKind.CTE (slice 110 documented this is required for 4153 // emitLineageForStatement to emit STATEMENT_OUTPUT → 4154 // STATEMENT_OUTPUT edges into the CTE body). Hoisted here once 4155 // (cteNameToStatementIndex is finalized by line 3856 well 4156 // before the per-WHEN loop; recomputing per-WHEN would be 4157 // wasteful — codex diff-review Q1 advisory). The decoration 4158 // is scoped to collectMergeActionWhere only; providerWithStar 4159 // elsewhere stays unchanged so the WHEN AND condition (line 4160 // ~4153) and per-action SET/INSERT walkers see the original 4161 // provider — they already have their own slice-94 subquery 4162 // rejects so no asymmetric resolution surfaces. 4163 final NameBindingProvider providerWithCteForActionWhere = 4164 cteNameToStatementIndex.isEmpty() 4165 ? providerWithStar 4166 : providerWithStar.withCteContext( 4167 cteNameToStatementIndex.keySet()); 4168 // Slice 118 — build the MERGE correlation scope once (target + 4169 // USING source + outer CTEs) and thread through every per-WHEN 4170 // action WHERE call so correlated predicate subqueries promote 4171 // outer-aliased refs into OUTER_REFERENCE relations instead of 4172 // rejecting them. Mirrors the slice-117 pattern (UPDATE 4173 // SET-RHS correlated scalars). The scope is null-safe — every 4174 // value type inside flows from already-computed buildMerge 4175 // state (targetTable / usingTable / aliasToSubIdx / 4176 // cteNameToStatementIndex). 4177 final EnclosingScope mergeCorrelationScope = 4178 buildMergeEnclosingScope(merge, cteNameToStatementIndex, 4179 aliasToSubIdx); 4180 for (int wi = 0; wi < merge.getWhenClauses().size(); wi++) { 4181 TMergeWhenClause when = merge.getWhenClauses().getElement(wi); 4182 // Slice 97 — BY SOURCE variants (SQL Server admits parser 4183 // types 7 / 8) are now admitted. PG 17+ syntax still parses 4184 // as type 2 (parser gap; slice 97 does not address). The 4185 // legacy MERGE_WHEN_NOT_MATCHED_BY_SOURCE_NOT_SUPPORTED 4186 // code stays declared-but-unreached (slice 71/72/82/86/95/96 4187 // precedent). 4188 boolean isNotMatchedBySource = 4189 when.getType() == TMergeWhenClause.not_matched_by_source 4190 || when.getType() 4191 == TMergeWhenClause.not_matched_by_source_with_condition; 4192 // Per-WHEN AND condition (matched_with_condition, 4193 // not_matched_with_condition, not_matched_by_target_with_condition, 4194 // not_matched_by_source_with_condition). 4195 TExpression whenCond = when.getCondition(); 4196 if (whenCond != null) { 4197 if (containsAnySubqueryExpression(whenCond)) { 4198 throw new SemanticIRBuildException(Diagnostic.error( 4199 DiagnosticCode.MERGE_WHEN_CONDITION_HAS_SUBQUERY_NOT_SUPPORTED, 4200 "MERGE WHEN AND condition contains a subquery; " 4201 + "slice 94 admits scalar-only WHEN " 4202 + "conditions", 4203 merge)); 4204 } 4205 rejectWindowFunctionInScope(whenCond, "MERGE WHEN AND condition"); 4206 List<ColumnRef> condRefs = 4207 collectColumnRefs(whenCond, providerWithStar); 4208 // Slice 97 — BY SOURCE branches forbid source-side 4209 // refs in the AND condition (no source row exists). 4210 if (isNotMatchedBySource) { 4211 rejectSourceRefsForBySource(condRefs, aliasToTableQName, 4212 aliasToSubIdx, "MERGE WHEN NOT MATCHED BY SOURCE " 4213 + "AND condition", merge); 4214 } 4215 joinRefsSet.addAll(condRefs); 4216 } 4217 // Dispatch to action. Slice 96 — DO NOTHING is a no-op 4218 // action (PG 15+): no SET/INSERT VALUES, no per-column 4219 // lineage (slice-81 DELETE precedent). Per-WHEN AND 4220 // condition refs were already collected into joinRefsSet 4221 // above. MERGE_DO_NOTHING_NOT_SUPPORTED stays declared- 4222 // but-unreached for API stability (slice 71/72/82/86/95 4223 // precedent). 4224 boolean isDoNothingAction = when.getDoNothingClause() != null; 4225 TMergeUpdateClause upd = when.getUpdateClause(); 4226 TMergeInsertClause ins = when.getInsertClause(); 4227 boolean isDeleteAction = when.getDeleteClause() != null; 4228 if (upd == null && ins == null && !isDeleteAction 4229 && !isDoNothingAction) { 4230 throw new SemanticIRBuildException(Diagnostic.error( 4231 DiagnosticCode.MERGE_WHEN_NO_ACTION, 4232 "MERGE WHEN clause #" + (wi + 1) 4233 + " has no UPDATE / INSERT / DELETE / " 4234 + "DO NOTHING action", 4235 merge)); 4236 } 4237 // Slice 95 — collect per-WHEN action WHERE refs into 4238 // filterRefsSet. Slice 116 — uncorrelated predicate-subquery 4239 // wrappers in those WHEREs now extract through the slice-23+ 4240 // pipeline via PredicateClauseContext.MERGE_WHEN_WHERE 4241 // (mirrors slice 110-114 lifts on UPDATE / DELETE / SELECT / 4242 // set-op branch / CTE-body WHEREs). Window functions still 4243 // reject via rejectWindowFunctionInScopeSkipping (slice 95 4244 // contract preserved). MERGE_UPDATE_DELETE_WHERE_NOT_SUPPORTED 4245 // remains declared-but-unreached (slice 71/72/82/86 4246 // precedent). The providerWithCteForActionWhere decoration 4247 // is hoisted ABOVE the per-WHEN loop (codex diff-review Q1 4248 // advisory) since cteNameToStatementIndex is finalized 4249 // before the loop. 4250 if (upd != null) { 4251 collectMergeActionWhere(upd.getUpdateWhereClause(), 4252 "MERGE WHEN action UPDATE WHERE", 4253 providerWithCteForActionWhere, filterRefsSet, 4254 stmts, lineage, cteNameToStatementIndex, merge, 4255 mergeCorrelationScope); 4256 collectMergeActionWhere(upd.getDeleteWhereClause(), 4257 "MERGE WHEN action DELETE WHERE", 4258 providerWithCteForActionWhere, filterRefsSet, 4259 stmts, lineage, cteNameToStatementIndex, merge, 4260 mergeCorrelationScope); 4261 } 4262 if (ins != null) { 4263 collectMergeActionWhere(ins.getInsertWhereClause(), 4264 "MERGE WHEN action INSERT WHERE", 4265 providerWithCteForActionWhere, filterRefsSet, 4266 stmts, lineage, cteNameToStatementIndex, merge, 4267 mergeCorrelationScope); 4268 } 4269 // Slice 97 — BY SOURCE forbids INSERT semantically. MSSQL 4270 // parser admits the shape, so Semantic IR rejects. 4271 if (isNotMatchedBySource && ins != null) { 4272 throw new SemanticIRBuildException(Diagnostic.error( 4273 DiagnosticCode.MERGE_NOT_MATCHED_BY_SOURCE_INSERT_NOT_VALID, 4274 "MERGE WHEN NOT MATCHED BY SOURCE THEN INSERT is " 4275 + "not a valid SQL Server action; INSERT only " 4276 + "applies when the source row has no target " 4277 + "match. Slice 97 admits UPDATE / DELETE on " 4278 + "BY SOURCE branches.", 4279 merge)); 4280 } 4281 if (upd != null) { 4282 // Slice 97 — pre-walk SET RHS refs for BY SOURCE 4283 // branches to reject source-side refs before lineage 4284 // emission (Q1 in plan-review: keep helper branch- 4285 // agnostic; double-collect cost is bounded since 4286 // BY SOURCE UPDATEs are small). 4287 if (isNotMatchedBySource) { 4288 rejectBySourceSetRhsRefs(upd, providerWithStar, 4289 aliasToTableQName, aliasToSubIdx, merge); 4290 } 4291 buildMergeUpdateAction(upd, targetQName, targetTable, 4292 providerWithStar, seenTargetCols, lineage, 4293 emittedEdgeKeys, aliasToTableQName, 4294 aliasToSubIdx, targetAliases, merge); 4295 } 4296 if (ins != null) { 4297 buildMergeInsertAction(ins, targetQName, targetTable, 4298 providerWithStar, seenTargetCols, lineage, 4299 emittedEdgeKeys, aliasToTableQName, 4300 aliasToSubIdx, targetAliases, merge); 4301 } 4302 // DELETE action: no per-column lineage (slice 81 contract). 4303 } 4304 4305 // 9) Build TargetRelation from accumulated target column spellings. 4306 for (String spelling : seenTargetCols.values()) { 4307 targetColumnNames.add(spelling); 4308 } 4309 targetRel = new TargetRelation( 4310 new RelationBinding(RelationKind.TABLE, targetQName), 4311 targetColumnNames); 4312 4313 // 9.5) Slice 98 — MSSQL MERGE OUTPUT projection. Reuses the 4314 // slice-85 buildReturningColumns walker with dmlKind="MERGE": 4315 // Pass 1.5's INSERT/DELETE pseudo-table mismatch check naturally 4316 // skips (MERGE is action-polymorphic — INSERTED and DELETED 4317 // may both legitimately appear). The walker handles $action via 4318 // a slice-98-specific short-circuit (derived OutputColumn, no 4319 // sources, no edges). mergeIdx = stmts.size() because the MERGE 4320 // StatementGraph is appended below; for USING-subquery shapes, 4321 // step 6 has already appended the extracted SELECT so the index 4322 // points at the upcoming MERGE position (slice-83 dynamic-index 4323 // pattern). Pass relations[] as fromSideRelations so unique 4324 // USING-alias matches resolve to the source qname (codex Q4). 4325 int mergeIdx = stmts.size(); 4326 String mergeTargetAlias = effectiveAliasOf(targetTable); 4327 if (mergeTargetAlias == null || mergeTargetAlias.isEmpty()) { 4328 mergeTargetAlias = targetQName; 4329 } 4330 List<OutputColumn> returningCols = buildReturningColumns( 4331 /*ret=*/ null, 4332 /*out=*/ merge.getOutputClause(), 4333 "MERGE", 4334 targetQName, 4335 mergeTargetAlias, 4336 targetTable, 4337 /*fromSideRelations=*/ relations, 4338 providerWithStar, 4339 mergeIdx, 4340 lineage, 4341 merge); 4342 4343 // 10) Emit StatementGraph. joinColumnRefs[] = ON + WHEN-AND refs. 4344 // Slice 95: filterColumnRefs[] = per-WHEN action WHERE refs 4345 // (UPDATE WHERE, UPDATE...DELETE WHERE, INSERT WHERE). 4346 // Slice 98: returningColumns[] = MERGE OUTPUT projection. 4347 List<ColumnRef> joinRefs = new ArrayList<>(joinRefsSet); 4348 List<ColumnRef> filterRefs = new ArrayList<>(filterRefsSet); 4349 StatementGraph mergeStmt = new StatementGraph( 4350 /*name=*/ null, 4351 "MERGE", 4352 relations, 4353 /*outputColumns=*/ Collections.<OutputColumn>emptyList(), 4354 returningCols, 4355 filterRefs, 4356 joinRefs, 4357 /*groupByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 4358 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 4359 /*orderByColumnRefs=*/ Collections.<ColumnRef>emptyList(), 4360 /*distinctOnColumnRefs=*/ Collections.<ColumnRef>emptyList(), 4361 /*distinct=*/ false, 4362 /*setOperator=*/ null, 4363 /*rowLimit=*/ null, 4364 targetRel); 4365 stmts.add(mergeStmt); 4366 4367 return new SemanticProgram(stmts, lineage); 4368 } 4369 4370 /** 4371 * Slice 95 — collect column refs from a per-WHEN action WHERE 4372 * predicate ({@code TMergeUpdateClause.updateWhereClause}, 4373 * {@code TMergeUpdateClause.deleteWhereClause}, or 4374 * {@code TMergeInsertClause.insertWhereClause}) into the supplied 4375 * {@code filterRefsSet}. 4376 * 4377 * <p>Slice 116 — lifts the slice-95 blanket subquery reject by 4378 * routing uncorrelated predicate-subquery wrappers (IN-SELECT / 4379 * EXISTS / NOT EXISTS / scalar comparison / ANY-ALL-SOME) through 4380 * the slice-23+ JOIN-ON extraction pipeline refactored by slice 4381 * 110 to take a {@link PredicateClauseContext}. The 4382 * {@link PredicateClauseContext#MERGE_WHEN_WHERE} constant reuses 4383 * the {@code SELECT_WHERE_*} DiagnosticCode family (slice 113/114 4384 * precedent — a MERGE-action WHERE IS a SELECT WHERE in shape) so 4385 * the enum count stays at 279; only the {@code clauseLabel} 4386 * differs so diagnostic messages can identify the MERGE-action 4387 * host context. 4388 * 4389 * <p>Each extracted wrapper lands as its own 4390 * {@code <predicate_subquery_<i>>} StatementGraph BEFORE the 4391 * MERGE statement (so {@code mergeIdx = stmts.size()} below in 4392 * {@code buildMerge} already accounts for them — slice-83 4393 * dynamic-index pattern, slice 110/111 precedent). Remaining 4394 * non-subquery refs flow into {@code filterRefsSet} via 4395 * {@link #collectColumnRefsSkipping}. Window functions in 4396 * non-subquery subtrees still reject via 4397 * {@link #rejectWindowFunctionInScopeSkipping} (slice 95 4398 * window-function contract preserved). 4399 * 4400 * <p>The supplied {@code provider} must carry 4401 * {@code withCteContext(cteMap.keySet())} so the predicate body's 4402 * inner SELECT's {@code FROM cte} refs route through 4403 * {@link RelationKind#CTE} — without that decoration, 4404 * {@code emitLineageForStatement} would lose the 4405 * STATEMENT_OUTPUT → STATEMENT_OUTPUT edge to the CTE body 4406 * (slice 110 documented gap for UPDATE WHERE — same applies here). 4407 * {@code buildMerge} composes the decoration once before the 4408 * per-WHEN loop. 4409 * 4410 * <p>Null-safe: returns immediately when the WHERE expression is 4411 * absent (slice-94 default; most WHEN clauses have no action 4412 * WHERE). 4413 */ 4414 private static void collectMergeActionWhere(TExpression expr, 4415 String label, 4416 NameBindingProvider provider, 4417 LinkedHashSet<ColumnRef> filterRefsSet, 4418 List<StatementGraph> stmts, 4419 List<LineageEdge> lineage, 4420 Map<String, Integer> cteMap, 4421 TMergeSqlStatement merge, 4422 EnclosingScope correlationScope) { 4423 if (expr == null) { 4424 return; 4425 } 4426 Set<TExpression> extractedRoots = Collections.<TExpression>emptySet(); 4427 if (containsAnySubqueryExpression(expr)) { 4428 extractedRoots = 4429 extractUncorrelatedPredicateSubqueriesFromClause( 4430 expr, provider, stmts, lineage, cteMap, 4431 PredicateClauseContext.MERGE_WHEN_WHERE, 4432 correlationScope); 4433 rejectAnyRemainingSubqueriesFromClause(expr, extractedRoots, 4434 PredicateClauseContext.MERGE_WHEN_WHERE); 4435 } 4436 rejectWindowFunctionInScopeSkipping(expr, label, extractedRoots); 4437 filterRefsSet.addAll(collectColumnRefsSkipping(expr, provider, 4438 extractedRoots)); 4439 } 4440 4441 /** 4442 * Slice 97 — reject any source-aliased column ref in a 4443 * WHEN NOT MATCHED BY SOURCE branch. SQL Server forbids 4444 * source-side references in this branch because there is no 4445 * matching source row when the action fires. 4446 * 4447 * <p>A ref is "source-aliased" iff its 4448 * {@code relationAlias.toLowerCase(Locale.ROOT)} appears in 4449 * either alias map (TABLE-kind or SUBQUERY-kind USING source). 4450 * Refs whose alias is unknown to both maps are assumed to be 4451 * target-bound (slice-94 alias-filter convention) and are 4452 * left alone. 4453 * 4454 * <p>Walks all refs so a source ref nested inside an arbitrary 4455 * function call (e.g. {@code COALESCE(s.code, t.code)}) is 4456 * caught — {@code collectColumnRefs} descends through arbitrary 4457 * scalar expressions (codex round-1 Q3 confirmed YES). 4458 */ 4459 private static void rejectSourceRefsForBySource(List<ColumnRef> refs, 4460 Map<String, String> aliasToTableQName, 4461 Map<String, Integer> aliasToSubIdx, 4462 String label, 4463 TMergeSqlStatement merge) { 4464 if (refs == null || refs.isEmpty()) { 4465 return; 4466 } 4467 for (ColumnRef r : refs) { 4468 String alias = r.getRelationAlias(); 4469 if (alias == null || alias.isEmpty()) { 4470 continue; 4471 } 4472 String key = alias.toLowerCase(Locale.ROOT); 4473 if (aliasToTableQName.containsKey(key) 4474 || aliasToSubIdx.containsKey(key)) { 4475 throw new SemanticIRBuildException(Diagnostic.error( 4476 DiagnosticCode.MERGE_NOT_MATCHED_BY_SOURCE_REFERENCES_SOURCE, 4477 label + " references USING source column '" 4478 + r + "'; WHEN NOT MATCHED BY SOURCE " 4479 + "branches must only reference target " 4480 + "columns or constants.", 4481 merge)); 4482 } 4483 } 4484 } 4485 4486 /** 4487 * Slice 97 — pre-walk the SET RHS of every assignment in a BY SOURCE 4488 * UPDATE action and reject any source-side ref. Called before 4489 * {@link #buildMergeUpdateAction} so the existing slice-94 helper 4490 * remains BY-SOURCE-agnostic. 4491 * 4492 * <p>Skips assignments that aren't shaped as a simple 4493 * {@code assignment_t} expression; those defects are caught by 4494 * {@link #buildMergeUpdateAction} with 4495 * {@link DiagnosticCode#UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED}. 4496 */ 4497 private static void rejectBySourceSetRhsRefs(TMergeUpdateClause upd, 4498 NameBindingProvider provider, 4499 Map<String, String> aliasToTableQName, 4500 Map<String, Integer> aliasToSubIdx, 4501 TMergeSqlStatement merge) { 4502 TResultColumnList sets = upd.getUpdateColumnList(); 4503 if (sets == null || sets.size() == 0) { 4504 return; 4505 } 4506 for (int i = 0; i < sets.size(); i++) { 4507 TResultColumn rc = sets.getResultColumn(i); 4508 TExpression assignment = (rc == null) ? null : rc.getExpr(); 4509 if (assignment == null 4510 || assignment.getExpressionType() != EExpressionType.assignment_t) { 4511 continue; 4512 } 4513 TExpression rhs = assignment.getRightOperand(); 4514 if (rhs == null) { 4515 continue; 4516 } 4517 // Subquery RHS would short-circuit later with 4518 // UPDATE_SET_HAS_SUBQUERY_NOT_SUPPORTED; ignore here. 4519 if (containsAnySubqueryExpression(rhs)) { 4520 continue; 4521 } 4522 List<ColumnRef> rhsRefs = collectColumnRefs(rhs, provider); 4523 rejectSourceRefsForBySource(rhsRefs, aliasToTableQName, 4524 aliasToSubIdx, 4525 "MERGE WHEN NOT MATCHED BY SOURCE UPDATE SET " 4526 + "assignment #" + (i + 1) + " RHS", 4527 merge); 4528 } 4529 } 4530 4531 /** 4532 * Slice 94 — process one WHEN MATCHED THEN UPDATE SET action. 4533 * Each {@code TResultColumn} carries an assignment_t TExpression 4534 * whose leftOperand is the SET LHS (target column reference) and 4535 * whose rightOperand is the value expression. We emit one 4536 * {@link LineageEdge} per (target col, RHS source ref) pair. 4537 */ 4538 private static void buildMergeUpdateAction(TMergeUpdateClause upd, 4539 String targetQName, 4540 TTable targetTable, 4541 NameBindingProvider provider, 4542 Map<String, String> seenTargetCols, 4543 List<LineageEdge> lineage, 4544 Set<String> emittedEdgeKeys, 4545 Map<String, String> aliasToTableQName, 4546 Map<String, Integer> aliasToSubIdx, 4547 Set<String> targetAliases, 4548 TMergeSqlStatement merge) { 4549 TResultColumnList sets = upd.getUpdateColumnList(); 4550 if (sets == null || sets.size() == 0) { 4551 return; 4552 } 4553 for (int i = 0; i < sets.size(); i++) { 4554 TResultColumn rc = sets.getResultColumn(i); 4555 TExpression assignment = (rc == null) ? null : rc.getExpr(); 4556 if (assignment == null 4557 || assignment.getExpressionType() != EExpressionType.assignment_t) { 4558 throw new SemanticIRBuildException(Diagnostic.error( 4559 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4560 "MERGE WHEN MATCHED UPDATE SET assignment #" + (i + 1) 4561 + " is not a simple column-value assignment_t", 4562 merge)); 4563 } 4564 TExpression lhs = assignment.getLeftOperand(); 4565 TExpression rhs = assignment.getRightOperand(); 4566 if (lhs == null || rhs == null) { 4567 throw new SemanticIRBuildException(Diagnostic.error( 4568 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4569 "MERGE WHEN MATCHED UPDATE SET assignment #" + (i + 1) 4570 + " is missing an operand", 4571 merge)); 4572 } 4573 if (lhs.getExpressionType() == EExpressionType.list_t) { 4574 throw new SemanticIRBuildException(Diagnostic.error( 4575 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4576 "MERGE WHEN MATCHED UPDATE SET tuple assignment " 4577 + "'(a, b) = ...' is not supported", 4578 merge)); 4579 } 4580 if (lhs.getExpressionType() != EExpressionType.simple_object_name_t) { 4581 throw new SemanticIRBuildException(Diagnostic.error( 4582 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4583 "MERGE WHEN MATCHED UPDATE SET assignment #" + (i + 1) 4584 + " LHS is expressionType=" + lhs.getExpressionType() 4585 + "; slice 94 admits simple column references only", 4586 merge)); 4587 } 4588 TObjectName targetCol = lhs.getObjectOperand(); 4589 if (targetCol == null) { 4590 throw new SemanticIRBuildException(Diagnostic.error( 4591 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4592 "MERGE WHEN MATCHED UPDATE SET assignment #" + (i + 1) 4593 + " LHS has no TObjectName operand", 4594 merge)); 4595 } 4596 // Codex round-1 diff Q3 NO fix: SET LHS qualifier must be 4597 // either the target alias or the target qualified name. 4598 // A foreign qualifier (e.g. `s.name` on a SET LHS pointing 4599 // at source `s`) silently treated as a target column would 4600 // produce a wrong target column spelling. 4601 String rawSpelling = targetCol.toString(); 4602 String colSpelling = validateAndStripSetLhsQualifier( 4603 rawSpelling, targetTable, targetQName, merge); 4604 // Subquery / window on RHS — reuse existing codes per 4605 // plan v3 §B (codex round-1 Q2 NO fix). 4606 if (containsAnySubqueryExpression(rhs)) { 4607 throw new SemanticIRBuildException(Diagnostic.error( 4608 DiagnosticCode.UPDATE_SET_HAS_SUBQUERY_NOT_SUPPORTED, 4609 "MERGE WHEN MATCHED UPDATE SET assignment #" + (i + 1) 4610 + " right-hand side contains a subquery; " 4611 + "slice 94 admits scalar-only RHS expressions", 4612 merge)); 4613 } 4614 rejectWindowFunctionInScope(rhs, "MERGE WHEN MATCHED UPDATE SET RHS"); 4615 4616 String lowerKey = colSpelling.toLowerCase(Locale.ROOT); 4617 if (!seenTargetCols.containsKey(lowerKey)) { 4618 seenTargetCols.put(lowerKey, colSpelling); 4619 } 4620 // Per-WHEN action lineage: TABLE_COLUMN(target,col) ← <RHS ref> 4621 List<ColumnRef> rhsRefs = collectColumnRefs(rhs, provider); 4622 for (ColumnRef src : rhsRefs) { 4623 emitMergeLineageEdge(targetQName, colSpelling, src, 4624 lineage, emittedEdgeKeys, aliasToTableQName, 4625 aliasToSubIdx, targetAliases); 4626 } 4627 } 4628 } 4629 4630 /** 4631 * Slice 94 — process one WHEN NOT MATCHED THEN INSERT (cols) VALUES (exprs) 4632 * action. Emits one {@link LineageEdge} per (insert col, source ref) 4633 * pair, plus arity validation between the explicit column list and 4634 * VALUES list. If the column list is omitted, we cannot derive 4635 * target column names — the slice rejects defensively. 4636 */ 4637 private static void buildMergeInsertAction(TMergeInsertClause ins, 4638 String targetQName, 4639 TTable targetTable, 4640 NameBindingProvider provider, 4641 Map<String, String> seenTargetCols, 4642 List<LineageEdge> lineage, 4643 Set<String> emittedEdgeKeys, 4644 Map<String, String> aliasToTableQName, 4645 Map<String, Integer> aliasToSubIdx, 4646 Set<String> targetAliases, 4647 TMergeSqlStatement merge) { 4648 TResultColumnList values = ins.getValuelist(); 4649 gudusoft.gsqlparser.nodes.TObjectNameList colList = ins.getColumnList(); 4650 if (values == null || values.size() == 0) { 4651 throw new SemanticIRBuildException(Diagnostic.error( 4652 DiagnosticCode.MERGE_INSERT_DEFAULT_VALUES_NOT_SUPPORTED, 4653 "MERGE WHEN NOT MATCHED INSERT has no VALUES list " 4654 + "(DEFAULT VALUES / row-type forms not supported)", 4655 merge)); 4656 } 4657 // If an explicit column list is present, validate arity. 4658 if (colList != null && colList.size() > 0) { 4659 if (colList.size() != values.size()) { 4660 throw new SemanticIRBuildException(Diagnostic.error( 4661 DiagnosticCode.INSERT_COLUMN_COUNT_MISMATCH, 4662 "MERGE WHEN NOT MATCHED INSERT column list has " 4663 + colList.size() + " column(s) but VALUES " 4664 + "list has " + values.size(), 4665 merge)); 4666 } 4667 } 4668 for (int i = 0; i < values.size(); i++) { 4669 TResultColumn rc = values.getResultColumn(i); 4670 TExpression rhs = (rc == null) ? null : rc.getExpr(); 4671 if (rhs == null) { 4672 throw new SemanticIRBuildException(Diagnostic.error( 4673 DiagnosticCode.MERGE_INSERT_DEFAULT_VALUES_NOT_SUPPORTED, 4674 "MERGE WHEN NOT MATCHED INSERT VALUES item #" 4675 + (i + 1) + " has no expression", 4676 merge)); 4677 } 4678 if (containsAnySubqueryExpression(rhs)) { 4679 throw new SemanticIRBuildException(Diagnostic.error( 4680 DiagnosticCode.MERGE_INSERT_VALUES_HAS_SUBQUERY_NOT_SUPPORTED, 4681 "MERGE WHEN NOT MATCHED INSERT VALUES item #" 4682 + (i + 1) + " contains a subquery; slice 94 " 4683 + "admits scalar-only VALUES expressions", 4684 merge)); 4685 } 4686 rejectWindowFunctionInScope(rhs, "MERGE INSERT VALUES"); 4687 4688 // Target column name. If the explicit column list is 4689 // omitted, slice 94 does not synthesize positional target 4690 // column names (the catalog is required to map by 4691 // position; the current builder does not consume catalog 4692 // ordering). We still emit lineage edges from a synth 4693 // "__merge_insert_pos_<i>__" target column so source refs 4694 // are observable; users with an explicit column list get 4695 // the verbatim spelling. 4696 String colSpelling; 4697 if (colList != null && colList.size() > 0) { 4698 TObjectName col = colList.getObjectName(i); 4699 // INSERT column list is conventionally bare (no 4700 // qualifier), but Oracle / SQL Server allow 4701 // target-qualified spellings; strip them and reject 4702 // foreign qualifiers (codex round-1 diff Q3 NO fix). 4703 colSpelling = (col == null) ? ("__merge_insert_pos_" + i + "__") 4704 : validateAndStripSetLhsQualifier(col.toString(), 4705 targetTable, targetQName, merge); 4706 } else { 4707 colSpelling = "__merge_insert_pos_" + i + "__"; 4708 } 4709 String lowerKey = colSpelling.toLowerCase(Locale.ROOT); 4710 if (!seenTargetCols.containsKey(lowerKey)) { 4711 seenTargetCols.put(lowerKey, colSpelling); 4712 } 4713 List<ColumnRef> rhsRefs = collectColumnRefs(rhs, provider); 4714 for (ColumnRef src : rhsRefs) { 4715 emitMergeLineageEdge(targetQName, colSpelling, src, 4716 lineage, emittedEdgeKeys, aliasToTableQName, 4717 aliasToSubIdx, targetAliases); 4718 } 4719 } 4720 } 4721 4722 /** 4723 * Slice 94 — emit a single MERGE per-WHEN action lineage edge: 4724 * {@code TABLE_COLUMN(targetQName, colSpelling) ← <src ref>}. 4725 * Deduplicates on a lower-case key so the same (target column, 4726 * source ref) pair appearing in multiple WHEN clauses produces 4727 * one edge (codex round-2 Q4 confirmed YES on this dedup 4728 * strategy). 4729 */ 4730 private static void emitMergeLineageEdge(String targetQName, 4731 String colSpelling, 4732 ColumnRef src, 4733 List<LineageEdge> lineage, 4734 Set<String> emittedEdgeKeys, 4735 Map<String, String> aliasToTableQName, 4736 Map<String, Integer> aliasToSubIdx, 4737 Set<String> targetAliases) { 4738 if (src == null || colSpelling == null || colSpelling.isEmpty()) { 4739 return; 4740 } 4741 String srcAlias = src.getRelationAlias(); 4742 String srcCol = src.getColumnName(); 4743 if (srcCol == null || srcCol.isEmpty()) { 4744 return; 4745 } 4746 if (srcAlias == null) srcAlias = ""; 4747 String aliasKey = srcAlias.toLowerCase(Locale.ROOT); 4748 // Codex round-1 diff Q1 BLOCKING fix: skip only if the alias 4749 // is a target alias, NOT if the alias resolves to a same-named 4750 // table. Self-merge (USING with same name as target) must 4751 // distinguish target from USING by alias identity, not by 4752 // resolved table name. 4753 if (targetAliases.contains(aliasKey)) { 4754 return; 4755 } 4756 // Map alias → LineageRef (TABLE_COLUMN or STATEMENT_OUTPUT). 4757 LineageRef toRef; 4758 if (aliasToSubIdx.containsKey(aliasKey)) { 4759 toRef = LineageRef.statementOutput( 4760 aliasToSubIdx.get(aliasKey), srcCol); 4761 } else if (aliasToTableQName.containsKey(aliasKey)) { 4762 toRef = LineageRef.tableColumn( 4763 aliasToTableQName.get(aliasKey), srcCol); 4764 } else { 4765 // Unknown alias — skip to avoid emitting a bogus edge. 4766 // The ref still surfaces on joinColumnRefs[] (ON / WHEN-AND). 4767 return; 4768 } 4769 // Codex round-1 diff Q2 NO fix: dedup on resolved LineageRef 4770 // identity (not raw alias), so `s.name` (alias) and 4771 // `managers.name` (qualified name) coming from the SAME 4772 // resolved source produce ONE edge. The key embeds the toRef's 4773 // canonical form: STATEMENT_OUTPUT(idx,col) or 4774 // TABLE_COLUMN(qname,col) — both are lower-cased. 4775 String resolvedKey; 4776 if (toRef.getKind() == LineageRef.Kind.STATEMENT_OUTPUT) { 4777 resolvedKey = "STMT_OUT::" + toRef.getStatementIndex() + "::" 4778 + (toRef.getOutputName() == null ? "" : toRef.getOutputName()); 4779 } else { 4780 resolvedKey = "TBL_COL::" 4781 + (toRef.getQualifiedName() == null ? "" : toRef.getQualifiedName()) 4782 + "::" 4783 + (toRef.getColumnName() == null ? "" : toRef.getColumnName()); 4784 } 4785 String key = (targetQName + "::" + colSpelling + "::" 4786 + resolvedKey).toLowerCase(Locale.ROOT); 4787 if (emittedEdgeKeys.add(key)) { 4788 lineage.add(new LineageEdge( 4789 LineageRef.tableColumn(targetQName, colSpelling), 4790 toRef)); 4791 } 4792 } 4793 4794 /** 4795 * Slice 94 — validate the SET LHS / INSERT column-list spelling 4796 * and strip a leading target qualifier. Codex round-1 diff Q3 NO 4797 * fix: previously the helper returned a foreign-qualified spelling 4798 * unchanged, which silently produced a wrong target column (e.g. 4799 * {@code "s.name"} would land in the target columns list verbatim 4800 * instead of being rejected). 4801 * 4802 * <p>Admit rules: 4803 * <ul> 4804 * <li>Unqualified bare name: return unchanged.</li> 4805 * <li>Qualified by target alias or qualified name: strip.</li> 4806 * <li>Qualified by anything else: reject as 4807 * {@link DiagnosticCode#UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED} 4808 * (the same code used for slice-80 UPDATE-LHS shape rejects, 4809 * message text discriminates by mentioning the foreign 4810 * qualifier).</li> 4811 * </ul> 4812 */ 4813 private static String validateAndStripSetLhsQualifier(String spelling, 4814 TTable targetTable, 4815 String targetQName, 4816 TMergeSqlStatement merge) { 4817 if (spelling == null) return spelling; 4818 int dot = spelling.indexOf('.'); 4819 if (dot <= 0) return spelling; 4820 String qualifier = spelling.substring(0, dot); 4821 String bare = spelling.substring(dot + 1); 4822 String targetAlias = effectiveAliasOf(targetTable); 4823 if (targetAlias != null 4824 && qualifier.equalsIgnoreCase(targetAlias)) { 4825 return bare; 4826 } 4827 if (qualifier.equalsIgnoreCase(targetQName)) { 4828 return bare; 4829 } 4830 throw new SemanticIRBuildException(Diagnostic.error( 4831 DiagnosticCode.UPDATE_TUPLE_ASSIGNMENT_NOT_SUPPORTED, 4832 "MERGE SET LHS / INSERT column qualifier '" + qualifier 4833 + "' does not match the target table; slice 94 " 4834 + "admits target-qualified or unqualified target " 4835 + "column references only", 4836 merge)); 4837 } 4838 4839 /** 4840 * Slice 84 — process one FROM-side source table for joined 4841 * {@link #buildDelete}. Mirrors slice-82 {@link #buildUpdateRelation}. 4842 * Applies the slice-84 reject contract for nested-join wrappers, 4843 * then appends a TABLE-kind {@link RelationSource} unless the 4844 * table is the target (reference-identity filter — clean IR 4845 * semantics: relations[] models read-side sources only). For 4846 * subquery sources (already extracted in step 4.7), publishes a 4847 * SUBQUERY-kind {@link RelationSource} so the inScope-enhanced 4848 * provider can route {@code sub.col} references. 4849 * 4850 * <p>Null-driver guard: probed PG 4851 * {@code DELETE FROM e USING (t1 JOIN t2 ON …)} returns 4852 * {@code refJoin[0].getTable() == null}. Silent skip mirrors 4853 * slice-82's null guard for the analogous UPDATE case; 4854 * documented as a known limitation (parenthesized JOIN-in-USING 4855 * is opaque to {@code relations[]} though WHERE refs still bind 4856 * via Resolver2). 4857 */ 4858 private static void buildDeleteRelation(TTable t, TTable targetTable, 4859 List<RelationSource> relations, 4860 TDeleteSqlStatement delete, 4861 Map<String, Integer> cteNameToStatementIndex) { 4862 if (t == null) { 4863 return; // defensive — parenthesized JOIN-in-USING surfaces null 4864 } 4865 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 4866 // Slice 84 — admit FROM-side subqueries. The inner SELECT 4867 // has already been extracted as its own StatementGraph by 4868 // extractDeleteFromSubqueries. Publish a SUBQUERY-kind 4869 // RelationSource so the inScope-enhanced provider routes 4870 // `sub.col` references correctly. Mirrors slice-83 4871 // buildUpdateRelation's subquery branch. 4872 String subAlias = effectiveAliasOf(t); 4873 if (subAlias != null && !subAlias.isEmpty()) { 4874 relations.add(new RelationSource(subAlias, 4875 new RelationBinding(RelationKind.SUBQUERY, subAlias))); 4876 } 4877 return; 4878 } 4879 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.join) { 4880 // Defensive: TTable wrapping a TJoin. Not reached by any 4881 // observed parser path on supported dialects (slice-82 4882 // precedent — parenthesized JOIN-in-USING surfaces a null 4883 // driver, not a join-typed TTable). Distinct DiagnosticCode 4884 // per slice-80's message-text-discrimination contract. 4885 throw new SemanticIRBuildException(Diagnostic.error( 4886 DiagnosticCode.DELETE_FROM_NESTED_JOIN_NOT_SUPPORTED, 4887 "DELETE FROM source is a nested join wrapper; " 4888 + "slice 84 admits simple table / subquery " 4889 + "FROM sources only", 4890 delete)); 4891 } 4892 // Reference-identity filter: target's own TTable instance is 4893 // excluded from relations[]. Different TTable instances with 4894 // the same qualified name (e.g. MSSQL `DELETE FROM t FROM t 4895 // spqh JOIN sp` where target identity A and FROM-driver 4896 // identity B share name "t") both stay — the catalog-miss 4897 // WARN walker's pass-1-target-then-pass-2-relations ordering 4898 // (slice 83) deduplicates by qualified name. 4899 if (t == targetTable) { 4900 return; 4901 } 4902 TObjectName tName = t.getTableName(); 4903 if (tName == null) { 4904 return; // defensive 4905 } 4906 // Slice 106 — FROM-side CTE detection. When the FROM-side table 4907 // is an objectname-typed reference whose bare name matches a 4908 // declared CTE in this DELETE's outer WITH clause, emit a 4909 // SUBQUERY-kind RelationSource pointing at the CTE statement 4910 // (mirrors slice-105 buildUpdateRelation). The slice-77 catalog- 4911 // miss WARN walker filters to RelationKind.TABLE so CTE-bound 4912 // relations are naturally skipped, even when the catalog also 4913 // declares the same name (slice-105 §G / §X precedent). 4914 // 4915 // Explicit objectname guard (codex round-1 NICE Q3): subquery / 4916 // join table types are handled by the early returns above; this 4917 // guard documents the contract and makes the branch resilient 4918 // if a future TTable type is added. 4919 if (cteNameToStatementIndex != null 4920 && !cteNameToStatementIndex.isEmpty() 4921 && t.getTableType() 4922 == gudusoft.gsqlparser.ETableSource.objectname) { 4923 String bareName = tName.toString(); 4924 if (bareName != null && !bareName.isEmpty()) { 4925 String bareNameLower = bareName.toLowerCase(Locale.ROOT); 4926 if (cteNameToStatementIndex.containsKey(bareNameLower)) { 4927 String cteAlias = effectiveAliasOf(t); 4928 if (cteAlias == null || cteAlias.isEmpty()) { 4929 cteAlias = bareName; 4930 } 4931 relations.add(new RelationSource(cteAlias, 4932 new RelationBinding(RelationKind.SUBQUERY, cteAlias))); 4933 return; 4934 } 4935 } 4936 } 4937 relations.add(new RelationSource(effectiveAliasOf(t), 4938 new RelationBinding(RelationKind.TABLE, tName.toString()))); 4939 } 4940 4941 /** 4942 * Slice 84 — process one {@link TJoinItem} for joined 4943 * {@link #buildDelete}. Mirrors slice-82 {@link #buildUpdateJoinItem}. 4944 * Applies the slice-84 reject contract for USING / NATURAL / 4945 * subquery-in-ON, processes the right-side table via 4946 * {@link #buildDeleteRelation}, and collects ON-clause column 4947 * refs into {@code joinRefs} via the shared 4948 * {@link #collectColumnRefs} helper. 4949 */ 4950 private static void buildDeleteJoinItem(TJoinItem item, TTable targetTable, 4951 NameBindingProvider provider, 4952 List<RelationSource> relations, 4953 java.util.LinkedHashSet<ColumnRef> joinRefs, 4954 TDeleteSqlStatement delete, 4955 Map<String, Integer> cteNameToStatementIndex) { 4956 if (item == null) return; 4957 if (item.getUsingColumns() != null && item.getUsingColumns().size() > 0) { 4958 throw new SemanticIRBuildException(Diagnostic.error( 4959 DiagnosticCode.DELETE_FROM_JOIN_USING_NOT_SUPPORTED, 4960 "DELETE FROM join uses USING(...); slice 84 admits " 4961 + "JOIN ON / CROSS JOIN / comma-FROM only", 4962 item)); 4963 } 4964 if (isNaturalJoinType(item.getJoinType())) { 4965 throw new SemanticIRBuildException(Diagnostic.error( 4966 DiagnosticCode.DELETE_FROM_JOIN_NATURAL_NOT_SUPPORTED, 4967 "DELETE FROM uses NATURAL JOIN; slice 84 admits " 4968 + "JOIN ON / CROSS JOIN / comma-FROM only", 4969 item)); 4970 } 4971 // Right-side table: apply the same source-shape rejects + 4972 // identity filter as the driver table. Slice 106 — threads 4973 // cteNameToStatementIndex so right-side CTE refs (MSSQL 4974 // `FROM target t JOIN cte ON …`) get SUBQUERY-kind emission. 4975 buildDeleteRelation(item.getTable(), targetTable, relations, delete, 4976 cteNameToStatementIndex); 4977 // ON-clause refs: subquery rejects with slice-84 code; 4978 // window function reuses CLAUSE_WINDOW_FUNCTION_LEAK via the 4979 // shared helper. CROSS JOIN has no ON; skip the walk entirely. 4980 TExpression onCond = item.getOnCondition(); 4981 if (onCond == null) return; 4982 if (containsAnySubqueryExpression(onCond)) { 4983 throw new SemanticIRBuildException(Diagnostic.error( 4984 DiagnosticCode.DELETE_JOIN_ON_HAS_SUBQUERY_NOT_SUPPORTED, 4985 "DELETE FROM JOIN ON condition contains a subquery; " 4986 + "slice 84 admits scalar predicates only", 4987 item)); 4988 } 4989 rejectWindowFunctionInScope(onCond, "DELETE FROM JOIN ON"); 4990 joinRefs.addAll(collectColumnRefs(onCond, provider)); 4991 } 4992 4993 /** 4994 * Slice 84 — extract every FROM-side subquery in 4995 * {@code delete.getReferenceJoins()} as its own 4996 * {@link StatementGraph} appended to {@code stmts} before the 4997 * DELETE itself. Walks both the driver TTable of each TJoin AND 4998 * each JoinItem's right table. Returns an alias → stmts-index 4999 * map so the consuming DELETE can (a) build its in-scope column 5000 * map via {@link #buildDeleteInScopeMap}, and (b) bind 5001 * {@code sub.col} references in WHERE / ON via the 5002 * inScope-enhanced provider. 5003 * 5004 * <p>Mirrors slice-83 {@link #extractUpdateFromSubqueries} but 5005 * walks {@code delete.getReferenceJoins()} instead of 5006 * {@code update.getJoins()}. Reuses the SELECT-side 5007 * {@link #processDirectSubqueryTable} verbatim, forwarding the 5008 * outer-WITH {@code cteNameToStatementIndex} + 5009 * {@code ctePublishedColumns} (slice 106) so a nested SELECT in 5010 * an extracted FROM-subquery body can resolve outer-CTE refs. 5011 * Pre-slice-106 the maps were always empty because slice 81 5012 * rejected top-level WITH on DELETE 5013 * ({@link DiagnosticCode#DELETE_CTE_NOT_SUPPORTED}). 5014 * 5015 * <p>No mutation-guard wrapper: buildDelete owns fresh local 5016 * lists and exceptions propagate cleanly to the caller. 5017 */ 5018 private static Map<String, Integer> extractDeleteFromSubqueries( 5019 TDeleteSqlStatement delete, 5020 NameBindingProvider provider, 5021 List<StatementGraph> stmts, 5022 List<LineageEdge> lineage, 5023 Map<String, Integer> cteNameToStatementIndex, 5024 Map<String, List<String>> ctePublishedColumns) { 5025 Map<String, Integer> aliasToIndex = new HashMap<>(); 5026 TJoinList refJoins = delete.getReferenceJoins(); 5027 if (refJoins == null) return aliasToIndex; 5028 // Slice 106 — forward the outer-WITH CTE maps so a nested SELECT 5029 // inside an extracted FROM-subquery body can resolve outer-WITH 5030 // CTE references. Resolver2 wires CTEScope; the maps are 5031 // forwarded for parity with the SELECT / MERGE / UPDATE call 5032 // sites and so the §N test for 5033 // `USING (SELECT … FROM cte) sub` produces the expected 5034 // cross-stmt lineage edge to the CTE body. 5035 Map<String, Integer> cteMap = cteNameToStatementIndex == null 5036 ? Collections.<String, Integer>emptyMap() 5037 : cteNameToStatementIndex; 5038 Map<String, List<String>> ctePublished = ctePublishedColumns == null 5039 ? Collections.<String, List<String>>emptyMap() 5040 : ctePublishedColumns; 5041 for (int ji = 0; ji < refJoins.size(); ji++) { 5042 TJoin join = refJoins.getJoin(ji); 5043 // Driver table — may be a subquery (PG / SF / BQ / RS 5044 // `DELETE FROM t USING (SELECT …) sub` shape). 5045 processDirectSubqueryTable(join.getTable(), provider, 5046 stmts, lineage, cteMap, ctePublished, aliasToIndex); 5047 TJoinItemList items = join.getJoinItems(); 5048 if (items == null) continue; 5049 for (int i = 0; i < items.size(); i++) { 5050 TJoinItem item = items.getJoinItem(i); 5051 if (item == null) continue; 5052 // Right-side table of a JoinItem — may be a subquery 5053 // (MSSQL / PG `DELETE FROM t FROM x JOIN (SELECT …) 5054 // sub ON …` shape). 5055 processDirectSubqueryTable(item.getTable(), provider, 5056 stmts, lineage, cteMap, ctePublished, aliasToIndex); 5057 } 5058 } 5059 return aliasToIndex; 5060 } 5061 5062 /** 5063 * Slice 84 — build an effective-alias-keyed in-scope map publishing 5064 * each extracted DELETE FROM-subquery's output column names. 5065 * Mirrors slice-83 {@link #buildUpdateInScopeMap} but walks 5066 * {@code delete.getReferenceJoins()}. 5067 * 5068 * <p>Base-table FROM-side relations do not need an entry: their 5069 * column resolution stays on the Resolver2 catalog path 5070 * (probed correct for PG / MSSQL DELETE — see slice-84 plan 5071 * §Codex Q4 + Q11). 5072 */ 5073 private static Map<String, List<String>> buildDeleteInScopeMap( 5074 TDeleteSqlStatement delete, 5075 Map<String, Integer> subqueryAliasToIndex, 5076 List<StatementGraph> stmts, 5077 Map<String, Integer> cteNameToStatementIndex, 5078 Map<String, List<String>> ctePublishedColumns) { 5079 Map<String, List<String>> result = new HashMap<>(); 5080 boolean haveSubq = subqueryAliasToIndex != null 5081 && !subqueryAliasToIndex.isEmpty(); 5082 boolean haveCte = cteNameToStatementIndex != null 5083 && !cteNameToStatementIndex.isEmpty(); 5084 if (!haveSubq && !haveCte) { 5085 return result; 5086 } 5087 TJoinList refJoins = delete.getReferenceJoins(); 5088 if (refJoins == null) return result; 5089 for (int ji = 0; ji < refJoins.size(); ji++) { 5090 TJoin join = refJoins.getJoin(ji); 5091 addDeleteRelationToInScopeMap(join.getTable(), 5092 subqueryAliasToIndex, stmts, result, 5093 cteNameToStatementIndex, ctePublishedColumns); 5094 TJoinItemList items = join.getJoinItems(); 5095 if (items == null) continue; 5096 for (int i = 0; i < items.size(); i++) { 5097 TJoinItem item = items.getJoinItem(i); 5098 if (item == null) continue; 5099 addDeleteRelationToInScopeMap(item.getTable(), 5100 subqueryAliasToIndex, stmts, result, 5101 cteNameToStatementIndex, ctePublishedColumns); 5102 } 5103 } 5104 return result; 5105 } 5106 5107 private static void addDeleteRelationToInScopeMap(TTable t, 5108 Map<String, Integer> subqueryAliasToIndex, 5109 List<StatementGraph> stmts, 5110 Map<String, List<String>> result, 5111 Map<String, Integer> cteNameToStatementIndex, 5112 Map<String, List<String>> ctePublishedColumns) { 5113 if (t == null) return; 5114 // Slice 106 — CTE-as-FROM-relation in-scope publication. When 5115 // the FROM-side table is an objectname-typed reference whose 5116 // bare name matches a declared outer CTE, publish the CTE's 5117 // own column names against the FROM-side effective alias so 5118 // WHERE / ON / RETURNING refs against the CTE alias bind 5119 // correctly. Mirrors slice-105 addUpdateRelationToInScopeMap. 5120 if (cteNameToStatementIndex != null 5121 && !cteNameToStatementIndex.isEmpty() 5122 && ctePublishedColumns != null 5123 && t.getTableType() 5124 == gudusoft.gsqlparser.ETableSource.objectname) { 5125 TObjectName tName = t.getTableName(); 5126 if (tName != null) { 5127 String bare = tName.toString(); 5128 if (bare != null && !bare.isEmpty()) { 5129 String bareLower = bare.toLowerCase(Locale.ROOT); 5130 if (cteNameToStatementIndex.containsKey(bareLower)) { 5131 String aliasKey = effectiveAliasLowerCaseOrNull(t); 5132 if (aliasKey == null) aliasKey = bareLower; 5133 List<String> cols = ctePublishedColumns.get(bareLower); 5134 if (cols != null) { 5135 result.put(aliasKey, cols); 5136 } 5137 return; 5138 } 5139 } 5140 } 5141 } 5142 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.subquery) { 5143 return; 5144 } 5145 if (subqueryAliasToIndex == null) { 5146 return; 5147 } 5148 String key = effectiveAliasLowerCaseOrNull(t); 5149 if (key == null) return; 5150 Integer idx = subqueryAliasToIndex.get(key); 5151 if (idx == null) return; 5152 result.put(key, outputColumnNames(stmts.get(idx))); 5153 } 5154 5155 /** 5156 * Slice 92 — returns {@code true} when the MySQL DELETE statement is a 5157 * self-reference single-target form ({@code DELETE T1 FROM T1 [WHERE …]}) 5158 * that is semantically equivalent to {@code DELETE FROM T1 [WHERE …]}. 5159 * 5160 * <p>The check requires ALL of the following (codex plan-review Q1+Q5 5161 * BLOCKING fix — checking only {@code joins[0]} is insufficient because 5162 * {@code DELETE T1 FROM T2} also has {@code joins.size==1} and 5163 * {@code joins[0].table=="T1"==targetQName}, yet the FROM clause points 5164 * to a different table): 5165 * <ol> 5166 * <li>{@code joins.size == 1} — exactly one MySQL target list entry.</li> 5167 * <li>{@code getReferenceJoins().size() == 1} — exactly one FROM clause 5168 * table.</li> 5169 * <li>{@code joins[0]} has no JoinItems — must be a plain table, not a 5170 * JOIN chain.</li> 5171 * <li>{@code refJoins[0]} has no JoinItems — same constraint.</li> 5172 * <li>{@code joins[0].table.name.toLowerCase() == targetQName.toLowerCase()}. 5173 * </li> 5174 * <li>{@code refJoins[0].table.name.toLowerCase() == targetQName.toLowerCase()}. 5175 * </li> 5176 * </ol> 5177 */ 5178 private static boolean isMysqlSelfReferenceDelete( 5179 TDeleteSqlStatement delete, String targetQName) { 5180 if (delete.joins == null || delete.joins.size() != 1) return false; 5181 TJoinList ref = delete.getReferenceJoins(); 5182 if (ref == null || ref.size() != 1) return false; 5183 TJoin join0 = delete.joins.getJoin(0); 5184 if (join0.getJoinItems() != null && join0.getJoinItems().size() > 0) { 5185 return false; 5186 } 5187 TJoin ref0 = ref.getJoin(0); 5188 if (ref0.getJoinItems() != null && ref0.getJoinItems().size() > 0) { 5189 return false; 5190 } 5191 TTable joinTable = join0.getTable(); 5192 if (joinTable == null || joinTable.getTableName() == null) return false; 5193 TTable refTable = ref0.getTable(); 5194 if (refTable == null || refTable.getTableName() == null) return false; 5195 String lowerTarget = targetQName.toLowerCase(java.util.Locale.ROOT); 5196 String joinName = joinTable.getTableName().toString() 5197 .toLowerCase(java.util.Locale.ROOT); 5198 String refName = refTable.getTableName().toString() 5199 .toLowerCase(java.util.Locale.ROOT); 5200 5201 // Codex diff-review P1 fix: MySQL allows the alias in the DELETE 5202 // target list instead of the table name: 5203 // DELETE t FROM T1 AS t WHERE t.id = 1 5204 // In this form joins[0].table.name = "t" (the alias used in the 5205 // delete-list) while targetQName = "T1" (from getTargetTable() 5206 // which the parser resolves to the real table). Accept the target 5207 // table's alias as a valid joins[0] match alongside the table name. 5208 String targetAlias = null; 5209 if (delete.getTargetTable() != null 5210 && delete.getTargetTable().getAliasClause() != null 5211 && delete.getTargetTable().getAliasClause().getAliasName() != null) { 5212 String a = delete.getTargetTable().getAliasClause() 5213 .getAliasName().toString(); 5214 if (a != null && !a.isEmpty()) { 5215 targetAlias = a.toLowerCase(java.util.Locale.ROOT); 5216 } 5217 } 5218 boolean joinMatchesTarget = joinName.equals(lowerTarget) 5219 || (targetAlias != null && joinName.equals(targetAlias)); 5220 // refJoins[0].table must always be the real table name (= targetQName). 5221 return joinMatchesTarget && refName.equals(lowerTarget); 5222 } 5223 5224 /** 5225 * Per-result-column metadata about an extracted scalar-subquery 5226 * projection (slice 11). {@link #statementIndex} points to the 5227 * inner body statement; {@link #innerOutputName} is the inner 5228 * SELECT's single projected output name (used to wire the 5229 * STATEMENT_OUTPUT → STATEMENT_OUTPUT lineage edge). 5230 */ 5231 private static final class ScalarInfo { 5232 final int statementIndex; 5233 final String innerOutputName; 5234 ScalarInfo(int statementIndex, String innerOutputName) { 5235 this.statementIndex = statementIndex; 5236 this.innerOutputName = innerOutputName; 5237 } 5238 } 5239 5240 /** 5241 * Walk the consuming SELECT's FROM list. For every {@link TTable} of 5242 * type {@link gudusoft.gsqlparser.ETableSource#subquery}, recursively 5243 * build the inner statement, append it to {@code stmts}, emit its own 5244 * lineage edges, and record alias→statementIndex. The returned map is 5245 * scoped to this single consuming statement so duplicate aliases 5246 * across different scopes do not collide. 5247 * 5248 * <p>Slice 17: extraction now walks BOTH sides of every JOIN 5249 * ({@code TJoin.getTable()} for the left, {@code joinItems[i].getTable()} 5250 * for each right) and recurses into nested FROM-subquery bodies. Each 5251 * recursive level pre-extracts its own children before calling 5252 * {@code buildSelectStatement}, preserving the 5253 * {@code BodyIndexes}-required ordering (innermost body before 5254 * its consumer). FROM-subquery bodies still recurse with 5255 * {@code allowScalarProjectionSubqueries=false} (slice-15 invariant 5256 * pinned by {@code Slice15Test.scalarProjectionInsideFromSubqueryBodyStillRejected}). 5257 * 5258 * <p>Slice 18 lifts CTE bodies (the non-set-op CTE-body branch in 5259 * {@link #build} now invokes this extractor with 5260 * {@code allowFromSubqueries=true}). Still rejected: subqueries with 5261 * no alias, FROM-subqueries inside a scalar body / set-op branch / 5262 * set-op CTE body (each enforced by the caller's 5263 * {@code allowFromSubqueries=false}), and predicate subqueries inside 5264 * the FROM-subquery body's WHERE / JOIN ON / GROUP BY (slice-17 5265 * helper {@link #rejectSubqueriesInFromSubqueryBodyClauses}). 5266 */ 5267 private static Map<String, Integer> extractFromSubqueriesAsStatements( 5268 TSelectSqlStatement consumer, 5269 NameBindingProvider consumerProvider, 5270 List<StatementGraph> stmts, 5271 List<LineageEdge> lineage, 5272 Map<String, Integer> cteNameToStatementIndex, 5273 Map<String, List<String>> ctePublishedColumns) { 5274 Map<String, Integer> aliasToIndex = new HashMap<>(); 5275 if (consumer.joins == null) return aliasToIndex; 5276 // Slice 17 mutation-free preflight: walk the entire direct 5277 // FROM/JOIN list once and reject before any mutation of 5278 // stmts/lineage. Catches comma-FROM, anonymous subqueries, 5279 // unsupported join shapes, and ALL same-level alias collisions 5280 // (base AND subquery, since rejectDuplicateAliases inside 5281 // buildRelations only catches them later, after this level's 5282 // subquery body has already landed in stmts). 5283 preflightDirectFromList(consumer); 5284 5285 // Slice 17: walk both sides of every join and process each 5286 // direct subquery via the same helper so left/right can't drift. 5287 for (TJoin join : consumer.joins) { 5288 processDirectSubqueryTable(join.getTable(), 5289 consumerProvider, stmts, lineage, 5290 cteNameToStatementIndex, ctePublishedColumns, aliasToIndex); 5291 TJoinItemList items = join.getJoinItems(); 5292 if (items == null) continue; 5293 for (int i = 0; i < items.size(); i++) { 5294 TJoinItem item = items.getJoinItem(i); 5295 if (item == null) continue; 5296 processDirectSubqueryTable(item.getTable(), 5297 consumerProvider, stmts, lineage, 5298 cteNameToStatementIndex, ctePublishedColumns, aliasToIndex); 5299 } 5300 } 5301 return aliasToIndex; 5302 } 5303 5304 /** 5305 * Slice 17 mutation-free preflight for the consumer's direct 5306 * FROM/JOIN list. Validates structural invariants BEFORE any 5307 * subquery body is appended to {@code stmts} so a deferred 5308 * failure (e.g. on the second of two siblings) doesn't strand 5309 * earlier-sibling output in the program. 5310 * 5311 * <p>Slice 62 (codex plan-review round 1): the comma-FROM 5312 * reject was removed here. The preflight runs only for 5313 * {@code allowFromSubqueries=true} paths (outer SELECT, CTE 5314 * body, FROM-subquery body recursion) — exactly the paths 5315 * that admit comma-FROM under slice 62. Synthetic body 5316 * contexts (scalar / set-op-branch / set-op-CTE / predicate) 5317 * do not run this preflight; they reach the gated reject in 5318 * {@link #buildRelations} (and predicate bodies hit the 5319 * earlier slice-62 reject inside 5320 * {@link #preflightExistsInnerShape}). 5321 */ 5322 private static void preflightDirectFromList(TSelectSqlStatement consumer) { 5323 if (consumer.joins == null) return; 5324 Set<String> seenSubqueryAliases = new HashSet<>(); 5325 Set<String> seenAllAliases = new HashSet<>(); 5326 for (TJoin join : consumer.joins) { 5327 preflightOneTable(join.getTable(), seenSubqueryAliases, seenAllAliases); 5328 TJoinItemList items = join.getJoinItems(); 5329 if (items == null) continue; 5330 for (int i = 0; i < items.size(); i++) { 5331 TJoinItem item = items.getJoinItem(i); 5332 if (item == null) continue; 5333 rejectUnsupportedJoinShape(item); 5334 preflightOneTable(item.getTable(), seenSubqueryAliases, seenAllAliases); 5335 } 5336 } 5337 } 5338 5339 /** 5340 * Slice 17: validate one direct FROM/JOIN-list TTable in the 5341 * mutation-free preflight. Effective alias is the SQL-written alias 5342 * if present, else the slice-74 synthetic alias for unaliased 5343 * FROM-subqueries (position-keyed), else the table name (matches 5344 * {@link #buildRelation}). 5345 * 5346 * <p>Slice 74: removed the {@code FROM_SUBQUERY_ALIAS_REQUIRED} reject 5347 * for anonymous subqueries; the slot is now filled by 5348 * {@link FromSubqueryNaming#synthAliasFor}. Two unaliased subqueries 5349 * at the same source location are theoretically impossible (the 5350 * parser would have to emit the same start token for both), but if 5351 * it ever happens the {@code DUPLICATE_FROM_SUBQUERY_ALIAS} branch 5352 * below catches it the same way as a literal user-written duplicate. 5353 * 5354 * <p>Still rejects: duplicate subquery aliases (whether user-written 5355 * or synthetic by collision), and any cross-kind alias collision 5356 * (base alias colliding with a subquery alias). 5357 */ 5358 private static void preflightOneTable(TTable t, 5359 Set<String> seenSubqueryAliases, 5360 Set<String> seenAllAliases) { 5361 if (t == null) return; 5362 boolean isSub = t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery; 5363 String effective = effectiveAliasOf(t); 5364 if (effective == null || effective.isEmpty()) return; 5365 String lower = effective.toLowerCase(Locale.ROOT); 5366 if (isSub && !seenSubqueryAliases.add(lower)) { 5367 throw new SemanticIRBuildException( 5368 Diagnostic.error(DiagnosticCode.DUPLICATE_FROM_SUBQUERY_ALIAS, 5369 "duplicate FROM-clause subquery alias '" + effective + "'", (TParseTreeNode) null)); 5370 } 5371 if (!seenAllAliases.add(lower)) { 5372 throw new SemanticIRBuildException( 5373 Diagnostic.error(DiagnosticCode.DUPLICATE_RELATION_ALIAS, 5374 "duplicate relation alias '" + effective 5375 + "' is not supported (would make ColumnRef ambiguous)", (TParseTreeNode) null)); 5376 } 5377 } 5378 5379 /** 5380 * Slice 17: extract one direct subquery TTable as its own 5381 * StatementGraph. Recurses into the inner SELECT first 5382 * (innermost body lands in {@code stmts} BEFORE its consumer, as 5383 * {@code BodyIndexes} requires). Skips non-subquery tables (base 5384 * relations are bound later by {@code buildRelations}). 5385 */ 5386 private static void processDirectSubqueryTable( 5387 TTable t, 5388 NameBindingProvider consumerProvider, 5389 List<StatementGraph> stmts, 5390 List<LineageEdge> lineage, 5391 Map<String, Integer> cteNameToStatementIndex, 5392 Map<String, List<String>> ctePublishedColumns, 5393 Map<String, Integer> aliasToIndex) { 5394 if (t == null) return; 5395 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.subquery) return; 5396 // Alias presence/uniqueness already validated by the preflight. 5397 // Slice 74: anonymous (unaliased) subqueries get a synth name 5398 // from FromSubqueryNaming via effectiveAliasOf so the alias used 5399 // for the aliasToIndex map and inner-stmt name is non-null. 5400 String alias = effectiveAliasOf(t); 5401 String aliasLower = alias.toLowerCase(Locale.ROOT); 5402 TSelectSqlStatement inner = t.getSubquery(); 5403 if (inner == null) { 5404 throw new SemanticIRBuildException( 5405 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_NO_INNER_SELECT, 5406 "FROM-clause subquery '" + alias + "' has no inner SELECT", (TParseTreeNode) null)); 5407 } 5408 // Slice 17 leak guard: predicate subqueries inside the 5409 // FROM-subquery body's WHERE / JOIN ON / GROUP BY would 5410 // otherwise slip past `allowScalarProjectionSubqueries=false` 5411 // (which only guards buildOutputColumns) and leak inner refs 5412 // into the body's filter/join/group ref lists. 5413 rejectSubqueriesInFromSubqueryBodyClauses(inner, alias); 5414 // Recurse into the inner's own FROM-subqueries first so each 5415 // deeper body lands in stmts BEFORE the body that consumes it. 5416 // The recursive call uses `consumerProvider` because the inner 5417 // sees the same CTE-name set as the outer (CTEs are visible 5418 // through FROM-subquery bodies — pinned by 5419 // Slice5Test.cteVisibleInsideFromSubquery). 5420 // Slice 60: thread ctePublishedColumns down unchanged. The 5421 // inner's siblings get registered into innerSubAliasToIndex 5422 // here; below we build the per-level innerInScope BEFORE 5423 // calling buildSelectStatement. 5424 Map<String, Integer> innerSubAliasToIndex = 5425 extractFromSubqueriesAsStatements(inner, consumerProvider, 5426 stmts, lineage, cteNameToStatementIndex, 5427 ctePublishedColumns); 5428 // Slice 60 (codex diff-review): build the inner FROM-subquery 5429 // body's effective-alias-keyed in-scope map by walking the 5430 // inner SELECT's FROM list. Sibling isolation is preserved 5431 // because `innerSubAliasToIndex` contains ONLY this body's 5432 // own children — ancestor siblings are never visited by the 5433 // walk because they're not in the inner's FROM list. 5434 Map<String, List<String>> innerInScope = buildEffectiveAliasInScopeMap( 5435 inner, consumerProvider, ctePublishedColumns, 5436 innerSubAliasToIndex, stmts); 5437 NameBindingProvider innerProviderWithStar = consumerProvider 5438 .withInScopeRelationColumns(innerInScope); 5439 // Slice 120 — switch from the 7-arg buildSelectStatement to the 5440 // 14-arg buildSelectStatementImpl so the FROM-subquery body's 5441 // WHERE clause can extract uncorrelated predicate subqueries 5442 // (IN-SELECT / EXISTS / NOT EXISTS / scalar comparison / 5443 // ANY-ALL-SOME) as their own statements (mirrors the slice-114 5444 // CTE-body lift). JOIN-ON predicate subqueries stay rejected (the 5445 // two flags are independent per the slice-113 split) — the 5446 // slice-17 leak guard rejectSubqueriesInFromSubqueryBodyClauses 5447 // above still fires for the body's JOIN-ON / GROUP-BY clauses. 5448 // allowFromSubqueries=true so its buildRelations accepts 5449 // already-extracted subquery aliases; allowScalarProjectionSubqueries 5450 // =false (slice-15 invariant). The snapshot/rollback wrapper 5451 // mirrors the slice-114 CTE-body call site: if the build appends 5452 // predicate bodies and then a later reject fires, stmts/lineage 5453 // truncate back to the pre-call boundary so a partial extraction 5454 // does not leak into the program. processDirectSubqueryTable is 5455 // shared by the SELECT / UPDATE (slice 83) / DELETE (slice 84) 5456 // FROM-subquery extractors, so this single site lifts all three. 5457 int fromBodyStmtsSnapshot = stmts.size(); 5458 int fromBodyLineageSnapshot = lineage.size(); 5459 StatementGraph innerStmt; 5460 try { 5461 innerStmt = buildSelectStatementImpl(inner, innerProviderWithStar, alias, 5462 /*hasOuterCteListAlreadyProcessed=*/ false, 5463 /*allowFromSubqueries=*/ true, 5464 /*allowScalarProjectionSubqueries=*/ false, 5465 /*allowWindowProjection=*/ true, 5466 /*allowJoinOnPredicateSubqueries=*/ false, 5467 /*stmtsForExtraction=*/ stmts, 5468 /*lineageForExtraction=*/ lineage, 5469 /*cteMapForExtraction=*/ cteNameToStatementIndex, 5470 /*isPredicateBody=*/ false, 5471 /*whereClauseContext=*/ PredicateClauseContext.FROM_SUBQUERY_BODY_WHERE, 5472 /*allowWherePredicateSubqueries=*/ true); 5473 } catch (RuntimeException ex) { 5474 while (stmts.size() > fromBodyStmtsSnapshot) stmts.remove(stmts.size() - 1); 5475 while (lineage.size() > fromBodyLineageSnapshot) lineage.remove(lineage.size() - 1); 5476 throw ex; 5477 } 5478 int idx = stmts.size(); 5479 stmts.add(innerStmt); 5480 aliasToIndex.put(aliasLower, idx); 5481 // Emit lineage with the inner's own subquery alias map (so 5482 // STATEMENT_OUTPUT → STATEMENT_OUTPUT edges target the inner's 5483 // children, not the outer's). Scalar map stays empty because 5484 // FROM-subquery bodies still reject scalar projections 5485 // (slice-15 invariant). 5486 emitLineageForStatement(innerStmt, idx, lineage, 5487 cteNameToStatementIndex, 5488 innerSubAliasToIndex, 5489 Collections.<Integer, ScalarInfo>emptyMap()); 5490 } 5491 5492 /** 5493 * Slice 17 leak guard: reject subqueries inside a FROM-subquery 5494 * body's JOIN ON / GROUP BY clauses. Mirrors 5495 * {@link #rejectSubqueriesInScalarBodyClauses} (slice 11) — the 5496 * {@code allowScalarProjectionSubqueries=false} flag only guards 5497 * {@code buildOutputColumns}, so without this helper a SQL like 5498 * {@code SELECT id FROM (SELECT id FROM e JOIN d ON e.x = d.x 5499 * AND EXISTS (...)) sub} would leak the EXISTS subquery's refs into 5500 * the body's join column refs via {@code collectColumnRefs}. 5501 * HAVING / ORDER BY subqueries are caught by the slice-9 / 10 5502 * deep-scan rejecters inside {@code buildSelectStatementImpl}. 5503 * 5504 * <p>Slice 120 — the WHERE branch was removed: uncorrelated WHERE-side 5505 * predicate subqueries in a FROM-subquery body are now extracted as 5506 * their own statements by {@code buildSelectStatementImpl} via 5507 * {@link PredicateClauseContext#FROM_SUBQUERY_BODY_WHERE} (see 5508 * {@link #processDirectSubqueryTable}). {@code FROM_SUBQUERY_INNER_SUBQUERY_IN_WHERE} 5509 * stays declared-but-unreached for public-API stability (slice 5510 * 71/72/82/86/95/101/.../114 retain-for-documentation precedent). 5511 */ 5512 private static void rejectSubqueriesInFromSubqueryBodyClauses( 5513 TSelectSqlStatement inner, String fromAlias) { 5514 if (inner.joins != null) { 5515 for (TJoin join : inner.joins) { 5516 TJoinItemList items = join.getJoinItems(); 5517 if (items == null) continue; 5518 for (int i = 0; i < items.size(); i++) { 5519 TJoinItem item = items.getJoinItem(i); 5520 TExpression onCond = item == null ? null : item.getOnCondition(); 5521 if (onCond != null && containsAnySubqueryExpression(onCond)) { 5522 throw new SemanticIRBuildException( 5523 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_INNER_SUBQUERY_IN_JOIN_ON, 5524 "FROM-clause subquery '" + fromAlias 5525 + "' has a subquery in a JOIN ON clause; not supported yet " 5526 + "(would leak inner refs)", (TParseTreeNode) null)); 5527 } 5528 } 5529 } 5530 } 5531 TGroupBy groupBy = inner.getGroupByClause(); 5532 if (groupBy != null) { 5533 TGroupByItemList items = groupBy.getItems(); 5534 if (items != null && containsAnySubquery(items)) { 5535 throw new SemanticIRBuildException( 5536 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_INNER_SUBQUERY_IN_GROUP_BY, 5537 "FROM-clause subquery '" + fromAlias 5538 + "' has a subquery in a GROUP BY clause; not supported yet " 5539 + "(would leak inner refs)", (TParseTreeNode) null)); 5540 } 5541 } 5542 } 5543 5544 /** 5545 * Walk the consuming SELECT's result-column list. For every 5546 * top-level {@link EExpressionType#subquery_t} projection, build 5547 * the inner SELECT as its own {@link StatementGraph} (mirroring 5548 * slice 5 FROM-subquery extraction), append it to {@code stmts}, 5549 * emit its own lineage edges, and record 5550 * {@code resultColumnOrdinal → ScalarInfo} so the consumer's 5551 * {@code emitLineageForStatement} can wire the 5552 * STATEMENT_OUTPUT → STATEMENT_OUTPUT edge. 5553 * 5554 * <p>Slice 11 disallows: scalar subqueries with no outer alias, 5555 * inner SELECTs that project more than one column, inner columns 5556 * with no alias and no direct column name, scalar subqueries 5557 * whose inner WHERE/JOIN/GROUP BY contains a subquery (predicate 5558 * leak guard), correlated scalar subqueries (inner refs that 5559 * resolve to outer aliases), and nested scalar subqueries. 5560 * 5561 * <p>Slice 14 lifted correlated TABLE-bound; slice 15 lifted CTE- 5562 * and SUBQUERY-bound. Slice 20 lifts <i>nested</i> scalar 5563 * projections inside a scalar body when the 5564 * {@code allowRecursiveScalarSubqueryExtraction} flag is true (passed 5565 * by the outer-build and CTE-body call sites). Set-op-branch call 5566 * sites pass false to keep the slice-12 / slice-16 boundary that 5567 * branch scalar bodies must not host another scalar projection. 5568 * 5569 * <p>Slice 20 wraps the body in a snapshot/rollback so a deeper 5570 * level's failure does not leak appended scalar-body statements at 5571 * shallower levels into {@code stmts}/{@code lineage}. The wrapper 5572 * mirrors slice-16's {@code buildSetOpProgram} and slice-17/18's 5573 * extract wrappers (§14.18 process lesson #21). 5574 */ 5575 private static Map<Integer, ScalarInfo> extractScalarSubqueriesAsStatements( 5576 TSelectSqlStatement consumer, 5577 NameBindingProvider consumerProvider, 5578 List<StatementGraph> stmts, 5579 List<LineageEdge> lineage, 5580 Map<String, Integer> cteNameToStatementIndex, 5581 EnclosingScope enclosingScope, 5582 boolean allowRecursiveScalarSubqueryExtraction) { 5583 // Slice 20: SET-OP-WIDE-style transactional rollback. A failure 5584 // anywhere inside the loop (or inside a recursive call) truncates 5585 // both lists back to the pre-extraction size. The wrapper closes 5586 // the class of "mutation-free check fires after partial mutation" 5587 // (codex round-3..5 finding on slice 16); the recursive scalar 5588 // extraction surfaced it here. 5589 int stmtsSnapshot = stmts.size(); 5590 int lineageSnapshot = lineage.size(); 5591 try { 5592 return extractScalarSubqueriesAsStatementsInternal(consumer, 5593 consumerProvider, stmts, lineage, 5594 cteNameToStatementIndex, enclosingScope, 5595 allowRecursiveScalarSubqueryExtraction); 5596 } catch (RuntimeException ex) { 5597 while (stmts.size() > stmtsSnapshot) stmts.remove(stmts.size() - 1); 5598 while (lineage.size() > lineageSnapshot) lineage.remove(lineage.size() - 1); 5599 throw ex; 5600 } 5601 } 5602 5603 /** 5604 * Internal body of {@link #extractScalarSubqueriesAsStatements}. 5605 * Wrapped with snapshot/rollback by the public entry point; do not 5606 * call directly from non-wrapper sites. 5607 */ 5608 private static Map<Integer, ScalarInfo> extractScalarSubqueriesAsStatementsInternal( 5609 TSelectSqlStatement consumer, 5610 NameBindingProvider consumerProvider, 5611 List<StatementGraph> stmts, 5612 List<LineageEdge> lineage, 5613 Map<String, Integer> cteNameToStatementIndex, 5614 EnclosingScope enclosingScope, 5615 boolean allowRecursiveScalarSubqueryExtraction) { 5616 Map<Integer, ScalarInfo> ordinalToInfo = new HashMap<>(); 5617 TResultColumnList rcl = consumer.getResultColumnList(); 5618 if (rcl == null || rcl.size() == 0) return ordinalToInfo; 5619 5620 // Reject duplicate output aliases when a scalar projection is 5621 // present (codex impl-review round-1 SHOULD 1). Lineage refs are 5622 // keyed by (statementIndex, outputName); two outputs sharing 5623 // the same name would collapse their lineage chains and 5624 // silently merge the scalar dependency with another column's 5625 // dependency. The slice-11 boundary is the cleanest place to 5626 // enforce this since the issue is most acute when a scalar 5627 // body's STATEMENT_OUTPUT → STATEMENT_OUTPUT edge is in play. 5628 boolean hasScalar = false; 5629 for (int i = 0; i < rcl.size(); i++) { 5630 TResultColumn rc = rcl.getResultColumn(i); 5631 if (rc != null && rc.getExpr() != null 5632 && rc.getExpr().getExpressionType() == EExpressionType.subquery_t) { 5633 hasScalar = true; 5634 break; 5635 } 5636 } 5637 if (hasScalar) { 5638 Set<String> seenOutputNames = new HashSet<>(); 5639 for (int i = 0; i < rcl.size(); i++) { 5640 TResultColumn rc = rcl.getResultColumn(i); 5641 if (rc == null) continue; 5642 String alias = rc.getColumnAlias(); 5643 String colName = rc.getColumnNameOnly(); 5644 String name = (alias != null && !alias.isEmpty()) 5645 ? alias 5646 : colName; 5647 if (name == null || name.isEmpty()) continue; 5648 String lower = name.toLowerCase(Locale.ROOT); 5649 if (!seenOutputNames.add(lower)) { 5650 throw new SemanticIRBuildException( 5651 Diagnostic.error(DiagnosticCode.DUPLICATE_OUTPUT_NAME, 5652 "duplicate output name '" + name 5653 + "' in a SELECT containing a scalar subquery projection; " 5654 + "lineage refs are keyed by output name and would collide", rc)); 5655 } 5656 } 5657 } 5658 5659 for (int i = 0; i < rcl.size(); i++) { 5660 TResultColumn rc = rcl.getResultColumn(i); 5661 if (rc == null || rc.getExpr() == null) continue; 5662 if (rc.getExpr().getExpressionType() != EExpressionType.subquery_t) { 5663 continue; 5664 } 5665 String outerAlias = rc.getColumnAlias(); 5666 if (outerAlias == null || outerAlias.isEmpty()) { 5667 throw new SemanticIRBuildException( 5668 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_ALIAS_REQUIRED, 5669 "scalar subquery projection must have an alias", rc)); 5670 } 5671 TSelectSqlStatement inner = rc.getExpr().getSubQuery(); 5672 if (inner == null) { 5673 throw new SemanticIRBuildException( 5674 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_NO_INNER_SELECT, 5675 "scalar subquery projection '" + outerAlias 5676 + "' has no inner SELECT", rc)); 5677 } 5678 // Pre-recursion validation (codex round-2 MUST 2): inspect 5679 // the inner SELECT's projected column count and naming 5680 // BEFORE recursive build so the rejection message is 5681 // scalar-specific instead of bubbling up from 5682 // effectiveOutputName. 5683 TResultColumnList innerRcl = inner.getResultColumnList(); 5684 if (innerRcl == null || innerRcl.size() == 0) { 5685 throw new SemanticIRBuildException( 5686 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_COLUMN_COUNT, 5687 "scalar subquery '" + outerAlias 5688 + "' must project exactly one column, got 0", rc)); 5689 } 5690 if (innerRcl.size() != 1) { 5691 throw new SemanticIRBuildException( 5692 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_COLUMN_COUNT, 5693 "scalar subquery '" + outerAlias 5694 + "' must project exactly one column, got " 5695 + innerRcl.size(), rc)); 5696 } 5697 TResultColumn innerCol = innerRcl.getResultColumn(0); 5698 String innerAlias = innerCol.getColumnAlias(); 5699 String innerColName = innerCol.getColumnNameOnly(); 5700 boolean innerHasName = 5701 (innerAlias != null && !innerAlias.isEmpty()) 5702 || (innerColName != null && !innerColName.isEmpty()); 5703 if (!innerHasName && !isConstantExpression(innerCol.getExpr())) { 5704 throw new SemanticIRBuildException( 5705 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_INNER_PROJECTION_UNNAMED, 5706 "scalar subquery '" + outerAlias 5707 + "' inner projection has no alias and no column name; " 5708 + "add an explicit alias inside the subquery", rc)); 5709 } 5710 // Pre-recursion deep-scan (codex round-3 MUST 5, round-4 5711 // MUST 1): reject nested predicate subqueries in the 5712 // scalar body's WHERE / JOIN ON / GROUP BY before 5713 // collectColumnRefs can descend. 5714 rejectSubqueriesInScalarBodyClauses(inner, outerAlias); 5715 5716 // Slice 20: branch on allowRecursiveScalarSubqueryExtraction. 5717 // - true (outer / CTE-body call sites): build the inner's 5718 // own enclosing scope chained to this caller's; recursively 5719 // extract the inner's scalar projections; then build the 5720 // inner with allowScalarProjectionSubqueries=true; 5721 // compute scalarName AFTER the recursive extraction so 5722 // the digit suffix matches the post-extraction stmts.size() 5723 // (slice-16 codex round-1 MUST 2 lesson). 5724 // - false (set-op-branch call site): keep the slice-12 / 5725 // slice-16 boundary — no recursive extraction; the inner 5726 // scalar map stays empty; the inner builds with 5727 // allowScalarProjectionSubqueries=false; promotion still 5728 // uses the caller's enclosing scope so OUTER_REFERENCE-of-* 5729 // correlation works at the branch level. 5730 EnclosingScope innerEnclosing; 5731 Map<Integer, ScalarInfo> innerScalarMap; 5732 String scalarName; 5733 if (allowRecursiveScalarSubqueryExtraction) { 5734 innerEnclosing = buildEnclosingScope(inner, 5735 cteNameToStatementIndex, 5736 Collections.<String, Integer>emptyMap(), 5737 enclosingScope); 5738 innerScalarMap = extractScalarSubqueriesAsStatements(inner, 5739 consumerProvider, stmts, lineage, 5740 cteNameToStatementIndex, innerEnclosing, 5741 /*allowRecursiveScalarSubqueryExtraction=*/ true); 5742 scalarName = SCALAR_BODY_PREFIX + stmts.size() + ">"; 5743 } else { 5744 innerEnclosing = enclosingScope; 5745 innerScalarMap = Collections.<Integer, ScalarInfo>emptyMap(); 5746 scalarName = SCALAR_BODY_PREFIX + stmts.size() + ">"; 5747 } 5748 StatementGraph innerStmt = buildSelectStatement(inner, consumerProvider, 5749 scalarName, 5750 /*hasOuterCteListAlreadyProcessed=*/ false, 5751 /*allowFromSubqueries=*/ false, 5752 /*allowScalarProjectionSubqueries=*/ allowRecursiveScalarSubqueryExtraction, 5753 /*allowWindowProjection=*/ false); 5754 // Slice 14: instead of rejecting correlated scalar 5755 // subqueries, promote outer-scope refs into synthesised 5756 // OUTER_REFERENCE relations on the inner statement. 5757 // Non-TABLE-bound outer refs (CTE / SUBQUERY) and 5758 // unknown aliases still throw. 5759 innerStmt = promoteCorrelatedRefsToOuterReference( 5760 innerStmt, outerAlias, innerEnclosing); 5761 int idx = stmts.size(); 5762 stmts.add(innerStmt); 5763 String innerOutName = effectiveOutputName(innerCol); 5764 ordinalToInfo.put(i, new ScalarInfo(idx, innerOutName)); 5765 // Slice 20: pass the chained subquery alias map so 5766 // OUTER_REFERENCE-of-SUBQUERY refs in deeply nested scalar 5767 // bodies resolve through ancestor FROM-subquery aliases. 5768 // Pass innerScalarMap (non-empty when recursive extraction 5769 // is allowed) so the inner's own STATEMENT_OUTPUT → 5770 // STATEMENT_OUTPUT edges land. 5771 emitLineageForStatement(innerStmt, idx, lineage, 5772 cteNameToStatementIndex, 5773 innerEnclosing.flattenSubqueryAliasToIndex(), 5774 innerScalarMap); 5775 } 5776 return ordinalToInfo; 5777 } 5778 5779 /** 5780 * Reject nested predicate subqueries inside a scalar body's 5781 * WHERE / JOIN ON / GROUP BY clauses. Scalar bodies are slice-11 5782 * scope; the pre-existing builder leaks predicate-subquery refs 5783 * into {@code filterColumnRefs}/etc. elsewhere, but the scalar-body 5784 * recursion path is guarded so slice-11 outputs stay clean. 5785 */ 5786 private static void rejectSubqueriesInScalarBodyClauses( 5787 TSelectSqlStatement inner, String outerAlias) { 5788 TWhereClause where = inner.getWhereClause(); 5789 if (where != null && containsAnySubquery(where)) { 5790 throw new SemanticIRBuildException( 5791 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_INNER_SUBQUERY_IN_WHERE, 5792 "scalar subquery '" + outerAlias 5793 + "' has a subquery in its WHERE clause; not supported yet " 5794 + "(would leak inner refs)", (TParseTreeNode) null)); 5795 } 5796 if (inner.joins != null) { 5797 for (TJoin join : inner.joins) { 5798 TJoinItemList items = join.getJoinItems(); 5799 if (items == null) continue; 5800 for (int i = 0; i < items.size(); i++) { 5801 TJoinItem item = items.getJoinItem(i); 5802 TExpression onCond = item == null ? null : item.getOnCondition(); 5803 if (onCond != null && containsAnySubqueryExpression(onCond)) { 5804 throw new SemanticIRBuildException( 5805 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_INNER_SUBQUERY_IN_JOIN_ON, 5806 "scalar subquery '" + outerAlias 5807 + "' has a subquery in a JOIN ON clause; not supported yet " 5808 + "(would leak inner refs)", (TParseTreeNode) null)); 5809 } 5810 } 5811 } 5812 } 5813 TGroupBy groupBy = inner.getGroupByClause(); 5814 if (groupBy != null) { 5815 TGroupByItemList items = groupBy.getItems(); 5816 if (items != null && containsAnySubquery(items)) { 5817 throw new SemanticIRBuildException( 5818 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_INNER_SUBQUERY_IN_GROUP_BY, 5819 "scalar subquery '" + outerAlias 5820 + "' has a subquery in a GROUP BY clause; not supported yet " 5821 + "(would leak inner refs)", (TParseTreeNode) null)); 5822 } 5823 } 5824 // HAVING / ORDER BY subqueries are caught by the slice-9 / 10 5825 // deep-scan rejecters that fire during buildSelectStatement. 5826 } 5827 5828 /** 5829 * True iff any descendant of {@code root} is a {@link TExpression} 5830 * with {@code subquery_t} type or a non-null 5831 * {@link TExpression#getSubQuery()}. Mirrors the slice 9/10 pattern. 5832 */ 5833 private static boolean containsAnySubquery(gudusoft.gsqlparser.nodes.TParseTreeNode root) { 5834 final boolean[] found = {false}; 5835 root.acceptChildren(new TParseTreeVisitor() { 5836 @Override 5837 public void preVisit(TExpression e) { 5838 if (found[0]) return; 5839 if (e.getExpressionType() == EExpressionType.subquery_t 5840 || e.getSubQuery() != null) { 5841 found[0] = true; 5842 } 5843 } 5844 }); 5845 return found[0]; 5846 } 5847 5848 /** Same as {@link #containsAnySubquery} but also checks the root expression itself. */ 5849 private static boolean containsAnySubqueryExpression(TExpression root) { 5850 if (root.getExpressionType() == EExpressionType.subquery_t 5851 || root.getSubQuery() != null) { 5852 return true; 5853 } 5854 return containsAnySubquery(root); 5855 } 5856 5857 /** 5858 * Slice 119 — collect every {@code subquery_t} TExpression node that 5859 * is a direct (top-level) subquery in the compound expression {@code 5860 * root}, without descending into any found subquery's own body. 5861 * 5862 * <p>Uses {@link TExpression#acceptChildren} with a depth counter to 5863 * handle all expression subtypes (arithmetic, CASE, function args) 5864 * without manual branch enumeration. The depth counter increments on 5865 * every {@code subquery_t} preVisit and decrements on every postVisit, 5866 * so nested subqueries inside a found body (e.g. an EXISTS inside a 5867 * scalar's WHERE) are tracked but NOT added to the result set. 5868 * 5869 * <p>Result list is in traversal (preVisit) order for deterministic 5870 * statement-graph numbering across identical SQL texts. 5871 */ 5872 private static List<TExpression> collectNestedSubqueryExpressions(TExpression root) { 5873 if (root == null) return Collections.<TExpression>emptyList(); 5874 // Defensive: root itself is a subquery_t — callers of the 5875 // mixed-expression path should have already taken the slice-115 5876 // top-level branch, but guard anyway. 5877 if (root.getExpressionType() == EExpressionType.subquery_t) { 5878 return Collections.singletonList(root); 5879 } 5880 final List<TExpression> ordered = new ArrayList<>(); 5881 final int[] depth = {0}; 5882 root.acceptChildren(new TParseTreeVisitor() { 5883 @Override 5884 public void preVisit(TExpression e) { 5885 if (e.getExpressionType() == EExpressionType.subquery_t) { 5886 if (depth[0] == 0) ordered.add(e); // top-level only 5887 depth[0]++; 5888 } 5889 } 5890 @Override 5891 public void postVisit(TExpression e) { 5892 if (e.getExpressionType() == EExpressionType.subquery_t) { 5893 depth[0]--; 5894 } 5895 } 5896 }); 5897 return ordered; 5898 } 5899 5900 /** 5901 * Slice-14 enclosing-scope map for correlation promotion. Holds the 5902 * full outer {@link RelationSource} per alias visible from the 5903 * enclosing scope so {@link #promoteCorrelatedRefsToOuterReference} 5904 * can read the outer's binding kind and qualifiedName. 5905 * 5906 * <p>The map is keyed by lower-cased alias for case-insensitive 5907 * lookup. The synthesised OUTER_REFERENCE RelationSource's alias is 5908 * NOT taken from this map's value — it is taken from the inner 5909 * ref's spelling (see §4.3 of the slice-14 plan), because 5910 * {@link gudusoft.gsqlparser.ir.semantic.binding.Resolver2NameBindingProvider} 5911 * records the inner ref's alias verbatim and case-sensitive 5912 * alias-equality elsewhere relies on that spelling. 5913 * 5914 * <p>Slice-15 adds {@link #subqueryAliasToIndex}: when a SUBQUERY-bound 5915 * outer alias is referenced from an inner correlated scalar, the 5916 * inner statement's lineage emission needs to look up the outer's 5917 * FROM-subquery body by alias to wire a STATEMENT_OUTPUT → 5918 * STATEMENT_OUTPUT edge. The map mirrors the outer's alias→index 5919 * map but is filtered to alias keys also present in 5920 * {@link #aliasLowerToOuter} as SUBQUERY entries so the two maps 5921 * cannot drift apart. 5922 * 5923 * <p>Slice-20 generalises slice-14/15's flat map into a chain of 5924 * ancestor scopes (the {@link #parent} field). Lookups walk innermost 5925 * → outermost via {@link #lookupAlias(String)}. The 5926 * {@link #flattenSubqueryAliasToIndex()} helper produces an innermost- 5927 * wins flattened map for {@code emitLineageForStatement} consumers 5928 * that resolve OUTER_REFERENCE-of-SUBQUERY through ancestor 5929 * FROM-subquery aliases. Worst-case asymptotic per build for a 5930 * scalar chain of depth D with K siblings per level is 5931 * {@code O(K · D²)}; in practice D ≤ 4-5 for human-written SQL so the 5932 * constant factor is negligible. If a real-world benchmark surfaces 5933 * the flatten as a hot path, the obvious fix is a memo keyed on 5934 * {@code EnclosingScope} identity — deferred until measured. 5935 */ 5936 private static final class EnclosingScope { 5937 final Map<String, RelationSource> aliasLowerToOuter; 5938 final Map<String, Integer> subqueryAliasToIndex; 5939 /** Slice-20 chain: parent enclosing scope; null at the root. */ 5940 final EnclosingScope parent; 5941 5942 EnclosingScope(Map<String, RelationSource> aliasLowerToOuter, 5943 Map<String, Integer> subqueryAliasToIndex, 5944 EnclosingScope parent) { 5945 this.aliasLowerToOuter = aliasLowerToOuter; 5946 this.subqueryAliasToIndex = subqueryAliasToIndex; 5947 this.parent = parent; 5948 } 5949 5950 static EnclosingScope empty() { 5951 return new EnclosingScope( 5952 Collections.<String, RelationSource>emptyMap(), 5953 Collections.<String, Integer>emptyMap(), 5954 /*parent=*/ null); 5955 } 5956 5957 /** 5958 * Walk the chain innermost → outermost; first match wins 5959 * (shadowing). Defensive cycle guard via identity-keyed visited 5960 * set: the chain is a DAG by construction (parent links never 5961 * loop back), but the guard makes the invariant explicit. 5962 */ 5963 RelationSource lookupAlias(String aliasLower) { 5964 EnclosingScope cur = this; 5965 Set<EnclosingScope> visited = Collections.newSetFromMap( 5966 new IdentityHashMap<EnclosingScope, Boolean>()); 5967 while (cur != null && visited.add(cur)) { 5968 RelationSource r = cur.aliasLowerToOuter.get(aliasLower); 5969 if (r != null) return r; 5970 cur = cur.parent; 5971 } 5972 return null; 5973 } 5974 5975 /** 5976 * Innermost-wins flatten of the SUBQUERY alias → body-index chain. 5977 * Ancestors contribute their own FROM-subquery alias maps; if both 5978 * a parent and a child define the same alias, the child wins 5979 * (innermost shadows outermost). Cycle guard mirrors 5980 * {@link #lookupAlias(String)}. 5981 */ 5982 Map<String, Integer> flattenSubqueryAliasToIndex() { 5983 if (parent == null) return subqueryAliasToIndex; 5984 Deque<EnclosingScope> stack = new ArrayDeque<>(); 5985 Set<EnclosingScope> visited = Collections.newSetFromMap( 5986 new IdentityHashMap<EnclosingScope, Boolean>()); 5987 EnclosingScope cur = this; 5988 while (cur != null && visited.add(cur)) { 5989 stack.push(cur); 5990 cur = cur.parent; 5991 } 5992 // Stack top = outermost. Pop outermost first, emit, then 5993 // innermost overwrites via put(). Result: innermost wins. 5994 Map<String, Integer> out = new LinkedHashMap<>(); 5995 while (!stack.isEmpty()) { 5996 EnclosingScope s = stack.pop(); 5997 out.putAll(s.subqueryAliasToIndex); 5998 } 5999 return out; 6000 } 6001 } 6002 6003 /** 6004 * Build an {@link EnclosingScope} for the consuming SELECT by walking 6005 * its {@link TSelectSqlStatement#tables} list (FROM relations) and 6006 * classifying each entry. Mirrors how 6007 * {@link Resolver2NameBindingProvider#bindRelation} would classify 6008 * the same TTable but produces a full RelationSource per alias so 6009 * the slice-14 promotion can read both kind and qualifiedName. 6010 * 6011 * <p>Slice 20: the {@code parent} parameter chains the new scope to an 6012 * enclosing one so a doubly-nested scalar body can resolve grandparent 6013 * aliases. Top-level callers pass {@code null}; recursive scalar 6014 * extraction passes the caller's enclosing scope. 6015 * 6016 * <p>Classification (precedence on collision: CTE name beats 6017 * base-table name): 6018 * <ul> 6019 * <li>{@code TTable.getTableType() == subquery} AND alias matches 6020 * a key in {@code subqueryAliasToIndex} → SUBQUERY-bound.</li> 6021 * <li>{@code TTable.getName().toLowerCase()} matches a key in 6022 * {@code cteNameToStatementIndex} → CTE-bound.</li> 6023 * <li>Otherwise → TABLE-bound.</li> 6024 * </ul> 6025 */ 6026 private static EnclosingScope buildEnclosingScope(TSelectSqlStatement consumer, 6027 Map<String, Integer> cteNameToStatementIndex, 6028 Map<String, Integer> subqueryAliasToIndex, 6029 EnclosingScope parent) { 6030 if (consumer == null || consumer.tables == null || consumer.tables.size() == 0) { 6031 // Even an empty FROM contributes an empty scope so the chain's 6032 // shape reflects nesting depth uniformly. 6033 return new EnclosingScope( 6034 Collections.<String, RelationSource>emptyMap(), 6035 Collections.<String, Integer>emptyMap(), 6036 parent); 6037 } 6038 Map<String, RelationSource> map = new LinkedHashMap<>(); 6039 Map<String, Integer> filteredSubqueryAliasToIndex = new LinkedHashMap<>(); 6040 for (int i = 0; i < consumer.tables.size(); i++) { 6041 TTable t = consumer.tables.getTable(i); 6042 if (t == null) continue; 6043 // Slice 74: route anonymous FROM-subqueries through 6044 // effectiveAliasOf so the synth name (instead of the 6045 // literal "subquery" returned by t.getName()) flows into the 6046 // OUTER_REFERENCE scope chain. 6047 String aliasOrName = effectiveAliasOf(t); 6048 if (aliasOrName == null || aliasOrName.isEmpty()) continue; 6049 String lower = aliasOrName.toLowerCase(Locale.ROOT); 6050 if (map.containsKey(lower)) continue; // first-occurrence wins (defensive) 6051 6052 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 6053 if (subqueryAliasToIndex != null && subqueryAliasToIndex.containsKey(lower)) { 6054 map.put(lower, new RelationSource(aliasOrName, 6055 new RelationBinding(RelationKind.SUBQUERY, aliasOrName))); 6056 // Slice 15: keep a parallel filtered map of alias→index 6057 // so OUTER_REFERENCE-of-SUBQUERY emit-side dispatch can 6058 // resolve the outer body's statement index. 6059 filteredSubqueryAliasToIndex.put(lower, subqueryAliasToIndex.get(lower)); 6060 } 6061 continue; 6062 } 6063 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.objectname) { 6064 continue; // function / rowList / etc. — not modelled 6065 } 6066 String name = t.getName(); 6067 if (name == null || name.isEmpty()) continue; 6068 String nameLower = name.toLowerCase(Locale.ROOT); 6069 if (cteNameToStatementIndex != null 6070 && cteNameToStatementIndex.containsKey(nameLower)) { 6071 map.put(lower, new RelationSource(aliasOrName, 6072 new RelationBinding(RelationKind.CTE, name))); 6073 } else { 6074 map.put(lower, new RelationSource(aliasOrName, 6075 new RelationBinding(RelationKind.TABLE, name))); 6076 } 6077 } 6078 return new EnclosingScope(map, filteredSubqueryAliasToIndex, parent); 6079 } 6080 6081 /** 6082 * Slice 117 — sibling to {@link #buildEnclosingScope} for UPDATE 6083 * SET-RHS scalar-subquery bodies. The UPDATE outer scope is the 6084 * target table (TABLE-bound) plus any FROM-side relations walked 6085 * via {@code update.getJoins()} (TABLE / CTE / SUBQUERY-classified). 6086 * 6087 * <p>The target is added FIRST so on alias collisions with a 6088 * FROM-side relation (e.g. {@code UPDATE t ... FROM other t}) the 6089 * target wins. The {@code first-occurrence wins} rule mirrors 6090 * {@link #buildEnclosingScope}. 6091 * 6092 * <p>Used only by 6093 * {@link #extractScalarSubqueriesFromUpdateSetRhsInternal}. 6094 */ 6095 private static EnclosingScope buildUpdateEnclosingScope( 6096 TUpdateSqlStatement update, 6097 Map<String, Integer> cteNameToStatementIndex, 6098 Map<String, Integer> subqueryAliasToIndex, 6099 EnclosingScope parent) { 6100 Map<String, RelationSource> map = new LinkedHashMap<>(); 6101 Map<String, Integer> filteredSubqueryAliasToIndex = new LinkedHashMap<>(); 6102 if (update == null) { 6103 return new EnclosingScope(map, filteredSubqueryAliasToIndex, parent); 6104 } 6105 // 1) Target — always TABLE-bound. effectiveAliasOf falls back to 6106 // the table's own name when no alias is present. 6107 TTable target = update.getTargetTable(); 6108 if (target != null 6109 && target.getTableType() == gudusoft.gsqlparser.ETableSource.objectname) { 6110 String targetAlias = effectiveAliasOf(target); 6111 String targetName = target.getName(); 6112 if (targetAlias != null && !targetAlias.isEmpty() 6113 && targetName != null && !targetName.isEmpty()) { 6114 String aliasLower = targetAlias.toLowerCase(Locale.ROOT); 6115 map.put(aliasLower, new RelationSource(targetAlias, 6116 new RelationBinding(RelationKind.TABLE, targetName))); 6117 } 6118 } 6119 // 2) FROM-side joins. 6120 TJoinList joins = update.getJoins(); 6121 if (joins != null) { 6122 for (TJoin join : joins) { 6123 addUpdateRelationToEnclosingScope(join.getTable(), map, 6124 filteredSubqueryAliasToIndex, 6125 cteNameToStatementIndex, subqueryAliasToIndex); 6126 TJoinItemList items = join.getJoinItems(); 6127 if (items == null) continue; 6128 for (int i = 0; i < items.size(); i++) { 6129 TJoinItem item = items.getJoinItem(i); 6130 if (item == null) continue; 6131 addUpdateRelationToEnclosingScope(item.getTable(), map, 6132 filteredSubqueryAliasToIndex, 6133 cteNameToStatementIndex, subqueryAliasToIndex); 6134 } 6135 } 6136 } 6137 return new EnclosingScope(map, filteredSubqueryAliasToIndex, parent); 6138 } 6139 6140 /** 6141 * Slice 117 — classify one FROM-side TTable for 6142 * {@link #buildUpdateEnclosingScope}. SUBQUERY-typed tables go to 6143 * {@link RelationKind#SUBQUERY} if their alias was registered in 6144 * {@code subqueryAliasToIndex} (slice-83 extraction map); objectname 6145 * tables go to {@link RelationKind#CTE} if their bare name is a 6146 * declared CTE, otherwise {@link RelationKind#TABLE}. First- 6147 * occurrence wins. Function / rowList sources are silently skipped 6148 * (not modelled). 6149 */ 6150 private static void addUpdateRelationToEnclosingScope(TTable t, 6151 Map<String, RelationSource> map, 6152 Map<String, Integer> filteredSubqueryAliasToIndex, 6153 Map<String, Integer> cteNameToStatementIndex, 6154 Map<String, Integer> subqueryAliasToIndex) { 6155 if (t == null) return; 6156 String aliasOrName = effectiveAliasOf(t); 6157 if (aliasOrName == null || aliasOrName.isEmpty()) return; 6158 String lower = aliasOrName.toLowerCase(Locale.ROOT); 6159 if (map.containsKey(lower)) return; // first-occurrence wins 6160 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 6161 if (subqueryAliasToIndex != null 6162 && subqueryAliasToIndex.containsKey(lower)) { 6163 map.put(lower, new RelationSource(aliasOrName, 6164 new RelationBinding(RelationKind.SUBQUERY, aliasOrName))); 6165 filteredSubqueryAliasToIndex.put(lower, 6166 subqueryAliasToIndex.get(lower)); 6167 } 6168 return; 6169 } 6170 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.objectname) { 6171 return; // function / rowList / etc. — not modelled 6172 } 6173 String name = t.getName(); 6174 if (name == null || name.isEmpty()) return; 6175 String nameLower = name.toLowerCase(Locale.ROOT); 6176 if (cteNameToStatementIndex != null 6177 && cteNameToStatementIndex.containsKey(nameLower)) { 6178 map.put(lower, new RelationSource(aliasOrName, 6179 new RelationBinding(RelationKind.CTE, name))); 6180 } else { 6181 map.put(lower, new RelationSource(aliasOrName, 6182 new RelationBinding(RelationKind.TABLE, name))); 6183 } 6184 } 6185 6186 /** 6187 * Slice 118 — sibling to {@link #buildUpdateEnclosingScope} for MERGE 6188 * per-WHEN action WHERE correlated predicate subqueries. Builds the 6189 * enclosing scope's relation map covering MERGE's target table, USING 6190 * source, and any outer CTEs declared on the MERGE itself. Used only 6191 * by {@link #collectMergeActionWhere}; produced once per MERGE in 6192 * {@link #buildMerge} and threaded through the predicate-subquery 6193 * extractor. 6194 * 6195 * <p>Classification mirrors {@code buildMerge} step 6: 6196 * <ul> 6197 * <li>{@code merge.getTargetTable()} — TABLE-bound (qualifiedName = 6198 * target's table name). First-occurrence wins.</li> 6199 * <li>{@code merge.getUsingTable()}: 6200 * <ul> 6201 * <li>SUBQUERY-typed → SUBQUERY-bound with index pulled from 6202 * {@code aliasToSubIdx}.</li> 6203 * <li>objectname-typed AND name in 6204 * {@code cteNameToStatementIndex} → CTE-bound (slice-101 6205 * USING-as-CTE).</li> 6206 * <li>Else objectname → TABLE-bound.</li> 6207 * </ul></li> 6208 * </ul> 6209 */ 6210 private static EnclosingScope buildMergeEnclosingScope( 6211 TMergeSqlStatement merge, 6212 Map<String, Integer> cteNameToStatementIndex, 6213 Map<String, Integer> aliasToSubIdx) { 6214 Map<String, RelationSource> map = new LinkedHashMap<>(); 6215 Map<String, Integer> filteredSubqueryAliasToIndex = new LinkedHashMap<>(); 6216 if (merge == null) { 6217 return new EnclosingScope(map, filteredSubqueryAliasToIndex, 6218 /*parent=*/ null); 6219 } 6220 // 1) Target — always TABLE-bound. effectiveAliasOf falls back to 6221 // the table's own name when no alias is present. 6222 TTable target = merge.getTargetTable(); 6223 if (target != null 6224 && target.getTableType() == gudusoft.gsqlparser.ETableSource.objectname) { 6225 String targetAlias = effectiveAliasOf(target); 6226 String targetName = target.getName(); 6227 if (targetAlias != null && !targetAlias.isEmpty() 6228 && targetName != null && !targetName.isEmpty()) { 6229 String aliasLower = targetAlias.toLowerCase(Locale.ROOT); 6230 map.put(aliasLower, new RelationSource(targetAlias, 6231 new RelationBinding(RelationKind.TABLE, targetName))); 6232 } 6233 } 6234 // 2) USING source. 6235 TTable using = merge.getUsingTable(); 6236 if (using != null) { 6237 String usingAlias = effectiveAliasOf(using); 6238 // Fall back to the USING source's bare name when no alias is 6239 // present — matches buildMerge's `usingAlias` initialisation. 6240 if (usingAlias == null || usingAlias.isEmpty()) { 6241 usingAlias = (using.getName() == null 6242 || using.getName().toString().isEmpty()) 6243 ? "__merge_using__" 6244 : using.getName().toString(); 6245 } 6246 String usingAliasLower = usingAlias.toLowerCase(Locale.ROOT); 6247 // First-occurrence wins (defensive — target's alias could 6248 // theoretically shadow if user named USING source identically; 6249 // matches buildUpdateEnclosingScope behaviour). 6250 if (!map.containsKey(usingAliasLower)) { 6251 if (using.getTableType() 6252 == gudusoft.gsqlparser.ETableSource.subquery) { 6253 if (aliasToSubIdx != null 6254 && aliasToSubIdx.containsKey(usingAliasLower)) { 6255 map.put(usingAliasLower, new RelationSource(usingAlias, 6256 new RelationBinding(RelationKind.SUBQUERY, 6257 usingAlias))); 6258 filteredSubqueryAliasToIndex.put(usingAliasLower, 6259 aliasToSubIdx.get(usingAliasLower)); 6260 } 6261 } else if (using.getTableType() 6262 == gudusoft.gsqlparser.ETableSource.objectname) { 6263 String usingName = using.getName(); 6264 if (usingName != null && !usingName.isEmpty()) { 6265 String usingNameLower = 6266 usingName.toLowerCase(Locale.ROOT); 6267 Integer cteIdx = (cteNameToStatementIndex == null) 6268 ? null 6269 : cteNameToStatementIndex.get(usingNameLower); 6270 if (cteIdx != null) { 6271 // Slice 101 USING-as-CTE branch — model as 6272 // SUBQUERY-bound so the promoter classifies 6273 // outerKind=SUBQUERY (mirrors the slice-117 6274 // SUBQUERY classifier for UPDATE FROM-CTE 6275 // sources, also via aliasToSubIdx). 6276 map.put(usingAliasLower, new RelationSource(usingAlias, 6277 new RelationBinding(RelationKind.SUBQUERY, 6278 usingAlias))); 6279 // The MERGE caller pre-populates aliasToSubIdx 6280 // with the CTE's statement index for both 6281 // usingAlias AND bare CTE name. Use whichever 6282 // exists (aliasToSubIdx is keyed on lower-cased 6283 // aliases — matches buildMerge step 6). 6284 if (aliasToSubIdx != null 6285 && aliasToSubIdx.containsKey(usingAliasLower)) { 6286 filteredSubqueryAliasToIndex.put(usingAliasLower, 6287 aliasToSubIdx.get(usingAliasLower)); 6288 } 6289 } else { 6290 map.put(usingAliasLower, new RelationSource(usingAlias, 6291 new RelationBinding(RelationKind.TABLE, usingName))); 6292 } 6293 } 6294 } 6295 // function / rowList sources are silently skipped (not 6296 // modelled — same convention as buildUpdateEnclosingScope). 6297 } 6298 } 6299 return new EnclosingScope(map, filteredSubqueryAliasToIndex, 6300 /*parent=*/ null); 6301 } 6302 6303 /** 6304 * Slice 117 — precompute the lowercased set of inner local FROM 6305 * aliases for a SELECT statement, used by the tolerant-outer- 6306 * binding fallback in 6307 * {@link gudusoft.gsqlparser.ir.semantic.binding.Resolver2NameBindingProvider#bindColumn}. 6308 * Walks {@code select.tables} (which includes both the FROM driver 6309 * and any JOIN sides) and collects each table's effective alias. 6310 * 6311 * <p>Computed BEFORE the inner build's {@code buildRelations} JOIN- 6312 * ON collector runs so the tolerant provider is already scope-aware 6313 * when the collector first calls {@code bindColumn} (codex round-5 6314 * ordering fix). 6315 */ 6316 private static Set<String> precomputeInnerLocalAliases( 6317 TSelectSqlStatement select) { 6318 Set<String> aliases = new HashSet<>(); 6319 if (select == null || select.tables == null) return aliases; 6320 for (int i = 0; i < select.tables.size(); i++) { 6321 TTable t = select.tables.getTable(i); 6322 if (t == null) continue; 6323 String alias = effectiveAliasOf(t); 6324 if (alias != null && !alias.isEmpty()) { 6325 aliases.add(alias.toLowerCase(Locale.ROOT)); 6326 } 6327 } 6328 return aliases; 6329 } 6330 6331 /** 6332 * Slice-14 correlation promotion (slice-15 extended). Walk every 6333 * column ref in the already-built inner statement; any ref whose 6334 * alias is NOT in the inner's local relations is "correlated." 6335 * Look it up in the enclosing scope: 6336 * 6337 * <ul> 6338 * <li>Found AND TABLE / CTE / SUBQUERY-bound → synthesise an 6339 * OUTER_REFERENCE RelationSource with {@code outerKind} 6340 * set to the resolved outer kind, and add to 6341 * {@code inner.relations}.</li> 6342 * <li>Found AND UNION / UNKNOWN-bound → throw defensively 6343 * (no current builder path produces these from a FROM 6344 * relation).</li> 6345 * <li>Not found anywhere → throw (Resolver2 should not give us 6346 * such a ref; defensive).</li> 6347 * </ul> 6348 * 6349 * <p>The synthesised RelationSource's alias is the inner ref's 6350 * spelling (case-sensitive equality is preserved). The 6351 * qualifiedName comes from the outer's existing binding: 6352 * - TABLE: outer's table name. 6353 * - CTE: outer's CTE name (NOT alias — see 6354 * {@code correlatedToCteBoundOuterWithCteAlias}). 6355 * - SUBQUERY: outer's alias (matching slice-14 SUBQUERY 6356 * convention, where {@code buildEnclosingScope} sets 6357 * {@code qualifiedName=aliasOrName}). 6358 * No lower-casing happens here — that happens at projector 6359 * emit-time per slice-1 convention. Multiple inner refs with 6360 * case-variant spellings inherit the same pre-existing slice-1 6361 * limitation. 6362 */ 6363 private static StatementGraph promoteCorrelatedRefsToOuterReference( 6364 StatementGraph innerStmt, String outerAlias, 6365 EnclosingScope enclosingScope) { 6366 Set<String> innerAliasesLower = new HashSet<>(); 6367 for (RelationSource r : innerStmt.getRelations()) { 6368 innerAliasesLower.add(r.getAlias().toLowerCase(Locale.ROOT)); 6369 } 6370 // Per alias-lower: outer's RelationSource (for binding) and the 6371 // inner ref's exact alias spelling (for the synthesised alias). 6372 LinkedHashMap<String, RelationSource> outerByLower = new LinkedHashMap<>(); 6373 LinkedHashMap<String, String> firstRefSpellingByLower = new LinkedHashMap<>(); 6374 6375 for (ColumnRef ref : collectAllInnerRefs(innerStmt)) { 6376 String aliasLower = ref.getRelationAlias().toLowerCase(Locale.ROOT); 6377 if (innerAliasesLower.contains(aliasLower)) continue; // local — not correlated 6378 if (outerByLower.containsKey(aliasLower)) continue; // already promoted 6379 6380 // Slice 20: chain-walking lookup. Walks the EnclosingScope 6381 // chain innermost → outermost; first match wins (shadowing 6382 // semantics). slice 14/15 used a single-level get(); slice 6383 // 20 generalises so inner-inner scalars can resolve 6384 // grandparent (and deeper) aliases. 6385 RelationSource outerRel = enclosingScope.lookupAlias(aliasLower); 6386 if (outerRel == null) { 6387 throw new SemanticIRBuildException( 6388 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_UNKNOWN_RELATION_ALIAS, 6389 "scalar subquery '" + outerAlias 6390 + "' references unknown alias '" + ref.getRelationAlias() 6391 + "' (column '" + ref.getRelationAlias() + "." 6392 + ref.getColumnName() 6393 + "'); not in inner relations or any enclosing scope", (TParseTreeNode) null)); 6394 } 6395 RelationKind outerKind = outerRel.getBinding().getKind(); 6396 if (outerKind != RelationKind.TABLE 6397 && outerKind != RelationKind.CTE 6398 && outerKind != RelationKind.SUBQUERY) { 6399 throw new SemanticIRBuildException( 6400 Diagnostic.error(DiagnosticCode.CORRELATED_SCALAR_SUBQUERY_UNKNOWN_OUTER_BINDING, 6401 "correlated scalar subquery '" + outerAlias 6402 + "' references outer alias '" + ref.getRelationAlias() 6403 + "' bound to a " + outerKind 6404 + "; only TABLE / CTE / SUBQUERY-bound outer correlations supported", (TParseTreeNode) null)); 6405 } 6406 outerByLower.put(aliasLower, outerRel); 6407 firstRefSpellingByLower.put(aliasLower, ref.getRelationAlias()); 6408 } 6409 6410 if (outerByLower.isEmpty()) return innerStmt; 6411 6412 List<RelationSource> augmented = new ArrayList<>(innerStmt.getRelations()); 6413 for (String key : outerByLower.keySet()) { 6414 RelationSource outerRel = outerByLower.get(key); 6415 augmented.add(new RelationSource( 6416 firstRefSpellingByLower.get(key), 6417 new RelationBinding(RelationKind.OUTER_REFERENCE, 6418 outerRel.getBinding().getQualifiedName(), 6419 outerRel.getBinding().getKind()))); 6420 } 6421 return rebuildStatementGraphWithRelations(innerStmt, augmented); 6422 } 6423 6424 /** 6425 * Collect every {@link ColumnRef} reachable from {@code innerStmt}'s 6426 * seven clause-bearing fields. Used by 6427 * {@link #promoteCorrelatedRefsToOuterReference} (slice-11 scalar 6428 * bodies) and the JOIN-ON EXISTS predicate-body correlation walker 6429 * (slice 23+); mirrors the old rejecter's clause coverage exactly 6430 * (output sources, filter, join, groupBy, having, orderBy, and 6431 * slice-73 distinctOn). 6432 */ 6433 private static List<ColumnRef> collectAllInnerRefs(StatementGraph innerStmt) { 6434 List<ColumnRef> all = new ArrayList<>(); 6435 for (OutputColumn out : innerStmt.getOutputColumns()) { 6436 all.addAll(out.getSources()); 6437 } 6438 all.addAll(innerStmt.getFilterColumnRefs()); 6439 all.addAll(innerStmt.getJoinColumnRefs()); 6440 all.addAll(innerStmt.getGroupByColumnRefs()); 6441 all.addAll(innerStmt.getHavingColumnRefs()); 6442 all.addAll(innerStmt.getOrderByColumnRefs()); 6443 all.addAll(innerStmt.getDistinctOnColumnRefs()); 6444 return all; 6445 } 6446 6447 /** 6448 * Copy a {@link StatementGraph} replacing only its relations list. 6449 * StatementGraph is otherwise immutable. 6450 */ 6451 private static StatementGraph rebuildStatementGraphWithRelations( 6452 StatementGraph stmt, List<RelationSource> relations) { 6453 return new StatementGraph( 6454 stmt.getName(), 6455 stmt.getKind(), 6456 relations, 6457 stmt.getOutputColumns(), 6458 stmt.getFilterColumnRefs(), 6459 stmt.getJoinColumnRefs(), 6460 stmt.getGroupByColumnRefs(), 6461 stmt.getHavingColumnRefs(), 6462 stmt.getOrderByColumnRefs(), 6463 stmt.getDistinctOnColumnRefs(), 6464 stmt.isDistinct(), 6465 stmt.getSetOperator(), 6466 stmt.getRowLimit()); 6467 } 6468 6469 /** 6470 * Slice 16 preflight: reject any FROM-clause subquery directly on a 6471 * set-op branch's FROM/JOIN list, BEFORE 6472 * {@link #extractScalarSubqueriesAsStatements} can append scalar-body 6473 * statements to {@code stmts}/{@code lineage}. Without this preflight, 6474 * a branch with both a FROM-subquery and a scalar projection would 6475 * extract the scalar (mutating shared state) and then fail later in 6476 * {@code buildSelectStatement}, producing a confusing scalar-correlation 6477 * error message instead of the slice-12 FROM-subquery boundary. 6478 * 6479 * <p>Inspects only the branch's DIRECT FROM/JOIN entries — does NOT 6480 * recurse into result-column expressions or scalar-body inner SELECTs 6481 * (those are handled by the recursive scalar-body build's own 6482 * {@code allowFromSubqueries=false} guard). The error message contains 6483 * both {@code set-op branch} and {@code FROM-clause subquery} keywords 6484 * so the slice-12 boundary is surfaced explicitly. 6485 */ 6486 private static void rejectFromSubqueriesInSetOpBranch(TSelectSqlStatement br) { 6487 if (br.joins == null) return; 6488 for (TJoin join : br.joins) { 6489 TTable left = join.getTable(); 6490 if (left != null 6491 && left.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 6492 throw new SemanticIRBuildException( 6493 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_IN_SET_OP_BRANCH_FROM, 6494 "FROM-clause subquery directly in a set-op branch FROM is not " 6495 + "supported yet (set-op branch with FROM-clause subquery)", (TParseTreeNode) null)); 6496 } 6497 TJoinItemList items = join.getJoinItems(); 6498 if (items == null) continue; 6499 for (int i = 0; i < items.size(); i++) { 6500 TTable r = items.getJoinItem(i).getTable(); 6501 if (r != null 6502 && r.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 6503 throw new SemanticIRBuildException( 6504 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_ON_JOIN_SIDE_IN_SET_OP_BRANCH, 6505 "FROM-clause subquery on a JOIN side in a set-op branch is " 6506 + "not supported yet (set-op branch with FROM-clause subquery)", (TParseTreeNode) null)); 6507 } 6508 } 6509 } 6510 } 6511 6512 // ==================================================================== 6513 // Slice 12: Set operations (UNION / INTERSECT / MINUS / EXCEPT). 6514 // 6515 // Algorithm: each branch of the set-op tree is built as its own 6516 // synthetically-named StatementGraph; the outer set-op statement has 6517 // empty relations and per-output `STATEMENT_OUTPUT → STATEMENT_OUTPUT` 6518 // lineage edges to each branch's corresponding output. The flatten 6519 // walks the left-leaning AST iteratively (CLAUDE.md mandates no 6520 // recursion on leftStmt/rightStmt; would StackOverflow on 2000+ UNIONs). 6521 // 6522 // Internal-node modifiers (ORDER BY / row-limits) are rejected on every 6523 // set-op node visited (root + internal), not just the root, because 6524 // parenthesized inner combined nodes can carry those modifiers (per 6525 // /tmp/SetOpInnerModifierProbe: `(A UNION B ORDER BY id) UNION C` 6526 // attaches ORDER BY to the inner node in Oracle / PostgreSQL / MSSQL; 6527 // PostgreSQL maps inner FETCH FIRST → LIMIT). 6528 // ==================================================================== 6529 6530 /** 6531 * Build a set-op program: flatten branches, build each as its own 6532 * StatementGraph, construct the outer set-op statement, emit lineage. 6533 * Returns the outer set-op statement's index in {@code stmts}. 6534 * 6535 * @param setOp the {@link TSelectSqlStatement} carrying 6536 * {@code setOperatorType != none}. 6537 * @param setOpName non-null when the set-op is a CTE body (the outer 6538 * statement is named with the CTE name); null when the set-op is 6539 * the program's top-level outer. 6540 * @param hasOuterCteListAlreadyProcessed true when the caller already 6541 * processed the set-op root's CTE list (top-level dispatch); 6542 * false when this is a recursive context (set-op CTE body) where 6543 * a non-empty CTE list on the set-op root is rejected as a 6544 * nested WITH. 6545 */ 6546 private static int buildSetOpProgram(TSelectSqlStatement setOp, 6547 NameBindingProvider provider, 6548 List<StatementGraph> stmts, 6549 List<LineageEdge> lineage, 6550 Map<String, Integer> cteNameToStatementIndex, 6551 String setOpName, 6552 boolean hasOuterCteListAlreadyProcessed) { 6553 // Nested-WITH guard (rd-5 MUST 1): when called from a CTE body 6554 // context, the set-op root must not carry its own CTE list. 6555 if (!hasOuterCteListAlreadyProcessed 6556 && setOp.getCteList() != null 6557 && setOp.getCteList().size() > 0) { 6558 throw new SemanticIRBuildException( 6559 Diagnostic.error(DiagnosticCode.NESTED_WITH_NOT_SUPPORTED, 6560 "nested WITH/CTE inside a CTE body or subquery is not supported yet " 6561 + "(set-op CTE body has its own CTE list)", (TParseTreeNode) null)); 6562 } 6563 // Slice 21: ORDER BY is now collected from the outer set-op 6564 // (see buildSetOpOuterOrderByColumnRefs). 6565 // Slice 72: outer row-limit lifted via buildSetOpRowLimit. 6566 // The internal-node reject (parenthesized inner set-ops with 6567 // row-limit) fires inside flattenSetOpTreeIteratively. Compute 6568 // BEFORE the snapshot block so a defensive throw (Hive/Vertica/ 6569 // ANSI-DB2 guards, MSSQL null-valued TOrderBy slots) propagates 6570 // without leaving stmts/lineage partially populated. 6571 RowLimit setOpRowLimit = buildSetOpRowLimit(setOp); 6572 6573 // Slice 16: SET-OP-WIDE TRANSACTIONAL ROLLBACK (codex rounds 3-5 6574 // adversarial findings). Snapshot `stmts.size()` and 6575 // `lineage.size()` BEFORE any branch mutation. On any 6576 // SemanticIRBuildException thrown by the per-branch loop or 6577 // post-build validation, truncate both lists back to the snapshot 6578 // and rethrow. This addresses the full class of "mutation-free 6579 // branch validation that fires after earlier-branch mutations": 6580 // FROM-subquery preflight (round 3), column-count check (round 4), 6581 // multi-scalar-projection per-branch validation (round 5), branch 6582 // duplicate-output-names check (round 5), and any future check 6583 // that may join the same class. The pre-loop preflight below 6584 // remains for fast-fail with better error messages on common 6585 // shapes, but the rollback is the safety net. 6586 int stmtsSnapshot = stmts.size(); 6587 int lineageSnapshot = lineage.size(); 6588 try { 6589 return buildSetOpProgramInternal(setOp, provider, stmts, lineage, 6590 cteNameToStatementIndex, setOpName, 6591 hasOuterCteListAlreadyProcessed, setOpRowLimit); 6592 } catch (RuntimeException e) { 6593 while (stmts.size() > stmtsSnapshot) stmts.remove(stmts.size() - 1); 6594 while (lineage.size() > lineageSnapshot) lineage.remove(lineage.size() - 1); 6595 throw e; 6596 } 6597 } 6598 6599 /** 6600 * Internal body of {@link #buildSetOpProgram}. Wrapped with 6601 * snapshot/rollback by the public entry point; do not call directly. 6602 */ 6603 private static int buildSetOpProgramInternal(TSelectSqlStatement setOp, 6604 NameBindingProvider provider, 6605 List<StatementGraph> stmts, 6606 List<LineageEdge> lineage, 6607 Map<String, Integer> cteNameToStatementIndex, 6608 String setOpName, 6609 boolean hasOuterCteListAlreadyProcessed, 6610 RowLimit setOpRowLimit) { 6611 6612 SetOperator setOpKind = resolveSetOperator(setOp); 6613 List<TSelectSqlStatement> branches = flattenSetOpTreeIteratively(setOp, setOpKind); 6614 if (branches.size() < 2) { 6615 throw new SemanticIRBuildException( 6616 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_COUNT_TOO_FEW, 6617 "set-op flatten produced " + branches.size() 6618 + " branches; expected at least 2", (TParseTreeNode) null)); 6619 } 6620 6621 // Slice 16: PRE-LOOP PREFLIGHT (codex round-3 + round-4 adversarial 6622 // findings — medium). Run all mutation-free branch validation across 6623 // EVERY branch BEFORE the main build loop runs. Without this, a 6624 // scalar-bearing earlier branch can append scalar-body statements to 6625 // `stmts`/`lineage` BEFORE a later branch's rejection fires, leaving 6626 // partial state on the rejection path. The slice-16 safety claim is 6627 // "no half-built scalar bodies leak when a branch is rejected"; that 6628 // claim only holds when ALL mutation-free branch checks are 6629 // set-op-wide. 6630 // 6631 // Checks bundled here (each is AST-only and side-effect-free): 6632 // - Defensive nested-set-op leaf check (slice 12). 6633 // - Direct branch FROM-subquery rejection (slice 16 round 3). 6634 // - Result-column-count compatibility across branches (slice 12, 6635 // moved here in slice 16 round 4 — uses AST 6636 // getResultColumnList().size() since BUILT outputColumns.size() 6637 // is unavailable pre-build; the post-loop check on BUILT 6638 // outputs stays as a defensive backup if AST-vs-built ever 6639 // diverges for a future shape). 6640 int expectedAstCols = -1; 6641 for (int i = 0; i < branches.size(); i++) { 6642 TSelectSqlStatement br = branches.get(i); 6643 if (br.getSetOperatorType() != null 6644 && br.getSetOperatorType() != ESetOperatorType.none) { 6645 throw new SemanticIRBuildException( 6646 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_IS_SET_OP, 6647 "set-op branch is itself a set operation; nested set " 6648 + "operations in branches are not supported yet", (TParseTreeNode) null)); 6649 } 6650 rejectFromSubqueriesInSetOpBranch(br); 6651 int astCols = br.getResultColumnList() == null 6652 ? 0 6653 : br.getResultColumnList().size(); 6654 if (i == 0) { 6655 expectedAstCols = astCols; 6656 } else if (astCols != expectedAstCols) { 6657 throw new SemanticIRBuildException( 6658 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_COLUMN_COUNT_MISMATCH, 6659 "set-op branch column-count mismatch: branch[0] has " 6660 + expectedAstCols + " columns, branch[" + i + "] has " 6661 + astCols, (TParseTreeNode) null)); 6662 } 6663 } 6664 6665 int[] branchIdxs = new int[branches.size()]; 6666 for (int i = 0; i < branches.size(); i++) { 6667 TSelectSqlStatement br = branches.get(i); 6668 6669 // Slice 16: per-branch enclosing scope. allowFromSubqueries 6670 // stays false at the branch level so subqueryAliasToIndex 6671 // is empty (no OUTER_REFERENCE-of-SUBQUERY in branches). 6672 // CTE-aware so a branch's scalar can correlate to a CTE 6673 // visible in scope (top-level set-ops have all outer CTEs 6674 // indexed; CTE-body set-ops have only prior visible CTEs 6675 // indexed — the current CTE is registered AFTER this method 6676 // returns, by design, mirroring the non-set-op CTE body path). 6677 EnclosingScope branchEnclosing = buildEnclosingScope(br, 6678 cteNameToStatementIndex, 6679 Collections.<String, Integer>emptyMap(), 6680 /*parent=*/ null); 6681 // Slice 20: pass `false` so the slice-12 / slice-16 boundary 6682 // holds — branch scalar bodies must NOT host another scalar 6683 // projection. The branch's TOP-LEVEL scalar projection is 6684 // still allowed (slice 16); deeper recursion is not. 6685 Map<Integer, ScalarInfo> branchScalarMap = 6686 extractScalarSubqueriesAsStatements(br, provider, 6687 stmts, lineage, cteNameToStatementIndex, branchEnclosing, 6688 /*allowRecursiveScalarSubqueryExtraction=*/ false); 6689 6690 // Slice 16: compute branchName AFTER scalar extraction so the 6691 // digit suffix in `<set_op_branch_<idx>>` matches the branch's 6692 // final position in `stmts`. Scalar bodies appended by 6693 // extractScalarSubqueriesAsStatements come BEFORE the branch 6694 // in `stmts`, so pre-extraction `stmts.size()` would be wrong 6695 // by (number of scalar bodies in this branch) — breaking the 6696 // slice-12 invariant that branch synthetic names round-trip 6697 // to their statement index. 6698 // 6699 // Slice 113 — predicate bodies extracted from the branch's 6700 // WHERE clause via {@link PredicateClauseContext#SET_OP_BRANCH_WHERE} 6701 // also land in {@code stmts} BEFORE the branch, INSIDE the 6702 // {@code buildSelectStatementImpl} call below. So the 6703 // pre-build {@code stmts.size()} can still understate the 6704 // branch's final position. The slice-12/16 invariant is 6705 // preserved by computing a tentative name pre-build (best 6706 // guess used by inner consumers that need a non-null name), 6707 // then rebuilding the StatementGraph with the corrected 6708 // name AFTER the build via {@link #withRenamedTo} if any 6709 // predicate body was extracted. 6710 int preBuildStmtsSize = stmts.size(); 6711 String tentativeBranchName = SET_OP_BRANCH_PREFIX + preBuildStmtsSize + ">"; 6712 StatementGraph branchStmt = buildSelectStatementImpl(br, provider, tentativeBranchName, 6713 /*hasOuterCteListAlreadyProcessed=*/ false, 6714 /*allowFromSubqueries=*/ false, 6715 /*allowScalarProjectionSubqueries=*/ true, // ← slice-16 lift 6716 /*allowWindowProjection=*/ true, 6717 // Slice 113 keeps JOIN-ON predicate subqueries rejected 6718 // in set-op branches (slice 23 / 26 contract pinned by 6719 // existsInSetOpBranchJoinOnStillRejected / 6720 // lhsSubqueryInSetOpBranchRejected) — the lift is 6721 // WHERE-only. The two flags are now independent. 6722 /*allowJoinOnPredicateSubqueries=*/ false, 6723 /*stmtsForExtraction=*/ stmts, // ← slice-113 6724 /*lineageForExtraction=*/ lineage, // ← slice-113 6725 /*cteMapForExtraction=*/ cteNameToStatementIndex, // ← slice-113 6726 /*isPredicateBody=*/ false, 6727 /*whereClauseContext=*/ PredicateClauseContext.SET_OP_BRANCH_WHERE, 6728 /*allowWherePredicateSubqueries=*/ true); // ← slice-113 lift 6729 int idx = stmts.size(); 6730 if (idx != preBuildStmtsSize) { 6731 // Slice 113 — predicate bodies were appended during 6732 // the branch build. Rebuild the StatementGraph with 6733 // the corrected name so the slice-12/16 invariant 6734 // (digit suffix == final position) survives. The 6735 // rebuild copies all 15 fields; no LineageRef is 6736 // affected because they are idx-based, not name-based 6737 // (codex round-1 Q4 resolution). 6738 String finalBranchName = SET_OP_BRANCH_PREFIX + idx + ">"; 6739 branchStmt = withRenamedTo(branchStmt, finalBranchName); 6740 } 6741 rejectDuplicateOutputNames(branchStmt, branchStmt.getName()); 6742 branchIdxs[i] = idx; 6743 stmts.add(branchStmt); 6744 // Branch's own per-output, filter, and join lineage. Pass 6745 // the branch's scalar map so STATEMENT_OUTPUT → 6746 // STATEMENT_OUTPUT edges to scalar bodies are emitted. 6747 // subqueryAliasToIndex stays empty (allowFromSubqueries=false 6748 // for branches, so no FROM-subquery aliases exist at this scope). 6749 emitLineageForStatement(branchStmt, idx, lineage, 6750 cteNameToStatementIndex, 6751 Collections.<String, Integer>emptyMap(), 6752 branchScalarMap); 6753 } 6754 6755 // Validate column-count alignment via BUILT statements. 6756 int expectedCols = stmts.get(branchIdxs[0]).getOutputColumns().size(); 6757 for (int i = 1; i < branches.size(); i++) { 6758 int n = stmts.get(branchIdxs[i]).getOutputColumns().size(); 6759 if (n != expectedCols) { 6760 throw new SemanticIRBuildException( 6761 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_COLUMN_COUNT_MISMATCH, 6762 "set-op branch column-count mismatch: branch[0] has " 6763 + expectedCols + " columns, branch[" + i + "] has " + n, (TParseTreeNode) null)); 6764 } 6765 } 6766 6767 // Build outer outputs from branch[0]'s built outputs. 6768 StatementGraph branch0 = stmts.get(branchIdxs[0]); 6769 List<OutputColumn> outerOutputs = new ArrayList<>(expectedCols); 6770 Set<String> seenOuter = new HashSet<>(); 6771 for (int i = 0; i < expectedCols; i++) { 6772 OutputColumn b0 = branch0.getOutputColumns().get(i); 6773 String name = b0.getName(); 6774 if (name == null || name.isEmpty()) { 6775 throw new SemanticIRBuildException( 6776 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_OUTPUT_NAME_UNUSABLE, 6777 "set-op output position " + i + " has no usable name in branch[0]; " 6778 + "add an alias to the SELECT-list expression", (TParseTreeNode) null)); 6779 } 6780 if (!seenOuter.add(name.toLowerCase(Locale.ROOT))) { 6781 throw new SemanticIRBuildException( 6782 Diagnostic.error(DiagnosticCode.SET_OP_DUPLICATE_OUTER_OUTPUT_NAME, 6783 "set-op outer output name '" + name + "' is duplicated " 6784 + "(branch[0] has duplicate output names)", null)); 6785 } 6786 outerOutputs.add(new OutputColumn(name, 6787 /*derived=*/ true, 6788 /*aggregate=*/ false, 6789 /*sources=*/ Collections.<ColumnRef>emptyList(), 6790 /*windowSpec=*/ null)); 6791 } 6792 6793 // Slice 21: collect outer ORDER BY refs from branches' base sources. 6794 // A throw here unwinds via the slice-16 snapshot/rollback wrapper 6795 // in buildSetOpProgram(), so partially-built branches/scalar-bodies 6796 // do not leak into stmts/lineage on rejection. 6797 List<ColumnRef> outerOrderByRefs = buildSetOpOuterOrderByColumnRefs( 6798 setOp, outerOutputs, stmts, branchIdxs); 6799 6800 StatementGraph outer = new StatementGraph(setOpName, "SELECT", 6801 /*relations=*/ Collections.<RelationSource>emptyList(), 6802 outerOutputs, 6803 /*filterColumnRefs=*/ Collections.<ColumnRef>emptyList(), 6804 /*joinColumnRefs=*/ Collections.<ColumnRef>emptyList(), 6805 /*groupByColumnRefs=*/Collections.<ColumnRef>emptyList(), 6806 /*havingColumnRefs=*/ Collections.<ColumnRef>emptyList(), 6807 /*orderByColumnRefs=*/outerOrderByRefs, 6808 /*distinctOnColumnRefs=*/Collections.<ColumnRef>emptyList(), 6809 /*distinct=*/ false, 6810 /*setOperator=*/ setOpKind, 6811 /*rowLimit=*/ setOpRowLimit); 6812 int outerIdx = stmts.size(); 6813 stmts.add(outer); 6814 6815 // Lineage: outer.outputs[i] → each branch.outputs[i] (in branch order). 6816 for (int i = 0; i < expectedCols; i++) { 6817 OutputColumn out = outer.getOutputColumns().get(i); 6818 for (int b = 0; b < branches.size(); b++) { 6819 StatementGraph branchStmt = stmts.get(branchIdxs[b]); 6820 String branchOutName = branchStmt.getOutputColumns().get(i).getName(); 6821 lineage.add(new LineageEdge( 6822 LineageRef.statementOutput(outerIdx, out.getName()), 6823 LineageRef.statementOutput(branchIdxs[b], branchOutName))); 6824 } 6825 } 6826 return outerIdx; 6827 } 6828 6829 /** 6830 * Reject row-limit clauses on an INTERNAL (non-root) set-op node. 6831 * Slice 9 / 12 rationale: with ORDER BY they decide which rows 6832 * survive, so the canonical-model exclusion of ORDER BY is only 6833 * sound when no row-limit is present. 6834 * 6835 * <p>Slice 21 split this from {@code rejectSetOpInternalOrderBy} 6836 * because the OUTER set-op node lifts its ORDER BY (collected via 6837 * {@link #buildSetOpOuterOrderByColumnRefs}). Slice 72 narrows the 6838 * row-limit guard the same way: the OUTER set-op node now lifts 6839 * row-limit metadata (collected via {@link #buildSetOpRowLimit}), 6840 * while parenthesized inner combined operations carrying a 6841 * row-limit (e.g. {@code (A UNION B LIMIT 3) UNION C}) remain 6842 * rejected because the intermediate limit is destroyed by the 6843 * outer set operation. 6844 */ 6845 private static void rejectSetOpRowLimit(TSelectSqlStatement node) { 6846 if (node.getLimitClause() != null) { 6847 throw new SemanticIRBuildException( 6848 Diagnostic.error(DiagnosticCode.SET_OP_ROW_LIMIT_NOT_SUPPORTED, 6849 "row-limit clause LIMIT on a non-root set-op node is not supported yet", (TParseTreeNode) null)); 6850 } 6851 if (node.getTopClause() != null) { 6852 throw new SemanticIRBuildException( 6853 Diagnostic.error(DiagnosticCode.SET_OP_ROW_LIMIT_NOT_SUPPORTED, 6854 "row-limit clause TOP on a non-root set-op node is not supported yet", (TParseTreeNode) null)); 6855 } 6856 if (node.getFetchFirstClause() != null) { 6857 throw new SemanticIRBuildException( 6858 Diagnostic.error(DiagnosticCode.SET_OP_ROW_LIMIT_NOT_SUPPORTED, 6859 "row-limit clause FETCH FIRST on a non-root set-op node is not supported yet", (TParseTreeNode) null)); 6860 } 6861 if (node.getOffsetClause() != null) { 6862 throw new SemanticIRBuildException( 6863 Diagnostic.error(DiagnosticCode.SET_OP_ROW_LIMIT_NOT_SUPPORTED, 6864 "row-limit clause OFFSET on a non-root set-op node is not supported yet", (TParseTreeNode) null)); 6865 } 6866 } 6867 6868 /** 6869 * Reject ORDER BY on an INTERNAL (non-root) set-op node. Slice 21 6870 * lifted ORDER BY on the OUTER (root) set-op, but parenthesized 6871 * inner combined nodes like 6872 * {@code (A UNION B ORDER BY id) UNION C} remain rejected: the 6873 * intermediate sort is destroyed by the outer set operation 6874 * (UNION does not preserve order), so the inner ORDER BY has no 6875 * observable effect. Lifting requires modelling intermediate sort 6876 * semantics — a future slice. 6877 */ 6878 private static void rejectSetOpInternalOrderBy(TSelectSqlStatement node) { 6879 if (node.getOrderbyClause() != null) { 6880 throw new SemanticIRBuildException( 6881 Diagnostic.error(DiagnosticCode.SET_OP_NON_ROOT_ORDER_BY_NOT_SUPPORTED, 6882 "ORDER BY on a non-root set-op node is not supported yet " 6883 + "(intermediate sort would be discarded by the outer set operation)", (TParseTreeNode) null)); 6884 } 6885 } 6886 6887 /** 6888 * Collect physical column refs for the outer set-op's ORDER BY 6889 * clause. Slice 21 lifts the slice-12 rejection on set-op outer 6890 * ORDER BY using the slice-9 single-SELECT pattern, generalised: 6891 * 6892 * <ul> 6893 * <li>Each sort-key item passes the same shape rejections as 6894 * {@link #buildOrderByColumnRefs} (ordinals, constants, 6895 * scalar / predicate subqueries, window functions, ORDER 6896 * SIBLINGS BY, RESET WHEN, in-clause OFFSET/FETCH). 6897 * <li>Each {@link TObjectName} reference dispatches via a 6898 * four-case fail-closed taxonomy: {@code column_alias} → 6899 * lookup via {@code toString()}; unqualified {@code column} 6900 * → lookup via {@code getColumnNameOnly()}; qualified 6901 * {@code column} → reject (set-op outer scope is the 6902 * unioned outputs, not branches' tables); other 6903 * {@code dbObjectType} → reject as unsupported. 6904 * <li>The lookup is positional against {@code outerOutputs} (= 6905 * branch[0].outputs by slice-12 design) — NOT per-branch 6906 * name search. Per-branch name search would mis-bind 6907 * swapped-name branches and silently accept names present 6908 * only in non-branch[0]. Slice-21 codex rounds 1-2 MUSTs. 6909 * <li>Each branch contributes its 6910 * {@code outputColumns[pos].sources} for the matched 6911 * position. Branches with empty sources at the matched 6912 * position (scalar / fully-derived projection) reject the 6913 * sort key with a tuned message — silent omission would 6914 * lose dependency information (slice-21 codex round 1 6915 * MUST 5). 6916 * <li>Each branch-local {@link ColumnRef} is normalised to its 6917 * catalog name via {@link RelationSource#getBinding()}'s 6918 * {@code qualifiedName}. The set-op outer's relations list 6919 * is empty, so branch-local aliases are not resolvable in 6920 * the owning statement; normalisation yields self-contained 6921 * refs (slice-21 codex round 1 MUST 4 + round 2 MUST 1). 6922 * <li>A per-item empty-refs guard rejects sort keys that 6923 * contributed zero physical refs (e.g. {@code ORDER BY 6924 * 1+0}, {@code ORDER BY UPPER('x')}). Mirrors slice-9 6925 * single-SELECT invariant. Operates on a per-item local 6926 * set, so duplicate cross-item refs ({@code ORDER BY id, 6927 * id}) survive global LinkedHashSet de-duplication 6928 * (slice-21 codex round 4 MUST 1). 6929 * </ul> 6930 * 6931 * <p>Like slice-9 ORDER BY for single-SELECT, this list does NOT 6932 * contribute to the canonical model — it is presentation metadata 6933 * only. The dlineage XML probe ({@code /tmp/SetOpOrderByLimitProbe}) 6934 * confirmed dlineage emits no parity edges for set-op outer 6935 * ORDER BY. 6936 */ 6937 private static List<ColumnRef> buildSetOpOuterOrderByColumnRefs( 6938 TSelectSqlStatement setOp, 6939 List<OutputColumn> outerOutputs, 6940 List<StatementGraph> stmts, 6941 int[] branchIdxs) { 6942 TOrderBy orderBy = setOp.getOrderbyClause(); 6943 if (orderBy == null) { 6944 return new ArrayList<>(); 6945 } 6946 if (orderBy.isSiblings()) { 6947 throw new SemanticIRBuildException( 6948 Diagnostic.error(DiagnosticCode.ORDER_SIBLINGS_BY_NOT_SUPPORTED, 6949 "ORDER SIBLINGS BY is not supported yet " 6950 + "(Oracle hierarchical ordering)", orderBy)); 6951 } 6952 if (orderBy.getResetWhenCondition() != null) { 6953 throw new SemanticIRBuildException( 6954 Diagnostic.error(DiagnosticCode.ORDER_BY_RESET_WHEN_NOT_SUPPORTED, 6955 "ORDER BY ... RESET WHEN is not supported yet " 6956 + "(Teradata window-style restart)", orderBy)); 6957 } 6958 // Slice 72: TOrderBy in-clause OFFSET/FETCH slots are admitted 6959 // for MSSQL set-op outer via buildSetOpRowLimit's TOrderBy 6960 // fallback (the MSSQL parser routes set-op outer OFFSET/FETCH 6961 // EXCLUSIVELY onto TOrderBy, not duplicated onto the SELECT 6962 // node as in single-SELECT). Removing the previous defensive 6963 // throws here so the slice-72 admit shapes aren't false- 6964 // rejected. The unused codes 6965 // ORDER_BY_FETCH_FIRST_NOT_SUPPORTED and 6966 // ORDER_BY_OFFSET_NOT_SUPPORTED stay as documentation of a 6967 // known reject taxonomy. 6968 TOrderByItemList items = orderBy.getItems(); 6969 if (items == null || items.size() == 0) { 6970 return new ArrayList<>(); 6971 } 6972 LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 6973 for (int i = 0; i < items.size(); i++) { 6974 TOrderByItem item = items.getOrderByItem(i); 6975 if (item == null) continue; 6976 TExpression sortKey = item.getSortKey(); 6977 if (sortKey == null) continue; 6978 // Same shape rejections as slice-9 single-SELECT. 6979 rejectOrderByOrdinalOrConstant(sortKey); 6980 rejectOrderByScalarSubquery(sortKey); 6981 rejectOrderByWindowFunction(sortKey); 6982 // (NOT rejectOrderByAliasReference — alias refs are valid 6983 // at set-op outer scope; they ARE the branch-output names, 6984 // looked up positionally below.) 6985 6986 // Per-item local set so the empty-refs guard counts refs 6987 // FOUND for this sort key, not refs ADDED to the global 6988 // set after de-dup. Otherwise `ORDER BY id, id` would 6989 // false-reject the second item (slice-21 codex round 4 MUST 1). 6990 LinkedHashSet<ColumnRef> itemRefs = new LinkedHashSet<>(); 6991 collectSetOpOuterRefsForSortKey(sortKey, outerOutputs, 6992 stmts, branchIdxs, itemRefs); 6993 if (itemRefs.isEmpty()) { 6994 throw new SemanticIRBuildException( 6995 Diagnostic.error(DiagnosticCode.SET_OP_OUTER_ORDER_BY_NO_PHYSICAL_COLUMN_REFS, 6996 "ORDER BY sort key '" + sortKey 6997 + "' has no physical column references at set-op " 6998 + "outer (constant or non-column expressions are " 6999 + "not supported yet)", sortKey)); 7000 } 7001 refs.addAll(itemRefs); 7002 } 7003 return new ArrayList<>(refs); 7004 } 7005 7006 /** 7007 * Collect refs for one set-op outer ORDER BY sort key. Walks the 7008 * sort-key expression for {@link TObjectName} nodes and dispatches 7009 * each through {@link #processSetOpOrderByObjectName}. Includes a 7010 * top-level fast path for the common {@code ORDER BY x} case where 7011 * the entire sort key IS the {@link TObjectName}. 7012 * 7013 * <p>The visitor filters its dispatch to {@code column}, 7014 * {@code column_alias}, and {@code unknown} dbObjectTypes — these 7015 * are the shapes that represent sort-key column references. Other 7016 * TObjectName nodes (function names, schema qualifications) are 7017 * part of the surrounding expression structure and skipped 7018 * silently. The {@code unknown} case is included so the four-case 7019 * fail-closed taxonomy in {@link #processSetOpOrderByObjectName} 7020 * rejects vendor-typed unknown qualified refs (e.g. 7021 * {@code foo.id + id}, slice-21 codex round 2 MUST 2). 7022 */ 7023 private static void collectSetOpOuterRefsForSortKey( 7024 TExpression sortKey, 7025 final List<OutputColumn> outerOutputs, 7026 final List<StatementGraph> stmts, 7027 final int[] branchIdxs, 7028 final LinkedHashSet<ColumnRef> outRefs) { 7029 // Top-level fast path: the visitor's `acceptChildren` may not 7030 // visit the root TObjectName when the sort key is itself a 7031 // bare TObjectName. Mirrors slice-9 rejectOrderByAliasReference. 7032 if (sortKey.getExpressionType() == EExpressionType.simple_object_name_t) { 7033 TObjectName op = sortKey.getObjectOperand(); 7034 if (op != null) { 7035 processSetOpOrderByObjectName(op, outerOutputs, stmts, 7036 branchIdxs, outRefs); 7037 return; 7038 } 7039 } 7040 sortKey.acceptChildren(new TParseTreeVisitor() { 7041 @Override 7042 public void preVisit(TObjectName node) { 7043 EDbObjectType ot = node.getDbObjectType(); 7044 // Skip non-column-like TObjectNames (function names, 7045 // schema/server qualifications). The four-case 7046 // fail-closed taxonomy still runs for column / 7047 // column_alias / unknown to handle the slice-21 codex 7048 // round 2 MUST 2 partial-accept case (e.g. 7049 // `foo.id + id` rejects via `foo.id`'s `unknown` 7050 // dbObjectType). 7051 if (ot != EDbObjectType.column 7052 && ot != EDbObjectType.column_alias 7053 && ot != EDbObjectType.unknown) { 7054 return; 7055 } 7056 processSetOpOrderByObjectName(node, outerOutputs, stmts, 7057 branchIdxs, outRefs); 7058 } 7059 }); 7060 } 7061 7062 /** 7063 * Resolve one {@link TObjectName} sort-key reference at set-op 7064 * outer scope. Four-case fail-closed taxonomy (slice-21 codex 7065 * round 2 MUST 2): column_alias / unqualified column / qualified 7066 * column / other. 7067 */ 7068 private static void processSetOpOrderByObjectName( 7069 TObjectName node, 7070 List<OutputColumn> outerOutputs, 7071 List<StatementGraph> stmts, 7072 int[] branchIdxs, 7073 LinkedHashSet<ColumnRef> outRefs) { 7074 EDbObjectType ot = node.getDbObjectType(); 7075 String name; 7076 if (ot == EDbObjectType.column_alias) { 7077 // Aliases at set-op outer carry tableToken=alias-name (the 7078 // /tmp/SetOpQualifiedRefProbe finding); accept regardless. 7079 name = node.toString(); 7080 } else if (ot == EDbObjectType.column) { 7081 if (node.getTableToken() != null) { 7082 throw new SemanticIRBuildException( 7083 Diagnostic.error(DiagnosticCode.ORDER_BY_QUALIFIED_REFERENCE_NOT_SUPPORTED, 7084 "qualified column reference '" + node 7085 + "' in set-op outer ORDER BY not supported " 7086 + "(scope is the unioned outputs, not branches' tables)", node)); 7087 } 7088 name = node.getColumnNameOnly(); 7089 } else { 7090 throw new SemanticIRBuildException( 7091 Diagnostic.error(DiagnosticCode.ORDER_BY_OBJECT_REFERENCE_UNSUPPORTED, 7092 "unsupported ORDER BY object reference '" + node 7093 + "' (dbObjectType=" + ot + ") in set-op outer", node)); 7094 } 7095 if (name == null || name.isEmpty() || "*".equals(name)) { 7096 throw new SemanticIRBuildException( 7097 Diagnostic.error(DiagnosticCode.ORDER_BY_OBJECT_REFERENCE_NO_USABLE_NAME, 7098 "ORDER BY object reference '" + node + "' has no usable name", node)); 7099 } 7100 String key = name.toLowerCase(Locale.ROOT); 7101 int pos = -1; 7102 for (int i = 0; i < outerOutputs.size(); i++) { 7103 String outName = outerOutputs.get(i).getName(); 7104 if (outName != null && outName.toLowerCase(Locale.ROOT).equals(key)) { 7105 pos = i; 7106 break; 7107 } 7108 } 7109 if (pos < 0) { 7110 throw new SemanticIRBuildException( 7111 Diagnostic.error(DiagnosticCode.ORDER_BY_NAME_NOT_MATCHED_IN_SET_OP_OUTPUT, 7112 "ORDER BY '" + name + "' does not match any set-op output " 7113 + "column (set-op outer column names come from branch[0])", node)); 7114 } 7115 for (int b = 0; b < branchIdxs.length; b++) { 7116 StatementGraph br = stmts.get(branchIdxs[b]); 7117 OutputColumn oc = br.getOutputColumns().get(pos); 7118 if (oc.getSources().isEmpty()) { 7119 throw new SemanticIRBuildException( 7120 Diagnostic.error(DiagnosticCode.SET_OP_ORDER_BY_BRANCH_OUTPUT_NO_SOURCES, 7121 "ORDER BY '" + name + "' references branch[" + b 7122 + "] output '" + oc.getName() 7123 + "' which has no physical sources " 7124 + "(derived/scalar projection); cannot " 7125 + "capture this dependency yet", node)); 7126 } 7127 for (ColumnRef cr : oc.getSources()) { 7128 outRefs.add(normaliseSetOpBranchRef(cr, br)); 7129 } 7130 } 7131 } 7132 7133 /** 7134 * Normalise a branch-local {@link ColumnRef} to a self-contained 7135 * ref using the underlying {@link RelationBinding#getQualifiedName()}. 7136 * 7137 * <p>Slice-21 invariant (codex round 2 MUST 1): the set-op outer's 7138 * {@code relations} list is empty. Branch-local aliases (like 7139 * {@code e} for {@code FROM employees e}) are not resolvable in 7140 * the outer statement, so {@code orderByColumnRefs} normalises to 7141 * the catalog name. Fail-closed if no matching RelationSource — 7142 * this would indicate corrupt branch lineage state. 7143 */ 7144 private static ColumnRef normaliseSetOpBranchRef(ColumnRef cr, 7145 StatementGraph branch) { 7146 String alias = cr.getRelationAlias(); 7147 String aliasKey = alias.toLowerCase(Locale.ROOT); 7148 for (RelationSource rs : branch.getRelations()) { 7149 if (rs.getAlias().toLowerCase(Locale.ROOT).equals(aliasKey)) { 7150 return new ColumnRef(rs.getBinding().getQualifiedName(), 7151 cr.getColumnName()); 7152 } 7153 } 7154 throw new SemanticIRBuildException( 7155 Diagnostic.error(DiagnosticCode.BRANCH_COLUMN_REF_UNKNOWN_RELATION, 7156 "internal: branch ColumnRef relationAlias '" + alias 7157 + "' does not match any RelationSource in the " 7158 + "branch's relations list", null)); 7159 } 7160 7161 /** 7162 * Reject duplicate output names within a single statement. 7163 * Lineage refs are keyed by {@code (statementIndex, outputName)}; two 7164 * outputs sharing a name silently merge their lineage chains. 7165 */ 7166 private static void rejectDuplicateOutputNames(StatementGraph stmt, String label) { 7167 Set<String> seen = new HashSet<>(); 7168 for (OutputColumn c : stmt.getOutputColumns()) { 7169 String name = c.getName(); 7170 if (name == null || name.isEmpty()) continue; 7171 if (!seen.add(name.toLowerCase(Locale.ROOT))) { 7172 throw new SemanticIRBuildException( 7173 Diagnostic.error(DiagnosticCode.SET_OP_BRANCH_DUPLICATE_OUTPUT_NAME, 7174 "set-op branch '" + label + "' has duplicate output name '" 7175 + name + "'; lineage refs are keyed by output name " 7176 + "and would collide", null)); 7177 } 7178 } 7179 } 7180 7181 /** 7182 * Map ({@link ESetOperatorType}, {@code isAll()}) to the IR 7183 * {@link SetOperator} enum. The exhaustive switch makes a future 7184 * {@code ESetOperatorType} value fail loudly at build time 7185 * (mirrors slice-8 {@code resolveDistinctFlag} pattern). 7186 */ 7187 private static SetOperator resolveSetOperator(TSelectSqlStatement setOp) { 7188 ESetOperatorType type = setOp.getSetOperatorType(); 7189 if (type == null) { 7190 throw new SemanticIRBuildException( 7191 Diagnostic.error(DiagnosticCode.SET_OP_ROOT_TYPE_NULL, 7192 "expected non-null set-op type on the set-op root", (TParseTreeNode) null)); 7193 } 7194 boolean all = setOp.isAll(); 7195 switch (type) { 7196 case union: return all ? SetOperator.UNION_ALL : SetOperator.UNION; 7197 case intersect: return all ? SetOperator.INTERSECT_ALL : SetOperator.INTERSECT; 7198 case minus: return all ? SetOperator.MINUS_ALL : SetOperator.MINUS; 7199 case except: return all ? SetOperator.EXCEPT_ALL : SetOperator.EXCEPT; 7200 case none: 7201 throw new SemanticIRBuildException( 7202 Diagnostic.error(DiagnosticCode.SET_OP_ROOT_TYPE_NONE, 7203 "expected non-none set operator type on the set-op root", (TParseTreeNode) null)); 7204 default: 7205 throw new SemanticIRBuildException( 7206 Diagnostic.error(DiagnosticCode.SET_OP_UNKNOWN_OPERATOR_TYPE, 7207 "unknown set operator type: " + type, (TParseTreeNode) null)); 7208 } 7209 } 7210 7211 /** 7212 * Iteratively flatten the left-leaning set-op tree into a list of 7213 * leaf SELECT statements (CLAUDE.md mandates no recursion on 7214 * {@code leftStmt}/{@code rightStmt}; would StackOverflow on 2000+ 7215 * UNIONs). 7216 * 7217 * <p>On every internal set-op node visited: 7218 * <ol> 7219 * <li>Reject row-limit modifiers ({@link #rejectSetOpRowLimit}) 7220 * on every node (root + internal). Slice 12 + 7221 * {@code /tmp/SetOpInnerModifierProbe}: parenthesized inner 7222 * combined nodes can carry row-limits in Oracle / PostgreSQL / 7223 * MSSQL.</li> 7224 * <li>Reject ORDER BY ({@link #rejectSetOpInternalOrderBy}) only 7225 * on INTERNAL (non-root) nodes. Slice 21 lifted ORDER BY on 7226 * the root via {@link #buildSetOpOuterOrderByColumnRefs}; an 7227 * internal {@code (A UNION B ORDER BY id) UNION C} sort is 7228 * still discarded by the outer set operation, so it has no 7229 * observable effect and remains rejected.</li> 7230 * <li>Reject mixed-operator and mixed-{@code ALL} chains by checking 7231 * the resolved kind matches the root's kind.</li> 7232 * <li>Hard-reject malformed AST (null left/right child).</li> 7233 * </ol> 7234 * 7235 * <p>Push order is right-then-left so leaves emerge in left-to-right 7236 * declaration order. 7237 */ 7238 private static List<TSelectSqlStatement> flattenSetOpTreeIteratively( 7239 TSelectSqlStatement root, SetOperator expected) { 7240 List<TSelectSqlStatement> leaves = new ArrayList<>(); 7241 Deque<TSelectSqlStatement> stack = new ArrayDeque<>(); 7242 stack.push(root); 7243 while (!stack.isEmpty()) { 7244 TSelectSqlStatement cur = stack.pop(); 7245 ESetOperatorType t = cur.getSetOperatorType(); 7246 if (t != null && t != ESetOperatorType.none) { 7247 // Slice 21: ORDER BY guard fires only on INTERNAL nodes. 7248 // The root (`cur == root`) lifts ORDER BY; the collection 7249 // happens in buildSetOpOuterOrderByColumnRefs. 7250 // Slice 72: row-limit guard ALSO fires only on INTERNAL 7251 // nodes. The root lifts via buildSetOpRowLimit (called 7252 // by buildSetOpProgram before this method). 7253 if (cur != root) { 7254 rejectSetOpRowLimit(cur); 7255 rejectSetOpInternalOrderBy(cur); 7256 } 7257 SetOperator curKind = resolveSetOperator(cur); 7258 if (curKind != expected) { 7259 throw new SemanticIRBuildException( 7260 Diagnostic.error(DiagnosticCode.MIXED_SET_OPERATORS_NOT_SUPPORTED, 7261 "mixed set operators in a single chain are not supported yet " 7262 + "(root=" + expected + ", inner=" + curKind + ")", (TParseTreeNode) null)); 7263 } 7264 if (cur.getLeftStmt() == null || cur.getRightStmt() == null) { 7265 throw new SemanticIRBuildException( 7266 Diagnostic.error(DiagnosticCode.MALFORMED_SET_OP_AST, 7267 "malformed set-op AST: null left/right child", (TParseTreeNode) null)); 7268 } 7269 stack.push(cur.getRightStmt()); 7270 stack.push(cur.getLeftStmt()); 7271 } else { 7272 leaves.add(cur); 7273 } 7274 } 7275 return leaves; 7276 } 7277 7278 private static Set<String> collectCteNames(TCTEList cteList) { 7279 if (cteList == null || cteList.size() == 0) return Collections.emptySet(); 7280 Set<String> names = new HashSet<>(); 7281 for (int i = 0; i < cteList.size(); i++) { 7282 String name = cteList.getCTE(i).getTableName().toString(); 7283 if (name != null && !name.isEmpty()) { 7284 names.add(name.toLowerCase(Locale.ROOT)); 7285 } 7286 } 7287 return names; 7288 } 7289 7290 /** 7291 * Slice 107 — return the first CTE name shared between the outer-WITH 7292 * and inner-WITH CTE lists on an INSERT (case-insensitive, lowercase 7293 * via {@code toLowerCase(Locale.ROOT)} matching the slice-15/103 7294 * duplicate-name walker convention), or {@code null} if the name sets 7295 * are disjoint. Used by {@code buildInsert} to keep the 7296 * shared-name case rejecting (PG/Oracle/Snowflake nested-WITH 7297 * inner-shadows-outer semantics not yet supported) while admitting 7298 * the disjoint case via flat-merge. 7299 * 7300 * <p>Pathological edge case (codex round-2 diff-review Q3): if one of 7301 * the two lists also contains an INTRA-list duplicate AND that 7302 * duplicated name happens to also appear in the other list, this 7303 * helper short-circuits with 7304 * INSERT_MIXED_OUTER_AND_INNER_WITH_NOT_SUPPORTED and masks the 7305 * more precise same-scope DUPLICATE_CTE_NAME the slice-103 walker 7306 * would have emitted. Accepted limitation — the diagnostic still 7307 * tells the user the shape is unsupported, and both codes point at 7308 * the same offending name. A future slice can pre-walk each list 7309 * for intra-list duplicates before the boundary check if a 7310 * customer reports confusion. 7311 */ 7312 private static String findFirstSharedCteName(TCTEList outer, TCTEList inner) { 7313 Set<String> outerNames = new HashSet<>(); 7314 for (int i = 0; i < outer.size(); i++) { 7315 outerNames.add(outer.getCTE(i).getTableName().toString().toLowerCase(Locale.ROOT)); 7316 } 7317 for (int i = 0; i < inner.size(); i++) { 7318 String name = inner.getCTE(i).getTableName().toString(); 7319 if (outerNames.contains(name.toLowerCase(Locale.ROOT))) { 7320 return name; 7321 } 7322 } 7323 return null; 7324 } 7325 7326 /** 7327 * Reject the case where a CTE body references a sibling CTE declared 7328 * <i>after</i> it. SQL chain semantics only allow left-to-right 7329 * references, but the bind-by-name provider would happily classify a 7330 * forward-declared CTE name as a base {@code TABLE} (because it's not 7331 * yet in {@code visibleSoFar}). Catching it here turns the silent 7332 * mislabeling into a clear error. 7333 */ 7334 private static void rejectForwardCteReferences(final TCTE cte, 7335 final Set<String> allCteNames, 7336 final Set<String> visibleSoFar) { 7337 TSelectSqlStatement body = cte.getSubquery(); 7338 if (body == null) return; 7339 final String selfName = cte.getTableName().toString().toLowerCase(Locale.ROOT); 7340 final List<String> forwards = new ArrayList<>(); 7341 body.acceptChildren(new TParseTreeVisitor() { 7342 @Override 7343 public void preVisit(TTable t) { 7344 String tname = bareName(t); 7345 if (tname == null) return; 7346 String lower = tname.toLowerCase(Locale.ROOT); 7347 if (allCteNames.contains(lower) 7348 && !visibleSoFar.contains(lower) 7349 && !lower.equals(selfName)) { 7350 forwards.add(tname); 7351 } 7352 } 7353 }); 7354 if (!forwards.isEmpty()) { 7355 throw new SemanticIRBuildException( 7356 Diagnostic.error(DiagnosticCode.CTE_FORWARD_REFERENCE, 7357 "CTE '" + cte.getTableName() + "' forward-references later CTE(s) " 7358 + forwards + "; only left-to-right CTE chains are supported", cte)); 7359 } 7360 } 7361 7362 private static String bareName(TTable t) { 7363 if (t == null) return null; 7364 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.objectname) return null; 7365 return t.getName(); 7366 } 7367 7368 /** 7369 * Reject {@code WITH RECURSIVE}. Slice 4 supports chained 7370 * (forward-referencing) CTEs; recursion is left for a later slice that 7371 * can model the fixpoint semantics. 7372 */ 7373 private static void rejectRecursiveCtes(TCTEList cteList) { 7374 if (cteList == null) return; 7375 for (int i = 0; i < cteList.size(); i++) { 7376 TCTE cte = cteList.getCTE(i); 7377 if (cte.isRecursive()) { 7378 throw new SemanticIRBuildException( 7379 Diagnostic.error(DiagnosticCode.CTE_WITH_RECURSIVE_NOT_SUPPORTED, 7380 "WITH RECURSIVE is not supported yet (CTE: " + cte.getTableName() + ")", cte)); 7381 } 7382 } 7383 } 7384 7385 /** 7386 * Slice 101 — walk the WITH clause on a MERGE statement and append 7387 * each CTE body to {@code stmts} as a preceding statement. Mirrors 7388 * the SELECT-side build() at lines ~516-653. 7389 * 7390 * <p>Returns a {@code cteNameToStatementIndex} map keyed by 7391 * lower-cased CTE name. {@code ctePublishedColumnsOut} is populated 7392 * with each CTE's output column names so the {@code buildMerge} 7393 * USING-as-CTE branch can install them via 7394 * {@link NameBindingProvider#withInScopeRelationColumns}. 7395 * 7396 * <p>Rejects (chronological): 7397 * <ol> 7398 * <li>WITH RECURSIVE — reuses {@link DiagnosticCode#CTE_WITH_RECURSIVE_NOT_SUPPORTED}. 7399 * Currently no admitting vendor (PG parser PARSE_FAILED, probe 7400 * 2026-05-17); defensive reject for forward compatibility.</li> 7401 * <li>CTE with explicit column list — rejects with new 7402 * {@link DiagnosticCode#MERGE_CTE_EXPLICIT_COLUMN_LIST_NOT_SUPPORTED}. 7403 * PG and MSSQL parsers admit this shape; slice 101 defers 7404 * because the inner CTE body output names ≠ user-visible CTE 7405 * column names.</li> 7406 * <li>Duplicate CTE name — reuses {@link DiagnosticCode#DUPLICATE_CTE_NAME}.</li> 7407 * <li>Forward CTE reference — reuses {@link DiagnosticCode#CTE_FORWARD_REFERENCE}.</li> 7408 * </ol> 7409 * 7410 * <p>Set-op CTE bodies route through {@link #buildSetOpProgram}; 7411 * non-set-op CTE bodies route through {@link #buildSelectStatement}. 7412 * Each CTE's published columns are added to {@code ctePublishedColumnsOut} 7413 * after its body is built so a CTE cannot self-reference (mirrors 7414 * SELECT-side slice 60). 7415 */ 7416 private static Map<String, Integer> buildMergeCteList( 7417 TMergeSqlStatement merge, 7418 NameBindingProvider provider, 7419 List<StatementGraph> stmts, 7420 List<LineageEdge> lineage, 7421 Map<String, List<String>> ctePublishedColumnsOut) { 7422 TCTEList cteList = merge.getCteList(); 7423 Map<String, Integer> cteNameToStatementIndex = new HashMap<>(); 7424 if (cteList == null || cteList.size() == 0) { 7425 return cteNameToStatementIndex; 7426 } 7427 rejectRecursiveCtes(cteList); 7428 // Slice 102 — explicit-column-list shapes (PG/MSSQL `WITH cte(a, b) AS 7429 // (...) MERGE ...`) are admitted by rebuilding the body's 7430 // StatementGraph with the explicit-list names and rewriting outgoing 7431 // STATEMENT_OUTPUT lineage refs. The slice-101 upfront reject is 7432 // replaced by per-CTE rename application below. The slice-101 code 7433 // (MERGE_CTE_EXPLICIT_COLUMN_LIST_NOT_SUPPORTED) stays declared for 7434 // API stability. 7435 Set<String> allCteNames = collectCteNames(cteList); 7436 Set<String> visibleSoFar = new HashSet<>(); 7437 for (int i = 0; i < cteList.size(); i++) { 7438 TCTE cte = cteList.getCTE(i); 7439 String cteName = cte.getTableName().toString(); 7440 String cteNameLower = cteName.toLowerCase(Locale.ROOT); 7441 if (visibleSoFar.contains(cteNameLower)) { 7442 throw new SemanticIRBuildException( 7443 Diagnostic.error(DiagnosticCode.DUPLICATE_CTE_NAME, 7444 "duplicate CTE name '" + cteName 7445 + "' in WITH clause; CTE names must be unique", 7446 cte)); 7447 } 7448 rejectForwardCteReferences(cte, allCteNames, visibleSoFar); 7449 NameBindingProvider bodyProvider = 7450 provider.withCteContext(visibleSoFar); 7451 // Slice 102 — snapshot the lineage size BEFORE either branch so 7452 // the rename helper can rewrite outgoing STATEMENT_OUTPUT refs 7453 // in [lineageSize0, lineage.size()) without touching prior CTE 7454 // bodies' edges. Covers BOTH set-op and non-set-op branches 7455 // (codex round-1 plan-review BLOCKING). 7456 int lineageSize0 = lineage.size(); 7457 int bodyIdx; 7458 TSelectSqlStatement cteBody = cte.getSubquery(); 7459 if (cteBody != null 7460 && cteBody.getSetOperatorType() != null 7461 && cteBody.getSetOperatorType() != ESetOperatorType.none) { 7462 bodyIdx = buildSetOpProgram(cteBody, bodyProvider, stmts, 7463 lineage, cteNameToStatementIndex, cteName, 7464 /*hasOuterCteListAlreadyProcessed=*/ false); 7465 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7466 } else { 7467 int cteStmtsSize0 = stmts.size(); 7468 int cteLineageSize0 = lineage.size(); 7469 Map<String, Integer> cteSubqueryAliasToIndex; 7470 try { 7471 cteSubqueryAliasToIndex = 7472 extractFromSubqueriesAsStatements(cteBody, 7473 bodyProvider, stmts, lineage, 7474 cteNameToStatementIndex, 7475 ctePublishedColumnsOut); 7476 } catch (RuntimeException ex) { 7477 while (stmts.size() > cteStmtsSize0) { 7478 stmts.remove(stmts.size() - 1); 7479 } 7480 while (lineage.size() > cteLineageSize0) { 7481 lineage.remove(lineage.size() - 1); 7482 } 7483 throw ex; 7484 } 7485 EnclosingScope cteEnclosing = buildEnclosingScope(cteBody, 7486 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7487 /*parent=*/ null); 7488 Map<Integer, ScalarInfo> cteScalarMap = 7489 extractScalarSubqueriesAsStatements(cteBody, 7490 bodyProvider, stmts, lineage, 7491 cteNameToStatementIndex, cteEnclosing, 7492 /*allowRecursiveScalarSubqueryExtraction=*/ true); 7493 Map<String, List<String>> cteBodyInScope = 7494 buildEffectiveAliasInScopeMap(cteBody, bodyProvider, 7495 ctePublishedColumnsOut, 7496 cteSubqueryAliasToIndex, stmts); 7497 NameBindingProvider cteBodyProviderWithStar = bodyProvider 7498 .withInScopeRelationColumns(cteBodyInScope); 7499 // Slice 114 — switch to buildSelectStatementImpl with 7500 // snapshot/rollback (see the matching SELECT-side 7501 // CTE site for full rationale). 7502 int cteBodyStmtsSnapshot = stmts.size(); 7503 int cteBodyLineageSnapshot = lineage.size(); 7504 StatementGraph body; 7505 try { 7506 body = buildSelectStatementImpl(cteBody, 7507 cteBodyProviderWithStar, cteName, 7508 /*hasOuterCteListAlreadyProcessed=*/ false, 7509 /*allowFromSubqueries=*/ true, 7510 /*allowScalarProjectionSubqueries=*/ true, 7511 /*allowWindowProjection=*/ true, 7512 /*allowJoinOnPredicateSubqueries=*/ false, 7513 /*stmtsForExtraction=*/ stmts, 7514 /*lineageForExtraction=*/ lineage, 7515 /*cteMapForExtraction=*/ cteNameToStatementIndex, 7516 /*isPredicateBody=*/ false, 7517 /*whereClauseContext=*/ PredicateClauseContext.CTE_BODY_WHERE, 7518 /*allowWherePredicateSubqueries=*/ true); 7519 } catch (RuntimeException ex) { 7520 while (stmts.size() > cteBodyStmtsSnapshot) stmts.remove(stmts.size() - 1); 7521 while (lineage.size() > cteBodyLineageSnapshot) lineage.remove(lineage.size() - 1); 7522 throw ex; 7523 } 7524 bodyIdx = stmts.size(); 7525 stmts.add(body); 7526 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7527 emitLineageForStatement(body, bodyIdx, lineage, 7528 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7529 cteScalarMap); 7530 } 7531 // Slice 102 — apply explicit-column-list rename if present. 7532 // Rebuilds stmts[bodyIdx] with renamed OutputColumns and 7533 // rewrites STATEMENT_OUTPUT(bodyIdx, oldName) refs in 7534 // lineage[lineageSize0..) to use the renamed name. Returns the 7535 // published column list (renamed if explicit list applied, 7536 // else inner names from the body). 7537 List<String> publishedCols = applyExplicitCteColumnListRename( 7538 cte, stmts, lineage, bodyIdx, lineageSize0, "MERGE"); 7539 ctePublishedColumnsOut.put(cteNameLower, publishedCols); 7540 visibleSoFar.add(cteNameLower); 7541 } 7542 return cteNameToStatementIndex; 7543 } 7544 7545 /** 7546 * Slice 105 — walk the WITH clause on an UPDATE statement and append 7547 * each CTE body to {@code stmts} as a preceding statement. Mirrors 7548 * the slice-101 MERGE walker {@link #buildMergeCteList} verbatim 7549 * except for the source of the CTE list and the 7550 * {@link #applyExplicitCteColumnListRename} {@code dmlKind} argument. 7551 * 7552 * <p>Returns a {@code cteNameToStatementIndex} map keyed by 7553 * lower-cased CTE name. {@code ctePublishedColumnsOut} is populated 7554 * with each CTE's output column names so {@link #buildUpdateRelation} 7555 * + {@link #buildUpdateInScopeMap} can route FROM-side references to 7556 * the matching CTE as SUBQUERY-kind relations with the CTE's columns 7557 * published into the in-scope map. 7558 * 7559 * <p>The slice-103 SELECT-side CTE walker contract is reused via the 7560 * {@link #applyExplicitCteColumnListRename} helper with 7561 * {@code dmlKind="SELECT"} so the SELECT-side 7562 * {@link DiagnosticCode#CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH} code 7563 * fires on arity mismatch (codex round-1 Q2 confirmed YES — UPDATE is 7564 * closer to ordinary SELECT than to MERGE for CTE rename semantics). 7565 * 7566 * <p>Rejects (chronological): 7567 * <ol> 7568 * <li>{@code WITH RECURSIVE} — {@link DiagnosticCode#CTE_WITH_RECURSIVE_NOT_SUPPORTED}. 7569 * Currently no admitting vendor (Oracle PARSE_FAILED on outer-WITH-UPDATE).</li> 7570 * <li>Duplicate CTE name — {@link DiagnosticCode#DUPLICATE_CTE_NAME}.</li> 7571 * <li>Forward CTE reference — {@link DiagnosticCode#CTE_FORWARD_REFERENCE}.</li> 7572 * <li>Explicit-column-list arity mismatch — handled by 7573 * {@link #applyExplicitCteColumnListRename} via 7574 * {@link DiagnosticCode#CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH}.</li> 7575 * </ol> 7576 */ 7577 private static Map<String, Integer> buildUpdateCteList( 7578 TUpdateSqlStatement update, 7579 NameBindingProvider provider, 7580 List<StatementGraph> stmts, 7581 List<LineageEdge> lineage, 7582 Map<String, List<String>> ctePublishedColumnsOut) { 7583 TCTEList cteList = update.getCteList(); 7584 Map<String, Integer> cteNameToStatementIndex = new HashMap<>(); 7585 if (cteList == null || cteList.size() == 0) { 7586 return cteNameToStatementIndex; 7587 } 7588 rejectRecursiveCtes(cteList); 7589 Set<String> allCteNames = collectCteNames(cteList); 7590 Set<String> visibleSoFar = new HashSet<>(); 7591 for (int i = 0; i < cteList.size(); i++) { 7592 TCTE cte = cteList.getCTE(i); 7593 String cteName = cte.getTableName().toString(); 7594 String cteNameLower = cteName.toLowerCase(Locale.ROOT); 7595 if (visibleSoFar.contains(cteNameLower)) { 7596 throw new SemanticIRBuildException( 7597 Diagnostic.error(DiagnosticCode.DUPLICATE_CTE_NAME, 7598 "duplicate CTE name '" + cteName 7599 + "' in WITH clause; CTE names must be unique", 7600 cte)); 7601 } 7602 rejectForwardCteReferences(cte, allCteNames, visibleSoFar); 7603 NameBindingProvider bodyProvider = 7604 provider.withCteContext(visibleSoFar); 7605 int lineageSize0 = lineage.size(); 7606 int bodyIdx; 7607 TSelectSqlStatement cteBody = cte.getSubquery(); 7608 if (cteBody != null 7609 && cteBody.getSetOperatorType() != null 7610 && cteBody.getSetOperatorType() != ESetOperatorType.none) { 7611 bodyIdx = buildSetOpProgram(cteBody, bodyProvider, stmts, 7612 lineage, cteNameToStatementIndex, cteName, 7613 /*hasOuterCteListAlreadyProcessed=*/ false); 7614 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7615 } else { 7616 int cteStmtsSize0 = stmts.size(); 7617 int cteLineageSize0 = lineage.size(); 7618 Map<String, Integer> cteSubqueryAliasToIndex; 7619 try { 7620 cteSubqueryAliasToIndex = 7621 extractFromSubqueriesAsStatements(cteBody, 7622 bodyProvider, stmts, lineage, 7623 cteNameToStatementIndex, 7624 ctePublishedColumnsOut); 7625 } catch (RuntimeException ex) { 7626 while (stmts.size() > cteStmtsSize0) { 7627 stmts.remove(stmts.size() - 1); 7628 } 7629 while (lineage.size() > cteLineageSize0) { 7630 lineage.remove(lineage.size() - 1); 7631 } 7632 throw ex; 7633 } 7634 EnclosingScope cteEnclosing = buildEnclosingScope(cteBody, 7635 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7636 /*parent=*/ null); 7637 Map<Integer, ScalarInfo> cteScalarMap = 7638 extractScalarSubqueriesAsStatements(cteBody, 7639 bodyProvider, stmts, lineage, 7640 cteNameToStatementIndex, cteEnclosing, 7641 /*allowRecursiveScalarSubqueryExtraction=*/ true); 7642 Map<String, List<String>> cteBodyInScope = 7643 buildEffectiveAliasInScopeMap(cteBody, bodyProvider, 7644 ctePublishedColumnsOut, 7645 cteSubqueryAliasToIndex, stmts); 7646 NameBindingProvider cteBodyProviderWithStar = bodyProvider 7647 .withInScopeRelationColumns(cteBodyInScope); 7648 // Slice 114 — switch to buildSelectStatementImpl with 7649 // snapshot/rollback (see the matching SELECT-side 7650 // CTE site for full rationale). 7651 int cteBodyStmtsSnapshot = stmts.size(); 7652 int cteBodyLineageSnapshot = lineage.size(); 7653 StatementGraph body; 7654 try { 7655 body = buildSelectStatementImpl(cteBody, 7656 cteBodyProviderWithStar, cteName, 7657 /*hasOuterCteListAlreadyProcessed=*/ false, 7658 /*allowFromSubqueries=*/ true, 7659 /*allowScalarProjectionSubqueries=*/ true, 7660 /*allowWindowProjection=*/ true, 7661 /*allowJoinOnPredicateSubqueries=*/ false, 7662 /*stmtsForExtraction=*/ stmts, 7663 /*lineageForExtraction=*/ lineage, 7664 /*cteMapForExtraction=*/ cteNameToStatementIndex, 7665 /*isPredicateBody=*/ false, 7666 /*whereClauseContext=*/ PredicateClauseContext.CTE_BODY_WHERE, 7667 /*allowWherePredicateSubqueries=*/ true); 7668 } catch (RuntimeException ex) { 7669 while (stmts.size() > cteBodyStmtsSnapshot) stmts.remove(stmts.size() - 1); 7670 while (lineage.size() > cteBodyLineageSnapshot) lineage.remove(lineage.size() - 1); 7671 throw ex; 7672 } 7673 bodyIdx = stmts.size(); 7674 stmts.add(body); 7675 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7676 emitLineageForStatement(body, bodyIdx, lineage, 7677 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7678 cteScalarMap); 7679 } 7680 // Slice 105 — explicit column-list rename uses dmlKind="SELECT" 7681 // so the SELECT-side CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH 7682 // code fires (codex Q2 confirmed YES — UPDATE is closer to 7683 // ordinary SELECT than MERGE for CTE rename semantics). 7684 List<String> publishedCols = applyExplicitCteColumnListRename( 7685 cte, stmts, lineage, bodyIdx, lineageSize0, "SELECT"); 7686 ctePublishedColumnsOut.put(cteNameLower, publishedCols); 7687 visibleSoFar.add(cteNameLower); 7688 } 7689 return cteNameToStatementIndex; 7690 } 7691 7692 /** 7693 * Slice 106 — walk the WITH clause on a DELETE statement and append 7694 * each CTE body to {@code stmts} as a preceding statement. Mirrors 7695 * the slice-105 UPDATE walker {@link #buildUpdateCteList} verbatim 7696 * except for the source of the CTE list ({@code delete.getCteList()}). 7697 * 7698 * <p>Returns a {@code cteNameToStatementIndex} map keyed by 7699 * lower-cased CTE name. {@code ctePublishedColumnsOut} is populated 7700 * with each CTE's output column names so {@link #buildDeleteRelation} 7701 * + {@link #buildDeleteInScopeMap} can route FROM-side references to 7702 * the matching CTE as SUBQUERY-kind relations with the CTE's columns 7703 * published into the in-scope map. 7704 * 7705 * <p>The slice-103 SELECT-side CTE walker contract is reused via the 7706 * {@link #applyExplicitCteColumnListRename} helper with 7707 * {@code dmlKind="SELECT"} so the SELECT-side 7708 * {@link DiagnosticCode#CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH} code 7709 * fires on arity mismatch (slice-105 precedent: UPDATE/DELETE are 7710 * closer to ordinary SELECT than to MERGE for CTE rename semantics). 7711 * 7712 * <p>Rejects (chronological): 7713 * <ol> 7714 * <li>{@code WITH RECURSIVE} — 7715 * {@link DiagnosticCode#CTE_WITH_RECURSIVE_NOT_SUPPORTED}. 7716 * PG / MySQL admit the parse shape but slice 106 rejects at the 7717 * semantic layer (mirrors slice-105 boundary).</li> 7718 * <li>Duplicate CTE name — {@link DiagnosticCode#DUPLICATE_CTE_NAME}.</li> 7719 * <li>Forward CTE reference — {@link DiagnosticCode#CTE_FORWARD_REFERENCE}.</li> 7720 * <li>Explicit-column-list arity mismatch — handled by 7721 * {@link #applyExplicitCteColumnListRename} via 7722 * {@link DiagnosticCode#CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH}.</li> 7723 * </ol> 7724 */ 7725 private static Map<String, Integer> buildDeleteCteList( 7726 TDeleteSqlStatement delete, 7727 NameBindingProvider provider, 7728 List<StatementGraph> stmts, 7729 List<LineageEdge> lineage, 7730 Map<String, List<String>> ctePublishedColumnsOut) { 7731 TCTEList cteList = delete.getCteList(); 7732 Map<String, Integer> cteNameToStatementIndex = new HashMap<>(); 7733 if (cteList == null || cteList.size() == 0) { 7734 return cteNameToStatementIndex; 7735 } 7736 rejectRecursiveCtes(cteList); 7737 Set<String> allCteNames = collectCteNames(cteList); 7738 Set<String> visibleSoFar = new HashSet<>(); 7739 for (int i = 0; i < cteList.size(); i++) { 7740 TCTE cte = cteList.getCTE(i); 7741 String cteName = cte.getTableName().toString(); 7742 String cteNameLower = cteName.toLowerCase(Locale.ROOT); 7743 if (visibleSoFar.contains(cteNameLower)) { 7744 throw new SemanticIRBuildException( 7745 Diagnostic.error(DiagnosticCode.DUPLICATE_CTE_NAME, 7746 "duplicate CTE name '" + cteName 7747 + "' in WITH clause; CTE names must be unique", 7748 cte)); 7749 } 7750 rejectForwardCteReferences(cte, allCteNames, visibleSoFar); 7751 NameBindingProvider bodyProvider = 7752 provider.withCteContext(visibleSoFar); 7753 int lineageSize0 = lineage.size(); 7754 int bodyIdx; 7755 TSelectSqlStatement cteBody = cte.getSubquery(); 7756 if (cteBody != null 7757 && cteBody.getSetOperatorType() != null 7758 && cteBody.getSetOperatorType() != ESetOperatorType.none) { 7759 bodyIdx = buildSetOpProgram(cteBody, bodyProvider, stmts, 7760 lineage, cteNameToStatementIndex, cteName, 7761 /*hasOuterCteListAlreadyProcessed=*/ false); 7762 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7763 } else { 7764 int cteStmtsSize0 = stmts.size(); 7765 int cteLineageSize0 = lineage.size(); 7766 Map<String, Integer> cteSubqueryAliasToIndex; 7767 try { 7768 cteSubqueryAliasToIndex = 7769 extractFromSubqueriesAsStatements(cteBody, 7770 bodyProvider, stmts, lineage, 7771 cteNameToStatementIndex, 7772 ctePublishedColumnsOut); 7773 } catch (RuntimeException ex) { 7774 while (stmts.size() > cteStmtsSize0) { 7775 stmts.remove(stmts.size() - 1); 7776 } 7777 while (lineage.size() > cteLineageSize0) { 7778 lineage.remove(lineage.size() - 1); 7779 } 7780 throw ex; 7781 } 7782 EnclosingScope cteEnclosing = buildEnclosingScope(cteBody, 7783 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7784 /*parent=*/ null); 7785 Map<Integer, ScalarInfo> cteScalarMap = 7786 extractScalarSubqueriesAsStatements(cteBody, 7787 bodyProvider, stmts, lineage, 7788 cteNameToStatementIndex, cteEnclosing, 7789 /*allowRecursiveScalarSubqueryExtraction=*/ true); 7790 Map<String, List<String>> cteBodyInScope = 7791 buildEffectiveAliasInScopeMap(cteBody, bodyProvider, 7792 ctePublishedColumnsOut, 7793 cteSubqueryAliasToIndex, stmts); 7794 NameBindingProvider cteBodyProviderWithStar = bodyProvider 7795 .withInScopeRelationColumns(cteBodyInScope); 7796 // Slice 114 — switch to buildSelectStatementImpl with 7797 // snapshot/rollback (see the matching SELECT-side 7798 // CTE site for full rationale). 7799 int cteBodyStmtsSnapshot = stmts.size(); 7800 int cteBodyLineageSnapshot = lineage.size(); 7801 StatementGraph body; 7802 try { 7803 body = buildSelectStatementImpl(cteBody, 7804 cteBodyProviderWithStar, cteName, 7805 /*hasOuterCteListAlreadyProcessed=*/ false, 7806 /*allowFromSubqueries=*/ true, 7807 /*allowScalarProjectionSubqueries=*/ true, 7808 /*allowWindowProjection=*/ true, 7809 /*allowJoinOnPredicateSubqueries=*/ false, 7810 /*stmtsForExtraction=*/ stmts, 7811 /*lineageForExtraction=*/ lineage, 7812 /*cteMapForExtraction=*/ cteNameToStatementIndex, 7813 /*isPredicateBody=*/ false, 7814 /*whereClauseContext=*/ PredicateClauseContext.CTE_BODY_WHERE, 7815 /*allowWherePredicateSubqueries=*/ true); 7816 } catch (RuntimeException ex) { 7817 while (stmts.size() > cteBodyStmtsSnapshot) stmts.remove(stmts.size() - 1); 7818 while (lineage.size() > cteBodyLineageSnapshot) lineage.remove(lineage.size() - 1); 7819 throw ex; 7820 } 7821 bodyIdx = stmts.size(); 7822 stmts.add(body); 7823 cteNameToStatementIndex.put(cteNameLower, bodyIdx); 7824 emitLineageForStatement(body, bodyIdx, lineage, 7825 cteNameToStatementIndex, cteSubqueryAliasToIndex, 7826 cteScalarMap); 7827 } 7828 // Slice 106 — explicit column-list rename uses dmlKind="SELECT" 7829 // so the SELECT-side CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH 7830 // code fires (slice-105 precedent: UPDATE/DELETE are closer 7831 // to ordinary SELECT than MERGE for CTE rename semantics). 7832 List<String> publishedCols = applyExplicitCteColumnListRename( 7833 cte, stmts, lineage, bodyIdx, lineageSize0, "SELECT"); 7834 ctePublishedColumnsOut.put(cteNameLower, publishedCols); 7835 visibleSoFar.add(cteNameLower); 7836 } 7837 return cteNameToStatementIndex; 7838 } 7839 7840 /** 7841 * Slice 105 — combine the slice-83 subqueryAliasToIndex with the 7842 * slice-105 CTE-as-FROM-relation alias→cteIdx entries so 7843 * {@link #emitUpdateSubquerySourceEdges} produces cross-stmt 7844 * lineage edges for SET RHS references resolving to a CTE column. 7845 * 7846 * <p>Without this merge the visible {@link OutputColumn#getSources} 7847 * stays correct (CTE refs surface as {@link ColumnRef}s) but 7848 * {@code lineage[]} silently loses the canonical 7849 * {@code STATEMENT_OUTPUT(update,col) → STATEMENT_OUTPUT(cte,col)} 7850 * edge (codex round-2 Q5 silent-correctness bug). 7851 * 7852 * <p>Walks {@code update.getJoins()} the same way 7853 * {@link #buildUpdateRelation} does to keep the alias resolution 7854 * identical: CTE-bound FROM-side relations are detected by their 7855 * bare name (case-insensitive) and registered under their effective 7856 * alias. Subquery aliases stay keyed lowercase to match the 7857 * slice-83 contract. 7858 */ 7859 private static Map<String, Integer> buildUpdateCombinedAliasToSubIdx( 7860 TUpdateSqlStatement update, 7861 Map<String, Integer> subqueryAliasToIndex, 7862 Map<String, Integer> cteNameToStatementIndex) { 7863 Map<String, Integer> combined = new HashMap<>(); 7864 if (subqueryAliasToIndex != null) { 7865 combined.putAll(subqueryAliasToIndex); 7866 } 7867 if (cteNameToStatementIndex == null 7868 || cteNameToStatementIndex.isEmpty()) { 7869 return combined; 7870 } 7871 TJoinList joins = update.getJoins(); 7872 if (joins == null) return combined; 7873 for (TJoin join : joins) { 7874 addCteAliasToCombinedMap(join.getTable(), 7875 cteNameToStatementIndex, combined); 7876 TJoinItemList items = join.getJoinItems(); 7877 if (items == null) continue; 7878 for (int i = 0; i < items.size(); i++) { 7879 TJoinItem item = items.getJoinItem(i); 7880 if (item == null) continue; 7881 addCteAliasToCombinedMap(item.getTable(), 7882 cteNameToStatementIndex, combined); 7883 } 7884 } 7885 return combined; 7886 } 7887 7888 private static void addCteAliasToCombinedMap(TTable t, 7889 Map<String, Integer> cteNameToStatementIndex, 7890 Map<String, Integer> combined) { 7891 if (t == null) return; 7892 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.objectname) { 7893 return; 7894 } 7895 TObjectName tName = t.getTableName(); 7896 if (tName == null) return; 7897 String bare = tName.toString(); 7898 if (bare == null || bare.isEmpty()) return; 7899 String bareLower = bare.toLowerCase(Locale.ROOT); 7900 Integer cteIdx = cteNameToStatementIndex.get(bareLower); 7901 if (cteIdx == null) return; 7902 String aliasKey = effectiveAliasLowerCaseOrNull(t); 7903 if (aliasKey == null) aliasKey = bareLower; 7904 combined.put(aliasKey, cteIdx); 7905 } 7906 7907 /** 7908 * Slice 102 / Slice 103 — when a WITH-clause CTE declares an explicit 7909 * column list ({@code WITH cte(a, b) AS (SELECT x, y FROM t)}), rebuild 7910 * {@code stmts[bodyIdx]} so its {@link OutputColumn} names match the 7911 * explicit list at each ordinal and rewrite outgoing 7912 * {@link LineageRef.Kind#STATEMENT_OUTPUT} refs in 7913 * {@code lineage[lineageSize0..lineage.size())} so the inner-projection 7914 * names are replaced by the explicit-list names. 7915 * 7916 * <p>Returns the published column list for the caller's 7917 * {@code ctePublishedColumns} map: the renamed list when an explicit list 7918 * is present; otherwise the body's inner names (matching pre-slice-102 7919 * behavior). Slice 103 reuses this helper from the outer SELECT CTE 7920 * walker via {@code dmlKind="SELECT"} (slice-100 cross-DML reuse 7921 * precedent). 7922 * 7923 * <p>Rejects: 7924 * <ul> 7925 * <li>Arity mismatch — explicit-list size != body output count → 7926 * {@link DiagnosticCode#MERGE_CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH} 7927 * when {@code dmlKind="MERGE"}, otherwise 7928 * {@link DiagnosticCode#CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH}. 7929 * Slice 103 cannot rename the MERGE-side code (it is pinned by 7930 * {@code Slice102Test.valueOfPinsResolveBothCodes} and adopting 7931 * it on the SELECT side would also miswire the message text); 7932 * the SELECT-side gets its own parallel code (codex round-1 7933 * plan-review BLOCKING).</li> 7934 * <li>Duplicate explicit name ({@code WITH cte(a, a) AS ...}) → 7935 * {@link DiagnosticCode#DUPLICATE_OUTPUT_NAME}. STATEMENT_OUTPUT 7936 * refs are keyed by output name; duplicates would collide 7937 * (codex round-2 plan-review advisory).</li> 7938 * </ul> 7939 * 7940 * <p>{@link OutputColumn} and {@link StatementGraph} are immutable; the 7941 * rebuild uses the slice-85 15-arg primary constructor copying every 7942 * field unchanged except {@code outputColumns}. {@link LineageEdge} and 7943 * {@link LineageRef} are immutable; the rewrite walker constructs new 7944 * instances and replaces them in the mutable {@code lineage} list via 7945 * {@link List#set}. 7946 */ 7947 private static List<String> applyExplicitCteColumnListRename( 7948 TCTE cte, 7949 List<StatementGraph> stmts, 7950 List<LineageEdge> lineage, 7951 int bodyIdx, 7952 int lineageSize0, 7953 String dmlKind) { 7954 StatementGraph body = stmts.get(bodyIdx); 7955 if (cte.getColumnList() == null || cte.getColumnList().size() == 0) { 7956 return outputColumnNames(body); 7957 } 7958 // Materialize the explicit list of renamed names (in declaration order). 7959 boolean isMerge = "MERGE".equals(dmlKind); 7960 String dmlLabel = isMerge ? "MERGE CTE" : "CTE"; 7961 String withClauseLabel = isMerge ? "MERGE WITH clause CTE" : "WITH clause CTE"; 7962 List<String> renamed = new ArrayList<>(cte.getColumnList().size()); 7963 Set<String> seenLower = new HashSet<>(); 7964 for (int k = 0; k < cte.getColumnList().size(); k++) { 7965 TObjectName col = cte.getColumnList().getObjectName(k); 7966 String name = (col == null) ? null : col.getColumnNameOnly(); 7967 if (name == null || name.isEmpty()) { 7968 // Defensive — parser normally fills these; if not, fall 7969 // back to a synthetic name so the constructor invariant 7970 // (non-empty name) holds, and the arity check still works. 7971 name = "col" + (k + 1); 7972 } 7973 String lower = name.toLowerCase(Locale.ROOT); 7974 if (!seenLower.add(lower)) { 7975 throw new SemanticIRBuildException(Diagnostic.error( 7976 DiagnosticCode.DUPLICATE_OUTPUT_NAME, 7977 "duplicate column name '" + name + "' in " + dmlLabel + " '" 7978 + cte.getTableName() 7979 + "' explicit column list; output names must " 7980 + "be unique within a CTE published column list", 7981 cte)); 7982 } 7983 renamed.add(name); 7984 } 7985 List<OutputColumn> bodyOutputs = body.getOutputColumns(); 7986 if (bodyOutputs.size() != renamed.size()) { 7987 DiagnosticCode arityCode = isMerge 7988 ? DiagnosticCode.MERGE_CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH 7989 : DiagnosticCode.CTE_EXPLICIT_COLUMN_LIST_ARITY_MISMATCH; 7990 throw new SemanticIRBuildException(Diagnostic.error( 7991 arityCode, 7992 withClauseLabel + " '" + cte.getTableName() 7993 + "' declares " + renamed.size() 7994 + " explicit column(s) but the body's SELECT " 7995 + "publishes " + bodyOutputs.size() + " column(s); " 7996 + "the explicit list must have exactly one entry " 7997 + "per body output column", 7998 cte)); 7999 } 8000 // Capture the old → new name mapping by ordinal BEFORE building the 8001 // new OutputColumns, so the lineage rewrite can look up the 8002 // substitution by old (inner) name. Codex round-1 diff-review 8003 // (non-blocking → upgraded to defensive guard): if the body has 8004 // duplicate inner output names (e.g. `SELECT id, id`), name-keyed 8005 // rewrite collapses both old refs to the last mapping and 8006 // produces wrong lineage. The IR contract already states output 8007 // names must be unique (see DUPLICATE_OUTPUT_NAME javadoc and the 8008 // line-4378 scalar-subquery guard) but is not enforced 8009 // generically. Reject here so explicit-rename paths cannot 8010 // silently break lineage. 8011 Set<String> seenInnerLower = new HashSet<>(); 8012 for (OutputColumn oc : bodyOutputs) { 8013 String n = oc.getName(); 8014 if (n == null || n.isEmpty()) continue; 8015 String lower = n.toLowerCase(Locale.ROOT); 8016 if (!seenInnerLower.add(lower)) { 8017 throw new SemanticIRBuildException(Diagnostic.error( 8018 DiagnosticCode.DUPLICATE_OUTPUT_NAME, 8019 dmlLabel + " '" + cte.getTableName() 8020 + "' body publishes duplicate inner column " 8021 + "name '" + n + "'; the explicit column " 8022 + "list rename requires unique inner names " 8023 + "because lineage refs are keyed by output " 8024 + "name and would collide", 8025 cte)); 8026 } 8027 } 8028 Map<String, String> oldToNewLower = new HashMap<>(); 8029 List<OutputColumn> newOutputs = new ArrayList<>(bodyOutputs.size()); 8030 for (int k = 0; k < bodyOutputs.size(); k++) { 8031 OutputColumn oc = bodyOutputs.get(k); 8032 String oldName = oc.getName(); 8033 String newName = renamed.get(k); 8034 if (oldName != null && !oldName.isEmpty()) { 8035 oldToNewLower.put(oldName.toLowerCase(Locale.ROOT), newName); 8036 } 8037 newOutputs.add(new OutputColumn(newName, oc.isDerived(), 8038 oc.isAggregate(), oc.getSources(), oc.getWindowSpec())); 8039 } 8040 // Rebuild the body's StatementGraph using the slice-85 15-arg primary 8041 // constructor — copies every field (including the slice-85 8042 // returningColumns slot per codex round-1 plan-review BLOCKING) 8043 // except outputColumns. 8044 StatementGraph renamedBody = new StatementGraph( 8045 body.getName(), body.getKind(), body.getRelations(), 8046 newOutputs, body.getReturningColumns(), 8047 body.getFilterColumnRefs(), body.getJoinColumnRefs(), 8048 body.getGroupByColumnRefs(), body.getHavingColumnRefs(), 8049 body.getOrderByColumnRefs(), body.getDistinctOnColumnRefs(), 8050 body.isDistinct(), body.getSetOperator(), body.getRowLimit(), 8051 body.getTarget()); 8052 stmts.set(bodyIdx, renamedBody); 8053 // Rewrite outgoing STATEMENT_OUTPUT refs in the window. Both `from` 8054 // and `to` are checked because edges can place the body-output ref 8055 // on either side (producer-side: from=TABLE_COLUMN, to=STATEMENT_OUTPUT; 8056 // consumer-side from a deeper inner stmt: from=STATEMENT_OUTPUT, 8057 // to=STATEMENT_OUTPUT — neither shape today places bodyIdx on 8058 // `from` for THIS body, but the symmetric check is cheap and 8059 // future-proof). LineageRef and LineageEdge are immutable, so new 8060 // instances are constructed and `lineage.set` replaces in place. 8061 for (int idx = lineageSize0; idx < lineage.size(); idx++) { 8062 LineageEdge edge = lineage.get(idx); 8063 LineageRef from = edge.getFrom(); 8064 LineageRef to = edge.getTo(); 8065 LineageRef newFrom = maybeRewriteStatementOutputRef( 8066 from, bodyIdx, oldToNewLower); 8067 LineageRef newTo = maybeRewriteStatementOutputRef( 8068 to, bodyIdx, oldToNewLower); 8069 if (newFrom != from || newTo != to) { 8070 lineage.set(idx, new LineageEdge(newFrom, newTo)); 8071 } 8072 } 8073 return Collections.unmodifiableList(renamed); 8074 } 8075 8076 /** 8077 * Slice 113 — copy a {@link StatementGraph} with a new {@code name} 8078 * field. Every other field is preserved verbatim. Used by the 8079 * set-op branch loop to assign the synthetic 8080 * {@code <set_op_branch_<idx>>} name AFTER the branch build, in case 8081 * the branch's WHERE-side predicate-subquery extraction 8082 * (slice 113 via {@link PredicateClauseContext#SET_OP_BRANCH_WHERE}) 8083 * appended predicate-body statements to {@code stmts}, which would 8084 * otherwise leave the pre-computed digit suffix lagging behind the 8085 * branch's final position. 8086 * 8087 * <p>The rebuild is purely cosmetic on the {@link StatementGraph#getName()} 8088 * field. No {@link LineageRef} is affected because all lineage refs 8089 * are idx-based (see {@link LineageRef#statementOutput(int, String)}), 8090 * not name-based. {@code outputColumns}, {@code relations}, 8091 * {@code filterColumnRefs}, {@code joinColumnRefs} and every other 8092 * field are reused unchanged. 8093 */ 8094 private static StatementGraph withRenamedTo(StatementGraph s, String newName) { 8095 return new StatementGraph(newName, s.getKind(), 8096 s.getRelations(), s.getOutputColumns(), s.getReturningColumns(), 8097 s.getFilterColumnRefs(), s.getJoinColumnRefs(), 8098 s.getGroupByColumnRefs(), s.getHavingColumnRefs(), 8099 s.getOrderByColumnRefs(), s.getDistinctOnColumnRefs(), 8100 s.isDistinct(), s.getSetOperator(), s.getRowLimit(), 8101 s.getTarget()); 8102 } 8103 8104 /** 8105 * Slice 102 — return a new STATEMENT_OUTPUT {@link LineageRef} with the 8106 * output name substituted when {@code ref} targets {@code bodyIdx} and 8107 * its current output name is a key in {@code oldToNewLower}. Otherwise 8108 * return {@code ref} unchanged (identity-comparable so the caller can 8109 * skip the {@code lineage.set} for no-op rewrites). 8110 */ 8111 private static LineageRef maybeRewriteStatementOutputRef( 8112 LineageRef ref, int bodyIdx, 8113 Map<String, String> oldToNewLower) { 8114 if (ref == null) return null; 8115 if (ref.getKind() != LineageRef.Kind.STATEMENT_OUTPUT) return ref; 8116 if (ref.getStatementIndex() != bodyIdx) return ref; 8117 String oldName = ref.getOutputName(); 8118 if (oldName == null || oldName.isEmpty()) return ref; 8119 String newName = oldToNewLower.get(oldName.toLowerCase(Locale.ROOT)); 8120 if (newName == null) return ref; 8121 return LineageRef.statementOutput(bodyIdx, newName); 8122 } 8123 8124 /** 8125 * Emit one lineage edge per (output, source) pair. Edges target a 8126 * {@link LineageRef.Kind#STATEMENT_OUTPUT} when the source's relation 8127 * is a CTE or a FROM-clause subquery, or a 8128 * {@link LineageRef.Kind#TABLE_COLUMN} when it's a base table. 8129 * Multi-source derived columns produce one edge per source. 8130 * 8131 * <p>{@code subqueryAliasToStatementIndex} is statement-local; the 8132 * caller supplies the alias map for this statement's own FROM list. 8133 * That avoids cross-scope alias collisions. 8134 */ 8135 private static void emitLineageForStatement(StatementGraph stmt, 8136 int statementIndex, 8137 List<LineageEdge> lineage, 8138 Map<String, Integer> cteNameToStatementIndex, 8139 Map<String, Integer> subqueryAliasToStatementIndex, 8140 Map<Integer, ScalarInfo> ordinalToScalarInfo) { 8141 // Slice 87: lowercase alias keys so SQL identifiers written with 8142 // different casing in the FROM clause vs. SELECT qualifier resolve 8143 // correctly (e.g. `SELECT t.name FROM employees T`). Mirrors the 8144 // same fix in emitUpdateSubquerySourceEdges (slice 83). When two 8145 // relations collide after lowercasing (unusual, but not guaranteed 8146 // caught by the duplicate-alias preflight in all call paths per 8147 // codex Q1 advisory), last-write-wins — the same policy as slice 83. 8148 Map<String, RelationSource> aliasToRelation = new HashMap<>(); 8149 for (RelationSource r : stmt.getRelations()) { 8150 String key = r.getAlias(); 8151 if (key == null || key.isEmpty()) continue; 8152 aliasToRelation.put(key.toLowerCase(Locale.ROOT), r); 8153 } 8154 List<OutputColumn> outputs = stmt.getOutputColumns(); 8155 for (int outOrdinal = 0; outOrdinal < outputs.size(); outOrdinal++) { 8156 OutputColumn out = outputs.get(outOrdinal); 8157 // Slice 11: scalar-subquery projections have empty sources 8158 // by construction; their lineage edge is a single 8159 // STATEMENT_OUTPUT → STATEMENT_OUTPUT pointing at the 8160 // extracted scalar body's only output. Emit it once and 8161 // skip the per-source loop (which would be a no-op anyway). 8162 ScalarInfo scalar = ordinalToScalarInfo.get(outOrdinal); 8163 if (scalar != null) { 8164 lineage.add(new LineageEdge( 8165 LineageRef.statementOutput(statementIndex, out.getName()), 8166 LineageRef.statementOutput(scalar.statementIndex, 8167 scalar.innerOutputName))); 8168 continue; 8169 } 8170 for (ColumnRef src : out.getSources()) { 8171 String srcAlias = src.getRelationAlias(); 8172 RelationSource rel = aliasToRelation.get( 8173 srcAlias == null ? null : srcAlias.toLowerCase(Locale.ROOT)); 8174 if (rel == null) { 8175 throw new SemanticIRBuildException( 8176 Diagnostic.error(DiagnosticCode.OUTPUT_REFERENCES_UNKNOWN_RELATION, 8177 "output '" + out.getName() + "' references unknown relation '" 8178 + src.getRelationAlias() + "'", null)); 8179 } 8180 LineageRef from = LineageRef.statementOutput(statementIndex, out.getName()); 8181 LineageRef to; 8182 // Slice 15: resolved-kind dispatch. For OUTER_REFERENCE 8183 // bindings the underlying outerKind decides which 8184 // table-column or statement-output edge we emit. 8185 // Codex round-1 MUST 2 / round-2 MUST 1: exhaustive 8186 // dispatch instead of catch-all. 8187 RelationKind kind = rel.getBinding().getKind(); 8188 RelationKind resolvedKind = (kind == RelationKind.OUTER_REFERENCE) 8189 ? rel.getBinding().getOuterKind() 8190 : kind; 8191 if (resolvedKind == RelationKind.CTE) { 8192 Integer cteIndex = cteNameToStatementIndex.get( 8193 rel.getBinding().getQualifiedName().toLowerCase(Locale.ROOT)); 8194 if (cteIndex == null) { 8195 throw new SemanticIRBuildException( 8196 Diagnostic.error(DiagnosticCode.CTE_BODY_MISSING, 8197 "CTE '" + rel.getBinding().getQualifiedName() + "' has no body statement", null)); 8198 } 8199 to = LineageRef.statementOutput(cteIndex, src.getColumnName()); 8200 } else if (resolvedKind == RelationKind.SUBQUERY) { 8201 Integer subIndex = subqueryAliasToStatementIndex.get( 8202 rel.getAlias().toLowerCase(Locale.ROOT)); 8203 if (subIndex == null) { 8204 throw new SemanticIRBuildException( 8205 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_BINDING_UNRESOLVED, 8206 "FROM-clause subquery '" + rel.getAlias() 8207 + "' has no body statement registered", null)); 8208 } 8209 to = LineageRef.statementOutput(subIndex, src.getColumnName()); 8210 } else if (resolvedKind == RelationKind.TABLE) { 8211 to = LineageRef.tableColumn( 8212 rel.getBinding().getQualifiedName(), 8213 src.getColumnName()); 8214 } else { 8215 throw new SemanticIRBuildException( 8216 Diagnostic.error(DiagnosticCode.OUTPUT_REFERENCES_UNSUPPORTED_BINDING_KIND, 8217 "output '" + out.getName() 8218 + "' references relation '" + rel.getAlias() 8219 + "' with unsupported binding kind " + kind 8220 + (kind == RelationKind.OUTER_REFERENCE 8221 ? " (outerKind=" + rel.getBinding().getOuterKind() + ")" 8222 : ""), null)); 8223 } 8224 lineage.add(new LineageEdge(from, to)); 8225 } 8226 } 8227 } 8228 8229 /** 8230 * Build one SELECT statement (CTE body or outer). The {@code name} 8231 * argument is non-null for a CTE body, null otherwise. When 8232 * {@code hasOuterCteListAlreadyProcessed} is true, the SELECT's own 8233 * {@code getCteList()} is not rejected because the caller has already 8234 * extracted those CTEs into separate statements; in all other cases a 8235 * non-empty CTE list on this node is rejected (so nested WITH inside 8236 * a CTE body does not silently slip through). 8237 */ 8238 private static StatementGraph buildSelectStatement(TSelectSqlStatement select, 8239 NameBindingProvider provider, 8240 String name, 8241 boolean hasOuterCteListAlreadyProcessed, 8242 boolean allowFromSubqueries, 8243 boolean allowScalarProjectionSubqueries, 8244 boolean allowWindowProjection) { 8245 // Slice 23: legacy 7-arg call site. Predicate-subquery extraction is 8246 // disabled (allowJoinOnPredicateSubqueries=false) and this is not a 8247 // predicate body itself (isPredicateBody=false). All non-outer call 8248 // sites use this overload — the slice-17 `rejectSubqueriesInJoinOn` 8249 // continues to fire at every non-outer JOIN-ON site. 8250 return buildSelectStatementImpl(select, provider, name, 8251 hasOuterCteListAlreadyProcessed, 8252 allowFromSubqueries, 8253 allowScalarProjectionSubqueries, 8254 allowWindowProjection, 8255 /*allowJoinOnPredicateSubqueries=*/ false, 8256 /*stmtsForExtraction=*/ null, 8257 /*lineageForExtraction=*/ null, 8258 /*cteMapForExtraction=*/ null, 8259 /*isPredicateBody=*/ false, 8260 /*whereClauseContext=*/ PredicateClauseContext.SELECT_WHERE, 8261 /*allowWherePredicateSubqueries=*/ false); 8262 } 8263 8264 /** 8265 * Internal body shared between the legacy 7-arg overload and the 8266 * outer-SELECT entry point used by {@link #build}. Slice 23 added two new 8267 * concepts; slice 24 added one more. 8268 * <ul> 8269 * <li>{@code allowJoinOnPredicateSubqueries} + {@code stmts}/{@code lineage} 8270 * — when {@code allow...} is {@code true}, JOIN-ON uncorrelated 8271 * EXISTS subqueries are extracted as their own 8272 * {@code <predicate_subquery_<i>>} statements appended to 8273 * {@code stmts} (slice-11/12 synthetic-name pattern). Outer-SELECT 8274 * entry only.</li> 8275 * <li>{@code isPredicateBody} — when {@code true}, this statement IS 8276 * the inner SELECT of an extracted EXISTS body. The constant-only 8277 * projection rejection in {@link #buildOutputColumns} is bypassed 8278 * and a single synthetic OutputColumn is emitted in its place.</li> 8279 * <li>{@code cteMapForExtraction} (slice 24) — outer's CTE 8280 * name-to-statement-index map, plumbed in only when 8281 * {@code allowJoinOnPredicateSubqueries=true}. Required so the 8282 * slice-24 column-bearing inner projection can emit 8283 * STATEMENT_OUTPUT → STATEMENT_OUTPUT lineage edges into outer- 8284 * visible CTE bodies. Non-outer call sites pass {@code null}.</li> 8285 * </ul> 8286 */ 8287 private static StatementGraph buildSelectStatementImpl( 8288 TSelectSqlStatement select, 8289 NameBindingProvider provider, 8290 String name, 8291 boolean hasOuterCteListAlreadyProcessed, 8292 boolean allowFromSubqueries, 8293 boolean allowScalarProjectionSubqueries, 8294 boolean allowWindowProjection, 8295 boolean allowJoinOnPredicateSubqueries, 8296 List<StatementGraph> stmtsForExtraction, 8297 List<LineageEdge> lineageForExtraction, 8298 Map<String, Integer> cteMapForExtraction, 8299 boolean isPredicateBody, 8300 PredicateClauseContext whereClauseContext, 8301 boolean allowWherePredicateSubqueries) { 8302 rejectUnsupportedShape(select, hasOuterCteListAlreadyProcessed); 8303 boolean distinct = resolveDistinctFlag(select); 8304 // Slice 65 — reset using scope at entry so a parent SELECT's 8305 // scope cannot leak into recursive nested builds. The using 8306 // scope for THIS SELECT is installed only AFTER buildRelations 8307 // completes (see below) so the predicate-subquery extraction 8308 // walk inside buildRelations does not inherit the outer scope 8309 // (codex slice-65 diff-review round-1 P2 #1: an inner 8310 // {@code EXISTS (SELECT SUM(x.v) FILTER (WHERE k > 0) FROM x)} 8311 // would have its bare `k` expand to outer's merged sources, 8312 // causing a valid uncorrelated body to be rejected as 8313 // correlated). The slice-64 → 65 JOIN-ON merged-key reject 8314 // also runs BEFORE buildRelations so ON-clause refs aren't 8315 // collected with a stale or future scope. 8316 provider = provider.withUsingScope(UsingScope.EMPTY); 8317 rejectUnqualifiedMergedKeyInJoinOn(select, provider); 8318 List<ColumnRef> joinRefs = new ArrayList<>(); 8319 List<RelationSource> relations; 8320 if (isSetOpBranchSyntheticName(name) 8321 && hasNoFromSource(select) 8322 && allResultColumnsAreConstantExpressions(select)) { 8323 // Slice 61: allow FROM-less constant-only set-op branches 8324 // such as SELECT 1 UNION ALL SELECT 2. The general SELECT 8325 // boundary remains unchanged: non-branch SELECT 1 still 8326 // fails in buildRelations with "must have at least one 8327 // FROM source". 8328 relations = Collections.emptyList(); 8329 } else { 8330 relations = buildRelations(select, provider, joinRefs, 8331 allowFromSubqueries, 8332 allowJoinOnPredicateSubqueries, 8333 stmtsForExtraction, lineageForExtraction, cteMapForExtraction); 8334 } 8335 // Slice 65 — install this SELECT's own using scope AFTER 8336 // buildRelations / predicate-subquery extraction. From here 8337 // forward the clause collectors (output / filter / groupBy / 8338 // having / orderBy) see the merged-key scope for THIS SELECT. 8339 UsingScope ownScope = buildUsingScope(select, provider); 8340 if (!ownScope.isEmpty()) { 8341 provider = provider.withUsingScope(ownScope); 8342 } 8343 List<OutputColumn> outputColumns = buildOutputColumns(select, provider, 8344 allowScalarProjectionSubqueries, allowWindowProjection, 8345 isPredicateBody, name); 8346 // Slice 112 — thread the SELECT path's outer extraction context 8347 // through buildFilterColumnRefs so top-level SELECT WHERE can 8348 // lift uncorrelated predicate-subquery wrappers via the 8349 // slice-23+ extraction pipeline (PredicateClauseContext.SELECT_WHERE). 8350 // Slice 113 — the same threading extends to set-op branch WHERE 8351 // via PredicateClauseContext.SET_OP_BRANCH_WHERE, distinguished 8352 // only by clauseLabel for diagnostic messages (codes are shared). 8353 // 8354 // {@code allowWherePredicateSubqueries} is INDEPENDENT of 8355 // {@code allowJoinOnPredicateSubqueries} (slice 113 split): 8356 // set-op branches admit WHERE-side predicate subqueries while 8357 // KEEPING JOIN-ON predicate subqueries rejected (slice 23 / 26 8358 // contract — pinned by Slice23Test#existsInSetOpBranchJoinOnStillRejected 8359 // and Slice26Test#lhsSubqueryInSetOpBranchRejected). Nested 8360 // SELECTs without extraction context 8361 // (allowWherePredicateSubqueries=false) keep the slice-80 8362 // blanket reject inside buildFilterColumnRefs. 8363 List<ColumnRef> filterRefs = buildFilterColumnRefs(select, provider, 8364 allowWherePredicateSubqueries, 8365 stmtsForExtraction, lineageForExtraction, cteMapForExtraction, 8366 whereClauseContext); 8367 List<ColumnRef> groupByRefs = buildGroupByColumnRefs(select, provider); 8368 List<ColumnRef> havingRefs = buildHavingColumnRefs(select, provider); 8369 List<ColumnRef> orderByRefs = buildOrderByColumnRefs(select, provider, outputColumns); 8370 // Slice 73: DISTINCT ON refs collected here so they observe the 8371 // same {@code provider} (with UsingScope already installed) used 8372 // by buildGroupByColumnRefs / buildHavingColumnRefs / 8373 // buildOrderByColumnRefs. This keeps `DISTINCT ON (k)` over 8374 // `JOIN ... USING (k)` consistent with slice-65 merged-key 8375 // semantics and prevents parent-scope leakage into nested 8376 // builds. 8377 List<ColumnRef> distinctOnRefs = buildDistinctOnColumnRefs(select, provider); 8378 RowLimit rowLimit = buildRowLimit(select); 8379 return new StatementGraph(name, "SELECT", relations, outputColumns, 8380 filterRefs, joinRefs, groupByRefs, havingRefs, orderByRefs, 8381 distinctOnRefs, 8382 distinct, 8383 /*setOperator=*/ null, 8384 rowLimit); 8385 } 8386 8387 /** 8388 * Slices 70 and 71: build per-statement row-limit metadata from 8389 * {@code TLimitClause}, {@code TTopClause}, {@code TOffsetClause}, 8390 * or {@code TFetchFirstClause}. Returns {@code null} when no 8391 * row-limit clause is present. All admit / reject decisions for 8392 * single-SELECT row-limit clauses live here; the set-op outer 8393 * row-limit path is rejected separately by 8394 * {@link #rejectSetOpRowLimit} (slice 72 lifts). 8395 * 8396 * <h4>Admitted shapes</h4> 8397 * <ul> 8398 * <li>{@link RowLimitKind#LIMIT} — {@code TLimitClause} with 8399 * non-null {@code getRow_count()}. Offset is populated when 8400 * {@code TLimitClause.getOffset() != null} (PG / MySQL / 8401 * SQLite / BigQuery / Snowflake / Redshift inline 8402 * {@code LIMIT N OFFSET M}, MySQL old-style {@code LIMIT M, N}, 8403 * Informix {@code SKIP m LIMIT n}).</li> 8404 * <li>{@link RowLimitKind#FETCH_FIRST} — {@code TLimitClause} with 8405 * non-null {@code getSelectFetchFirstValue()} (PG 8406 * {@code FETCH FIRST}, Informix {@code FIRST n}). Offset is 8407 * populated when present (PG 8408 * {@code OFFSET m FETCH FIRST n}, Informix 8409 * {@code SKIP m FIRST n}). Also fires for 8410 * {@code TFetchFirstClause} with non-null 8411 * {@code getFetchValue()} (Oracle / SQL Server 8412 * {@code FETCH FIRST/NEXT N ROWS ONLY}) when no 8413 * {@code TOffsetClause} is present.</li> 8414 * <li>{@link RowLimitKind#TOP} — {@code TTopClause} with non-null 8415 * {@code getExpr()} and neither {@code isPercent()} nor 8416 * {@code isWithties()} set. SQL Server {@code SELECT TOP N}.</li> 8417 * <li>{@link RowLimitKind#OFFSET_FETCH} — Oracle / SQL Server 8418 * {@code OFFSET m ROWS [FETCH NEXT n ROWS ONLY]} routed via 8419 * the dedicated {@code TOffsetClause} + {@code TFetchFirstClause} 8420 * pair, and PG offset-only {@code OFFSET m} routed via 8421 * {@code TLimitClause.getOffset()} when {@code row_count} and 8422 * {@code selectFetchFirstValue} are both null. 8423 * {@link RowLimit#getCount()} may be {@code null} for 8424 * offset-only forms.</li> 8425 * </ul> 8426 * 8427 * <h4>Rejects</h4> 8428 * <ul> 8429 * <li>{@link DiagnosticCode#ROW_LIMIT_TOP_PERCENT_NOT_SUPPORTED} 8430 * — {@code TOP N PERCENT}. The sampling semantics differ from 8431 * fixed-row {@code LIMIT} enough to warrant a dedicated slice.</li> 8432 * <li>{@link DiagnosticCode#ROW_LIMIT_TOP_WITH_TIES_NOT_SUPPORTED} 8433 * — {@code TOP N WITH TIES}. Requires modeling the ORDER BY 8434 * tie-handling interaction; deferred.</li> 8435 * <li>{@link DiagnosticCode#ROW_LIMIT_HIVE_LIMIT_GRAMMAR_QUIRK} — 8436 * Hive single-argument {@code LIMIT N} parser routes the 8437 * count through {@code TLimitClause.getOffset()} with 8438 * {@code row_count == null}, which is indistinguishable at 8439 * the AST level from PG offset-only {@code OFFSET m}. Pinning 8440 * this with a vendor-specific guard prevents emitting 8441 * semantically-wrong {@code OFFSET_FETCH} metadata for what 8442 * the SQL author wrote as a LIMIT. A future grammar fix 8443 * should route the count through {@code getRow_count()}; this 8444 * guard can be removed then.</li> 8445 * <li>{@link DiagnosticCode#ROW_LIMIT_LIMIT_NOT_SUPPORTED} — 8446 * Vertica TIMESERIES windowing on {@code TLimitClause} 8447 * ({@code getWindowDef() != null}). Defensive; not modeled.</li> 8448 * <li>{@link DiagnosticCode#ROW_LIMIT_COUNT_UNRESOLVED} — the 8449 * parser constructed a row-limit clause node but did not 8450 * populate any count slot: 8451 * <ul> 8452 * <li>{@code TLimitClause} with {@code row_count}, 8453 * {@code selectFetchFirstValue}, and {@code offset} all 8454 * null (defensive; not observed in probe runs).</li> 8455 * <li>{@code TFetchFirstClause} with null fetchValue — 8456 * ANSI / DB2 grammar incompleteness (the parser 8457 * constructs the clause node but does not populate the 8458 * count). Future grammar fix can lift this.</li> 8459 * <li>{@code TTopClause} with null expression (defensive).</li> 8460 * </ul></li> 8461 * </ul> 8462 */ 8463 private static RowLimit buildRowLimit(TSelectSqlStatement select) { 8464 TLimitClause limit = select.getLimitClause(); 8465 if (limit != null) { 8466 // Vertica TIMESERIES window on TLimitClause — defensive; rare. 8467 // Pre-empts the row_count / fff branches because the windowed 8468 // form is its own semantic surface. 8469 if (limit.getWindowDef() != null) { 8470 throw new SemanticIRBuildException( 8471 Diagnostic.error(DiagnosticCode.ROW_LIMIT_LIMIT_NOT_SUPPORTED, 8472 "row-limit clause LIMIT with Vertica TIMESERIES window " 8473 + "is not supported yet", limit)); 8474 } 8475 8476 TExpression rc = limit.getRow_count(); 8477 TExpression off = limit.getOffset(); 8478 TExpression fff = limit.getSelectFetchFirstValue(); 8479 8480 // Hive single-argument LIMIT parser quirk: the count ends 8481 // up on offset with row_count=null. Vendor-conditional 8482 // because the same AST shape is legitimate PG offset-only. 8483 if (select.dbvendor == EDbVendor.dbvhive 8484 && rc == null && off != null && fff == null) { 8485 throw new SemanticIRBuildException( 8486 Diagnostic.error(DiagnosticCode.ROW_LIMIT_HIVE_LIMIT_GRAMMAR_QUIRK, 8487 "Hive single-argument LIMIT N is currently mis-routed " 8488 + "by the parser (count appears on TLimitClause.getOffset() " 8489 + "with row_count=null); fix the Hive grammar to route " 8490 + "the count through getRow_count() to lift this guard", limit)); 8491 } 8492 8493 if (rc != null) { 8494 // LIMIT N with optional OFFSET M (PG/MySQL/SQLite/ 8495 // BigQuery/Snowflake/Redshift inline LIMIT-OFFSET, 8496 // MySQL old-style LIMIT M,N, Informix SKIP m LIMIT n). 8497 return new RowLimit(RowLimitKind.LIMIT, 8498 rc.toString(), 8499 off != null ? off.toString() : null); 8500 } 8501 if (fff != null) { 8502 // FETCH FIRST via the PG/Informix routing through 8503 // TLimitClause, with optional OFFSET (PG 8504 // OFFSET m FETCH FIRST n; Informix SKIP m FIRST n). 8505 return new RowLimit(RowLimitKind.FETCH_FIRST, 8506 fff.toString(), 8507 off != null ? off.toString() : null); 8508 } 8509 if (off != null) { 8510 // Offset-only via TLimitClause (PG OFFSET m [ROWS]). 8511 return new RowLimit(RowLimitKind.OFFSET_FETCH, 8512 /*count=*/ null, 8513 off.toString()); 8514 } 8515 // Defensive: TLimitClause present with all four slots null. 8516 throw new SemanticIRBuildException( 8517 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8518 "row-limit clause LIMIT is present but no count, offset, " 8519 + "or FETCH FIRST value is populated on the parser AST", limit)); 8520 } 8521 8522 TTopClause top = select.getTopClause(); 8523 if (top != null) { 8524 if (top.isPercent()) { 8525 throw new SemanticIRBuildException( 8526 Diagnostic.error(DiagnosticCode.ROW_LIMIT_TOP_PERCENT_NOT_SUPPORTED, 8527 "row-limit clause TOP N PERCENT is not supported yet; " 8528 + "sampling semantics warrant a dedicated slice", top)); 8529 } 8530 if (top.isWithties()) { 8531 throw new SemanticIRBuildException( 8532 Diagnostic.error(DiagnosticCode.ROW_LIMIT_TOP_WITH_TIES_NOT_SUPPORTED, 8533 "row-limit clause TOP N WITH TIES is not supported yet; " 8534 + "tie-handling semantics warrant a dedicated slice", top)); 8535 } 8536 TExpression e = top.getExpr(); 8537 if (e == null) { 8538 throw new SemanticIRBuildException( 8539 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8540 "row-limit clause TOP is present but the count expression " 8541 + "is not populated on the parser AST", top)); 8542 } 8543 return new RowLimit(RowLimitKind.TOP, e.toString(), /*offset=*/ null); 8544 } 8545 8546 TOffsetClause offClause = select.getOffsetClause(); 8547 TFetchFirstClause fetch = select.getFetchFirstClause(); 8548 if (offClause != null) { 8549 // Oracle / SQL Server OFFSET m ROWS [FETCH NEXT n ROWS ONLY]. 8550 // The optional FETCH NEXT counterpart populates 8551 // TFetchFirstClause when present. 8552 String offsetText = offClause.getSelectOffsetValue() != null 8553 ? offClause.getSelectOffsetValue().toString() 8554 : null; 8555 if (offsetText == null) { 8556 throw new SemanticIRBuildException( 8557 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8558 "row-limit clause OFFSET is present but the offset value " 8559 + "is not populated on the parser AST", offClause)); 8560 } 8561 String countText = null; 8562 if (fetch != null && fetch.getFetchValue() != null) { 8563 countText = fetch.getFetchValue().toString(); 8564 } 8565 return new RowLimit(RowLimitKind.OFFSET_FETCH, countText, offsetText); 8566 } 8567 if (fetch != null) { 8568 if (fetch.getFetchValue() != null) { 8569 // Oracle / SQL Server FETCH FIRST/NEXT N ROWS ONLY 8570 // without OFFSET (TOffsetClause was null above). 8571 return new RowLimit(RowLimitKind.FETCH_FIRST, 8572 fetch.getFetchValue().toString(), /*offset=*/ null); 8573 } 8574 // ANSI / DB2: TFetchFirstClause is non-null but fetchValue 8575 // is null because the grammar does not pass the count into 8576 // the node initializer. Reject so the gap is visible. 8577 throw new SemanticIRBuildException( 8578 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8579 "row-limit clause FETCH FIRST is present but the count " 8580 + "expression is not populated on the parser AST " 8581 + "(ANSI / DB2 grammar gap)", fetch)); 8582 } 8583 return null; 8584 } 8585 8586 /** 8587 * Slice 72: build the OUTER set-op statement's row-limit. Same 8588 * decision tree as {@link #buildRowLimit} for SELECT-level routing 8589 * (PG/MySQL/SQLite/BigQuery/Snowflake/Redshift via 8590 * {@code TLimitClause}, plus Hive/Vertica/ANSI-DB2 defensives), 8591 * with an additional MSSQL-only fallback that reads the OFFSET / 8592 * FETCH FIRST clauses off the outer {@code TOrderBy} node. 8593 * 8594 * <p>Empirical AST shapes (probed against the current parser): 8595 * <ul> 8596 * <li>PG / MySQL / SQLite / BigQuery / Snowflake / Redshift route 8597 * set-op outer LIMIT / OFFSET / FETCH FIRST onto 8598 * {@code setOp.getLimitClause()} — handled by the primary 8599 * {@code buildRowLimit} path.</li> 8600 * <li>MSSQL routes set-op outer {@code OFFSET m ROWS [FETCH NEXT 8601 * n ROWS ONLY]} EXCLUSIVELY onto 8602 * {@code setOp.getOrderbyClause().getOffsetClause()} / 8603 * {@code .getFetchFirstClause()} — NOT duplicated onto the 8604 * SELECT node (opposite of single-SELECT MSSQL where slice 71 8605 * saw duplication onto both). The TOrderBy fallback below 8606 * handles this.</li> 8607 * <li>Oracle drops set-op outer OFFSET / FETCH from both SELECT 8608 * and TOrderBy slots silently; nothing for slice 72 to 8609 * emit. A future Oracle grammar fix can lift this.</li> 8610 * </ul> 8611 * 8612 * <p>The TOrderBy fallback is vendor-gated to MSSQL to avoid 8613 * over-admitting on unprobed dialects (per codex round-1 B1). 8614 * Kind mapping mirrors {@link #buildRowLimit}'s single-SELECT 8615 * decision tree: 8616 * <ul> 8617 * <li>{@code TOffsetClause} + {@code TFetchFirstClause} both 8618 * populated → {@code OFFSET_FETCH/count/offset}</li> 8619 * <li>{@code TOffsetClause} only → {@code OFFSET_FETCH/null/offset}</li> 8620 * <li>{@code TFetchFirstClause} only → {@code FETCH_FIRST/count/null} 8621 * (unreachable via current MSSQL grammar which requires 8622 * OFFSET before FETCH; retained as defensive routing-shape 8623 * parity with single-SELECT)</li> 8624 * <li>Defensive null-value rejects mirror single-SELECT 8625 * {@link #buildRowLimit}: a present {@code TOffsetClause} 8626 * with a null offset value throws 8627 * {@code ROW_LIMIT_COUNT_UNRESOLVED}; a present bare 8628 * {@code TFetchFirstClause} (no companion OFFSET) with a 8629 * null fetch value throws the same code. When both clauses 8630 * are present, only the offset slot must be populated; a 8631 * null fetch value is silently treated as offset-only 8632 * (matches the single-SELECT 8633 * {@code TOffsetClause + TFetchFirstClause} branch in 8634 * {@code buildRowLimit}).</li> 8635 * </ul> 8636 */ 8637 private static RowLimit buildSetOpRowLimit(TSelectSqlStatement setOp) { 8638 // Primary path: SELECT-level routing. Covers PG/MySQL/SQLite/ 8639 // BigQuery/Snowflake/Redshift via TLimitClause, plus inherited 8640 // Hive / Vertica / ANSI-DB2 defensives from buildRowLimit. 8641 RowLimit fromSelect = buildRowLimit(setOp); 8642 if (fromSelect != null) { 8643 return fromSelect; 8644 } 8645 // MSSQL-only TOrderBy fallback (codex B1 vendor gate). 8646 if (setOp.dbvendor != EDbVendor.dbvmssql) { 8647 return null; 8648 } 8649 TOrderBy orderBy = setOp.getOrderbyClause(); 8650 if (orderBy == null) return null; 8651 TOffsetClause oc = orderBy.getOffsetClause(); 8652 TFetchFirstClause fc = orderBy.getFetchFirstClause(); 8653 if (oc == null && fc == null) return null; 8654 8655 String offsetText = (oc != null && oc.getSelectOffsetValue() != null) 8656 ? oc.getSelectOffsetValue().toString() : null; 8657 String countText = (fc != null && fc.getFetchValue() != null) 8658 ? fc.getFetchValue().toString() : null; 8659 8660 if (oc != null && fc != null) { 8661 if (offsetText == null) { 8662 // Mirrors single-SELECT buildRowLimit TOffsetClause path: 8663 // a present OFFSET clause must populate its value (the 8664 // FETCH NEXT counterpart is optional; null countText is 8665 // silently treated as offset-only). 8666 throw new SemanticIRBuildException( 8667 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8668 "MSSQL set-op outer OFFSET clause present on TOrderBy " 8669 + "but offset value is not populated on the parser AST", orderBy)); 8670 } 8671 return new RowLimit(RowLimitKind.OFFSET_FETCH, countText, offsetText); 8672 } 8673 if (oc != null) { 8674 if (offsetText == null) { 8675 throw new SemanticIRBuildException( 8676 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8677 "MSSQL set-op outer OFFSET clause present on TOrderBy " 8678 + "but offset value is not populated on the parser AST", orderBy)); 8679 } 8680 return new RowLimit(RowLimitKind.OFFSET_FETCH, /*count=*/ null, offsetText); 8681 } 8682 // fc only (oc == null). Defensive: not reachable via current 8683 // MSSQL grammar which requires OFFSET before FETCH NEXT. 8684 if (countText == null) { 8685 throw new SemanticIRBuildException( 8686 Diagnostic.error(DiagnosticCode.ROW_LIMIT_COUNT_UNRESOLVED, 8687 "MSSQL set-op outer FETCH FIRST clause present on TOrderBy " 8688 + "but fetch value is not populated on the parser AST", orderBy)); 8689 } 8690 return new RowLimit(RowLimitKind.FETCH_FIRST, countText, /*offset=*/ null); 8691 } 8692 8693 /** 8694 * Resolve a {@link TSelectSqlStatement}'s row-filter clause to the IR's 8695 * {@code distinct} flag. Mapping: 8696 * 8697 * <ul> 8698 * <li>no clause / {@code urfNone} / {@code urfAll}: {@code false}</li> 8699 * <li>{@code urfDistinct}: {@code true}</li> 8700 * <li>{@code urfUnique}: {@code true} — Oracle treats 8701 * {@code SELECT UNIQUE} as a deprecated synonym for 8702 * {@code SELECT DISTINCT}; both produce the same row-set.</li> 8703 * <li>{@code urfDistinctOn}: admits (slice 73). Returns 8704 * {@code true} for the boolean flag; the 8705 * {@code DISTINCT ON (cols)} partition keys are collected 8706 * separately by 8707 * {@link #buildDistinctOnColumnRefs(TSelectSqlStatement, 8708 * NameBindingProvider)} so the column-ref collection runs 8709 * AFTER {@code UsingScope} is installed (matching the timing 8710 * of {@link #buildGroupByColumnRefs} and friends).</li> 8711 * <li>{@code urfDistinctRow}, {@code urfNormalize}: rejected 8712 * (vendor-specific; not yet a documented IR shape).</li> 8713 * <li>null filter on a non-null {@code TSelectDistinct}, or a 8714 * new enum value the switch hasn't seen yet: rejected, so a 8715 * future {@code EUniqueRowFilterType} addition fails loudly 8716 * rather than silently classifying as {@code distinct=false}.</li> 8717 * </ul> 8718 */ 8719 private static boolean resolveDistinctFlag(TSelectSqlStatement select) { 8720 TSelectDistinct sd = select.getSelectDistinct(); 8721 if (sd == null) return false; 8722 EUniqueRowFilterType urf = sd.getUniqueRowFilter(); 8723 if (urf == null) { 8724 throw new SemanticIRBuildException( 8725 Diagnostic.error(DiagnosticCode.SELECT_ROW_FILTER_NULL, 8726 "SELECT row-filter is null; expected one of " 8727 + "{none, all, distinct, unique}", select)); 8728 } 8729 switch (urf) { 8730 case urfNone: 8731 case urfAll: 8732 return false; 8733 case urfDistinct: 8734 case urfUnique: // Oracle deprecated synonym for DISTINCT 8735 case urfDistinctOn: // slice 73: refs collected separately 8736 return true; 8737 case urfDistinctRow: 8738 case urfNormalize: 8739 throw new SemanticIRBuildException( 8740 Diagnostic.error(DiagnosticCode.SELECT_ROW_FILTER_NOT_SUPPORTED, 8741 "SELECT row-filter " + urf + " is not supported yet", select)); 8742 default: 8743 throw new SemanticIRBuildException( 8744 Diagnostic.error(DiagnosticCode.SELECT_ROW_FILTER_UNKNOWN, 8745 "unknown SELECT row-filter " + urf, select)); 8746 } 8747 } 8748 8749 /** 8750 * Slice 73: collect physical column references from a 8751 * {@code SELECT DISTINCT ON (cols)} expression list. Returns the 8752 * empty list for plain {@code DISTINCT}, {@code UNIQUE}, 8753 * {@code ALL}, and the no-filter case. Only PostgreSQL and 8754 * Greenplum expose {@code urfDistinctOn} with a populated 8755 * {@link TSelectDistinct#getExpressionList()}; Oracle, MySQL, and 8756 * Redshift silently drop the {@code ON (...)} clause and parse the 8757 * SELECT as plain {@code DISTINCT}, so this helper returns 8758 * {@code []} for those vendors regardless of the surface SQL. 8759 * 8760 * <p>Mirrors {@link #buildGroupByColumnRefs}: subqueries and window 8761 * functions in the expression list are rejected BEFORE 8762 * {@link #collectColumnRefs} descends, so inner-scope refs cannot 8763 * leak into {@code distinctOnColumnRefs}. Compound expressions 8764 * ({@code a + b}, {@code CASE WHEN ...}) and aggregate arguments 8765 * ({@code COUNT(x)}) are descended into so the underlying column 8766 * refs are captured. 8767 */ 8768 private static List<ColumnRef> buildDistinctOnColumnRefs( 8769 TSelectSqlStatement select, NameBindingProvider provider) { 8770 TSelectDistinct sd = select.getSelectDistinct(); 8771 if (sd == null 8772 || sd.getUniqueRowFilter() != EUniqueRowFilterType.urfDistinctOn) { 8773 return new ArrayList<>(); 8774 } 8775 TExpressionList el = sd.getExpressionList(); 8776 if (el == null || el.size() == 0) { 8777 // PG grammar requires at least one expression after 8778 // DISTINCT ON (; this branch is defensive — surface a 8779 // clear diagnostic rather than silently emit []. 8780 throw new SemanticIRBuildException( 8781 Diagnostic.error(DiagnosticCode.DISTINCT_ON_EMPTY_COLUMN_LIST, 8782 "DISTINCT ON requires at least one expression but the " 8783 + "AST exposes an empty list", sd)); 8784 } 8785 // Iterate items explicitly so each per-expression reject 8786 // diagnostic points at the offending expression. Equivalent 8787 // to running containsAnySubquery / rejectWindowFunctionInScope 8788 // / collectColumnRefs on the whole list (TExpressionList 8789 // inherits TParseTreeNodeList.acceptChildren which already 8790 // iterates element children), but the loop body gives 8791 // clearer rejection sites and lets us dedup refs across 8792 // expressions in declaration order. 8793 List<ColumnRef> refs = new ArrayList<>(); 8794 for (int i = 0; i < el.size(); i++) { 8795 TExpression expr = el.getExpression(i); 8796 if (containsAnySubqueryExpression(expr)) { 8797 throw new SemanticIRBuildException( 8798 Diagnostic.error(DiagnosticCode.DISTINCT_ON_HAS_SUBQUERY_NOT_SUPPORTED, 8799 "DISTINCT ON expression list contains a subquery; " 8800 + "subqueries in DISTINCT ON are not supported yet", sd)); 8801 } 8802 rejectWindowFunctionInScope(expr, "DISTINCT ON expression list"); 8803 for (ColumnRef ref : collectColumnRefs(expr, provider)) { 8804 if (!refs.contains(ref)) refs.add(ref); 8805 } 8806 } 8807 return refs; 8808 } 8809 8810 private static boolean hasNoFromSource(TSelectSqlStatement select) { 8811 return select.joins == null || select.joins.size() == 0; 8812 } 8813 8814 private static boolean allResultColumnsAreConstantExpressions(TSelectSqlStatement select) { 8815 TResultColumnList rcl = select.getResultColumnList(); 8816 if (rcl == null || rcl.size() == 0) return false; 8817 for (int i = 0; i < rcl.size(); i++) { 8818 TResultColumn rc = rcl.getResultColumn(i); 8819 if (rc == null || rc.getExpr() == null || !isConstantExpression(rc.getExpr())) { 8820 return false; 8821 } 8822 } 8823 return true; 8824 } 8825 8826 private static List<ColumnRef> buildGroupByColumnRefs(TSelectSqlStatement select, NameBindingProvider provider) { 8827 TGroupBy groupBy = select.getGroupByClause(); 8828 if (groupBy == null || groupBy.getItems() == null || groupBy.getItems().size() == 0) { 8829 return new ArrayList<>(); 8830 } 8831 TGroupByItemList items = groupBy.getItems(); 8832 // Slice 61: reject subqueries in GROUP BY before collectColumnRefs 8833 // descends into them. Pre-slice-61, queries such as `SELECT 1 8834 // FROM employees GROUP BY (SELECT id FROM departments)` reached 8835 // the constant-only projection guard and failed there; with the 8836 // slice-61 lift the projection now builds and the GROUP BY 8837 // visitor would leak `departments.id` into groupByColumnRefs 8838 // even though `departments` is not in {@code relations}, breaking 8839 // the IR invariant that column refs reference an in-scope 8840 // relation. Mirrors the WHERE / HAVING / ORDER BY subquery 8841 // guards. 8842 if (containsAnySubquery(items)) { 8843 throw new SemanticIRBuildException( 8844 Diagnostic.error(DiagnosticCode.GROUP_BY_HAS_SUBQUERY_NOT_SUPPORTED, 8845 "GROUP BY clause contains a subquery; subqueries in " 8846 + "GROUP BY are not supported yet", groupBy)); 8847 } 8848 // Slice 13: reject window functions in GROUP BY before 8849 // collectColumnRefs descends. 8850 rejectWindowFunctionInScope(items, "GROUP BY clause"); 8851 // Visitor-based collection ensures column refs in any nested 8852 // expression (e.g. GROUP BY date_trunc('day', t)) are captured. 8853 return collectColumnRefs(items, provider); 8854 } 8855 8856 /** 8857 * Collect physical column references from the {@code HAVING} clause. 8858 * 8859 * <p>HAVING is supported regardless of whether {@code GROUP BY} is 8860 * present: standard SQL allows {@code HAVING} without {@code GROUP BY} 8861 * (the whole result set is treated as a single group), and the parser 8862 * still attaches a {@link TGroupBy} node with empty 8863 * {@code getItems()} in that case. Both shapes flow through the same 8864 * collection path. 8865 * 8866 * <p>Per-shape rejections fire <i>before</i> {@link #collectColumnRefs} 8867 * so subquery / OVER children never enter the visitor and can't leak 8868 * inner-scope refs into {@code havingColumnRefs} (mirrors slice-9 8869 * ORDER BY guards): 8870 * 8871 * <ul> 8872 * <li>Scalar subqueries ({@link EExpressionType#subquery_t}) and 8873 * predicate subqueries ({@code EXISTS}, {@code IN (SELECT ...)}, 8874 * {@code ANY/ALL/SOME}) — checked via both expression-type and 8875 * {@link TExpression#getSubQuery()}, deep-scanned through the 8876 * whole HAVING expression subtree.</li> 8877 * <li>Window functions ({@code OVER (...)}) — standard SQL forbids 8878 * window functions in HAVING, but defense in depth: the 8879 * deep-scan rejecter ensures PARTITION BY / OVER ORDER BY refs 8880 * can't leak.</li> 8881 * </ul> 8882 * 8883 * <p>Aggregate functions in HAVING are <i>not</i> rejected — they're 8884 * the most common HAVING shape ({@code HAVING SUM(salary) > 1000}). 8885 * The visitor walks into the aggregate's argument list and captures 8886 * the underlying column ref ({@code salary}) the same way slice 6 8887 * does for projection-side aggregate args. 8888 */ 8889 private static List<ColumnRef> buildHavingColumnRefs(TSelectSqlStatement select, 8890 NameBindingProvider provider) { 8891 TGroupBy groupBy = select.getGroupByClause(); 8892 if (groupBy == null) return new ArrayList<>(); 8893 TExpression having = groupBy.getHavingClause(); 8894 if (having == null) return new ArrayList<>(); 8895 rejectHavingScalarSubquery(having); 8896 rejectHavingWindowFunction(having); 8897 return collectColumnRefs(having, provider); 8898 } 8899 8900 /** 8901 * Reject HAVING expressions that contain a subquery anywhere in the 8902 * subtree. Catches both: 8903 * 8904 * <ul> 8905 * <li>Scalar subqueries ({@link EExpressionType#subquery_t}) — 8906 * e.g. {@code HAVING (SELECT MAX(salary) FROM employees) > 0}.</li> 8907 * <li>Predicate subqueries ({@code EXISTS}, {@code IN (SELECT ...)}, 8908 * {@code ANY/ALL/SOME (SELECT ...)}) — these don't appear as 8909 * {@code subquery_t} expression nodes but carry a non-null 8910 * {@link TExpression#getSubQuery()}, e.g. 8911 * {@code HAVING EXISTS (SELECT 1 FROM ...)} or 8912 * {@code HAVING d.id IN (SELECT id FROM ...)}.</li> 8913 * </ul> 8914 * 8915 * <p>Mirrors {@link #rejectOrderByScalarSubquery}: top-level fast 8916 * path + visitor deep-scan over {@link TExpression#acceptChildren}. 8917 * The deep scan is required for nested cases like 8918 * {@code HAVING flag = 1 AND EXISTS (SELECT ...)} or 8919 * {@code HAVING CASE WHEN d.id IN (SELECT ...) THEN 1 ELSE 0 END > 0}. 8920 */ 8921 private static void rejectHavingScalarSubquery(TExpression having) { 8922 if (having.getExpressionType() == EExpressionType.subquery_t 8923 || having.getSubQuery() != null) { 8924 throw new SemanticIRBuildException( 8925 Diagnostic.error(DiagnosticCode.HAVING_SUBQUERY_NOT_SUPPORTED, 8926 "HAVING subquery '" + having + "' is not supported yet " 8927 + "(subqueries in HAVING would leak inner column refs)", having)); 8928 } 8929 final boolean[] found = {false}; 8930 having.acceptChildren(new TParseTreeVisitor() { 8931 @Override 8932 public void preVisit(TExpression e) { 8933 if (found[0]) return; 8934 if (e.getExpressionType() == EExpressionType.subquery_t 8935 || e.getSubQuery() != null) { 8936 found[0] = true; 8937 } 8938 } 8939 }); 8940 if (found[0]) { 8941 throw new SemanticIRBuildException( 8942 Diagnostic.error(DiagnosticCode.HAVING_HAS_SUBQUERY_NOT_SUPPORTED, 8943 "HAVING expression '" + having + "' contains a subquery " 8944 + "(scalar, EXISTS, IN, or ANY/ALL/SOME); not supported yet", having)); 8945 } 8946 } 8947 8948 /** 8949 * Reject HAVING expressions that contain a window function. Standard 8950 * SQL forbids window functions in HAVING (analytic functions are 8951 * computed after HAVING), but defense in depth: the visitor would 8952 * descend into {@code OVER (PARTITION BY ... ORDER BY ...)} and the 8953 * inner-scope refs would otherwise leak into {@code havingColumnRefs}. 8954 * Mirrors the projection-side {@link #rejectWindowFunctions} and the 8955 * ORDER BY-side {@link #rejectOrderByWindowFunction}. 8956 */ 8957 private static void rejectHavingWindowFunction(TExpression having) { 8958 final boolean[] found = {false}; 8959 having.acceptChildren(new TParseTreeVisitor() { 8960 @Override 8961 public void preVisit(TFunctionCall fn) { 8962 if (found[0]) return; 8963 if (fn.getWindowDef() != null) found[0] = true; 8964 } 8965 }); 8966 if (!found[0] && having.getExpressionType() == EExpressionType.function_t) { 8967 TFunctionCall fn = having.getFunctionCall(); 8968 if (fn != null && fn.getWindowDef() != null) found[0] = true; 8969 } 8970 if (found[0]) { 8971 throw new SemanticIRBuildException( 8972 Diagnostic.error(DiagnosticCode.HAVING_WINDOW_FUNCTION_NOT_SUPPORTED, 8973 "HAVING window function '" + having + "' is not supported yet " 8974 + "(window OVER (...) refs would leak into havingColumnRefs)", having)); 8975 } 8976 } 8977 8978 /** 8979 * Collect physical column references from {@code ORDER BY} sort keys. 8980 * 8981 * <p>Per-item validation rejects shapes that would otherwise vanish 8982 * silently into an empty ref list, leak inner-scope refs, or 8983 * misrepresent presentation as a dependency: 8984 * 8985 * <ul> 8986 * <li>Ordinal references ({@code ORDER BY 1}) — the sort key is a 8987 * {@link EExpressionType#simple_constant_t}; its meaning is 8988 * "first projected column" which depends on the SELECT list, 8989 * not on a base column. A future slice can model output-position 8990 * references explicitly.</li> 8991 * <li>Constant sort keys other than ordinals ({@code ORDER BY 'x'}, 8992 * and the compound {@code ORDER BY (1)} / {@code ORDER BY 1+0} 8993 * caught by the generic no-physical-column-refs check).</li> 8994 * <li>Projection-alias references ({@code ORDER BY x} where 8995 * {@code x} is a SELECT alias) — {@link TOrderByItem#doParse} 8996 * retypes the operand to {@link TObjectName#ttobjColumnAlias}, 8997 * which lowers {@link TObjectName#getDbObjectType()} to 8998 * {@link EDbObjectType#column_alias}. Without explicit 8999 * rejection the visitor would skip it and the IR would lose 9000 * the dependency entirely. The deep-scan version of this 9001 * check catches alias nodes nested inside expressions.</li> 9002 * <li>Subqueries in sort keys — scalar 9003 * ({@link EExpressionType#subquery_t}) and predicate 9004 * ({@code EXISTS}, {@code IN (SELECT ...)}, {@code ANY/ALL/SOME}) 9005 * — would otherwise leak inner-scope column refs into the 9006 * outer statement's {@code orderByColumnRefs}.</li> 9007 * <li>Window functions in sort keys ({@code ORDER BY ROW_NUMBER() 9008 * OVER (...)}) — the OVER clause descends through the visitor 9009 * and would leak its PARTITION BY / ORDER BY refs.</li> 9010 * </ul> 9011 * 9012 * <p>Sub-clauses that change row-set semantics are also rejected 9013 * here: Oracle {@code ORDER SIBLINGS BY} (hierarchical, not yet 9014 * modelled), Teradata {@code RESET WHEN} (window-style restart), 9015 * and the {@link TOrderBy}-level {@code FETCH FIRST}/{@code OFFSET} 9016 * defensive guards (in fresh parses the SELECT-level row-limit 9017 * guards in {@link #rejectUnsupportedShape} fire first because 9018 * {@code TSelectSqlNode.setOrderbyClause()} copies in-clause OFFSET/ 9019 * FETCH onto the SELECT node). 9020 * 9021 * <p>For everything else (qualified column refs, expressions like 9022 * {@code UPPER(name)}), {@link #collectColumnRefs} runs over each 9023 * sort key and aggregates the physical column refs. A per-item 9024 * empty-refs check catches anything that slipped past the explicit 9025 * shape rejections (e.g. {@code ORDER BY (1)}, 9026 * {@code ORDER BY 1 + 0}). Sort direction ({@code ASC}/{@code DESC}) 9027 * and null placement ({@code NULLS FIRST}/{@code NULLS LAST}) are 9028 * presentation metadata and are not modelled. 9029 */ 9030 private static List<ColumnRef> buildOrderByColumnRefs(TSelectSqlStatement select, 9031 NameBindingProvider provider, 9032 List<OutputColumn> outputColumns) { 9033 TOrderBy orderBy = select.getOrderbyClause(); 9034 if (orderBy == null) { 9035 return new ArrayList<>(); 9036 } 9037 if (orderBy.isSiblings()) { 9038 throw new SemanticIRBuildException( 9039 Diagnostic.error(DiagnosticCode.ORDER_SIBLINGS_BY_NOT_SUPPORTED, 9040 "ORDER SIBLINGS BY is not supported yet " 9041 + "(Oracle hierarchical ordering)", orderBy)); 9042 } 9043 if (orderBy.getResetWhenCondition() != null) { 9044 throw new SemanticIRBuildException( 9045 Diagnostic.error(DiagnosticCode.ORDER_BY_RESET_WHEN_NOT_SUPPORTED, 9046 "ORDER BY ... RESET WHEN is not supported yet " 9047 + "(Teradata window-style restart)", orderBy)); 9048 } 9049 // Slice 71: the in-clause OFFSET/FETCH on TOrderBy is no longer 9050 // rejected. MSSQL parsers duplicate OFFSET/FETCH onto BOTH the 9051 // SELECT node AND the TOrderBy node; slice 71 admits at the 9052 // SELECT level via buildRowLimit, so the TOrderBy duplicates 9053 // are simply ignored here. Oracle parsers populate only the 9054 // SELECT-level fields, so the TOrderBy fields are typically 9055 // null there. 9056 TOrderByItemList items = orderBy.getItems(); 9057 if (items == null || items.size() == 0) { 9058 return new ArrayList<>(); 9059 } 9060 // Validate + collect per item so a sort key contributing zero 9061 // column refs (e.g. constant arithmetic, parenthesised constant) 9062 // is rejected with an item-specific message instead of silently 9063 // disappearing. 9064 LinkedHashSet<ColumnRef> all = new LinkedHashSet<>(); 9065 for (int i = 0; i < items.size(); i++) { 9066 TOrderByItem item = items.getOrderByItem(i); 9067 if (item == null) continue; 9068 TExpression sortKey = item.getSortKey(); 9069 if (sortKey == null) continue; 9070 // Slice 68: positive-integer ordinals admit. The helper: 9071 // - returns null when sortKey is not a positive-integer 9072 // literal (caller falls through to the existing 9073 // constant / alias / subquery / window rejecters and the 9074 // standard ref collection); 9075 // - returns the matching output column's sources list when 9076 // sortKey IS a positive-integer literal in range; 9077 // - throws ORDER_BY_ORDINAL_OUT_OF_RANGE when the ordinal 9078 // is 0 or exceeds the output column count. 9079 // The sourceless-output case (e.g. SELECT 1 FROM t ORDER BY 1 9080 // or SELECT COUNT(*) FROM t ORDER BY 1) returns an empty 9081 // list and falls through to the per-item empty-refs guard 9082 // below, mirroring the existing ORDER BY COUNT(*) / 9083 // ORDER BY 1 + 0 rejection. 9084 List<ColumnRef> ordinalSources = tryResolveOrderByOrdinal(sortKey, outputColumns); 9085 if (ordinalSources != null) { 9086 if (ordinalSources.isEmpty()) { 9087 throw new SemanticIRBuildException( 9088 Diagnostic.error(DiagnosticCode.ORDER_BY_NO_PHYSICAL_COLUMN_REFS, 9089 "ORDER BY ordinal '" + sortKey 9090 + "' resolves to output column with no physical column references " 9091 + "(constant or sourceless aggregate output)", sortKey)); 9092 } 9093 all.addAll(ordinalSources); 9094 continue; 9095 } 9096 // Slice 69: top-level projection-alias references admit. The 9097 // helper returns null for non-alias shapes; an empty list 9098 // (alias of a constant / sourceless aggregate) falls through 9099 // to ORDER_BY_NO_PHYSICAL_COLUMN_REFS, mirroring the slice-68 9100 // sourceless-ordinal handling. Deep-scan alias references 9101 // (e.g. ORDER BY UPPER(<alias>)) are still caught by 9102 // rejectOrderByAliasReference below. 9103 List<ColumnRef> aliasSources = tryResolveOrderByProjectionAlias(sortKey, outputColumns); 9104 if (aliasSources != null) { 9105 if (aliasSources.isEmpty()) { 9106 throw new SemanticIRBuildException( 9107 Diagnostic.error(DiagnosticCode.ORDER_BY_NO_PHYSICAL_COLUMN_REFS, 9108 "ORDER BY projection alias '" + sortKey 9109 + "' resolves to output column with no physical column references " 9110 + "(constant or sourceless aggregate output)", sortKey)); 9111 } 9112 all.addAll(aliasSources); 9113 continue; 9114 } 9115 // Slice 68: non-ordinal constants stay rejected. The original 9116 // helper is preserved for the set-op outer path (which keeps 9117 // its ordinal/constant rejection until slice 72). 9118 rejectOrderByNonOrdinalConstant(sortKey); 9119 // Slice 69: top-level bare alias references are consumed by 9120 // tryResolveOrderByProjectionAlias above; this helper now only 9121 // catches DEEP alias references nested inside compound 9122 // expressions (e.g. ORDER BY UPPER(<alias>)). 9123 rejectOrderByAliasReference(sortKey); 9124 // Reject scalar subqueries and window functions BEFORE 9125 // collecting refs. The visitor descends into both, so without 9126 // these guards `ORDER BY (SELECT MAX(salary) FROM employees)` 9127 // and `ORDER BY ROW_NUMBER() OVER (ORDER BY salary)` would 9128 // leak inner refs into orderByColumnRefs as if the outer 9129 // statement physically depended on them. 9130 rejectOrderByScalarSubquery(sortKey); 9131 rejectOrderByWindowFunction(sortKey); 9132 List<ColumnRef> itemRefs = collectColumnRefs(item, provider); 9133 if (itemRefs.isEmpty()) { 9134 // Anything else that produces no physical column refs: 9135 // ORDER BY (1), ORDER BY 1+0, ORDER BY NULL, ORDER BY 9136 // CASE WHEN 1=1 THEN 'a' END, etc. Reject so the IR 9137 // doesn't silently emit empty refs. 9138 throw new SemanticIRBuildException( 9139 Diagnostic.error(DiagnosticCode.ORDER_BY_NO_PHYSICAL_COLUMN_REFS, 9140 "ORDER BY sort key '" + sortKey + "' has no physical column references " 9141 + "(constant or non-column expressions are not supported yet)", sortKey)); 9142 } 9143 all.addAll(itemRefs); 9144 } 9145 return new ArrayList<>(all); 9146 } 9147 9148 /** 9149 * Slice 68: resolve a positive-integer ORDER BY ordinal to the matching 9150 * output column's sources. Returns: 9151 * 9152 * <ul> 9153 * <li>{@code null} if {@code sortKey} is not a positive-integer 9154 * literal (caller continues with the constant / alias / subquery 9155 * / window rejecters and the standard ref collection);</li> 9156 * <li>a {@link List} of {@link ColumnRef}s — the source list of the 9157 * output column at position {@code v - 1} (1-based ordinals);</li> 9158 * <li>throws {@link SemanticIRBuildException} with 9159 * {@code ORDER_BY_ORDINAL_OUT_OF_RANGE} when {@code v} is 0 or 9160 * exceeds the output column count.</li> 9161 * </ul> 9162 * 9163 * <p>{@code sortKey.getExpressionType() == simple_constant_t} for bare 9164 * positive integers; negative integers parse as a {@code unary_minus_t} 9165 * over a {@code simple_constant_t} and are not handled here. Compound 9166 * constant expressions ({@code ORDER BY 1 + 0}, {@code ORDER BY (1)}) 9167 * are {@code arithmetic_*_t} / {@code parenthesis_t} respectively and 9168 * fall through to the per-item empty-refs guard. 9169 * 9170 * <p>The empty-list case (output column resolved with 9171 * {@link OutputColumn#getSources()} empty — constant projections, 9172 * {@code COUNT(*)}, sourceless aggregates) is returned to the caller 9173 * which fires {@code ORDER_BY_NO_PHYSICAL_COLUMN_REFS}. Slice 68 9174 * boundary. 9175 * 9176 * <p>Sort direction (ASC/DESC) and null placement (NULLS FIRST/LAST) 9177 * are presentation metadata on {@link TOrderByItem}, not on the sort 9178 * key expression; this helper doesn't inspect them (slice 9 decision). 9179 */ 9180 private static List<ColumnRef> tryResolveOrderByOrdinal(TExpression sortKey, 9181 List<OutputColumn> outputs) { 9182 if (sortKey.getExpressionType() != EExpressionType.simple_constant_t) { 9183 return null; 9184 } 9185 String txt = sortKey.toString(); 9186 if (txt == null || !txt.matches("\\d+")) { 9187 return null; 9188 } 9189 long v; 9190 try { 9191 v = Long.parseLong(txt); 9192 } catch (NumberFormatException e) { 9193 // Very-long-digit text overflows long; definitely out of range. 9194 throw new SemanticIRBuildException( 9195 Diagnostic.error(DiagnosticCode.ORDER_BY_ORDINAL_OUT_OF_RANGE, 9196 "ORDER BY ordinal '" + sortKey + "' is out of range " 9197 + "(must be between 1 and " + outputs.size() + ")", sortKey)); 9198 } 9199 if (v < 1 || v > outputs.size()) { 9200 throw new SemanticIRBuildException( 9201 Diagnostic.error(DiagnosticCode.ORDER_BY_ORDINAL_OUT_OF_RANGE, 9202 "ORDER BY ordinal '" + sortKey + "' is out of range " 9203 + "(must be between 1 and " + outputs.size() + ")", sortKey)); 9204 } 9205 return outputs.get((int) v - 1).getSources(); 9206 } 9207 9208 /** 9209 * Slice 69: resolve a top-level bare projection-alias ORDER BY sort 9210 * key to the matching output column's sources. Returns: 9211 * 9212 * <ul> 9213 * <li>{@code null} if {@code sortKey} is not a top-level bare 9214 * {@code simple_object_name_t} whose object operand has 9215 * {@code dbObjectType == EDbObjectType.column_alias} (caller 9216 * continues with {@link #rejectOrderByAliasReference} for the 9217 * deep-scan case and the standard column-ref collection);</li> 9218 * <li>a {@link List} of {@link ColumnRef}s — the matching output's 9219 * source list (which may be empty when the aliased projection 9220 * is a constant or sourceless aggregate; the caller fires 9221 * {@code ORDER_BY_NO_PHYSICAL_COLUMN_REFS} in that case);</li> 9222 * <li>throws {@link SemanticIRBuildException} with 9223 * {@code ORDER_BY_PROJECTION_ALIAS_NOT_SUPPORTED} when the 9224 * parser retyped the operand to {@code column_alias} but no 9225 * matching output exists by case-insensitive name (defensive; 9226 * theoretically unreachable for parsable SQL).</li> 9227 * </ul> 9228 * 9229 * <p>Match strategy: case-insensitive ({@link Locale#ROOT}) on 9230 * {@link OutputColumn#getName()}, returning the FIRST match. This 9231 * mirrors the set-op outer alias matcher in 9232 * {@link #processSetOpOrderByObjectName} (which uses the identical 9233 * {@code toLowerCase(Locale.ROOT)} pattern and {@code break}s on 9234 * first match) and follows MySQL / PostgreSQL ORDER BY alias 9235 * resolution semantics. Duplicate aliases (e.g. {@code SELECT a AS x, 9236 * b AS x FROM t ORDER BY x}) resolve to the leftmost matching 9237 * projection; this is the documented slice-69 boundary. 9238 * 9239 * <p>The set-op outer alias path 9240 * ({@link #buildSetOpOuterOrderByColumnRefs} → 9241 * {@link #processSetOpOrderByObjectName}) was already admitted by 9242 * slice 21 and is independent of this helper. 9243 * 9244 * <p>Deep-scan alias references inside compound expressions 9245 * ({@code ORDER BY UPPER(<alias>)}) are NOT handled here — the 9246 * parser only retypes the top-level operand to {@code column_alias}; 9247 * inside nested expressions the alias may or may not be retyped by 9248 * resolver2 depending on schema heuristics. The slice-9 9249 * {@code orderByNestedAliasReferenceIsHandledSafely} contract is 9250 * preserved: deep alias refs are caught by 9251 * {@link #rejectOrderByAliasReference} with 9252 * {@code ORDER_BY_UNSUPPORTED_SORT_KEY_SHAPE} or by a binding 9253 * failure. 9254 */ 9255 private static List<ColumnRef> tryResolveOrderByProjectionAlias( 9256 TExpression sortKey, List<OutputColumn> outputs) { 9257 if (sortKey.getExpressionType() != EExpressionType.simple_object_name_t) { 9258 return null; 9259 } 9260 TObjectName op = sortKey.getObjectOperand(); 9261 if (op == null || op.getDbObjectType() != EDbObjectType.column_alias) { 9262 return null; 9263 } 9264 String name = op.toString(); 9265 if (name == null || name.isEmpty()) { 9266 return null; 9267 } 9268 String key = name.toLowerCase(Locale.ROOT); 9269 for (OutputColumn oc : outputs) { 9270 String outName = oc.getName(); 9271 if (outName != null && outName.toLowerCase(Locale.ROOT).equals(key)) { 9272 return oc.getSources(); 9273 } 9274 } 9275 throw new SemanticIRBuildException( 9276 Diagnostic.error(DiagnosticCode.ORDER_BY_PROJECTION_ALIAS_NOT_SUPPORTED, 9277 "ORDER BY projection alias '" + sortKey 9278 + "' does not match any output column " 9279 + "(defensive — parser retyped to column_alias " 9280 + "but no output by that name)", sortKey)); 9281 } 9282 9283 /** 9284 * Slice 68: reject ORDER BY sort keys that are constants but NOT 9285 * positive-integer ordinals. The positive-integer ordinal case is 9286 * admitted separately by {@link #tryResolveOrderByOrdinal} which maps 9287 * the ordinal to the matching output column's sources. This helper 9288 * handles the remaining constant shapes ({@code ORDER BY 'x'}, 9289 * {@code ORDER BY 3.14}) — none of which reference an output position 9290 * and so contribute no column dependency. 9291 * 9292 * <p>The set-op outer ORDER BY path 9293 * ({@link #buildSetOpOuterOrderByColumnRefs}) keeps the original 9294 * {@link #rejectOrderByOrdinalOrConstant} helper so ordinals at that 9295 * scope stay rejected (slice 68 lifts only the single-SELECT case; 9296 * slice 72 will lift set-op outer). 9297 */ 9298 private static void rejectOrderByNonOrdinalConstant(TExpression sortKey) { 9299 if (sortKey.getExpressionType() != EExpressionType.simple_constant_t) { 9300 return; 9301 } 9302 String txt = sortKey.toString(); 9303 boolean looksOrdinal = txt != null && txt.matches("\\d+"); 9304 if (looksOrdinal) { 9305 // Admitted by tryResolveOrderByOrdinal; this helper is a no-op 9306 // for positive-integer ordinals. 9307 return; 9308 } 9309 throw new SemanticIRBuildException( 9310 Diagnostic.error(DiagnosticCode.ORDER_BY_CONSTANT_NOT_SUPPORTED, 9311 "ORDER BY constant '" + sortKey + "' is not supported yet " 9312 + "(constant sort keys add no column dependency)", sortKey)); 9313 } 9314 9315 /** 9316 * Reject ORDER BY sort keys that contain a subquery anywhere in the 9317 * subtree. Catches both: 9318 * 9319 * <ul> 9320 * <li>Scalar subqueries ({@link EExpressionType#subquery_t}) — 9321 * e.g. {@code ORDER BY (SELECT MAX(salary) FROM employees)}.</li> 9322 * <li>Predicate subqueries ({@code EXISTS}, {@code IN (SELECT ...)}, 9323 * {@code ANY/ALL/SOME (SELECT ...)}) — these don't appear as a 9324 * {@code subquery_t} expression but carry a non-null 9325 * {@link TExpression#getSubQuery()}, e.g. 9326 * {@code ORDER BY CASE WHEN EXISTS (SELECT 1 FROM t WHERE ...) 9327 * THEN 0 ELSE 1 END}.</li> 9328 * </ul> 9329 * 9330 * <p>The visitor descends into the subquery body, so without an 9331 * explicit reject the inner-scope refs would leak into the outer 9332 * statement's {@code orderByColumnRefs}. The same restriction is 9333 * applied to scalar subqueries in projection (see 9334 * {@link #buildOutputColumns}). 9335 */ 9336 private static void rejectOrderByScalarSubquery(TExpression sortKey) { 9337 // Top-level fast path: scalar-subquery message for the common case. 9338 if (sortKey.getExpressionType() == EExpressionType.subquery_t 9339 || sortKey.getSubQuery() != null) { 9340 throw new SemanticIRBuildException( 9341 Diagnostic.error(DiagnosticCode.ORDER_BY_SUBQUERY_NOT_SUPPORTED, 9342 "ORDER BY subquery '" + sortKey + "' is not supported yet " 9343 + "(subqueries in sort keys would leak inner column refs)", sortKey)); 9344 } 9345 // Deep scan: any nested expression that owns a subquery (scalar, 9346 // EXISTS, IN (SELECT ...), ANY/ALL/SOME) makes the sort key 9347 // out of scope. 9348 final boolean[] found = {false}; 9349 sortKey.acceptChildren(new TParseTreeVisitor() { 9350 @Override 9351 public void preVisit(TExpression e) { 9352 if (found[0]) return; 9353 if (e.getExpressionType() == EExpressionType.subquery_t 9354 || e.getSubQuery() != null) { 9355 found[0] = true; 9356 } 9357 } 9358 }); 9359 if (found[0]) { 9360 throw new SemanticIRBuildException( 9361 Diagnostic.error(DiagnosticCode.ORDER_BY_HAS_SUBQUERY_NOT_SUPPORTED, 9362 "ORDER BY sort key '" + sortKey + "' contains a subquery " 9363 + "(scalar, EXISTS, IN, or ANY/ALL/SOME); not supported yet", sortKey)); 9364 } 9365 } 9366 9367 /** 9368 * Reject ORDER BY sort keys that contain a window function. Window 9369 * functions descend through {@link TFunctionCall#acceptChildren()} so 9370 * their PARTITION BY / ORDER BY column refs would otherwise leak into 9371 * the outer statement's {@code orderByColumnRefs}. Mirrors the 9372 * projection-side {@link #rejectWindowFunctions}, but wired through 9373 * the ORDER BY item-walk instead of the result-column list. 9374 */ 9375 private static void rejectOrderByWindowFunction(TExpression sortKey) { 9376 final boolean[] found = {false}; 9377 sortKey.acceptChildren(new TParseTreeVisitor() { 9378 @Override 9379 public void preVisit(TFunctionCall fn) { 9380 if (found[0]) return; 9381 if (fn.getWindowDef() != null) found[0] = true; 9382 } 9383 }); 9384 if (!found[0] && sortKey.getExpressionType() == EExpressionType.function_t) { 9385 TFunctionCall fn = sortKey.getFunctionCall(); 9386 if (fn != null && fn.getWindowDef() != null) found[0] = true; 9387 } 9388 if (found[0]) { 9389 throw new SemanticIRBuildException( 9390 Diagnostic.error(DiagnosticCode.ORDER_BY_WINDOW_FUNCTION_NOT_SUPPORTED, 9391 "ORDER BY window function '" + sortKey + "' is not supported yet " 9392 + "(window OVER (...) refs would leak into orderByColumnRefs)", sortKey)); 9393 } 9394 } 9395 9396 /** 9397 * Reject ORDER BY sort keys that are bare constants. Splits the 9398 * message between integer ordinals (which reference the SELECT 9399 * position) and other constants (which add no column dependency). The 9400 * generic no-physical-column-refs check in 9401 * {@link #buildOrderByColumnRefs} catches compound cases like 9402 * {@code ORDER BY (1)} or {@code ORDER BY 1 + 0}. 9403 * 9404 * <p><b>Slice 68:</b> the single-SELECT call site no longer uses this 9405 * helper because positive-integer ordinals now resolve to the matching 9406 * output column's sources (see {@link #tryResolveOrderByOrdinal}). 9407 * This helper remains for {@link #buildSetOpOuterOrderByColumnRefs}, 9408 * where ordinal lifting is deferred to slice 72 (set-op outer 9409 * ORDER BY needs output-position references against the set-op output 9410 * row type, not the single-SELECT output column list). 9411 */ 9412 private static void rejectOrderByOrdinalOrConstant(TExpression sortKey) { 9413 if (sortKey.getExpressionType() != EExpressionType.simple_constant_t) { 9414 return; 9415 } 9416 String txt = sortKey.toString(); 9417 boolean looksOrdinal = txt != null && txt.matches("\\d+"); 9418 if (looksOrdinal) { 9419 throw new SemanticIRBuildException( 9420 Diagnostic.error(DiagnosticCode.ORDER_BY_ORDINAL_NOT_SUPPORTED, 9421 "ORDER BY ordinal '" + sortKey + "' is not supported yet " 9422 + "(reference the column or expression directly)", sortKey)); 9423 } 9424 throw new SemanticIRBuildException( 9425 Diagnostic.error(DiagnosticCode.ORDER_BY_CONSTANT_NOT_SUPPORTED, 9426 "ORDER BY constant '" + sortKey + "' is not supported yet " 9427 + "(constant sort keys add no column dependency)", sortKey)); 9428 } 9429 9430 /** 9431 * Reject ORDER BY sort keys that contain a projection-alias reference 9432 * NESTED inside a compound expression (e.g. 9433 * {@code ORDER BY UPPER(<alias>)}). The visitor in 9434 * {@link #collectColumnRefs} skips column-alias nodes, so without an 9435 * explicit reject the IR would emit no column refs for them. 9436 * 9437 * <p><b>Slice 69:</b> the top-level bare-alias case (e.g. 9438 * {@code ORDER BY <alias>}) is now consumed by 9439 * {@link #tryResolveOrderByProjectionAlias} BEFORE this helper runs. 9440 * Only the deep-scan branch remains here; the top-level fast-path 9441 * was removed because it became unreachable. 9442 * 9443 * <p>{@link TOrderByItem#doParse} only retypes the top-level operand 9444 * to {@link TObjectName#ttobjColumnAlias} → dbObjectType 9445 * {@link EDbObjectType#column_alias}; inside nested expressions the 9446 * alias may or may not be retyped by resolver2 depending on schema 9447 * heuristics. Slice 9's 9448 * {@code orderByNestedAliasReferenceIsHandledSafely} documents the 9449 * three acceptable outcomes for deep aliases (reject by binding 9450 * failure, reject by this deep scan, or accept with a real column 9451 * dependency captured). 9452 */ 9453 private static void rejectOrderByAliasReference(TExpression sortKey) { 9454 // Deep scan: an alias node nested inside an expression 9455 // (e.g. ORDER BY UPPER(x) where x is an alias) would otherwise be 9456 // silently dropped by the column-only visitor. The top-level 9457 // bare-alias case is consumed earlier by 9458 // tryResolveOrderByProjectionAlias (slice 69 lift). 9459 final boolean[] foundAlias = {false}; 9460 final String[] aliasName = {null}; 9461 sortKey.acceptChildren(new TParseTreeVisitor() { 9462 @Override 9463 public void preVisit(TObjectName node) { 9464 if (foundAlias[0]) return; 9465 if (node.getDbObjectType() == EDbObjectType.column_alias) { 9466 foundAlias[0] = true; 9467 aliasName[0] = node.toString(); 9468 } 9469 } 9470 }); 9471 if (foundAlias[0]) { 9472 throw new SemanticIRBuildException( 9473 Diagnostic.error(DiagnosticCode.ORDER_BY_UNSUPPORTED_SORT_KEY_SHAPE, 9474 "ORDER BY sort key '" + sortKey 9475 + "' contains a projection alias reference '" 9476 + aliasName[0] + "'; not supported yet " 9477 + "(reference the underlying column directly)", sortKey)); 9478 } 9479 } 9480 9481 /** 9482 * Reject SELECT shapes outside current builder scope. The 9483 * {@code skipCteListCheck} flag is true only for the outer SELECT of a 9484 * WITH-bearing query whose CTEs were already extracted by 9485 * {@link #build}; nested WITH inside a CTE body is still rejected. 9486 */ 9487 private static void rejectUnsupportedShape(TSelectSqlStatement select, boolean skipCteListCheck) { 9488 // Slice 12: top-level set-ops and CTE-body set-ops are dispatched 9489 // by build() to buildSetOpProgram BEFORE buildSelectStatement is 9490 // called. This rejection still fires when buildSelectStatement 9491 // is called from a recursive context (FROM-subquery / scalar-body 9492 // extraction) where the inner SELECT happens to be a set-op — 9493 // those nested cases remain out of scope. 9494 if (select.getSetOperatorType() != null && select.getSetOperatorType() != ESetOperatorType.none) { 9495 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.SET_OPERATION_NOT_SUPPORTED_IN_CONTEXT, "set operations (UNION/INTERSECT/MINUS) are not supported in this context yet", select)); 9496 } 9497 if (!skipCteListCheck && select.getCteList() != null && select.getCteList().size() > 0) { 9498 throw new SemanticIRBuildException( 9499 Diagnostic.error(DiagnosticCode.NESTED_WITH_NOT_SUPPORTED, 9500 "nested WITH/CTE inside a CTE body or subquery is not supported yet", select)); 9501 } 9502 // DISTINCT / UNIQUE / ALL handling is done in resolveDistinctFlag() 9503 // (called from buildSelectStatement). Only rejected row-filter shapes 9504 // bubble up as a SemanticIRBuildException; the rest become the 9505 // StatementGraph.distinct flag. 9506 // Slice 6 lifted GROUP BY; slice 10 lifted HAVING. The HAVING 9507 // expression itself (and the per-shape rejections for subqueries 9508 // and window functions inside it) are handled in 9509 // buildHavingColumnRefs so the rejection messages can mention the 9510 // specific shape. 9511 // Slices 70 and 71: all single-SELECT row-limit admit/reject 9512 // decisions live in buildRowLimit(select). rejectUnsupportedShape 9513 // no longer carries any row-limit logic. Set-op outer row-limits 9514 // remain handled by rejectSetOpRowLimit (slice 72 lifts). 9515 // Slice 13 codex impl-review round-2 MUST 3: reject Teradata 9516 // QUALIFY clause. QUALIFY filters rows based on window-function 9517 // results (e.g. `QUALIFY ROW_NUMBER() OVER (...) = 1`); without 9518 // this guard a window-function projection paired with a QUALIFY 9519 // clause would silently ignore the row-filter and produce an 9520 // incomplete IR. Lifting requires modeling row-filter semantics 9521 // similar to slice-9's row-limit canonical-model exclusion. 9522 if (select.getQualifyClause() != null) { 9523 throw new SemanticIRBuildException( 9524 Diagnostic.error(DiagnosticCode.QUALIFY_NOT_SUPPORTED, 9525 "QUALIFY clause is not supported yet; row-filter on " 9526 + "window-function results requires modelling alongside " 9527 + "the slice-13 window-function projection support", select)); 9528 } 9529 // ORDER BY itself is lifted in slice 9; see buildOrderByColumnRefs. 9530 // Vendor- and clause-level guards are checked there so the rejection 9531 // message can mention the specific sub-clause. 9532 } 9533 9534 /** 9535 * Walk the FROM clause: each top-level {@link TJoin} contributes its 9536 * base table; each chained {@link TJoinItem} contributes one more base 9537 * table plus the column refs found in its ON-condition expression. 9538 * Comma-separated FROM lists (multiple top-level TJoins) and 9539 * single-source SELECTs both reduce to the same loop. 9540 */ 9541 private static List<RelationSource> buildRelations(TSelectSqlStatement select, 9542 NameBindingProvider provider, 9543 List<ColumnRef> joinRefsOut, 9544 boolean allowFromSubqueries) { 9545 return buildRelations(select, provider, joinRefsOut, allowFromSubqueries, 9546 /*allowJoinOnPredicateSubqueries=*/ false, 9547 /*stmtsForExtraction=*/ null, 9548 /*lineageForExtraction=*/ null, 9549 /*cteMapForExtraction=*/ null); 9550 } 9551 9552 /** 9553 * Slice-23/24 overload of {@link #buildRelations}. When 9554 * {@code allowJoinOnPredicateSubqueries} is {@code true} (outer-SELECT 9555 * call site only), uncorrelated EXISTS subqueries inside JOIN ON 9556 * predicates are extracted as their own {@code <predicate_subquery_<i>>} 9557 * StatementGraphs appended to {@code stmtsForExtraction}. The extracted 9558 * subtrees are then skipped by the JOIN-ON window-function rejecter and 9559 * the JOIN-ON ref collector so their inner refs do not leak into outer 9560 * {@code joinColumnRefs}. 9561 * 9562 * <p>Slice 24: {@code cteMapForExtraction} carries outer's 9563 * CTE-name-to-statement-index map so the extracted predicate body can 9564 * emit STATEMENT_OUTPUT → STATEMENT_OUTPUT edges into outer-visible CTE 9565 * bodies via {@link #emitLineageForStatement}. Non-outer call sites 9566 * (where {@code allowJoinOnPredicateSubqueries=false}) pass {@code null}. 9567 */ 9568 private static List<RelationSource> buildRelations(TSelectSqlStatement select, 9569 NameBindingProvider provider, 9570 List<ColumnRef> joinRefsOut, 9571 boolean allowFromSubqueries, 9572 boolean allowJoinOnPredicateSubqueries, 9573 List<StatementGraph> stmtsForExtraction, 9574 List<LineageEdge> lineageForExtraction, 9575 Map<String, Integer> cteMapForExtraction) { 9576 if (select.joins == null || select.joins.size() == 0) { 9577 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.SELECT_NO_FROM_SOURCE, "SELECT must have at least one FROM source", select)); 9578 } 9579 // Slice 62: comma-separated FROM lists (e.g. `FROM a, b`) 9580 // parse as multiple top-level TJoin elements. We admit them at 9581 // outer / CTE-body / FROM-subquery-body call sites (where 9582 // {@code allowFromSubqueries=true}) and build them as an ordered 9583 // cross-product relation graph with empty {@code joinColumnRefs} 9584 // (WHERE-side predicates feed {@code filterColumnRefs} as 9585 // usual). Synthetic body contexts (scalar / set-op-branch / 9586 // set-op-CTE / predicate) call this method with 9587 // {@code allowFromSubqueries=false} and stay rejected — that 9588 // is exactly the discriminator we need. Predicate bodies also 9589 // hit an earlier shape-specific reject inside 9590 // {@link #preflightExistsInnerShape}. 9591 if (!allowFromSubqueries && select.joins.size() > 1) { 9592 throw new SemanticIRBuildException( 9593 Diagnostic.error(DiagnosticCode.COMMA_FROM_IN_BODY_NOT_SUPPORTED, 9594 "comma-separated FROM list (implicit cross join) is not supported " 9595 + "inside scalar / set-op-branch / set-op-CTE / predicate body " 9596 + "contexts yet; use explicit JOIN ... ON", select)); 9597 } 9598 // Slice 63: explicit CROSS JOIN admits at outer / CTE-body / 9599 // FROM-subquery-body call sites (allowFromSubqueries=true) but 9600 // stays rejected inside synthetic body contexts (scalar / 9601 // set-op-branch / set-op-CTE / predicate) because the body's 9602 // shape contract (single column for scalar; column-count parity 9603 // for set-op branches; constant or single column-ref for 9604 // predicates) cannot host a cross-product relation graph 9605 // safely. Predicate bodies also hit an earlier shape-specific 9606 // reject inside {@link #preflightExistsInnerShape} so the 9607 // user-visible diagnostic mentions EXISTS / IN-SELECT context. 9608 if (!allowFromSubqueries) { 9609 for (TJoin join : select.joins) { 9610 TJoinItemList items = join.getJoinItems(); 9611 if (items == null) continue; 9612 for (int i = 0; i < items.size(); i++) { 9613 TJoinItem item = items.getJoinItem(i); 9614 if (item == null) continue; 9615 if (item.getJoinType() == EJoinType.cross) { 9616 throw new SemanticIRBuildException( 9617 Diagnostic.error(DiagnosticCode.CROSS_JOIN_IN_BODY_NOT_SUPPORTED, 9618 "CROSS JOIN is not supported inside scalar / " 9619 + "set-op-branch / set-op-CTE / predicate " 9620 + "body contexts yet; rewrite as INNER " 9621 + "JOIN ... ON in the body", item)); 9622 } 9623 // Slice 64: USING admitted at outer / CTE-body / 9624 // FROM-subquery-body call sites but rejected inside 9625 // synthetic body contexts. The body's shape contract 9626 // (single column for scalar, column-count parity for 9627 // set-op branches, constant/column-ref for predicate 9628 // bodies) cannot host the merged-key semantics safely. 9629 if (item.getUsingColumns() != null 9630 && item.getUsingColumns().size() > 0) { 9631 throw new SemanticIRBuildException( 9632 Diagnostic.error(DiagnosticCode.USING_IN_BODY_NOT_SUPPORTED, 9633 "JOIN ... USING (...) is not supported inside " 9634 + "scalar / set-op-branch / set-op-CTE / " 9635 + "predicate body contexts yet; rewrite " 9636 + "as JOIN ... ON in the body", item)); 9637 } 9638 // Slice 66: NATURAL JOIN inside synthetic body 9639 // contexts is rejected with a tuned diagnostic. 9640 // Predicate bodies hit preflightExistsInnerShape 9641 // first which emits an EXISTS-tuned message; this 9642 // reject fires for scalar / set-op-branch / 9643 // set-op-CTE bodies (and as defense-in-depth for 9644 // predicate bodies if the preflight didn't catch 9645 // a vendor-specific variant). 9646 if (isNaturalJoinType(item.getJoinType())) { 9647 throw new SemanticIRBuildException( 9648 Diagnostic.error(DiagnosticCode.NATURAL_IN_BODY_NOT_SUPPORTED, 9649 "NATURAL JOIN is not supported inside " 9650 + "scalar / set-op-branch / set-op-CTE / " 9651 + "predicate body contexts yet; rewrite " 9652 + "as JOIN ... ON in the body", item)); 9653 } 9654 } 9655 } 9656 } 9657 List<RelationSource> relations = new ArrayList<>(); 9658 for (TJoin join : select.joins) { 9659 TTable leftTable = join.getTable(); 9660 if (leftTable == null) { 9661 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.FROM_SOURCE_NO_TABLE, "FROM source has no table", join)); 9662 } 9663 relations.add(buildRelation(leftTable, provider, allowFromSubqueries)); 9664 9665 // Slice 66: per top-level TJoin LeftOutputState. Seeded 9666 // with the top-left table's catalog; updated as JoinItems 9667 // walk left-to-right. NATURAL JoinItems consume this state 9668 // (catalog intersection) and update it (merge right's 9669 // shared keys into existing slots, append non-shared right 9670 // columns). Reset between top-level TJoins so comma-FROM 9671 // groups stay independent. 9672 LeftOutputState leftState = new LeftOutputState(); 9673 seedLeftOutput(leftState, leftTable, provider); 9674 9675 TJoinItemList items = join.getJoinItems(); 9676 if (items == null) continue; 9677 for (int i = 0; i < items.size(); i++) { 9678 TJoinItem item = items.getJoinItem(i); 9679 rejectUnsupportedJoinShape(item); 9680 TTable rightTable = item.getTable(); 9681 if (rightTable == null) { 9682 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.JOIN_ITEM_NO_TABLE, "JOIN item has no table", item)); 9683 } 9684 // Slice 17/18: subqueries on a JOIN side are extracted as 9685 // their own statements by extractFromSubqueriesAsStatements 9686 // before buildRelations runs (when allowFromSubqueries=true 9687 // — outer build path, non-set-op CTE body, AND nested 9688 // FROM-subquery body recursion). 9689 // Scalar-body / set-op-branch / set-op-CTE-body builds pass 9690 // allowFromSubqueries=false; buildRelation rejects there. 9691 relations.add(buildRelation(rightTable, provider, allowFromSubqueries)); 9692 // Slice 66: NATURAL admission via catalog inference. 9693 // Computes shared keys against the running LeftOutputState 9694 // (which carries the accumulated row type so far) and 9695 // routes through the shared emitMergedJoinRefs helper. 9696 // Rejects with a side-specific diagnostic when catalog 9697 // is missing on either side. 9698 if (isNaturalJoinType(item.getJoinType())) { 9699 NaturalKeyResult r = naturalSharedKeys(leftState, rightTable, provider); 9700 if (r.kind != NaturalKeyResult.Kind.SUCCESS) { 9701 throw new SemanticIRBuildException( 9702 Diagnostic.error(DiagnosticCode.NATURAL_CATALOG_REQUIRED, 9703 formatNaturalCatalogReject(r), item)); 9704 } 9705 emitMergedJoinRefs(JoinKind.NATURAL, r.keys, join, items, i, 9706 rightTable, provider, joinRefsOut); 9707 mergeRightIntoLeftOutput(leftState, rightTable, provider, r.keys); 9708 continue; 9709 } 9710 // Slice 64: USING and ON are mutually exclusive 9711 // (enforced by rejectUnsupportedJoinShape). Populate 9712 // per-key joinColumnRefs from USING here, then skip the 9713 // onCond branch since it cannot be both. 9714 TObjectNameList usingCols = item.getUsingColumns(); 9715 if (usingCols != null && usingCols.size() > 0) { 9716 populateUsingJoinRefs(join, items, i, rightTable, 9717 usingCols, provider, joinRefsOut); 9718 // Slice 66: USING JoinItems also merge right into 9719 // the LeftOutputState so subsequent NATURAL JoinItems 9720 // see the accumulated row type (including the merged 9721 // USING keys at their original slots). 9722 List<String> usingKeyNames = new ArrayList<>(usingCols.size()); 9723 for (int k = 0; k < usingCols.size(); k++) { 9724 TObjectName usingKey = usingCols.getObjectName(k); 9725 if (usingKey == null) continue; 9726 String keyName = usingKey.getColumnNameOnly(); 9727 if (keyName != null && !keyName.isEmpty()) { 9728 usingKeyNames.add(keyName); 9729 } 9730 } 9731 mergeRightIntoLeftOutput(leftState, rightTable, provider, usingKeyNames); 9732 continue; 9733 } 9734 // Slice 66: ON / CROSS JoinItem — append right's catalog 9735 // columns to the running LeftOutputState. NATURAL JoinItems 9736 // that follow will see the accumulated row type. 9737 appendRightToLeftOutput(leftState, rightTable, provider); 9738 TExpression onCond = item.getOnCondition(); 9739 if (onCond != null) { 9740 if (allowJoinOnPredicateSubqueries) { 9741 // Slice 23/24/25: outer-SELECT JOIN ON path — 9742 // extract uncorrelated predicate-subquery wrappers 9743 // (EXISTS / NOT EXISTS / IN-SELECT / NOT IN-SELECT / 9744 // scalar comparison subquery / ANY-ALL-SOME) as 9745 // their own <predicate_subquery_<i>> 9746 // StatementGraphs and skip their subtrees during 9747 // the slice-13 window guard and the JOIN-ON ref 9748 // collection. The remaining subquery-leak guard 9749 // rejects everything that is NOT an extracted 9750 // wrapper (correlated bodies, multi-column inner 9751 // projection, expression LHS, tuple LHS, subquery 9752 // on left side, etc.). 9753 Set<TExpression> extractedRoots = 9754 extractUncorrelatedPredicateSubqueriesFromJoinOn(onCond, provider, 9755 stmtsForExtraction, lineageForExtraction, 9756 cteMapForExtraction); 9757 rejectAnyRemainingSubqueriesInJoinOn(onCond, extractedRoots); 9758 rejectWindowFunctionInScopeSkipping(onCond, 9759 "JOIN ON condition", extractedRoots); 9760 joinRefsOut.addAll( 9761 collectColumnRefsSkipping(onCond, provider, extractedRoots)); 9762 } else { 9763 // Slice 13: reject window functions in JOIN ON before 9764 // collectColumnRefs descends. 9765 rejectWindowFunctionInScope(onCond, "JOIN ON condition"); 9766 // Slice 17: predicate subqueries inside JOIN ON would otherwise 9767 // slip through collectColumnRefs and produce an incomplete IR. 9768 // Hardens the boundary at every non-outer `buildRelations` call site 9769 // (FROM-subquery body, CTE body, scalar body, set-op branch). 9770 // The slice-23 outer-SELECT path replaces this rejection with 9771 // selective EXISTS extraction (see the if branch above). 9772 rejectSubqueriesInJoinOn(onCond); 9773 joinRefsOut.addAll(collectColumnRefs(onCond, provider)); 9774 } 9775 } 9776 } 9777 } 9778 rejectDuplicateAliases(relations); 9779 return relations; 9780 } 9781 9782 /** 9783 * Slice 17: reject predicate subqueries (EXISTS, IN-SELECT, 9784 * scalar-subquery comparisons, etc.) inside a JOIN ON expression. 9785 * Without this guard, slice-17's expanded JOIN surface (relation 9786 * subqueries on either side) would let predicate subqueries slip 9787 * past {@code collectColumnRefs} and produce incomplete IR. Applies 9788 * to every {@code buildRelations} call site; the slice-11 9789 * {@link #rejectSubqueriesInScalarBodyClauses} and slice-17 9790 * {@link #rejectSubqueriesInFromSubqueryBodyClauses} fire BEFORE 9791 * the recursive {@code buildSelectStatement}, so their context- 9792 * specific messages preempt this one. 9793 */ 9794 private static void rejectSubqueriesInJoinOn(TExpression onCond) { 9795 if (containsAnySubqueryExpression(onCond)) { 9796 throw new SemanticIRBuildException( 9797 Diagnostic.error(DiagnosticCode.JOIN_ON_TOP_LEVEL_SUBQUERY_NOT_SUPPORTED, 9798 "subquery in a top-level JOIN ON predicate is not supported yet", onCond)); 9799 } 9800 } 9801 9802 // ==================================================================== 9803 // Slice 23: uncorrelated EXISTS subqueries in top-level outer-SELECT 9804 // JOIN ON. 9805 // 9806 // Approach: walk the JOIN-ON expression looking for `exists_t` nodes 9807 // (and `not_t(exists_t(...))` for NOT EXISTS); validate each as 9808 // uncorrelated with a constant-only inner projection; build the inner 9809 // SELECT as its own `<predicate_subquery_<i>>` StatementGraph; record 9810 // the extracted `exists_t` root in a Set so the JOIN-ON window guard 9811 // and ref collector can skip its subtree. Predicate bodies are 9812 // unreachable from outer (no relation, no lineage edge) so they 9813 // contribute zero canonical edges — matching dlineage's behaviour for 9814 // EXISTS-in-JOIN-ON shapes that project a constant (the inner-shape 9815 // preflight enforces constant-only projection so this invariant 9816 // holds). 9817 // 9818 // Process: codex round 1 + round 2 plan reviews; v3 plan locked. 9819 // See roadmap §14.25 (slice-23 entry). 9820 // ==================================================================== 9821 9822 /** 9823 * True iff {@code e} is the root of an EXISTS predicate that slice 23 9824 * may extract: either an {@code exists_t} expression, or a 9825 * {@code logical_not_t} whose <b>right</b> operand is {@code exists_t}. 9826 * NOT EXISTS unwraps to its inner {@code exists_t}. 9827 * 9828 * <p>Note: the GSP parser puts the operand of {@code logical_not_t} in 9829 * {@link TExpression#getRightOperand()}, not {@code getLeftOperand()} 9830 * (verified across Oracle / PostgreSQL / MSSQL / MySQL / BigQuery). 9831 * The root fast-path for {@code NOT EXISTS} is therefore "dead" in the 9832 * sense that the descendant walker on the wrapping {@code logical_not_t} 9833 * already visits the child {@code exists_t} — we still keep it here so 9834 * the symmetry between root EXISTS and root NOT EXISTS is explicit. 9835 * 9836 * <p>Slice 25 (kept as a dedicated helper for slice-23/24 callers and 9837 * for clarity): the slice-25 generalisation lives in 9838 * {@link #unwrapToInnerExtractableSubquery(TExpression)} which 9839 * recognises four wrapper shapes — including the two EXISTS shapes 9840 * here. 9841 */ 9842 private static boolean isExistsRoot(TExpression e) { 9843 if (e == null) return false; 9844 if (e.getExpressionType() == EExpressionType.exists_t) return true; 9845 if (e.getExpressionType() == EExpressionType.logical_not_t 9846 && e.getRightOperand() != null 9847 && e.getRightOperand().getExpressionType() == EExpressionType.exists_t) { 9848 return true; 9849 } 9850 return false; 9851 } 9852 9853 /** Return the actual {@code exists_t} node — unwrap a {@code logical_not_t} parent if present. */ 9854 private static TExpression unwrapExistsRoot(TExpression e) { 9855 if (e.getExpressionType() == EExpressionType.exists_t) return e; 9856 return e.getRightOperand(); 9857 } 9858 9859 /** 9860 * Slice 25 / Slice 26: pure shape-recogniser for the predicate- 9861 * subquery wrappers admitted in TOP-LEVEL JOIN ON. Returns the inner 9862 * extractable node ({@code subquery_t} or {@code exists_t}) for the 9863 * wrapper shapes; null otherwise. Pure — performs NO validation and 9864 * throws NO exceptions. 9865 * 9866 * <p>Recognised wrappers: 9867 * <ul> 9868 * <li>{@code exists_t} (slice-23 EXISTS) — returns {@code e}.</li> 9869 * <li>{@code logical_not_t} with rightOperand {@code exists_t} 9870 * (slice-23 NOT EXISTS) — returns the inner exists_t.</li> 9871 * <li>{@code in_t} with rightOperand {@code subquery_t} 9872 * (slice 25 IN-SELECT / NOT IN-SELECT) — returns the 9873 * rightOperand. LHS-subquery {@code in_t} returns null 9874 * (slice 26 boundary: dlineage's {@code fdr clause="on"} 9875 * sources omit the outer column for IN-LHS, so admitting on 9876 * the IR side would manufacture canonical-model divergence).</li> 9877 * <li>{@code simple_comparison_t} (slice 25 + slice 26 scalar 9878 * comparison) — returns the operand on whichever side is a 9879 * {@code subquery_t}. RHS-subquery (slice 25) and LHS- 9880 * subquery (slice 26) are both admitted; both-sides subquery 9881 * returns null and falls through to {@link #findSubqueryOnLeftWrapper}'s 9882 * new "both subqueries" rejection branch.</li> 9883 * <li>{@code group_comparison_t} with rightOperand 9884 * {@code subquery_t} AND non-null {@code getQuantifier()} 9885 * (slice 25 ANY/ALL/SOME) — returns the rightOperand. 9886 * LHS-subquery {@code group_comparison_t} returns null 9887 * (slice 26 boundary: borderline grammar; not probed).</li> 9888 * </ul> 9889 * 9890 * <p>For null returns, the wrapper either is not a recognised shape 9891 * (falls through to the slice-23 generic remaining-subquery rejection 9892 * in {@link #rejectAnyRemainingSubqueriesInJoinOn}) OR has the right 9893 * outer shape but the LHS / RHS positioning is unsupported (subquery 9894 * on left side of IN/quantifier, both sides subquery for cmp, tuple 9895 * LHS / RHS, expression LHS / RHS). The walker validates the 9896 * non-subquery side via {@link #isAdmittedOuterLhsShape} or 9897 * {@link #isAdmittedOuterRhsShape} and throws a slice-25 / slice-26 9898 * tuned message before calling this helper for extraction. 9899 * 9900 * <p>The slice-23/24 EXISTS callers ({@code isExistsRoot} and 9901 * {@code unwrapExistsRoot}) remain in place — both are simple 9902 * boolean / unwrap helpers; this method consolidates the slice-25 / 9903 * slice-26 shape decision in one place. 9904 */ 9905 private static TExpression unwrapToInnerExtractableSubquery(TExpression e) { 9906 if (e == null) return null; 9907 EExpressionType t = e.getExpressionType(); 9908 if (t == EExpressionType.exists_t) return e; 9909 if (t == EExpressionType.logical_not_t 9910 && e.getRightOperand() != null 9911 && e.getRightOperand().getExpressionType() == EExpressionType.exists_t) { 9912 return e.getRightOperand(); 9913 } 9914 TExpression l = e.getLeftOperand(); 9915 TExpression r = e.getRightOperand(); 9916 boolean lhsIsSubq = l != null && l.getExpressionType() == EExpressionType.subquery_t; 9917 boolean rhsIsSubq = r != null && r.getExpressionType() == EExpressionType.subquery_t; 9918 if (t == EExpressionType.in_t) { 9919 return rhsIsSubq ? r : null; 9920 } 9921 if (t == EExpressionType.simple_comparison_t) { 9922 // Slice 26: admit subquery on either single side. Both sides 9923 // → null (rejected via findSubqueryOnLeftWrapper's new 9924 // "both subqueries" branch — see isSubqueryOnLeftOfWrapper). 9925 if (lhsIsSubq && rhsIsSubq) return null; 9926 if (rhsIsSubq) return r; 9927 if (lhsIsSubq) return l; 9928 return null; 9929 } 9930 if (t == EExpressionType.group_comparison_t 9931 && e.getQuantifier() != null) { 9932 return rhsIsSubq ? r : null; 9933 } 9934 return null; 9935 } 9936 9937 /** 9938 * Slice 25: admitted LHS shapes for non-EXISTS predicate-subquery 9939 * wrappers ({@code in_t} / {@code simple_comparison_t} / 9940 * {@code group_comparison_t}) when the subquery is on the RHS. 9941 * 9942 * <p>Admits ONLY {@link EExpressionType#simple_object_name_t} — 9943 * a single column reference, qualified or unqualified. Rejects: 9944 * tuple expressions ({@code (a, b) IN (...)}), parenthesized 9945 * wrapping ({@code (e.col) IN (...)}), arithmetic 9946 * ({@code e.col + 1 IN (...)}), function calls 9947 * ({@code UPPER(e.col) IN (...)}), scalar subqueries on LHS, and 9948 * any other non-column shape. 9949 * 9950 * <p>Slice-25 boundary; future slice may admit parenthesized 9951 * column refs (slice 26+). 9952 */ 9953 private static boolean isAdmittedOuterLhsShape(TExpression lhs) { 9954 return lhs != null 9955 && lhs.getExpressionType() == EExpressionType.simple_object_name_t; 9956 } 9957 9958 /** 9959 * Slice 26: admitted RHS shapes for {@code simple_comparison_t} 9960 * with subquery on the LHS. Mirror of 9961 * {@link #isAdmittedOuterLhsShape}: admits ONLY 9962 * {@link EExpressionType#simple_object_name_t} — a single column 9963 * reference, qualified or unqualified. Rejects tuple, parenthesized, 9964 * arithmetic, function-call, and subquery (the "both subqueries" 9965 * shape is rejected separately via 9966 * {@link #isSubqueryOnLeftOfWrapper}'s new 9967 * {@code simple_comparison_t} both-sides branch). 9968 * 9969 * <p>Slice-26 boundary: only {@code simple_comparison_t} reaches 9970 * this helper (the walker dispatches on which side is the 9971 * subquery). {@code in_t} / {@code group_comparison_t} with LHS 9972 * subquery return null from 9973 * {@link #unwrapToInnerExtractableSubquery} so they never reach 9974 * here. 9975 */ 9976 private static boolean isAdmittedOuterRhsShape(TExpression rhs) { 9977 return rhs != null 9978 && rhs.getExpressionType() == EExpressionType.simple_object_name_t; 9979 } 9980 9981 /** 9982 * Slice 25 (impl-review M1-fix): true iff {@code e} is a 9983 * {@code logical_not_t} wrapping a slice-25 IN / scalar-cmp / 9984 * ANY-ALL-SOME wrapper (i.e. NOT applied to an admitted slice-25 9985 * shape that ISN'T an EXISTS). The descendant walker would 9986 * otherwise traverse INTO this {@code logical_not_t} and find the 9987 * child wrapper, accidentally admitting 9988 * {@code NOT (e.col IN (SELECT ...))} which is NOT a slice-25 9989 * recognised shape ({@code unwrapToInnerExtractableSubquery} 9990 * matches {@code logical_not_t} only when the inner is 9991 * {@code exists_t}). 9992 * 9993 * <p>This helper is consulted by the extraction walker BEFORE it 9994 * descends into the children of a {@code logical_not_t}, so the 9995 * rejection happens at the wrapper level with a tuned message 9996 * pointing at the slice-25 boundary. 9997 */ 9998 private static boolean isLogicalNotOverNonExistsWrapper(TExpression e) { 9999 if (e == null) return false; 10000 if (e.getExpressionType() != EExpressionType.logical_not_t) return false; 10001 TExpression r = e.getRightOperand(); 10002 if (r == null) return false; 10003 // Strip parenthesis_t chain. The Oracle parser wraps 10004 // `NOT (e.col IN (SELECT...))` as 10005 // logical_not_t → parenthesis_t → in_t, so the immediate 10006 // right child is parenthesis_t. Descend through any chain of 10007 // parens to find the actual subject. Note: 10008 // {@code parenthesis_t} stores its child on 10009 // {@link TExpression#getLeftOperand()} (mirroring 10010 // {@link #isConstantExpression}'s descent). 10011 TExpression subject = r; 10012 while (subject != null 10013 && subject.getExpressionType() == EExpressionType.parenthesis_t) { 10014 subject = subject.getLeftOperand(); 10015 } 10016 if (subject == null) return false; 10017 if (subject.getExpressionType() == EExpressionType.exists_t) return false; 10018 // Either an in_t / simple_comparison_t / group_comparison_t 10019 // with subquery RHS, or any of those types directly. 10020 return unwrapToInnerExtractableSubquery(subject) != null; 10021 } 10022 10023 /** 10024 * Slice 25 / Slice 26: build a tuned outer-shape rejection message 10025 * for a non-EXISTS predicate-subquery wrapper. Called from the 10026 * extraction walker when {@link #unwrapToInnerExtractableSubquery} 10027 * returns non-null for an in_t / simple_comparison_t / 10028 * group_comparison_t but the non-subquery side is not admitted by 10029 * {@link #isAdmittedOuterLhsShape} (slice 25 — subquery on RHS) or 10030 * {@link #isAdmittedOuterRhsShape} (slice 26 — subquery on LHS). 10031 * 10032 * <p>{@code isLhsSubquery} indicates which side of the wrapper 10033 * carries the subquery: {@code true} = subquery on LHS (slice 26 10034 * path; we validate the wrapper's RHS), {@code false} = subquery 10035 * on RHS (slice 25 path; we validate the wrapper's LHS). 10036 * 10037 * <p>Uses the slice-25 outer-shape error prefix 10038 * "predicate subquery in JOIN ON:" so end users distinguish 10039 * outer-shape failures from the slice-23/24 inner-shape failures 10040 * (which keep the "EXISTS in JOIN ON:" prefix). 10041 */ 10042 private static String buildOuterShapeRejectionMessage(TExpression wrapper, 10043 boolean isLhsSubquery, 10044 PredicateClauseContext ctx) { 10045 EExpressionType t = wrapper.getExpressionType(); 10046 String shapeLabel; 10047 if (t == EExpressionType.in_t) shapeLabel = "IN"; 10048 else if (t == EExpressionType.simple_comparison_t) shapeLabel = "comparison"; 10049 else if (t == EExpressionType.group_comparison_t) shapeLabel = "ANY/ALL/SOME"; 10050 else shapeLabel = String.valueOf(t); 10051 // Validate the side that does NOT carry the subquery. 10052 TExpression nonSubquerySide = isLhsSubquery 10053 ? wrapper.getRightOperand() 10054 : wrapper.getLeftOperand(); 10055 String sideLabel = isLhsSubquery ? "RHS" : "LHS"; 10056 EExpressionType sideType = nonSubquerySide == null 10057 ? null : nonSubquerySide.getExpressionType(); 10058 String detail; 10059 if (nonSubquerySide == null) { 10060 detail = "missing " + sideLabel; 10061 } else if (sideType == EExpressionType.list_t) { 10062 detail = "tuple " + sideLabel; 10063 } else if (sideType == EExpressionType.parenthesis_t) { 10064 detail = "parenthesized " + sideLabel; 10065 } else if (sideType == EExpressionType.simple_object_name_t) { 10066 // Defensive: should not be reached when the corresponding 10067 // admitted-shape helper returns true. 10068 detail = "unexpected admitted " + sideLabel + " shape"; 10069 } else { 10070 detail = "expression " + sideLabel + " (" + sideType + ")"; 10071 } 10072 String boundary = isLhsSubquery ? "slice 26 boundary" : "slice 25 boundary"; 10073 return "predicate subquery in " + ctx.clauseLabel + ": " + shapeLabel 10074 + " wrapper has unsupported " + sideLabel + " shape (" 10075 + detail + "); only a single column reference " 10076 + "(simple_object_name_t) is admitted on the " 10077 + sideLabel 10078 + " of a comparison / IN / ANY-ALL-SOME " 10079 + "predicate subquery when the other side is a " 10080 + "subquery (" + boundary + ")"; 10081 } 10082 10083 /** 10084 * Slice 25 (rename of slice-23 10085 * {@code extractUncorrelatedExistsFromJoinOn}): walk the JOIN-ON 10086 * expression, extract every uncorrelated predicate-subquery wrapper 10087 * (EXISTS / NOT EXISTS / IN-SELECT / NOT IN-SELECT / scalar 10088 * comparison subquery / ANY-ALL-SOME) as its own 10089 * {@code <predicate_subquery_<i>>} StatementGraph, and return the 10090 * set of extracted inner nodes (the {@code exists_t} or 10091 * {@code subquery_t}, NOT the wrapping {@code in_t} / 10092 * {@code simple_comparison_t} / {@code group_comparison_t} / 10093 * {@code logical_not_t}) keyed on identity. 10094 * 10095 * <p>The set is consumed by the JOIN-ON window-function guard, the 10096 * JOIN-ON ref collector, and the slice-17 remaining-subquery 10097 * rejecter — each of those skips INTO / PAST these subtrees so 10098 * inner refs do not leak into outer joinColumnRefs. Critically, 10099 * the wrapper itself (e.g. an {@code in_t} whose RHS is the 10100 * {@code subquery_t}) is NOT in the set — this lets the LHS column 10101 * reference (e.g. {@code e.dept_id} in 10102 * {@code e.dept_id IN (SELECT ...)}) be collected normally into 10103 * outer's {@code joinColumnRefs}. 10104 * 10105 * <p>The walker handles BOTH the root-position case (the entire ON 10106 * IS one of the four wrappers, which {@code acceptChildren} would 10107 * not visit as a node) AND descendant positions (e.g. 10108 * {@code e.id = d.id AND e.dept_id IN (SELECT ...)}). Multiple 10109 * wrappers in one ON, multiple ON across multiple JOINs, and mixed 10110 * EXISTS / IN / cmp / ANY-ALL combinations are all handled. 10111 * 10112 * <p>Slice 25 / Slice 26 outer-shape validation: for non-EXISTS 10113 * wrappers, the side opposite the subquery must be a single 10114 * {@code simple_object_name_t} column ref. Slice 25 admits subquery 10115 * on RHS only and validates LHS via 10116 * {@link #isAdmittedOuterLhsShape}. Slice 26 lifts {@code 10117 * simple_comparison_t} to also admit subquery on LHS and validates 10118 * RHS via {@link #isAdmittedOuterRhsShape}. Tuple / parenthesized / 10119 * expression / function-call shapes on the validated side throw 10120 * {@link SemanticIRBuildException} with a tuned message via 10121 * {@link #buildOuterShapeRejectionMessage}. The EXISTS branch has 10122 * no outer-shape gate (slice-23 carryover). 10123 * 10124 * <p>Snapshot/rollback wrapper at the outer-SELECT call site 10125 * ({@link #build}) catches a partial extraction (e.g. third 10126 * wrapper rejected after first two extracted) and truncates 10127 * {@code stmts}/{@code lineage} back to the snapshot. 10128 */ 10129 /** 10130 * Slice 110 — context bag threading clause-specific 10131 * {@link DiagnosticCode}s and a clause-label into the slice-23+ 10132 * predicate-subquery extraction pipeline so the same walker code can 10133 * power JOIN-ON (slice 23–33+) and UPDATE WHERE (slice 110) without 10134 * code duplication. 10135 * 10136 * <p>Two static instances exist: 10137 * <ul> 10138 * <li>{@link #JOIN_ON} — preserves slice-23+ JOIN-ON behavior 10139 * byte-for-byte (same codes, same "JOIN ON" labels).</li> 10140 * <li>{@link #UPDATE_WHERE} — slice 110 UPDATE WHERE call site 10141 * (parallel {@code UPDATE_WHERE_*} codes; "UPDATE WHERE clause" 10142 * label).</li> 10143 * </ul> 10144 * 10145 * <p>Codes per clause are intentionally parallel (slice-80 10146 * granular-codes contract: each semantic reject reason gets its own 10147 * stable API code rather than an umbrella code with discriminating 10148 * message text). 10149 */ 10150 private static final class PredicateClauseContext { 10151 /** Used as the "in <label>" piece of every diagnostic message. */ 10152 final String clauseLabel; 10153 final DiagnosticCode existsBodyMissing; 10154 final DiagnosticCode existsInnerRelationUnknown; 10155 final DiagnosticCode existsCorrelatedUnknownOuterAlias; 10156 final DiagnosticCode predicateNotNot; 10157 final DiagnosticCode outerShapeRejected; 10158 final DiagnosticCode scalarComparisonBothSides; 10159 final DiagnosticCode predicateSubqueryOnLeft; 10160 final DiagnosticCode genericSubqueryNotSupported; 10161 10162 private PredicateClauseContext(String clauseLabel, 10163 DiagnosticCode existsBodyMissing, 10164 DiagnosticCode existsInnerRelationUnknown, 10165 DiagnosticCode existsCorrelatedUnknownOuterAlias, 10166 DiagnosticCode predicateNotNot, 10167 DiagnosticCode outerShapeRejected, 10168 DiagnosticCode scalarComparisonBothSides, 10169 DiagnosticCode predicateSubqueryOnLeft, 10170 DiagnosticCode genericSubqueryNotSupported) { 10171 this.clauseLabel = clauseLabel; 10172 this.existsBodyMissing = existsBodyMissing; 10173 this.existsInnerRelationUnknown = existsInnerRelationUnknown; 10174 this.existsCorrelatedUnknownOuterAlias = existsCorrelatedUnknownOuterAlias; 10175 this.predicateNotNot = predicateNotNot; 10176 this.outerShapeRejected = outerShapeRejected; 10177 this.scalarComparisonBothSides = scalarComparisonBothSides; 10178 this.predicateSubqueryOnLeft = predicateSubqueryOnLeft; 10179 this.genericSubqueryNotSupported = genericSubqueryNotSupported; 10180 } 10181 10182 static final PredicateClauseContext JOIN_ON = new PredicateClauseContext( 10183 "JOIN ON", 10184 DiagnosticCode.JOIN_ON_EXISTS_BODY_MISSING, 10185 DiagnosticCode.JOIN_ON_EXISTS_INNER_RELATION_UNKNOWN, 10186 DiagnosticCode.JOIN_ON_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10187 DiagnosticCode.JOIN_ON_PREDICATE_NOT_NOT_SUPPORTED, 10188 DiagnosticCode.JOIN_ON_OUTER_SHAPE_REJECTED, 10189 DiagnosticCode.JOIN_ON_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10190 DiagnosticCode.JOIN_ON_PREDICATE_NOT_LIFTABLE, 10191 DiagnosticCode.JOIN_ON_PREDICATE_GENERIC_NOT_SUPPORTED); 10192 10193 static final PredicateClauseContext UPDATE_WHERE = new PredicateClauseContext( 10194 "UPDATE WHERE clause", 10195 DiagnosticCode.UPDATE_WHERE_EXISTS_BODY_MISSING, 10196 DiagnosticCode.UPDATE_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10197 DiagnosticCode.UPDATE_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10198 DiagnosticCode.UPDATE_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10199 DiagnosticCode.UPDATE_WHERE_OUTER_SHAPE_REJECTED, 10200 DiagnosticCode.UPDATE_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10201 DiagnosticCode.UPDATE_WHERE_PREDICATE_NOT_LIFTABLE, 10202 DiagnosticCode.UPDATE_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10203 10204 static final PredicateClauseContext DELETE_WHERE = new PredicateClauseContext( 10205 "DELETE WHERE clause", 10206 DiagnosticCode.DELETE_WHERE_EXISTS_BODY_MISSING, 10207 DiagnosticCode.DELETE_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10208 DiagnosticCode.DELETE_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10209 DiagnosticCode.DELETE_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10210 DiagnosticCode.DELETE_WHERE_OUTER_SHAPE_REJECTED, 10211 DiagnosticCode.DELETE_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10212 DiagnosticCode.DELETE_WHERE_PREDICATE_NOT_LIFTABLE, 10213 DiagnosticCode.DELETE_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10214 10215 static final PredicateClauseContext SELECT_WHERE = new PredicateClauseContext( 10216 "SELECT WHERE clause", 10217 DiagnosticCode.SELECT_WHERE_EXISTS_BODY_MISSING, 10218 DiagnosticCode.SELECT_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10219 DiagnosticCode.SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10220 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10221 DiagnosticCode.SELECT_WHERE_OUTER_SHAPE_REJECTED, 10222 DiagnosticCode.SELECT_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10223 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_LIFTABLE, 10224 DiagnosticCode.SELECT_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10225 10226 /** 10227 * Slice 113 — uncorrelated WHERE-side predicate subqueries on 10228 * set-op branches (UNION / INTERSECT / EXCEPT / MINUS branches). 10229 * Reuses every {@link DiagnosticCode} from {@link #SELECT_WHERE} 10230 * because a branch IS a SELECT — the shape rejects are 10231 * semantically identical to top-level SELECT WHERE. Only the 10232 * {@code clauseLabel} differs so diagnostic messages distinguish 10233 * the nested context (helpful when a multi-branch query reports 10234 * a reject and the user needs to know which branch). Keeping the 10235 * codes shared frees consumers from a new code-family migration 10236 * and preserves the enum count at 279. 10237 */ 10238 static final PredicateClauseContext SET_OP_BRANCH_WHERE = new PredicateClauseContext( 10239 "set-op branch WHERE clause", 10240 DiagnosticCode.SELECT_WHERE_EXISTS_BODY_MISSING, 10241 DiagnosticCode.SELECT_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10242 DiagnosticCode.SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10243 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10244 DiagnosticCode.SELECT_WHERE_OUTER_SHAPE_REJECTED, 10245 DiagnosticCode.SELECT_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10246 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_LIFTABLE, 10247 DiagnosticCode.SELECT_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10248 10249 /** 10250 * Slice 114 — uncorrelated WHERE-side predicate subqueries 10251 * inside a non-set-op CTE body (the SELECT body of a single CTE 10252 * in a WITH list on SELECT / MERGE / UPDATE / DELETE). Reuses 10253 * every {@link DiagnosticCode} from {@link #SELECT_WHERE} 10254 * because a CTE body IS a SELECT — the shape rejects are 10255 * semantically identical to top-level SELECT WHERE. Only the 10256 * {@code clauseLabel} differs so a reject in a 10257 * {@code WITH cte AS (SELECT ... WHERE NOT (...))} shape can 10258 * identify the CTE-body host context in the diagnostic message. 10259 * Keeping the codes shared (slice 113 precedent) preserves the 10260 * enum count at 279 and frees consumers from another 10261 * code-family migration. 10262 */ 10263 static final PredicateClauseContext CTE_BODY_WHERE = new PredicateClauseContext( 10264 "CTE body WHERE clause", 10265 DiagnosticCode.SELECT_WHERE_EXISTS_BODY_MISSING, 10266 DiagnosticCode.SELECT_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10267 DiagnosticCode.SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10268 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10269 DiagnosticCode.SELECT_WHERE_OUTER_SHAPE_REJECTED, 10270 DiagnosticCode.SELECT_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10271 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_LIFTABLE, 10272 DiagnosticCode.SELECT_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10273 10274 /** 10275 * Slice 120 — uncorrelated WHERE-side predicate subqueries inside 10276 * a FROM-subquery body (the inner SELECT of a {@code FROM (...)} 10277 * derived table). Reuses every {@link DiagnosticCode} from 10278 * {@link #SELECT_WHERE} (slice 113/114/116 precedent) because a 10279 * FROM-subquery body IS a SELECT — the shape rejects are 10280 * semantically identical to top-level SELECT WHERE. Only the 10281 * {@code clauseLabel} differs so a reject inside a 10282 * {@code FROM (SELECT ... WHERE NOT (...)) sub} shape can identify 10283 * the FROM-subquery host context in the diagnostic message. The 10284 * FROM-subquery body builder {@code processDirectSubqueryTable} is 10285 * shared by the SELECT, UPDATE (slice 83), and DELETE (slice 84) 10286 * FROM-subquery extractors, so this single context lifts all three. 10287 * Keeping the codes shared preserves the enum count at 279 and 10288 * frees consumers from another code-family migration. 10289 */ 10290 static final PredicateClauseContext FROM_SUBQUERY_BODY_WHERE = new PredicateClauseContext( 10291 "FROM-subquery body WHERE clause", 10292 DiagnosticCode.SELECT_WHERE_EXISTS_BODY_MISSING, 10293 DiagnosticCode.SELECT_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10294 DiagnosticCode.SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10295 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10296 DiagnosticCode.SELECT_WHERE_OUTER_SHAPE_REJECTED, 10297 DiagnosticCode.SELECT_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10298 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_LIFTABLE, 10299 DiagnosticCode.SELECT_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10300 10301 /** 10302 * Slice 116 — uncorrelated WHERE-side predicate subqueries on 10303 * MERGE per-WHEN action WHEREs ({@code TMergeUpdateClause.updateWhereClause}, 10304 * {@code TMergeUpdateClause.deleteWhereClause}, 10305 * {@code TMergeInsertClause.insertWhereClause}). Reuses every 10306 * {@link DiagnosticCode} from {@link #SELECT_WHERE} (slice 113/114 10307 * precedent) because a MERGE-action WHERE predicate IS a SELECT 10308 * WHERE in shape — the shape rejects are semantically identical to 10309 * top-level SELECT WHERE. Only the {@code clauseLabel} differs so a 10310 * reject inside a MERGE WHEN can identify the host context in the 10311 * diagnostic message. Keeping the codes shared preserves the enum 10312 * count at 279 and frees consumers from another code-family 10313 * migration. 10314 */ 10315 static final PredicateClauseContext MERGE_WHEN_WHERE = new PredicateClauseContext( 10316 "MERGE WHEN action WHERE clause", 10317 DiagnosticCode.SELECT_WHERE_EXISTS_BODY_MISSING, 10318 DiagnosticCode.SELECT_WHERE_EXISTS_INNER_RELATION_UNKNOWN, 10319 DiagnosticCode.SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS, 10320 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_NOT_SUPPORTED, 10321 DiagnosticCode.SELECT_WHERE_OUTER_SHAPE_REJECTED, 10322 DiagnosticCode.SELECT_WHERE_PREDICATE_SCALAR_COMPARISON_NOT_LIFTABLE, 10323 DiagnosticCode.SELECT_WHERE_PREDICATE_NOT_LIFTABLE, 10324 DiagnosticCode.SELECT_WHERE_PREDICATE_GENERIC_NOT_SUPPORTED); 10325 } 10326 10327 /** 10328 * Slice 110 — preserved entry-point alias for the JOIN-ON walker. 10329 * Delegates to {@link #extractUncorrelatedPredicateSubqueriesFromClause} 10330 * with {@link PredicateClauseContext#JOIN_ON} so existing JOIN-ON 10331 * callers (single site in {@code buildRelations}) need no change and 10332 * the slice-23+ diagnostic byte-shape is preserved exactly. 10333 */ 10334 private static Set<TExpression> extractUncorrelatedPredicateSubqueriesFromJoinOn( 10335 TExpression onCond, 10336 final NameBindingProvider provider, 10337 final List<StatementGraph> stmts, 10338 final List<LineageEdge> lineage, 10339 final Map<String, Integer> cteMapForExtraction) { 10340 return extractUncorrelatedPredicateSubqueriesFromClause( 10341 onCond, provider, stmts, lineage, cteMapForExtraction, 10342 PredicateClauseContext.JOIN_ON, 10343 /*correlationScope=*/ null); 10344 } 10345 10346 /** 10347 * Slice 118 — overload preserved for the slice-110 / 111 / 112 / 113 / 10348 * 114 / 116 call sites that don't admit correlation. Delegates to the 10349 * 8-arg form with {@code correlationScope=null}. 10350 */ 10351 private static Set<TExpression> extractUncorrelatedPredicateSubqueriesFromClause( 10352 TExpression onCond, 10353 final NameBindingProvider provider, 10354 final List<StatementGraph> stmts, 10355 final List<LineageEdge> lineage, 10356 final Map<String, Integer> cteMapForExtraction, 10357 final PredicateClauseContext ctx) { 10358 return extractUncorrelatedPredicateSubqueriesFromClause(onCond, 10359 provider, stmts, lineage, cteMapForExtraction, ctx, 10360 /*correlationScope=*/ null); 10361 } 10362 10363 /** 10364 * Slice 118 — same as the 7-arg overload but threads an optional 10365 * {@code correlationScope} (target + USING source + outer CTEs) into 10366 * {@link #extractOnePredicateSubqueryBody}. When non-null, the inner 10367 * predicate-body build uses tolerant outer binding and the post-build 10368 * correlation walk PROMOTES outer-aliased refs into synthesised 10369 * OUTER_REFERENCE relations instead of rejecting them. The FILTER and 10370 * WITHIN GROUP correlation walks remain active (codex round-1 Q2 10371 * BLOCKING fix) so refs hidden inside FILTER subtrees or PG 10372 * {@code fn.withinGroup.orderBy} continue to reject. 10373 * 10374 * <p>All non-MERGE callers pass {@code correlationScope=null} and 10375 * therefore see byte-identical behaviour. Only 10376 * {@link #collectMergeActionWhere} passes a non-null scope (built once 10377 * per MERGE in {@code buildMerge} via 10378 * {@link #buildMergeEnclosingScope}). 10379 */ 10380 private static Set<TExpression> extractUncorrelatedPredicateSubqueriesFromClause( 10381 TExpression onCond, 10382 final NameBindingProvider provider, 10383 final List<StatementGraph> stmts, 10384 final List<LineageEdge> lineage, 10385 final Map<String, Integer> cteMapForExtraction, 10386 final PredicateClauseContext ctx, 10387 final EnclosingScope correlationScope) { 10388 // Defensive null assertions — the extraction path can only be 10389 // reached from buildRelations when 10390 // allowJoinOnPredicateSubqueries=true (outer-SELECT only), which 10391 // guarantees stmts/lineage/cteMap are non-null. Failing here means 10392 // a future refactor wired a non-outer call site through the slice-25 10393 // path without supplying the required state. 10394 if (onCond == null) { 10395 return Collections.newSetFromMap(new java.util.IdentityHashMap<TExpression, Boolean>()); 10396 } 10397 if (stmts == null || lineage == null || cteMapForExtraction == null) { 10398 throw new IllegalStateException( 10399 "extractUncorrelatedPredicateSubqueriesFromClause(" 10400 + ctx.clauseLabel + ") activated without required state — " 10401 + "stmts=" + (stmts == null ? "null" : "ok") 10402 + " lineage=" + (lineage == null ? "null" : "ok") 10403 + " cteMap=" + (cteMapForExtraction == null ? "null" : "ok") 10404 + "; caller misconfiguration"); 10405 } 10406 final Set<TExpression> extractedRoots = 10407 Collections.newSetFromMap(new java.util.IdentityHashMap<TExpression, Boolean>()); 10408 // Slice 25 (impl-review M1-fix): explicit reject for 10409 // {@code logical_not_t} over a slice-25 IN / scalar-cmp / 10410 // ANY-ALL-SOME wrapper at the root. The slice-23/24 10411 // {@code logical_not_t} over {@code exists_t} (NOT EXISTS) 10412 // remains admitted by unwrapToInnerExtractableSubquery. 10413 if (isLogicalNotOverNonExistsWrapper(onCond)) { 10414 throw new SemanticIRBuildException( 10415 Diagnostic.error(ctx.predicateNotNot, 10416 "predicate subquery in " + ctx.clauseLabel + ": NOT applied to " 10417 + "a non-EXISTS predicate subquery wrapper " 10418 + "(" + onCond.getRightOperand().getExpressionType() 10419 + ") is not supported yet — the slice-25 boundary " 10420 + "admits NOT only over EXISTS; " 10421 + "rewrite e.g. NOT (a IN (SELECT ...)) as " 10422 + "a NOT IN (SELECT ...)", onCond)); 10423 } 10424 // Root fast path: acceptChildren never visits the root node, so 10425 // a clause whose entire expression IS a wrapper would be missed 10426 // by the descendant walker. 10427 TExpression rootExtractable = unwrapToInnerExtractableSubquery(onCond); 10428 if (rootExtractable != null) { 10429 // M1-fix + slice-26 dual-side: validate the non-subquery 10430 // side of non-EXISTS wrappers BEFORE extracting (so partial 10431 // extraction never lands). Slice 25 carryover: subquery on 10432 // RHS → validate LHS via isAdmittedOuterLhsShape. 10433 // Slice 26 NEW: subquery on LHS (simple_comparison_t only) 10434 // → validate RHS via isAdmittedOuterRhsShape. 10435 if (onCond.getExpressionType() != EExpressionType.exists_t 10436 && onCond.getExpressionType() != EExpressionType.logical_not_t) { 10437 boolean isLhsSubquery = (rootExtractable == onCond.getLeftOperand()); 10438 boolean nonSubquerySideOk = isLhsSubquery 10439 ? isAdmittedOuterRhsShape(onCond.getRightOperand()) 10440 : isAdmittedOuterLhsShape(onCond.getLeftOperand()); 10441 if (!nonSubquerySideOk) { 10442 throw new SemanticIRBuildException( 10443 Diagnostic.error(ctx.outerShapeRejected, 10444 buildOuterShapeRejectionMessage(onCond, isLhsSubquery, ctx), 10445 onCond)); 10446 } 10447 } 10448 extractOnePredicateSubqueryBody(rootExtractable, provider, stmts, lineage, 10449 cteMapForExtraction, ctx, correlationScope); 10450 extractedRoots.add(rootExtractable); 10451 } 10452 // Descendant walk: find every wrapper at any depth. Skip into 10453 // already-extracted subtrees (so we don't re-enter the body 10454 // looking for nested wrappers — covered by the inner-shape 10455 // preflight's "no nested predicate subqueries in body" 10456 // rejection). 10457 onCond.acceptChildren(new TParseTreeVisitor() { 10458 // Track depth into already-extracted roots and into wrapper 10459 // subtrees we've extracted. preVisit increments on the 10460 // wrapper (the parent that contained the inner extractable); 10461 // postVisit decrements on either the inner extractable 10462 // (extractedRoots.contains) or the wrapper 10463 // (unwrapToInnerExtractableSubquery != null). The 10464 // {@code skipDepth > 0} guard prevents the second 10465 // decrement from going negative when both apply (e.g. NOT 10466 // EXISTS — both the logical_not_t wrapper and the inner 10467 // exists_t fire). 10468 int skipDepth = 0; 10469 10470 @Override 10471 public void preVisit(TExpression e) { 10472 if (skipDepth > 0) return; 10473 if (extractedRoots.contains(e)) { 10474 // Already-extracted inner being re-visited shouldn't 10475 // happen in normal traversal but defensive guard 10476 // avoids double-extraction if it ever did. 10477 skipDepth++; 10478 return; 10479 } 10480 // Slice 25 (impl-review M1-fix): explicit reject for 10481 // {@code logical_not_t} over a slice-25 wrapper at any 10482 // depth. Without this, the visitor would descend into 10483 // the wrapper child and silently extract — admitting 10484 // a shape (`NOT (a IN (SELECT ...))`) that the 10485 // slice-25 boundary does NOT admit. 10486 if (isLogicalNotOverNonExistsWrapper(e)) { 10487 throw new SemanticIRBuildException( 10488 Diagnostic.error(ctx.predicateNotNot, 10489 "predicate subquery in " + ctx.clauseLabel + ": NOT applied to " 10490 + "a non-EXISTS predicate subquery wrapper " 10491 + "(" + e.getRightOperand().getExpressionType() 10492 + ") is not supported yet — the slice-25 " 10493 + "boundary admits NOT only over EXISTS; " 10494 + "rewrite e.g. NOT (a IN (SELECT ...)) as " 10495 + "a NOT IN (SELECT ...)", e)); 10496 } 10497 TExpression toExtract = unwrapToInnerExtractableSubquery(e); 10498 if (toExtract != null) { 10499 // M1-fix + slice-26 dual-side: validate the non- 10500 // subquery side BEFORE extracting (so partial 10501 // extraction never lands). The slice-23/24 10502 // NOT-EXISTS path uses logical_not_t, which has no 10503 // outer-shape gate. 10504 if (e.getExpressionType() != EExpressionType.exists_t 10505 && e.getExpressionType() != EExpressionType.logical_not_t) { 10506 boolean isLhsSubquery = (toExtract == e.getLeftOperand()); 10507 boolean nonSubquerySideOk = isLhsSubquery 10508 ? isAdmittedOuterRhsShape(e.getRightOperand()) 10509 : isAdmittedOuterLhsShape(e.getLeftOperand()); 10510 if (!nonSubquerySideOk) { 10511 throw new SemanticIRBuildException( 10512 Diagnostic.error(ctx.outerShapeRejected, 10513 buildOuterShapeRejectionMessage(e, isLhsSubquery, ctx), 10514 e)); 10515 } 10516 } 10517 if (extractedRoots.contains(toExtract)) return; 10518 extractOnePredicateSubqueryBody(toExtract, provider, stmts, lineage, 10519 cteMapForExtraction, ctx, correlationScope); 10520 extractedRoots.add(toExtract); 10521 skipDepth++; 10522 } 10523 } 10524 10525 @Override 10526 public void postVisit(TExpression e) { 10527 // M2-fix: decrement on EITHER the extracted inner 10528 // (extractedRoots.contains) OR the wrapper 10529 // (unwrapToInnerExtractableSubquery != null). The 10530 // {@code skipDepth > 0} guard prevents going negative 10531 // when both apply. 10532 if (skipDepth > 0 10533 && (extractedRoots.contains(e) 10534 || unwrapToInnerExtractableSubquery(e) != null)) { 10535 skipDepth--; 10536 } 10537 } 10538 }); 10539 return extractedRoots; 10540 } 10541 10542 /** 10543 * Slice 25 (rename of slice-23 {@code extractOneExistsBody}): 10544 * extract a single predicate-subquery body's inner SELECT as its 10545 * own {@code <predicate_subquery_<i>>} StatementGraph. Runs the 10546 * inner-shape preflight before recursive build, then post-build 10547 * correlation check. 10548 * 10549 * <p>{@code extractableNode} is either an {@code exists_t} 10550 * (slice-23 EXISTS / slice-24 column-bearing EXISTS) or a 10551 * {@code subquery_t} (slice-25 IN-SELECT / scalar comparison / 10552 * ANY-ALL-SOME). Both expose the inner SELECT via 10553 * {@link TExpression#getSubQuery()}. 10554 */ 10555 private static void extractOnePredicateSubqueryBody(TExpression extractableNode, 10556 NameBindingProvider provider, 10557 List<StatementGraph> stmts, 10558 List<LineageEdge> lineage, 10559 Map<String, Integer> cteMapForExtraction, 10560 PredicateClauseContext ctx, 10561 EnclosingScope correlationScope) { 10562 TSelectSqlStatement inner = extractableNode.getSubQuery(); 10563 if (inner == null) { 10564 // Degenerate node with no subquery; defensive. 10565 throw new SemanticIRBuildException( 10566 Diagnostic.error(ctx.existsBodyMissing, 10567 "EXISTS in " + ctx.clauseLabel + ": subquery body is missing", null)); 10568 } 10569 // (a–g) Inner-shape preflight (slice-23 boundary; slice 24 widens 10570 // (e) to admit single column-ref projection in addition to constant). 10571 preflightExistsInnerShape(inner); 10572 10573 // Slice 118 — when an enclosing correlation scope is supplied 10574 // (MERGE per-WHEN action WHERE only), decorate `provider` with 10575 // tolerant outer binding so the inner build admits qualified refs 10576 // to outer aliases (target / USING source / outer CTEs) as 10577 // synthetic EXACT_MATCH bindings instead of rejecting them as 10578 // COLUMN_BINDING_NON_EXACT. Mirrors the slice-117 pattern for 10579 // UPDATE SET-RHS correlated scalars. Computed BEFORE 10580 // buildSelectStatementImpl so the inner build's bindColumn calls 10581 // see the tolerant fallback already populated (codex round-5 10582 // ordering fix from slice 117). Qualifiers IN the inner's local 10583 // FROM aliases still strict-reject so real typos (`o.bad_col` 10584 // where `o` IS the inner FROM alias) still surface as 10585 // COLUMN_BINDING_NON_EXACT. 10586 final NameBindingProvider effectiveProvider; 10587 if (correlationScope != null) { 10588 Set<String> innerLocalAliasesForTolerant = 10589 precomputeInnerLocalAliases(inner); 10590 effectiveProvider = innerLocalAliasesForTolerant.isEmpty() 10591 ? provider 10592 : provider.withTolerantOuterBinding( 10593 innerLocalAliasesForTolerant); 10594 } else { 10595 effectiveProvider = provider; 10596 } 10597 10598 // Build the inner SELECT as its own StatementGraph. SAME provider 10599 // as outer (codex round-1 MUST 3 — outer CTEs remain visible). 10600 // Slice 118: tolerant-decorated provider when correlationScope 10601 // != null (MERGE per-WHEN action WHERE only); same provider as 10602 // before otherwise. 10603 // hasOuterCteListAlreadyProcessed=false (codex round-2 SHOULD 1 — 10604 // generic nested-WITH guard remains active as belt-and-braces). 10605 // allowFromSubqueries=false (no FROM-subqueries in inner body for 10606 // slice 23). isPredicateBody=true: for constant-only inner emits one 10607 // synthetic OutputColumn (slice-23 path); for column-ref inner the 10608 // §4.1.2 short-circuit falls through to the normal column-ref path 10609 // (slice-24 widening). 10610 String predName = PREDICATE_BODY_PREFIX + stmts.size() + ">"; 10611 StatementGraph innerStmt = buildSelectStatementImpl(inner, effectiveProvider, predName, 10612 /*hasOuterCteListAlreadyProcessed=*/ false, 10613 /*allowFromSubqueries=*/ false, 10614 /*allowScalarProjectionSubqueries=*/ false, 10615 /*allowWindowProjection=*/ false, 10616 /*allowJoinOnPredicateSubqueries=*/ false, 10617 /*stmtsForExtraction=*/ null, 10618 /*lineageForExtraction=*/ null, 10619 /*cteMapForExtraction=*/ null, 10620 /*isPredicateBody=*/ true, 10621 /*whereClauseContext=*/ PredicateClauseContext.SELECT_WHERE, 10622 /*allowWherePredicateSubqueries=*/ false); 10623 10624 // Slice 24 (codex impl-review SHOULD 1): defensive relation-kind 10625 // walk. The preflight rejects FROM-subqueries; the post-build 10626 // correlation check below rejects OUTER_REFERENCE relations 10627 // (synthesised by promoteCorrelatedRefsToOuterReference for 10628 // outer refs we don't see). Belt-and-braces: the predicate body 10629 // must contain ONLY TABLE or CTE-bound relations. SUBQUERY / 10630 // OUTER_REFERENCE / UNION leaking through here would mean the 10631 // emitLineageForStatement call below routes through code paths 10632 // (e.g. the SUBQUERY-alias map) that we deliberately pass empty, 10633 // producing a SemanticIRBuildException about an unregistered 10634 // alias. Failing fast here surfaces the architectural violation 10635 // with a slice-24-tuned message instead. 10636 for (RelationSource r : innerStmt.getRelations()) { 10637 RelationKind kind = r.getBinding().getKind(); 10638 if (kind != RelationKind.TABLE && kind != RelationKind.CTE) { 10639 throw new SemanticIRBuildException( 10640 Diagnostic.error(ctx.existsInnerRelationUnknown, 10641 "EXISTS in " + ctx.clauseLabel + ": inner SELECT relation '" 10642 + r.getAlias() + "' has unsupported binding kind " 10643 + kind + "; only TABLE or CTE relations are admitted " 10644 + "(slice 24 boundary)", null)); 10645 } 10646 } 10647 // Post-build correlation check (codex round-1 MUST 2 + round-2 SHOULD 2). 10648 // Use the existing collectAllInnerRefs helper so clause coverage 10649 // stays in sync with promoteCorrelatedRefsToOuterReference. 10650 // Slice 24: collectAllInnerRefs includes OutputColumn.sources, so 10651 // a column-ref projection like `EXISTS (SELECT e.id FROM x)` where 10652 // `e` is the OUTER's alias trips the same correlation rejection — 10653 // no extra slice-24 code needed. 10654 // 10655 // Slice 118 — when correlationScope != null (MERGE per-WHEN action 10656 // WHERE only), instead of REJECTING outer-aliased refs we PROMOTE 10657 // them into synthesised OUTER_REFERENCE relations via 10658 // promoteCorrelatedRefsToOuterReference. Mirrors the slice-14 / 10659 // slice-117 pattern. Unknown outer aliases (not in target / USING 10660 // source / outer CTEs) still throw SCALAR_SUBQUERY_UNKNOWN_RELATION_ALIAS 10661 // (the promoter's existing boundary; diagnostic message says 10662 // "scalar subquery" — acceptable cosmetic limitation, slice 117 10663 // precedent). The slice-118 lift covers refs landing in 10664 // collectAllInnerRefs clauses (output sources, filter, join, 10665 // groupBy, having, orderBy, distinctOn); the FILTER and WITHIN 10666 // GROUP walks BELOW remain active so outer-aliased refs hidden 10667 // inside FILTER subtrees or PG fn.withinGroup.orderBy still 10668 // reject with SELECT_WHERE_EXISTS_CORRELATED_UNKNOWN_OUTER_ALIAS 10669 // (codex round-1 Q2 BLOCKING preserved this boundary). 10670 Set<String> innerLocalAliases = new HashSet<>(); 10671 for (RelationSource r : innerStmt.getRelations()) { 10672 innerLocalAliases.add(r.getAlias().toLowerCase(Locale.ROOT)); 10673 } 10674 if (correlationScope != null) { 10675 // Pass a descriptive "outerAlias" so the promoter's diagnostic 10676 // messages identify the MERGE predicate-body host context if 10677 // promotion fails on an unknown alias. 10678 innerStmt = promoteCorrelatedRefsToOuterReference( 10679 innerStmt, 10680 "<merge predicate subquery " + (stmts.size()) + ">", 10681 correlationScope); 10682 } else { 10683 for (ColumnRef ref : collectAllInnerRefs(innerStmt)) { 10684 if (!innerLocalAliases.contains(ref.getRelationAlias().toLowerCase(Locale.ROOT))) { 10685 throw new SemanticIRBuildException( 10686 Diagnostic.error(ctx.existsCorrelatedUnknownOuterAlias, 10687 "EXISTS in " + ctx.clauseLabel + ": correlated reference to outer alias '" 10688 + ref.getRelationAlias() 10689 + "' is not supported yet (slice 23 accepts uncorrelated EXISTS only)", null)); 10690 } 10691 } 10692 } 10693 // Slice 28: projection-only FILTER-aware correlation walk. The 10694 // slice-28 source-skip in buildOutputColumns removes column refs 10695 // inside FILTER (WHERE ...) subtrees from OutputColumn.sources, so 10696 // a correlated FILTER ref in the inner projection (e.g. 10697 // `EXISTS (SELECT SUM(x.s) FILTER (WHERE e.region='EU') FROM x)` 10698 // where `e` is the outer alias) would slip past the loop above. 10699 // Existing collectAllInnerRefs continues to cover correlated 10700 // FILTER refs landing in inner WHERE / HAVING / GROUP BY / ORDER BY 10701 // / JOIN-ON because those clauses still collect via plain 10702 // collectColumnRefs which descends into FILTER subtrees. 10703 TResultColumnList rclForFilterWalk = inner.getResultColumnList(); 10704 if (rclForFilterWalk != null) { 10705 for (int rci = 0; rci < rclForFilterWalk.size(); rci++) { 10706 TResultColumn rc = rclForFilterWalk.getResultColumn(rci); 10707 Set<TExpression> filterClauses = collectFilterClauses(rc); 10708 for (TExpression fclause : filterClauses) { 10709 // Slice 118 — use effectiveProvider so under 10710 // correlationScope != null, outer-aliased refs come 10711 // back as synthetic EXACT_MATCH ColumnRefs (rather 10712 // than throwing on tolerant fallback being absent). 10713 // The alias-membership rejection below then fires 10714 // for outer-aliased FILTER-inner refs, preserving 10715 // the slice-118 boundary (codex round-1 Q2 BLOCKING 10716 // fix: FILTER-inner correlation still rejects). 10717 for (ColumnRef ref : collectColumnRefs(fclause, effectiveProvider)) { 10718 if (!innerLocalAliases.contains( 10719 ref.getRelationAlias().toLowerCase(Locale.ROOT))) { 10720 throw new SemanticIRBuildException( 10721 Diagnostic.error(ctx.existsCorrelatedUnknownOuterAlias, 10722 "EXISTS in " + ctx.clauseLabel + ": correlated reference to outer alias '" 10723 + ref.getRelationAlias() 10724 + "' inside FILTER (WHERE ...) is not supported yet", null)); 10725 } 10726 } 10727 } 10728 } 10729 } 10730 // Slice 30: projection-only direct WITHIN GROUP ORDER BY correlation 10731 // walk. PostgreSQL attaches WITHIN GROUP to fn.withinGroup directly, 10732 // and TFunctionCall.acceptChildren does NOT descend into that field — 10733 // so collectColumnRefs (and therefore collectAllInnerRefs above) is 10734 // blind to outer references inside `fn.withinGroup.orderBy`. A 10735 // correlated reference like 10736 // `mode() WITHIN GROUP (ORDER BY e.region)` (where `e` is the 10737 // outer alias) would slip past the slice-23 correlation loop above. 10738 // Catch it explicitly with a per-result-column WG ORDER BY scan. 10739 // Mirrors the slice-28 FILTER walk pattern. Also closes the same 10740 // correlation gap retroactively for slice-29-admitted aggregates 10741 // (LISTAGG / STRING_AGG / GROUP_CONCAT / ARRAY_AGG WITHIN GROUP), 10742 // see Slice30Test.pgCorrelatedListaggWithinGroupOrderByNowAlsoRejected. 10743 // 10744 // IMPORTANT: this walk uses a qualifier-only collector 10745 // ({@link #collectQualifierAliases}) instead of 10746 // {@link #collectColumnRefs}. Resolver2 also doesn't attach 10747 // ResolutionResult to TObjectName nodes inside PG's direct 10748 // fn.withinGroup field (it shares the AST asymmetry that lets 10749 // slice 29 admit these without a source-skip). Going through 10750 // {@code collectColumnRefs} → {@code provider.bindColumn} would 10751 // throw {@code non-exact column bindings} on legitimate 10752 // non-correlated refs (status=NOT_FOUND because Resolver2 skipped 10753 // them). The qualifier-only collector reads the qualifier alias 10754 // straight off the TObjectName, which matches slice-23's 10755 // correlation invariant: only qualified refs that name an outer 10756 // alias are caught — unqualified refs remain a documented 10757 // schema-less limitation. 10758 TResultColumnList rclForWgWalk = inner.getResultColumnList(); 10759 if (rclForWgWalk != null) { 10760 for (int rci = 0; rci < rclForWgWalk.size(); rci++) { 10761 TResultColumn rc = rclForWgWalk.getResultColumn(rci); 10762 Set<TOrderBy> wgOrderBys = collectDirectWithinGroupOrderBys(rc); 10763 for (TOrderBy wgOrderBy : wgOrderBys) { 10764 for (String alias : collectQualifierAliases(wgOrderBy)) { 10765 if (!innerLocalAliases.contains( 10766 alias.toLowerCase(Locale.ROOT))) { 10767 throw new SemanticIRBuildException( 10768 Diagnostic.error(ctx.existsCorrelatedUnknownOuterAlias, 10769 "EXISTS in " + ctx.clauseLabel + ": correlated reference to outer alias '" 10770 + alias 10771 + "' inside WITHIN GROUP (ORDER BY ...) is not supported yet", null)); 10772 } 10773 } 10774 } 10775 } 10776 } 10777 int idx = stmts.size(); 10778 stmts.add(innerStmt); 10779 // Slice 24: emit lineage edges for the predicate body. For 10780 // constant-only inner (slice-23 carryover), the synthetic 10781 // OutputColumn has empty sources and emitLineageForStatement 10782 // emits zero edges — no shape change for slice 23. For 10783 // column-ref inner (slice 24), the real OutputColumn carries 10784 // one ColumnRef source pointing at the inner's local relation; 10785 // emitLineageForStatement emits a STATEMENT_OUTPUT → TABLE_COLUMN 10786 // edge (TABLE-bound inner) or STATEMENT_OUTPUT → STATEMENT_OUTPUT 10787 // edge (CTE-bound inner) that the projector's slice-24 pass uses 10788 // to resolve the JOIN canonical edge. 10789 // 10790 // SUBQUERY map is empty: inner-shape preflight rejects FROM-subqueries. 10791 // ScalarInfo map is empty: inner-shape preflight rejects scalar 10792 // projections (and column-ref projection is single-source, not a 10793 // scalar-subquery extraction). 10794 // Slice 118 — pass the enclosing scope's flattened SUBQUERY-alias 10795 // map under correlation mode so OUTER_REFERENCE-of-SUBQUERY refs 10796 // resolve to the enclosing MERGE's USING-subquery statement index 10797 // for cross-stmt lineage emission (mirrors slice 117 UPDATE-side 10798 // emit dispatch). 10799 Map<String, Integer> subqueryAliasMap = (correlationScope != null) 10800 ? correlationScope.flattenSubqueryAliasToIndex() 10801 : Collections.<String, Integer>emptyMap(); 10802 emitLineageForStatement(innerStmt, idx, lineage, 10803 cteMapForExtraction, 10804 subqueryAliasMap, 10805 Collections.<Integer, ScalarInfo>emptyMap()); 10806 // The predicate body remains UNREACHABLE from outer: no relation 10807 // in outer points at it, and no STATEMENT_OUTPUT lineage edge has 10808 // it as its `to`. Inner WHERE / inner JOIN refs of the predicate 10809 // body therefore cannot enter outer's row-influence walker. The 10810 // slice-24 projector pass iterates predicate bodies directly via 10811 // `isPredicateSubquerySyntheticName` to emit JOIN canonical edges 10812 // from their OutputColumn sources only (slice-24 §4.2.1). 10813 } 10814 10815 /** 10816 * Slice 23: inner-shape preflight for an extracted EXISTS body. See 10817 * roadmap §14.25 (slice-23 plan §4.4) for the full reasoning. 10818 */ 10819 private static void preflightExistsInnerShape(TSelectSqlStatement inner) { 10820 // (a) No set-op 10821 if (inner.getSetOperatorType() != null 10822 && inner.getSetOperatorType() != ESetOperatorType.none) { 10823 throw new SemanticIRBuildException( 10824 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_IS_SET_OP, 10825 "EXISTS in JOIN ON: inner SELECT may not be a set operation", inner)); 10826 } 10827 // (b) No nested CTE list 10828 if (inner.getCteList() != null && inner.getCteList().size() > 0) { 10829 throw new SemanticIRBuildException( 10830 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_WITH, 10831 "EXISTS in JOIN ON: inner SELECT may not have its own WITH clause", inner)); 10832 } 10833 // (c) No row-limit (delegated to rejectUnsupportedShape's row-limit 10834 // guards which fire during buildSelectStatement). For an early, 10835 // specific message we also fast-fail here. 10836 if (inner.getLimitClause() != null 10837 || inner.getTopClause() != null 10838 || inner.getFetchFirstClause() != null 10839 || inner.getOffsetClause() != null) { 10840 throw new SemanticIRBuildException( 10841 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_ROW_LIMIT, 10842 "EXISTS in JOIN ON: inner SELECT may not have a row-limit clause", inner)); 10843 } 10844 // (d) Inner FROM is required (codex round-2 MUST 2). 10845 if (inner.joins == null || inner.joins.size() == 0) { 10846 throw new SemanticIRBuildException( 10847 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_MISSING_FROM, 10848 "EXISTS in JOIN ON: inner SELECT must have a FROM clause " 10849 + "(degenerate EXISTS (SELECT 1) is not in scope)", inner)); 10850 } 10851 // Slice 62 (codex plan-review round 1, P2 #1): predicate bodies 10852 // are built with allowFromSubqueries=false, so the gated reject 10853 // inside buildRelations also fires; we surface a slice-23 10854 // tuned message here so callers see the predicate-body shape 10855 // diagnostic before the generic comma-FROM message. 10856 if (inner.joins.size() > 1) { 10857 throw new SemanticIRBuildException( 10858 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_COMMA_FROM, 10859 "EXISTS in JOIN ON: comma-separated FROM list " 10860 + "(implicit cross join) in inner SELECT is not supported yet", inner)); 10861 } 10862 // Slice 63: predicate body must not contain explicit CROSS JOIN 10863 // either. Surfaces a predicate-body-tuned diagnostic before the 10864 // gated reject inside buildRelations would fire with the 10865 // generic "scalar / set-op-branch / set-op-CTE / predicate" 10866 // message. The same shared preflight is used by EXISTS / 10867 // IN-SELECT / cmp-subquery / ANY-ALL-SOME wrappers. 10868 // Slice 64: same treatment for JOIN ... USING. 10869 for (TJoin j : inner.joins) { 10870 TJoinItemList items = j.getJoinItems(); 10871 if (items == null) continue; 10872 for (int i = 0; i < items.size(); i++) { 10873 TJoinItem item = items.getJoinItem(i); 10874 if (item == null) continue; 10875 if (item.getJoinType() == EJoinType.cross) { 10876 throw new SemanticIRBuildException( 10877 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_CROSS_JOIN, 10878 "EXISTS in JOIN ON: CROSS JOIN in inner SELECT " 10879 + "is not supported yet", null)); 10880 } 10881 if (item.getUsingColumns() != null 10882 && item.getUsingColumns().size() > 0) { 10883 throw new SemanticIRBuildException( 10884 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_USING, 10885 "EXISTS in JOIN ON: JOIN ... USING (...) in inner " 10886 + "SELECT is not supported yet", null)); 10887 } 10888 // Slice 66: NATURAL JOIN inside an EXISTS-style predicate 10889 // body is rejected with an EXISTS-tuned diagnostic. The 10890 // gated reject inside buildRelations would fire later 10891 // with the generic body-context message; surfacing here 10892 // gives users an EXISTS / IN-SELECT / cmp-subquery 10893 // friendly error. 10894 if (isNaturalJoinType(item.getJoinType())) { 10895 throw new SemanticIRBuildException( 10896 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_NATURAL, 10897 "EXISTS in JOIN ON: NATURAL JOIN in inner SELECT " 10898 + "is not supported yet", null)); 10899 } 10900 } 10901 } 10902 // (d') No FROM-subquery on inner FROM/JOIN list. The recursive build 10903 // passes allowFromSubqueries=false, so buildRelation would also 10904 // reject; we surface a slice-23 specific message here. 10905 for (TJoin j : inner.joins) { 10906 if (j.getTable() != null 10907 && j.getTable().getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 10908 throw new SemanticIRBuildException( 10909 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_FROM_SUBQUERY, 10910 "EXISTS in JOIN ON: FROM-clause subquery in inner SELECT is not supported yet", null)); 10911 } 10912 TJoinItemList items = j.getJoinItems(); 10913 if (items == null) continue; 10914 for (int i = 0; i < items.size(); i++) { 10915 TTable r = items.getJoinItem(i).getTable(); 10916 if (r != null && r.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 10917 throw new SemanticIRBuildException( 10918 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_FROM_SUBQUERY_ON_JOIN, 10919 "EXISTS in JOIN ON: FROM-clause subquery on JOIN side in inner SELECT is not supported yet", null)); 10920 } 10921 } 10922 } 10923 // (e) Result-column list: exactly one column projecting either a 10924 // constant expression (slice 23), a single column reference 10925 // (slice 24), an expression / function call / CASE / 10926 // aggregate over inner columns (slice 27), an aggregate with 10927 // FILTER (WHERE ...) over inner columns (slice 28), or — on 10928 // PostgreSQL only — a whitelisted WITHIN GROUP aggregate 10929 // (slice 29 admits LISTAGG / STRING_AGG / GROUP_CONCAT / 10930 // ARRAY_AGG / count / sum / avg / min / max / stddev / 10931 // variance family; slice 30 extends with `mode`). 10932 // Multi-column / star / window function / scalar subquery / 10933 // non-whitelisted WITHIN GROUP aggregate projections are 10934 // rejected with shape-specific tuned messages — see 10935 // {@link #findUnsupportedWithinGroupFunctionName} for the 10936 // vendor + name gate. 10937 TResultColumnList rcl = inner.getResultColumnList(); 10938 if (rcl == null || rcl.size() != 1) { 10939 throw new SemanticIRBuildException( 10940 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_COLUMN_COUNT, 10941 "EXISTS in JOIN ON: inner SELECT must project exactly one column, got " 10942 + (rcl == null ? 0 : rcl.size()), null)); 10943 } 10944 TResultColumn rc0 = rcl.getResultColumn(0); 10945 if ("*".equals(rc0.getColumnNameOnly())) { 10946 throw new SemanticIRBuildException( 10947 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_NON_CONSTANT_PROJECTION, 10948 "EXISTS in JOIN ON: inner SELECT must project a constant expression " 10949 + "or a single column reference, got SELECT *", null)); 10950 } 10951 TExpression projExpr = rc0.getExpr(); 10952 if (projExpr == null || !isAdmittedPredicateProjection(projExpr)) { 10953 throw new SemanticIRBuildException( 10954 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_NON_CONSTANT_PROJECTION, 10955 "EXISTS in JOIN ON: inner SELECT must project a constant expression " 10956 + "(e.g. SELECT 1), a single column reference " 10957 + "(e.g. SELECT x.id), an expression / function call / " 10958 + "CASE / aggregate over inner columns (e.g. SELECT x.id + 1, " 10959 + "UPPER(x.region), MAX(x.id), CASE WHEN ...), an aggregate " 10960 + "with FILTER (WHERE ...) over inner columns " 10961 + "(e.g. SUM(x.id) FILTER (WHERE x.region = 'EU')), or a " 10962 + "WITHIN GROUP (ORDER BY ...) aggregate over inner columns " 10963 + "(PostgreSQL admits the direct fn.withinGroup attachment; " 10964 + "Oracle and SQL Server admit the windowDef.withinGroup " 10965 + "attachment via slice 31 when no OVER clause is present " 10966 + "— see TWindowDef.isIncludingOverClause(); slice 44 also " 10967 + "admits Snowflake hypothetical-set ordered-set aggregates " 10968 + "(rank / dense_rank / percent_rank / cume_dist) via direct " 10969 + "fn.withinGroup attachment); DB2 / Snowflake LISTAGG / " 10970 + "STRING_AGG WITHIN GROUP remain rejected pending a probe " 10971 + "of their parser-specific argument storage; window " 10972 + "functions (any OVER-bearing form) and scalar subqueries " 10973 + "are not supported yet (slice 31 boundary)", null)); 10974 } 10975 // Slice 29 / Slice 31: vendor-gated WITHIN GROUP rejecter. 10976 // 10977 // Two attachment styles, gated by vendor: 10978 // * Direct attachment ({@code fn.getWithinGroup()}): PG admits 10979 // because its visitor descent (TFunctionCall.acceptChildren) 10980 // does NOT walk fn.withinGroup, leaving OutputColumn.sources 10981 // populated exactly with the function's column-bearing args. 10982 // Snowflake and DB2 use the same field but their parser- 10983 // specific arg storage (DB2's stringExpr / separatorExpr for 10984 // LISTAGG) may not be visitor-visible — silently-empty 10985 // sources while dlineage walks fdd to the base column = 10986 // manufactured IR_MISSING_DEPENDENCY divergence; rejected. 10987 // * WindowDef attachment ({@code fn.getWindowDef().getWithinGroup()} 10988 // with WITHIN-GROUP-only windowDef): Oracle / MSSQL admit via 10989 // slice 31. The visitor DOES descend through 10990 // {@code windowDef.withinGroup.orderBy}, so the slice-31 10991 // source-skip in 10992 // {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses} 10993 // keeps OutputColumn.sources from leaking the WITHIN GROUP 10994 // ORDER BY column refs (probe Q1 / Q3 / Q4 / Q5 in 10995 // {@code /tmp/probe31}). 10996 // 10997 // Probed: PG (Q1, Q5, Q6, Q9, Q10), Oracle (Q1-Q5 in 10998 // {@code /tmp/probe31}), MSSQL (Q11-Q12), SparkSQL (parser drops 10999 // WITHIN GROUP attachment, so containsAggregateWithWithinGroup 11000 // returns false and the lift applies). 11001 if (containsAggregateWithWithinGroup(projExpr)) { 11002 EDbVendor v = inner.dbvendor; 11003 // Slice 44 / 45: Snowflake admitted at this gate ONLY when 11004 // every WITHIN GROUP-bearing call in the inner projection 11005 // is an admitted Snowflake direct-attachment shape: 11006 // * hypothetical-set (rank / dense_rank / percent_rank / 11007 // cume_dist with fn.getWithinGroup()!=null and 11008 // fn.getWindowDef()==null) — slice 44; or 11009 // * mode() with the same direct-attachment shape — slice 45. 11010 // Snowflake LISTAGG / STRING_AGG / percentile_cont / 11011 // percentile_disc share the direct-attachment shape but 11012 // their parser-specific argument storage (stringExpr / 11013 // separatorExpr) and / or name-whitelist exclusion keep 11014 // the slice-31/44 rejection (see Slice44Test §C and 11015 // Slice45Test §C boundary tests). 11016 boolean snowflakeAdmittedShape = (v == EDbVendor.dbvsnowflake) 11017 && allWithinGroupCallsAreAdmittedSnowflakeDirectAttachment(projExpr); 11018 if (v != EDbVendor.dbvpostgresql 11019 && v != EDbVendor.dbvoracle 11020 && v != EDbVendor.dbvmssql 11021 && !snowflakeAdmittedShape) { 11022 throw new SemanticIRBuildException( 11023 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_WITHIN_GROUP_AGGREGATE, 11024 "EXISTS in JOIN ON: WITHIN GROUP aggregate inner projection on " 11025 + "vendor=" + v 11026 + " is not supported yet — slice 31 admits PostgreSQL " 11027 + "(direct fn.withinGroup attachment), Oracle, and " 11028 + "SQL Server (windowDef.withinGroup attachment with " 11029 + "isIncludingOverClause=false); slice 44 additionally " 11030 + "admits Snowflake hypothetical-set ordered-set " 11031 + "aggregates (rank / dense_rank / percent_rank / " 11032 + "cume_dist) via direct fn.withinGroup attachment; " 11033 + "slice 45 additionally admits Snowflake mode() via " 11034 + "the same direct attachment; " 11035 + "DB2 / Snowflake LISTAGG / STRING_AGG / " 11036 + "percentile_cont / other direct-attachment vendors " 11037 + "remain rejected pending a probe of their parser-" 11038 + "specific argument storage", null)); 11039 } 11040 // Codex impl-review round-3 MUST: name-whitelist guard. The PG 11041 // parser attaches WITHIN GROUP to generic `func_application`, 11042 // not only to whitelisted aggregate names. Without this check, 11043 // a non-whitelisted call like `foo(x.id) WITHIN GROUP (...)` 11044 // would slip through. Slice 31: same protection applies on 11045 // Oracle / MSSQL — the windowDef-attachment grammar admits 11046 // any function name (PERCENTILE_CONT, RANK, user-defined 11047 // foo); the name guard rejects them so the IR never sees a 11048 // shape whose canonical model is unverified. 11049 String unsupportedName = findUnsupportedWithinGroupFunctionName( 11050 projExpr, inner.dbvendor); 11051 if (unsupportedName != null) { 11052 throw new SemanticIRBuildException( 11053 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_WITHIN_GROUP_NON_WHITELISTED, 11054 "EXISTS in JOIN ON: WITHIN GROUP attached to non-whitelisted " 11055 + "function '" + unsupportedName + "' is not supported yet — " 11056 + "slice 31 admits whitelisted aggregates only " 11057 + "(see SemanticIRBuilder.AGGREGATE_FUNCTION_NAMES); " 11058 + "slice 43 additionally admits PostgreSQL hypothetical-set " 11059 + "ordered-set aggregates (rank / dense_rank / percent_rank / " 11060 + "cume_dist) via direct fn.getWithinGroup attachment", null)); 11061 } 11062 } 11063 // (f) No subqueries in inner WHERE / inner JOIN ON / inner GROUP BY / 11064 // inner HAVING / inner ORDER BY. Reuses the slice-11 helper 11065 // style with a slice-23-specific message prefix. 11066 rejectSubqueriesInPredicateBodyClauses(inner); 11067 // (g) Window functions are caught by the rejecters that fire inside 11068 // buildSelectStatement (rejectWindowFunctionInScope on WHERE / 11069 // GROUP BY / HAVING / ORDER BY); the inner projection itself is 11070 // a constant expression and cannot contain a window call. 11071 } 11072 11073 /** 11074 * Slice 27: true iff {@code e} is an admitted predicate-subquery inner 11075 * projection shape. Admits (in priority order): 11076 * <ul> 11077 * <li>{@link EExpressionType#simple_object_name_t} — single column 11078 * ref (slice 24 carryover); one JOIN canonical edge per inner- 11079 * column lineage terminal.</li> 11080 * <li>{@link #isConstantExpression}-shaped constant (slice-23 11081 * carryover); zero canonical contribution.</li> 11082 * <li>Slice 27 widenings via {@link #isAdmittedSlice27ShapeRoot}: 11083 * expression / function call / CASE / aggregate over inner 11084 * columns. Probes 27 / 27b confirmed dlineage's 11085 * {@code fdr clause="on"} canonical model walks fdd to the 11086 * underlying base columns identically to the IR's 11087 * slice-24 predicate-body sweep — so canonical equivalence 11088 * holds. Aggregate-over-constants (e.g. {@code COUNT(*)}, 11089 * {@code SUM(1)}) produce empty {@code OutputColumn.sources} 11090 * and zero predicate-body JOIN edges; canonical-equivalent 11091 * to the slice-23 constant projection.</li> 11092 * </ul> 11093 * Hard rejecters fire BEFORE the {@link #isAdmittedSlice27ShapeRoot} 11094 * admit-list to keep the surface tight: 11095 * <ul> 11096 * <li>{@link #containsAnySubqueryExpression} — slice-23 invariant.</li> 11097 * <li>{@link #containsWindowFunction} — slice-13 invariant. 11098 * Slice 31 narrowed the rejecter via {@link #isWindowDefBearingFunction} 11099 * so a WITHIN-GROUP-only windowDef (Oracle / MSSQL plain 11100 * WITHIN GROUP attachment) is NOT classified as a window 11101 * function. Real OVER-bearing windowDef shapes ({@code OVER ()}, 11102 * {@code OVER (PARTITION BY ...)}, KEEP DENSE_RANK) continue to 11103 * fire the rejecter. The complementary slice-31 source-skip in 11104 * {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses} 11105 * removes the WITHIN GROUP ORDER BY column refs from 11106 * {@code OutputColumn.sources} on Oracle / MSSQL so the visitor's 11107 * descent through {@code windowDef.withinGroup.orderBy} doesn't 11108 * leak into projection sources.</li> 11109 * <li>Slice 29 / 31's vendor-gated WITHIN GROUP rejecter at the 11110 * {@link #preflightExistsInnerShape} call site — see that 11111 * method's vendor gate. Slice 31 admits Oracle and MSSQL 11112 * (windowDef.withinGroup attachment with 11113 * {@code !isIncludingOverClause()}) alongside PostgreSQL. 11114 * Snowflake and DB2 both attach {@code WITHIN GROUP} to the 11115 * direct {@code fn.getWithinGroup()} field (same as PG), but 11116 * their parser-specific argument storage may not be 11117 * visitor-visible (DB2 stores LISTAGG args in 11118 * {@code stringExpr} / {@code separatorExpr}, which 11119 * {@code TFunctionCall.acceptChildren} does NOT walk); they 11120 * remain rejected. SparkSQL silently drops the WITHIN GROUP 11121 * attachment at parse time (both {@code fn.withinGroup} and 11122 * {@code fn.windowDef} are null); after slice 29 SparkSQL 11123 * admits the same shape as PG, parity-friendly per probe Q1 11124 * SparkSQL.</li> 11125 * </ul> 11126 * 11127 * <p>Slice 28 lifted the prior {@code containsAggregateWithFilter} 11128 * rejecter; FILTER aggregates are now admitted, with the FILTER 11129 * predicate column refs excluded from {@code OutputColumn.sources} 11130 * globally via the FILTER-aware variant of {@link #collectColumnRefs} 11131 * used in {@link #buildOutputColumns}. See the slice-28 entry in 11132 * §14.5 of the unified roadmap and §B / §C of the slice history 11133 * archive for the load-bearing decision. 11134 * 11135 * <p>Slice 29 lifted the prior unconditional 11136 * {@code containsAggregateWithWithinGroup} rejecter and replaced it 11137 * with a vendor-gated rejecter at the 11138 * {@link #preflightExistsInnerShape} call site (Snowflake / DB2 / 11139 * other non-PostgreSQL vendors that use the direct 11140 * {@code fn.getWithinGroup()} attachment remain rejected). PG 11141 * attaches {@code WITHIN GROUP} to the direct 11142 * {@code fn.getWithinGroup()} field, and 11143 * {@code TFunctionCall.acceptChildren} does NOT descend into that 11144 * field, so {@link #collectColumnRefs} never picks up the ORDER BY 11145 * column refs — no source-skip is needed. dlineage probes Q1–Q10 11146 * confirmed canonical-model JOIN-on edges include only the aggregate's 11147 * primary argument across all four vendors (the WITHIN GROUP ORDER 11148 * BY ref appears as {@code fdr clauseType="orderby"} on PG only, and 11149 * {@code DlineageXmlProjector.projectColumn} follows fdd not fdr). 11150 * Slice 29 is restricted to whitelisted aggregates whose names 11151 * appear in {@link #AGGREGATE_FUNCTION_NAMES}. As of slice 30 the 11152 * whitelist is: {@code count}, {@code sum}, {@code avg}, {@code min}, 11153 * {@code max}, {@code stddev}, {@code variance}, {@code var_samp}, 11154 * {@code var_pop}, {@code stddev_samp}, {@code stddev_pop}, 11155 * {@code listagg}, {@code string_agg}, {@code group_concat}, 11156 * {@code array_agg}, {@code mode} (slice-30 addition — PG 11157 * ordered-set aggregate, gated for the WITHIN GROUP path only; 11158 * see {@code DlineageXmlProjector.ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES}). The predicate-body short-circuit's 11159 * {@code aggregate=true} branch fires for these regardless of 11160 * {@code OutputColumn.sources} content — column-bearing args 11161 * (e.g. {@code LISTAGG(x.id, ',')}) produce a synthesized 11162 * {@code OutputColumn} with {@code sources=[x.id]} that the slice-24 11163 * sweep walks to base-column terminals, while literal-only args 11164 * (e.g. {@code LISTAGG('hello', ',')}) produce {@code sources=[]} 11165 * with zero JOIN canonical edges — canonically equivalent to 11166 * slice-23's constant projection. 11167 * 11168 * <p>Functions NOT in the whitelist (which on PG includes 11169 * {@code percentile_cont}, {@code percentile_disc}, {@code rank}, 11170 * {@code dense_rank}, {@code percent_rank}, {@code cume_dist}, plus 11171 * any user-defined function with a direct {@code fn.withinGroup} 11172 * attachment) remain rejected by the 11173 * {@link #findUnsupportedWithinGroupFunctionName} guard at the 11174 * {@link #preflightExistsInnerShape} call site. Slice 30 lifted 11175 * {@code mode} only — the one PG ordered-set aggregate with no 11176 * documented window form in any GSP-supported vendor. Lifting 11177 * {@code percentile_cont} / {@code percentile_disc} requires either 11178 * a vendor-scoped projector OR a structural discriminator strong 11179 * enough to distinguish the cross-vendor windowed forms (Redshift / 11180 * Vertica / BigQuery / Oracle / SQL Server emit 11181 * {@code PERCENTILE_CONT WITHIN GROUP OVER (...)} variants). Lifting 11182 * {@code rank}/{@code dense_rank}/{@code percent_rank}/{@code cume_dist} 11183 * requires distinguishing window form {@code RANK() OVER (ORDER BY)} 11184 * from hypothetical-set form {@code rank(0.5) WITHIN GROUP (ORDER BY)} 11185 * — dlineage XML for the two is structurally identical on PG. See 11186 * §14.6 of the unified roadmap. 11187 */ 11188 private static boolean isAdmittedPredicateProjection(TExpression e) { 11189 if (e == null) return false; 11190 if (e.getExpressionType() == EExpressionType.simple_object_name_t) { 11191 return true; // slice 24 (column ref) 11192 } 11193 if (isConstantExpression(e)) return true; // slice 23 (constant) 11194 // Slice 27: hard rejecters before admit-list. Slice 28 lifted the 11195 // FILTER rejecter; slice 29 replaced the unconditional WITHIN 11196 // GROUP rejecter with a vendor-gated rejecter at the 11197 // preflightExistsInnerShape call site (see slice-29 §3.2). 11198 if (containsAnySubqueryExpression(e)) return false; // slice 23 invariant 11199 if (containsWindowFunction(e)) return false; // slice 13 invariant 11200 return isAdmittedSlice27ShapeRoot(e); 11201 } 11202 11203 /** 11204 * Slice 29: detect a {@code WITHIN GROUP (ORDER BY ...)} attachment 11205 * on the direct {@code fn.getWithinGroup()} field anywhere in the 11206 * subtree. This is the PG / Snowflake / DB2 attachment style. Used 11207 * as a vendor-gated rejecter in {@link #preflightExistsInnerShape}: 11208 * non-admitted vendors with this attachment remain rejected because 11209 * their parser-specific argument storage may not be visitor-visible 11210 * (DB2's {@code LISTAGG} stores args in {@code stringExpr} / 11211 * {@code separatorExpr}, which the default 11212 * {@code TFunctionCall.acceptChildren} does NOT walk — 11213 * {@code OutputColumn.sources} would be silently empty while dlineage 11214 * walks fdd to the base column, manufacturing 11215 * {@code IR_MISSING_DEPENDENCY} divergence). 11216 * 11217 * <p>Slice 31: also detects WITHIN GROUP attached to 11218 * {@code fn.getWindowDef().getWithinGroup()} when the windowDef is 11219 * {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only} — the 11220 * Oracle / MSSQL attachment style. Both attachments are routed 11221 * through {@link #hasWithinGroupAnyAttachment}. 11222 */ 11223 private static boolean containsAggregateWithWithinGroup(TExpression e) { 11224 if (e == null) return false; 11225 final boolean[] found = {false}; 11226 e.acceptChildren(new TParseTreeVisitor() { 11227 @Override 11228 public void preVisit(TFunctionCall fn) { 11229 if (found[0]) return; 11230 if (hasWithinGroupAnyAttachment(fn)) found[0] = true; 11231 } 11232 }); 11233 if (!found[0] && e.getExpressionType() == EExpressionType.function_t) { 11234 TFunctionCall fn = e.getFunctionCall(); 11235 if (hasWithinGroupAnyAttachment(fn)) found[0] = true; 11236 } 11237 return found[0]; 11238 } 11239 11240 /** 11241 * Slice 31: shared predicate used by {@link #containsAggregateWithWithinGroup} 11242 * and {@link #findUnsupportedWithinGroupFunctionName}. Returns 11243 * {@code true} iff {@code fn} carries {@code WITHIN GROUP} via 11244 * either: 11245 * <ul> 11246 * <li>direct {@code fn.getWithinGroup()} field (PG / Snowflake / 11247 * DB2 / SparkSQL parser style);</li> 11248 * <li>{@code fn.getWindowDef().getWithinGroup()} when the 11249 * windowDef is {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only} 11250 * (Oracle / MSSQL parser style).</li> 11251 * </ul> 11252 */ 11253 private static boolean hasWithinGroupAnyAttachment(TFunctionCall fn) { 11254 if (fn == null) return false; 11255 if (fn.getWithinGroup() != null) return true; 11256 return isWithinGroupOnlyWindowDef(fn.getWindowDef()); 11257 } 11258 11259 /** 11260 * Slice 29 (codex impl-review round-3 MUST): walk the expression 11261 * subtree and return the (lower-cased) function name of any 11262 * {@code TFunctionCall} that carries WITHIN GROUP — via direct 11263 * {@code fn.getWithinGroup()} (PG style) or via 11264 * {@code fn.getWindowDef().getWithinGroup()} when the windowDef is 11265 * {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only} (Oracle / 11266 * MSSQL style; slice 31) — whose name is NOT in 11267 * {@link #AGGREGATE_FUNCTION_NAMES}. Returns {@code null} if every 11268 * WITHIN GROUP-bearing call uses a whitelisted aggregate name. 11269 * Used at the {@code preflightExistsInnerShape} call site to reject 11270 * {@code foo(x.id) WITHIN GROUP (...)}-shaped projections where 11271 * {@code foo} isn't an aggregate the IR knows how to model. 11272 * 11273 * <p>Slice 43: now takes the inner {@link EDbVendor} so the 11274 * {@link #isAdmittedWithinGroupName} delegate can apply the 11275 * PG-only hypothetical-set carve-out 11276 * ({@link #isDirectAttachmentHypotheticalSetCall}; widened to 11277 * Snowflake by slice 44). 11278 */ 11279 private static String findUnsupportedWithinGroupFunctionName( 11280 TExpression e, final EDbVendor vendor) { 11281 if (e == null) return null; 11282 final String[] firstUnsupported = {null}; 11283 e.acceptChildren(new TParseTreeVisitor() { 11284 @Override 11285 public void preVisit(TFunctionCall fn) { 11286 if (firstUnsupported[0] != null) return; 11287 if (!hasWithinGroupAnyAttachment(fn)) return; 11288 String name = fn.getFunctionName() == null 11289 ? null : fn.getFunctionName().toString(); 11290 if (isAdmittedWithinGroupName(fn, name, vendor)) return; 11291 firstUnsupported[0] = name == null ? "<unnamed>" : name; 11292 } 11293 }); 11294 if (firstUnsupported[0] == null 11295 && e.getExpressionType() == EExpressionType.function_t) { 11296 TFunctionCall fn = e.getFunctionCall(); 11297 if (hasWithinGroupAnyAttachment(fn)) { 11298 String name = fn.getFunctionName() == null 11299 ? null : fn.getFunctionName().toString(); 11300 if (!isAdmittedWithinGroupName(fn, name, vendor)) { 11301 firstUnsupported[0] = name == null ? "<unnamed>" : name; 11302 } 11303 } 11304 } 11305 return firstUnsupported[0]; 11306 } 11307 11308 /** 11309 * Slice 42 helper used by {@link #findUnsupportedWithinGroupFunctionName}. 11310 * Returns {@code true} iff {@code name} is in the regular 11311 * {@link #AGGREGATE_FUNCTION_NAMES} whitelist, OR — under the 11312 * AST-shape constraint 11313 * {@link #isHypotheticalSetWithinGroupCall} — in the slice-42 11314 * {@link #HYPOTHETICAL_SET_AGGREGATE_NAMES} whitelist (Oracle / 11315 * MSSQL windowDef-bearing attachment), OR — under the slice-43 11316 * AST-shape constraint 11317 * {@link #isDirectAttachmentHypotheticalSetCall} — in the same 11318 * hypothetical-set whitelist on PostgreSQL (slice 43) or Snowflake 11319 * (slice 44) via direct {@code fn.getWithinGroup()} attachment. 11320 * 11321 * <p>The shape constraints pin the carve-outs by parser flavor: 11322 * Oracle / MSSQL produce {@code fn.getWindowDef()!=null} with 11323 * {@code wd.getWithinGroup()!=null} and {@code !wd.isIncludingOverClause()}; 11324 * PG produces {@code fn.getWithinGroup()!=null} with 11325 * {@code fn.getWindowDef()==null}. Slice 43 admits that direct- 11326 * attachment hypothetical-set carve-out for PostgreSQL; slice 44 11327 * widens the same probe-confirmed shape to Snowflake. DB2 and other 11328 * direct-attachment vendors remain outside this helper until their 11329 * AST / dlineage parity is explicitly probed and covered. 11330 */ 11331 private static boolean isAdmittedWithinGroupName( 11332 TFunctionCall fn, String name, EDbVendor vendor) { 11333 if (name == null || name.isEmpty()) return false; 11334 String lower = name.toLowerCase(Locale.ROOT); 11335 if (AGGREGATE_FUNCTION_NAMES.contains(lower)) return true; 11336 if (isHypotheticalSetWithinGroupCall(fn)) return true; 11337 return isDirectAttachmentHypotheticalSetCall(fn, vendor); 11338 } 11339 11340 /** 11341 * Slice 43 / 44: true iff {@code fn} is a direct-attachment 11342 * hypothetical-set ordered-set aggregate call shape — {@code rank} / 11343 * {@code dense_rank} / {@code percent_rank} / {@code cume_dist} with 11344 * {@code fn.getWithinGroup()!=null} AND {@code fn.getWindowDef()==null}, 11345 * AND {@code vendor} is in {PostgreSQL, Snowflake}. 11346 * 11347 * <p>Used as a name-whitelist exception inside 11348 * {@link #isAdmittedWithinGroupName} for predicate-body inner 11349 * projections only. Top-level admission is deliberately not granted: 11350 * top-level lifting requires a vendor-scoped projector override 11351 * (slice 43 introduces the API but defers the override to a future 11352 * slice because PG / Snowflake dlineage XML is structurally 11353 * indistinguishable between the WG and OVER forms — naive override 11354 * breaks {@code rank() OVER (ORDER BY x)} classification). 11355 * 11356 * <p>Vendor gate: PG (slice 43) and Snowflake (slice 44 — probe- 11357 * confirmed AST + dlineage XML byte-identical to PG for all 11358 * four hypothetical-set names). DB2 / Greenplum / Redshift parse-fail 11359 * on the syntax. Other direct-attachment vendors (e.g. SparkSQL drops 11360 * WITHIN GROUP attachment at parse time) remain rejected pending a 11361 * fresh probe. 11362 * 11363 * <p>Probe: {@code /tmp/probe43/Probe43.java} (slice 43) and 11364 * {@code probe44.Probe44Test} (slice 44, captured during slice-44 11365 * implementation) confirmed the AST predicate matches PG / Snowflake 11366 * hypothetical-set forms (and not the OVER form), and confirmed the 11367 * dlineage XML for {@code EXISTS (SELECT rank(0.5) WITHIN GROUP 11368 * (ORDER BY x.salary) FROM locations x)} contributes zero base-table 11369 * edges from the inner predicate body (literal arg + WG ORDER BY ref 11370 * via {@code clauseType="orderby"} fdr that the projector's 11371 * {@code clauseTypeToRole} does not map to FILTER/JOIN). Both 11372 * projectors therefore agree on zero predicate-body lineage edges 11373 * for the slice-43 / slice-44 shape — no projector change required. 11374 */ 11375 private static boolean isDirectAttachmentHypotheticalSetCall( 11376 TFunctionCall fn, EDbVendor vendor) { 11377 if (fn == null) return false; 11378 if (vendor != EDbVendor.dbvpostgresql 11379 && vendor != EDbVendor.dbvsnowflake) return false; 11380 if (!isDirectAttachmentHypotheticalSetCallShape(fn)) return false; 11381 return true; 11382 } 11383 11384 /** 11385 * Slice 44: vendor-agnostic shape predicate for the direct-attachment 11386 * hypothetical-set call form ({@code fn.getWithinGroup()!=null} AND 11387 * {@code fn.getWindowDef()==null} AND function name in 11388 * {@link #HYPOTHETICAL_SET_AGGREGATE_NAMES}). Used together with 11389 * {@link #isDirectAttachmentModeCallShape} (slice 45) by 11390 * {@link #allWithinGroupCallsAreAdmittedSnowflakeDirectAttachment} to 11391 * gate Snowflake admission. Snowflake LISTAGG / STRING_AGG / 11392 * percentile_cont WITHIN GROUP share this attachment style but their 11393 * parser-specific argument storage ({@code stringExpr} / 11394 * {@code separatorExpr}) and dlineage XML parity remain unprobed 11395 * (slice-31 boundary preserved). 11396 */ 11397 private static boolean isDirectAttachmentHypotheticalSetCallShape( 11398 TFunctionCall fn) { 11399 if (fn == null) return false; 11400 if (fn.getWithinGroup() == null) return false; 11401 if (fn.getWindowDef() != null) return false; 11402 if (fn.getFunctionName() == null) return false; 11403 String name = fn.getFunctionName().toString(); 11404 if (name == null || name.isEmpty()) return false; 11405 return HYPOTHETICAL_SET_AGGREGATE_NAMES.contains( 11406 name.toLowerCase(Locale.ROOT)); 11407 } 11408 11409 /** 11410 * Slice 45: vendor-agnostic shape predicate for the direct-attachment 11411 * {@code mode()} ordered-set aggregate call form 11412 * ({@code fn.getWithinGroup()!=null} AND 11413 * {@code fn.getWindowDef()==null} AND function name equals 11414 * {@code mode}). Parallel to 11415 * {@link #isDirectAttachmentHypotheticalSetCallShape}; used by 11416 * {@link #allWithinGroupCallsAreAdmittedSnowflakeDirectAttachment} 11417 * to admit Snowflake {@code mode() WITHIN GROUP (ORDER BY ...)} 11418 * predicate-body inner projections. 11419 * 11420 * <p>Probe-confirmed (see {@code /tmp/Probe45c.java} captured during 11421 * slice-45 implementation): Snowflake parses {@code mode() WITHIN 11422 * GROUP (ORDER BY x.salary)} with {@code fn.getWithinGroup() != null} 11423 * and {@code fn.getWindowDef() == null}, identical to PG. The 11424 * Snowflake dlineage XML for the predicate-body wrapper shape is 11425 * byte-equivalent to PG (same {@code resultset name="mode" 11426 * type="function"} wrapper, same {@code orderby} fdr that 11427 * {@code clauseTypeToRole} does not map to FILTER/JOIN); the 11428 * canonical model has zero predicate-body lineage edges, matching 11429 * the IR side (mode has no args, default visitor descent does not 11430 * walk direct {@code fn.withinGroup}). 11431 * 11432 * <p>Why mode is admitted but Snowflake LISTAGG / STRING_AGG / 11433 * percentile_cont aren't (slice-44/45 boundaries): mode has no 11434 * positional argument, so the OutputColumn.sources collection is 11435 * trivially empty and matches the dlineage zero-edge canonical model. 11436 * LISTAGG / STRING_AGG store args in parser-specific 11437 * {@code stringExpr} / {@code separatorExpr} fields whose visitor 11438 * descent has not been probed; admitting them risks 11439 * silently-empty IR sources against a non-empty dlineage column-arg 11440 * fdd. percentile_cont / percentile_disc use a literal arg 11441 * (slice-44 §C boundary preserved) but are not in 11442 * {@link #AGGREGATE_FUNCTION_NAMES}, so the slice-29 name-whitelist 11443 * guard fires inside {@link #findUnsupportedWithinGroupFunctionName} 11444 * and rejects regardless of vendor gate. 11445 */ 11446 private static boolean isDirectAttachmentModeCallShape( 11447 TFunctionCall fn) { 11448 if (fn == null) return false; 11449 if (fn.getWithinGroup() == null) return false; 11450 if (fn.getWindowDef() != null) return false; 11451 if (fn.getFunctionName() == null) return false; 11452 String name = fn.getFunctionName().toString(); 11453 if (name == null || name.isEmpty()) return false; 11454 return "mode".equals(name.toLowerCase(Locale.ROOT)) 11455 && hasNoFunctionArgs(fn); 11456 } 11457 11458 private static boolean hasNoFunctionArgs(TFunctionCall fn) { 11459 return fn != null && (fn.getArgs() == null || fn.getArgs().size() == 0); 11460 } 11461 11462 /** 11463 * Slice 45 (renamed and widened from the slice-44 helper 11464 * {@code allWithinGroupCallsAreDirectAttachmentHypotheticalSet}): 11465 * returns {@code true} iff {@code e} contains at least one WITHIN 11466 * GROUP-bearing function call AND every such call uses an 11467 * <i>admitted</i> Snowflake direct-attachment shape — either 11468 * hypothetical-set ({@link #isDirectAttachmentHypotheticalSetCallShape}, 11469 * slice 44) or mode ({@link #isDirectAttachmentModeCallShape}, 11470 * slice 45). Used to gate the predicate-body vendor whitelist widen 11471 * at the {@code preflightExistsInnerShape} call site so Snowflake is 11472 * admitted only on these probe-confirmed shapes — Snowflake LISTAGG / 11473 * STRING_AGG / percentile_cont / percentile_disc / other names 11474 * remain rejected (their parser-specific argument storage and 11475 * dlineage XML parity are unprobed; slice-31/44 boundary 11476 * preserved). 11477 * 11478 * <p>Mixed expressions (e.g. {@code mode() WG (...) || rank(0.5) 11479 * WG (...)} in a single predicate-body inner projection) are 11480 * admitted when every WG-bearing call is admitted-shape; 11481 * one non-admitted-shape call blocks the whole expression 11482 * (Slice45Test §D). 11483 */ 11484 private static boolean allWithinGroupCallsAreAdmittedSnowflakeDirectAttachment( 11485 TExpression e) { 11486 if (e == null) return false; 11487 final boolean[] sawAny = {false}; 11488 final boolean[] sawNonAdmitted = {false}; 11489 e.acceptChildren(new TParseTreeVisitor() { 11490 @Override 11491 public void preVisit(TFunctionCall fn) { 11492 if (!hasWithinGroupAnyAttachment(fn)) return; 11493 sawAny[0] = true; 11494 if (!isDirectAttachmentHypotheticalSetCallShape(fn) 11495 && !isDirectAttachmentModeCallShape(fn)) { 11496 sawNonAdmitted[0] = true; 11497 } 11498 } 11499 }); 11500 if (e.getExpressionType() == EExpressionType.function_t) { 11501 TFunctionCall fn = e.getFunctionCall(); 11502 if (fn != null && hasWithinGroupAnyAttachment(fn)) { 11503 sawAny[0] = true; 11504 if (!isDirectAttachmentHypotheticalSetCallShape(fn) 11505 && !isDirectAttachmentModeCallShape(fn)) { 11506 sawNonAdmitted[0] = true; 11507 } 11508 } 11509 } 11510 return sawAny[0] && !sawNonAdmitted[0]; 11511 } 11512 11513 /** 11514 * Slice 27: fail-closed enumeration of admitted projection root shapes 11515 * after the slice-23/24 fast paths and the hard-rejecter guards have 11516 * been considered by {@link #isAdmittedPredicateProjection}. 11517 * Open-ended type checks are intentionally avoided 11518 * (slice-history §C / codex round-1 SHOULD 5). 11519 * 11520 * <p>Admits: 11521 * <ul> 11522 * <li>{@code function_t} — any function call (aggregate or scalar). 11523 * OVER-bearing window functions are rejected by the caller's 11524 * {@code containsWindowFunction} guard (slice 31 narrowed via 11525 * {@link #isWindowDefBearingFunction} so WITHIN-GROUP-only 11526 * windowDef passes; OVER-bearing forms still rejected). 11527 * {@code FILTER (WHERE ...)} was admitted in slice 28 (with 11528 * FILTER predicate refs excluded from {@code OutputColumn.sources} 11529 * via {@link #collectColumnRefsExcludingFilterClauses} — 11530 * slice 31 widens to 11531 * {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses}). 11532 * PG-style direct {@code fn.withinGroup} attachment was admitted 11533 * in slice 29 via the vendor-gated rejecter at the 11534 * {@link #preflightExistsInnerShape} call site; slice 31 extends 11535 * admission to Oracle / MSSQL windowDef-bearing WITHIN GROUP 11536 * (Snowflake / DB2 / other direct-attachment vendors remain 11537 * rejected pending probe).</li> 11538 * <li>{@code case_t} — simple or searched CASE.</li> 11539 * <li>Pure binary ({@link TExpression#isPureBinaryForDoParse}) — 11540 * arithmetic, concat, comparison.</li> 11541 * <li>{@code parenthesis_t} — descend.</li> 11542 * <li>{@code typecast_t} — PostgreSQL / Snowflake / Redshift 11543 * {@code expr::TYPE} (slice 37; cross-vendor probe slice 38). 11544 * Admit unconditionally; the slice-13 invariant rejecters 11545 * ({@link #containsAnySubqueryExpression} / 11546 * {@link #containsWindowFunction}) fire BEFORE this admit check 11547 * inside {@link #isAdmittedPredicateProjection}, so 11548 * {@code (SELECT 1)::INT} and {@code (ROW_NUMBER() OVER ())::INT} 11549 * are still rejected. The default visitor descent walks 11550 * {@code typecast_t.getLeftOperand()} so 11551 * {@code OutputColumn.sources} populates with the underlying 11552 * column refs (probe-verified for PG — 11553 * {@code /tmp/probe37/Probe37.java}; slice 38 extended the probe 11554 * to Snowflake and Redshift — 11555 * {@code /tmp/probe38/Probe38.java}, {@code CheckCurrent.java} — 11556 * and confirmed byte-identical AST + dlineage XML to PG for both 11557 * {@code x.id::VARCHAR [AS lst]} and {@code LOWER(x.id)::VARCHAR} 11558 * composed forms with zero divergence; slice 39 extended the probe 11559 * to Greenplum, Vertica, GaussDB, Netezza — 11560 * {@code /tmp/probe39/Probe39.java}, {@code Probe39b.java} — and 11561 * confirmed AST + dlineage XML byte-identical to the PG / Snowflake / 11562 * Redshift contract for both aliased and unaliased forms with zero 11563 * divergence; slice 40 extended the probe to BigQuery, Trino, Presto, 11564 * EDB, DuckDB, Databricks — 11565 * {@code /tmp/probe40/Probe40.java}, {@code Probe40b.java}, 11566 * {@code Probe40c.java}, {@code Probe40d.java}, {@code Probe40e.java} 11567 * — and confirmed AST + dlineage XML byte-identical to the 11568 * PG / Snowflake / Redshift contract for aliased, unaliased, and 11569 * {@code LOWER(x.id)::VARCHAR} composed forms with zero divergence; 11570 * slice 41 closes out the residual vendor matrix — 11571 * {@code /tmp/probe41/Probe41.java}, {@code Probe41b.java} — 11572 * confirming Informix native {@code typecast_t} (AST + dlineage XML 11573 * byte-identical to the PG / Snowflake / Redshift contract); 11574 * ClickHouse parser auto-lowers {@code expr::TYPE} to 11575 * {@code function_t} so the slice-27 admission applies; 11576 * Sybase / Flink / Dameng parse-fail on {@code ::TYPE} but accept 11577 * {@code CAST(x AS TYPE)} via {@code function_t} (slice-27 11578 * carryover); Exasol / AzureSQL parse {@code expr::TYPE} as 11579 * {@code simple_object_name_t} (vendor-quirk — the {@code ::} is 11580 * interpreted as a qualified-name separator, mirroring T-SQL's 11581 * {@code tablename::method()} schema-qualified syntax) so the 11582 * slice-32 exclusion routes via normal column handling; 11583 * OceanBase / Impala / StarRocks parse-fail boundary locked in). 11584 * Oracle uses {@code CAST(x AS TYPE)} which parses as 11585 * {@code function_t} (already admitted above), so no Oracle-specific 11586 * {@code typecast_t} admission is needed; Hive / SparkSQL parse-fail 11587 * on the {@code ::TYPE} syntax — slice 39 pins this boundary; slice 11588 * 40 extends the parse-fail boundary lock-in to DB2, Teradata, MySQL, 11589 * and HANA so a future grammar lift fires loudly and re-probe is 11590 * required before relying on zero-divergence for those dialects. 11591 * The slice-37 admission remains structural (no vendor gate); future 11592 * vendors that surface {@code typecast_t} will be admitted 11593 * automatically — re-probe before relying on zero-divergence 11594 * guarantees.</li> 11595 * </ul> 11596 * Implicitly rejects {@code list_t}, {@code subquery_t} (caught by the 11597 * caller), and any unknown expression type. 11598 */ 11599 private static boolean isAdmittedSlice27ShapeRoot(TExpression e) { 11600 if (e == null) return false; 11601 EExpressionType t = e.getExpressionType(); 11602 if (t == EExpressionType.function_t) return true; 11603 if (t == EExpressionType.case_t) return true; 11604 if (t == EExpressionType.parenthesis_t) { 11605 return e.getLeftOperand() != null 11606 && isAdmittedSlice27ShapeRoot(e.getLeftOperand()); 11607 } 11608 if (t == EExpressionType.typecast_t) return true; // slice 37 (cross-vendor parity probed in slice 38; widened in slice 39 + slice 40; residual vendors locked in by slice 41 — Informix typecast_t) 11609 if (TExpression.isPureBinaryForDoParse(t)) return true; 11610 return false; 11611 } 11612 11613 /** 11614 * Slice 27: visitor-based deep window-function detector. Mirrors 11615 * {@link #rejectWindowFunctions} (line ~4530) but boolean-returning so 11616 * it can be used as a guard inside 11617 * {@link #isAdmittedPredicateProjection}. 11618 * 11619 * <p>Slice 31: discriminates WITHIN-GROUP-only windowDef shapes 11620 * (Oracle / MSSQL plain {@code WITHIN GROUP (ORDER BY ...)} attachment 11621 * without OVER) from OVER-bearing ones via 11622 * {@link #isWindowDefBearingFunction}. Plain WITHIN GROUP no longer 11623 * counts as a window function for predicate-body inner-projection 11624 * admission. NOTE: only this helper and {@link #isAggregateFunction} 11625 * are lifted — every other slice-13 invariant rejecter 11626 * ({@link #rejectHavingWindowFunction}, 11627 * {@link #rejectOrderByWindowFunction}, 11628 * {@link #rejectWindowFunctionInScope}, 11629 * {@link #rejectWindowFunctions}, 11630 * {@link #rejectEmbeddedWindowFunction}, 11631 * {@link #isTopLevelWindowProjection}, and the OVER ORDER BY 11632 * window check inside {@code buildWindowOrderRefs}) keeps the 11633 * strict {@code wd != null} check unchanged so HAVING / ORDER BY / 11634 * WHERE / GROUP BY / JOIN ON / top-level projection contexts still 11635 * reject WITHIN-GROUP-only attachments — slice 31 boundary. 11636 */ 11637 private static boolean containsWindowFunction(TExpression e) { 11638 if (e == null) return false; 11639 final boolean[] found = {false}; 11640 e.acceptChildren(new TParseTreeVisitor() { 11641 @Override 11642 public void preVisit(TFunctionCall fn) { 11643 if (found[0]) return; 11644 if (isWindowDefBearingFunction(fn)) found[0] = true; 11645 } 11646 }); 11647 if (!found[0] && e.getExpressionType() == EExpressionType.function_t) { 11648 TFunctionCall fn = e.getFunctionCall(); 11649 if (isWindowDefBearingFunction(fn)) found[0] = true; 11650 } 11651 return found[0]; 11652 } 11653 11654 /** 11655 * Slice 31: discriminate WITHIN-GROUP-only {@link TWindowDef} shapes 11656 * from OVER-bearing ones. Returns {@code true} iff: 11657 * <ul> 11658 * <li>{@code wd.getWithinGroup() != null} — the discriminating 11659 * attachment;</li> 11660 * <li>{@code !wd.isIncludingOverClause()} — no OVER syntax of any 11661 * kind (including empty {@code OVER ()});</li> 11662 * <li>{@code wd.getKeepDenseRankClause() == null} — Oracle KEEP 11663 * DENSE_RANK FIRST/LAST is a slice-22 deferred shape and must 11664 * remain windowed.</li> 11665 * </ul> 11666 * 11667 * <p>Probe-validated: {@code TWindowDef.isIncludingOverClause()} is 11668 * {@code false} for plain {@code WITHIN GROUP} and {@code true} for 11669 * any OVER-bearing form (probe Q8 / Q9 / Q11 in 11670 * {@code /tmp/probe31}). 11671 * 11672 * <p>Used by {@link #isWindowDefBearingFunction} (the slice-13 11673 * invariant lift's discriminator). 11674 */ 11675 private static boolean isWithinGroupOnlyWindowDef(TWindowDef wd) { 11676 if (wd == null) return false; 11677 if (wd.isIncludingOverClause()) return false; 11678 if (wd.getWithinGroup() == null) return false; 11679 if (wd.getKeepDenseRankClause() != null) return false; 11680 return true; 11681 } 11682 11683 /** 11684 * Slice 31: a {@link TFunctionCall} is an OVER-bearing window-def 11685 * function iff its {@code windowDef} is non-null AND not 11686 * {@link #isWithinGroupOnlyWindowDef WITHIN-GROUP-only}. Replaces 11687 * the historical {@code fn.getWindowDef() != null} check inside 11688 * {@link #containsWindowFunction} and {@link #isAggregateFunction}; 11689 * every other rejecter retains the strict {@code wd != null} check 11690 * unchanged (slice 31 narrow lift). 11691 */ 11692 private static boolean isWindowDefBearingFunction(TFunctionCall fn) { 11693 if (fn == null) return false; 11694 TWindowDef wd = fn.getWindowDef(); 11695 if (wd == null) return false; 11696 return !isWithinGroupOnlyWindowDef(wd); 11697 } 11698 11699 /** 11700 * Slice 33: a {@link TFunctionCall} is an admitted top-level 11701 * WITHIN-GROUP-only aggregate iff: 11702 * 11703 * <ul> 11704 * <li>{@link #isWithinGroupOnlyWindowDef} returns true on its 11705 * windowDef (Oracle / MSSQL plain {@code WITHIN GROUP}, no 11706 * OVER, no KEEP DENSE_RANK);</li> 11707 * <li>{@code vendor} is Oracle or MSSQL — explicit gate mirroring 11708 * the slice-31 predicate-body gate at line ~3860. PG / 11709 * Snowflake / DB2 / SparkSQL produce direct 11710 * {@code fn.withinGroup} attachment with {@code windowDef=null} 11711 * and don't reach this helper today, but the explicit gate 11712 * keeps the contract narrow against future parser changes;</li> 11713 * <li>The function name is in {@link #AGGREGATE_FUNCTION_NAMES} 11714 * (LISTAGG / STRING_AGG / SUM / MIN / MAX / MODE / etc.).</li> 11715 * </ul> 11716 * 11717 * <p>Used only by {@link #buildOutputColumns} to fall through to the 11718 * normal aggregate path. The slice-13 invariant rejecters 11719 * ({@link #isTopLevelWindowProjection}, 11720 * {@link #rejectWindowFunctions}, 11721 * {@link #rejectEmbeddedWindowFunction}, 11722 * {@link #rejectHavingWindowFunction}, 11723 * {@link #rejectOrderByWindowFunction}, 11724 * {@link #rejectWindowFunctionInScope}) 11725 * keep the strict {@code wd != null} check unchanged; slice 33 11726 * admission is gated by a single boolean local to 11727 * {@code buildOutputColumns}. 11728 * 11729 * <p>Non-whitelisted names (PERCENTILE_CONT / PERCENTILE_DISC / 11730 * RANK / DENSE_RANK / PERCENT_RANK / CUME_DIST / user-defined) 11731 * keep routing to {@link #buildWindowOutputColumn} where the 11732 * {@link #WINDOW_FUNCTION_NAMES} guard rejects them as 11733 * "unsupported window function". 11734 */ 11735 private static boolean isAdmittedTopLevelWithinGroupAggregate( 11736 TFunctionCall fn, EDbVendor vendor) { 11737 if (fn == null) return false; 11738 if (!isWithinGroupOnlyWindowDef(fn.getWindowDef())) return false; 11739 if (vendor != EDbVendor.dbvoracle && vendor != EDbVendor.dbvmssql) { 11740 return false; 11741 } 11742 if (fn.getFunctionName() == null) return false; 11743 String name = fn.getFunctionName().toString(); 11744 if (name == null || name.isEmpty()) return false; 11745 String lower = name.toLowerCase(Locale.ROOT); 11746 if (AGGREGATE_FUNCTION_NAMES.contains(lower)) return true; 11747 // Slice 42: hypothetical-set ordered-set aggregates (RANK / 11748 // DENSE_RANK / PERCENT_RANK / CUME_DIST) admitted on 11749 // Oracle / MSSQL with WITHIN-GROUP-only windowDef shape. The 11750 // surrounding {@code isWithinGroupOnlyWindowDef} guard above 11751 // already enforces the shape; the name-set membership here keeps 11752 // PERCENTILE_CONT / PERCENTILE_DISC / user-defined names rejected. 11753 return HYPOTHETICAL_SET_AGGREGATE_NAMES.contains(lower); 11754 } 11755 11756 /** 11757 * Slice 35 / 36 / 46: top-level direct-attachment WITHIN GROUP 11758 * aggregate. 11759 * 11760 * <p>PG stores {@code LISTAGG(... ) WITHIN GROUP (...)} (slice 35) and 11761 * {@code STRING_AGG(... ) WITHIN GROUP (...)} (slice 36) in 11762 * {@code fn.getWithinGroup()} with {@code windowDef=null}. Both bypass 11763 * the slice-33 Oracle/MSSQL helper above, but the plain aggregate path 11764 * is otherwise already correct: the root is not a window function, 11765 * {@link #isAggregateFunction} sees the whitelisted name, and default 11766 * visitor descent does not walk direct {@code fn.withinGroup}, so 11767 * sources contain the function argument but not the WITHIN GROUP ORDER 11768 * BY ref. This helper exists only to unlock the slice-34 expression-text 11769 * fallback for the unaliased top-level form. 11770 * 11771 * <p>Slice 36 widens the PG name whitelist from {@code {listagg}} to 11772 * {@code {listagg, string_agg}}. Snowflake / DB2 / SparkSQL 11773 * {@code LISTAGG} / {@code STRING_AGG} WG remain rejected because 11774 * their argument-storage shape (e.g. DB2's {@code stringExpr}/ 11775 * {@code separatorExpr}) is not yet probed for visitor descent and 11776 * silent empty {@code OutputColumn.sources} would manufacture 11777 * {@code IR_MISSING_DEPENDENCY} divergence. 11778 * 11779 * <p>Slice 46 widens the vendor gate to additionally admit 11780 * Snowflake — but only for {@code mode}, the only Snowflake 11781 * direct-attachment WITHIN GROUP name whose dlineage XML has been 11782 * probe-confirmed byte-equivalent to PG (the projector's slice-30 11783 * vendor-agnostic {@code AGGREGATE_FUNCTION_NAMES} + 11784 * {@code ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES} entries already 11785 * cover {@code mode}, so no projector change is needed). Snowflake 11786 * {@code listagg} / {@code string_agg} / hypothetical-set names 11787 * stay out of slice-46 scope. 11788 * 11789 * <p>Slice 47 widens the PG name whitelist from 11790 * {@code {listagg, string_agg}} to {@code {listagg, string_agg, mode}}, 11791 * the symmetrical lift to slice 46's Snowflake widen. The slice-46 11792 * pre-plan probe ({@code /tmp/Probe46Slice30.java}) already 11793 * confirmed PG aliased {@code mode()} WG was zero-divergence and the 11794 * unaliased form was blocked only by {@code effectiveOutputName}; PG 11795 * top-level {@code mode()} dlineage XML is byte-identical to 11796 * Snowflake's. No projector change is needed (slice 30 already 11797 * registered {@code mode}, vendor-agnostic). 11798 */ 11799 private static boolean isAdmittedTopLevelDirectWithinGroupAggregate( 11800 TFunctionCall fn, EDbVendor vendor) { 11801 if (fn == null) return false; 11802 if (fn.getWithinGroup() == null) return false; 11803 if (fn.getWindowDef() != null) return false; 11804 if (fn.getFunctionName() == null) return false; 11805 String name = fn.getFunctionName().toString(); 11806 if (name == null || name.isEmpty()) return false; 11807 String lower = name.toLowerCase(Locale.ROOT); 11808 if (vendor == EDbVendor.dbvpostgresql) { 11809 // Slice 35/36: PG LISTAGG / STRING_AGG WG. 11810 // Slice 47: PG mode() WG (parallels slice-46 Snowflake mode 11811 // lift; probed mode() carries no positional argument so 11812 // OutputColumn.sources is trivially empty matching 11813 // dlineage's zero-edge canonical model). 11814 return "listagg".equals(lower) 11815 || "string_agg".equals(lower) 11816 || ("mode".equals(lower) && hasNoFunctionArgs(fn)); 11817 } 11818 if (vendor == EDbVendor.dbvsnowflake) { 11819 // Slice 46: Snowflake mode() WG only. The admitted shape is 11820 // constrained to the probed no-arg mode() form: OutputColumn. 11821 // sources is trivially empty (matching dlineage's zero-edge 11822 // canonical model), and dlineage XML was probe-confirmed 11823 // byte-identical to PG's mode() WG XML for both aliased and 11824 // unaliased forms. 11825 return "mode".equals(lower) && hasNoFunctionArgs(fn); 11826 } 11827 return false; 11828 } 11829 /** 11830 * Slice 27: detect {@code FILTER (WHERE ...)} on any function call in 11831 * the expression subtree. Probe Q5 (PostgreSQL) confirmed dlineage's 11832 * {@code fdr clause="on"} omits FILTER-predicate column refs while 11833 * {@link #collectColumnRefs} would include them — canonical-model 11834 * divergence. Reject as slice-27 boundary. 11835 */ 11836 private static boolean containsAggregateWithFilter(TExpression e) { 11837 if (e == null) return false; 11838 final boolean[] found = {false}; 11839 e.acceptChildren(new TParseTreeVisitor() { 11840 @Override 11841 public void preVisit(TFunctionCall fn) { 11842 if (found[0]) return; 11843 if (fn.getFilterClause() != null) found[0] = true; 11844 } 11845 }); 11846 if (!found[0] && e.getExpressionType() == EExpressionType.function_t) { 11847 TFunctionCall fn = e.getFunctionCall(); 11848 if (fn != null && fn.getFilterClause() != null) found[0] = true; 11849 } 11850 return found[0]; 11851 } 11852 11853 /** 11854 * Slice 23: true iff every leaf of {@code e} is a {@code simple_constant_t}. 11855 * Admits {@code 1}, {@code 1+1}, {@code (1)}, {@code 'a' || 'b'} (vendor- 11856 * dependent). Slice 61 additionally admits unary {@code +}/{@code -} 11857 * wrappers over a constant operand so common signed literals like 11858 * {@code -1} and {@code -1.5} count as constants. Rejects column refs, 11859 * function calls (including {@code COALESCE}), CASE, scalar subqueries, 11860 * etc. The predicate body may STILL have inner WHERE / GROUP BY / 11861 * HAVING / ORDER BY referencing inner columns — only the projection 11862 * must be constant. 11863 */ 11864 private static boolean isConstantExpression(TExpression e) { 11865 if (e == null) return false; 11866 EExpressionType t = e.getExpressionType(); 11867 if (t == EExpressionType.simple_constant_t) return true; 11868 if (t == EExpressionType.parenthesis_t) { 11869 return e.getLeftOperand() != null && isConstantExpression(e.getLeftOperand()); 11870 } 11871 // Slice 61: unary +/- over a constant operand. The Oracle parser 11872 // emits `-1` as {@code unary_minus_t} with {@code left=null} and 11873 // {@code right=simple_constant_t(1)}. Pre-slice-61 this fell 11874 // through and was rejected; slice 61 lifts it so signed literals 11875 // like `SELECT -1 FROM t` and `SELECT -1 UNION ALL SELECT -2` 11876 // round-trip through the constant-projection path. 11877 if (t == EExpressionType.unary_minus_t || t == EExpressionType.unary_plus_t) { 11878 TExpression operand = e.getRightOperand() != null 11879 ? e.getRightOperand() 11880 : e.getLeftOperand(); 11881 return operand != null && isConstantExpression(operand); 11882 } 11883 // Pure binary ops (slice-22 isPureBinaryForDoParse helper) — both 11884 // operands must be constant. Concatenation, arithmetic, etc. 11885 if (TExpression.isPureBinaryForDoParse(t)) { 11886 TExpression l = e.getLeftOperand(); 11887 TExpression r = e.getRightOperand(); 11888 return l != null && isConstantExpression(l) 11889 && r != null && isConstantExpression(r); 11890 } 11891 return false; 11892 } 11893 11894 /** 11895 * Slice 23: reject subqueries in an EXISTS body's inner WHERE / JOIN ON / 11896 * GROUP BY / HAVING / ORDER BY (would be predicate subqueries OR scalar 11897 * subqueries). Mirrors the slice-11 11898 * {@link #rejectSubqueriesInScalarBodyClauses} structure but uses a 11899 * slice-23-specific error message. 11900 */ 11901 private static void rejectSubqueriesInPredicateBodyClauses(TSelectSqlStatement inner) { 11902 TWhereClause where = inner.getWhereClause(); 11903 if (where != null && containsAnySubquery(where)) { 11904 throw new SemanticIRBuildException( 11905 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_SUBQUERY_IN_WHERE, 11906 "EXISTS in JOIN ON: inner SELECT has a subquery in its WHERE clause; " 11907 + "not supported yet", null)); 11908 } 11909 if (inner.joins != null) { 11910 for (TJoin join : inner.joins) { 11911 TJoinItemList items = join.getJoinItems(); 11912 if (items == null) continue; 11913 for (int i = 0; i < items.size(); i++) { 11914 TJoinItem item = items.getJoinItem(i); 11915 TExpression onCond = item == null ? null : item.getOnCondition(); 11916 if (onCond != null && containsAnySubqueryExpression(onCond)) { 11917 throw new SemanticIRBuildException( 11918 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_SUBQUERY_IN_JOIN_ON, 11919 "EXISTS in JOIN ON: inner SELECT has a subquery in a JOIN ON " 11920 + "clause; not supported yet", null)); 11921 } 11922 } 11923 } 11924 } 11925 TGroupBy groupBy = inner.getGroupByClause(); 11926 if (groupBy != null) { 11927 TGroupByItemList items = groupBy.getItems(); 11928 if (items != null && containsAnySubquery(items)) { 11929 throw new SemanticIRBuildException( 11930 Diagnostic.error(DiagnosticCode.JOIN_ON_EXISTS_INNER_SUBQUERY_IN_GROUP_BY, 11931 "EXISTS in JOIN ON: inner SELECT has a subquery in a GROUP BY clause; " 11932 + "not supported yet", null)); 11933 } 11934 } 11935 // HAVING / ORDER BY subqueries are caught by the slice-9 / 10 11936 // deep-scan rejecters that fire during the recursive build. 11937 } 11938 11939 /** 11940 * Slice 25 (impl-review S2-fix): walk {@code onCond} (root + every 11941 * descendant outside {@code extractedRoots}) and return the first 11942 * wrapper expression whose <b>left</b> operand is a 11943 * {@code subquery_t}. Returns null if none. 11944 */ 11945 private static TExpression findSubqueryOnLeftWrapper(TExpression onCond, 11946 final Set<TExpression> extractedRoots) { 11947 if (onCond == null) return null; 11948 if (isSubqueryOnLeftOfWrapper(onCond)) return onCond; 11949 final TExpression[] found = {null}; 11950 onCond.acceptChildren(new TParseTreeVisitor() { 11951 int skipDepth = 0; 11952 11953 @Override 11954 public void preVisit(TExpression e) { 11955 if (found[0] != null) return; 11956 if (extractedRoots.contains(e)) { 11957 skipDepth++; 11958 return; 11959 } 11960 if (skipDepth > 0) return; 11961 if (isSubqueryOnLeftOfWrapper(e)) { 11962 found[0] = e; 11963 } 11964 } 11965 11966 @Override 11967 public void postVisit(TExpression e) { 11968 if (extractedRoots.contains(e) && skipDepth > 0) { 11969 skipDepth--; 11970 } 11971 } 11972 }); 11973 return found[0]; 11974 } 11975 11976 /** 11977 * Slice 25 / Slice 26: true iff {@code e} is a comparison/IN/ 11978 * quantifier wrapper that the slice-25 / slice-26 surface still 11979 * rejects on subquery-positioning grounds. 11980 * 11981 * <p>Slice 26 narrowing: {@code simple_comparison_t} with subquery 11982 * on the LHS and a non-subquery RHS is now ADMITTED (returns 11983 * false here so the post-extraction rejecter doesn't fire). Other 11984 * shapes still fail: 11985 * <ul> 11986 * <li>{@code in_t} with LHS=subquery: dlineage's 11987 * {@code fdr clause="on"} sources omit the outer column for 11988 * IN-LHS, so admitting on the IR side would manufacture 11989 * canonical-model divergence (still rejected).</li> 11990 * <li>{@code group_comparison_t} with LHS=subquery: borderline 11991 * grammar; defensively rejected (slice-26 boundary).</li> 11992 * <li>{@code simple_comparison_t} with subqueries on BOTH 11993 * sides: would require dual extraction; deferred to a future 11994 * slice (slice-26 boundary). Caller emits a tuned message.</li> 11995 * </ul> 11996 */ 11997 private static boolean isSubqueryOnLeftOfWrapper(TExpression e) { 11998 if (e == null) return false; 11999 EExpressionType t = e.getExpressionType(); 12000 TExpression l = e.getLeftOperand(); 12001 TExpression r = e.getRightOperand(); 12002 boolean lhsIsSubq = l != null && l.getExpressionType() == EExpressionType.subquery_t; 12003 boolean rhsIsSubq = r != null && r.getExpressionType() == EExpressionType.subquery_t; 12004 if (t == EExpressionType.in_t) { 12005 return lhsIsSubq; 12006 } 12007 if (t == EExpressionType.group_comparison_t) { 12008 return lhsIsSubq; 12009 } 12010 if (t == EExpressionType.simple_comparison_t) { 12011 // Slice 26: lifted UNLESS both sides are subqueries. 12012 return lhsIsSubq && rhsIsSubq; 12013 } 12014 return false; 12015 } 12016 12017 /** 12018 * Slice 26: true iff {@code e} is a {@code simple_comparison_t} 12019 * whose BOTH operands are {@code subquery_t}. Used by the 12020 * post-extraction rejecter to emit a slice-26-specific tuned 12021 * message distinguishing this case from the slice-25 LHS-subquery 12022 * shapes. 12023 */ 12024 private static boolean isComparisonWithBothSubqueries(TExpression e) { 12025 if (e == null) return false; 12026 if (e.getExpressionType() != EExpressionType.simple_comparison_t) return false; 12027 TExpression l = e.getLeftOperand(); 12028 TExpression r = e.getRightOperand(); 12029 return l != null && l.getExpressionType() == EExpressionType.subquery_t 12030 && r != null && r.getExpressionType() == EExpressionType.subquery_t; 12031 } 12032 12033 /** 12034 * Slice 23: after extraction, any remaining subquery-bearing expression in 12035 * the JOIN-ON tree is an unsupported shape — EXISTS that failed 12036 * extraction (inner-shape rejection), correlated wrappers, subquery on 12037 * left side, etc. Catch them here with a tuned message before 12038 * {@link #collectColumnRefsSkipping} would otherwise descend into them 12039 * and bind their inner refs against the outer scope. 12040 * 12041 * <p>Slice 25 (impl-review S2-fix): subquery-on-LEFT cases get a 12042 * tuned message (uses the slice-25 outer-shape prefix 12043 * "predicate subquery in JOIN ON:") via 12044 * {@link #findSubqueryOnLeftWrapper}. 12045 * 12046 * <p>Slice 26: a NEW first pass (before the slice-25 LHS-subquery 12047 * pass) detects {@code simple_comparison_t} wrappers with 12048 * subqueries on BOTH sides via 12049 * {@link #findComparisonWithBothSubqueries} and emits a slice-26 12050 * tuned message. Both-sides shape satisfies 12051 * {@link #isSubqueryOnLeftOfWrapper} (which slice 26 narrowed to 12052 * {@code lhsIsSubq && rhsIsSubq} for {@code simple_comparison_t}), 12053 * so ordering matters — without the both-sides first pass, the 12054 * slice-25 LHS-subquery wording would fire first. 12055 */ 12056 private static void rejectAnyRemainingSubqueriesInJoinOn(TExpression onCond, 12057 final Set<TExpression> extractedRoots) { 12058 rejectAnyRemainingSubqueriesFromClause(onCond, extractedRoots, 12059 PredicateClauseContext.JOIN_ON); 12060 } 12061 12062 /** 12063 * Slice 110 — clause-agnostic remaining-subquery rejecter. Mirrors 12064 * the slice-26 logic in {@link #rejectAnyRemainingSubqueriesInJoinOn} 12065 * but uses {@code ctx.*} codes / labels so the same body powers JOIN-ON 12066 * (slice 26) and UPDATE WHERE (slice 110). 12067 */ 12068 private static void rejectAnyRemainingSubqueriesFromClause(TExpression onCond, 12069 final Set<TExpression> extractedRoots, 12070 final PredicateClauseContext ctx) { 12071 if (onCond == null) return; 12072 // Slice 26: tuned message for a comparison with subqueries on 12073 // BOTH sides. Fires at root or any descent. Checked BEFORE the 12074 // slice-25 subquery-on-LEFT pass because both-subqueries 12075 // satisfies isSubqueryOnLeftOfWrapper too — without this 12076 // ordering the slice-25 wording would fire first. 12077 TExpression bothSubqueriesWrapper = findComparisonWithBothSubqueries(onCond, 12078 extractedRoots); 12079 if (bothSubqueriesWrapper != null) { 12080 throw new SemanticIRBuildException( 12081 Diagnostic.error(ctx.scalarComparisonBothSides, 12082 "predicate subquery in " + ctx.clauseLabel + ": scalar comparison with " 12083 + "subqueries on both sides is not supported yet " 12084 + "(slice 26 admits exactly one subquery side, with a " 12085 + "single column reference on the other side; rewrite " 12086 + "as a join across a derived table or a CTE)", null)); 12087 } 12088 // Slice 25 (impl-review S2-fix): tuned message for subquery 12089 // on the LEFT side of a wrapper. Fires at root or any descent. 12090 // Slice 26 narrowed isSubqueryOnLeftOfWrapper: 12091 // simple_comparison_t with LHS=subquery and non-subquery RHS is 12092 // now ADMITTED, so this rejecter only fires for in_t-LHS-subq / 12093 // group_comparison_t-LHS-subq (still rejected as asymmetric / 12094 // borderline shapes). 12095 TExpression leftSubqueryWrapper = findSubqueryOnLeftWrapper(onCond, extractedRoots); 12096 if (leftSubqueryWrapper != null) { 12097 throw new SemanticIRBuildException( 12098 Diagnostic.error(ctx.predicateSubqueryOnLeft, 12099 "predicate subquery in " + ctx.clauseLabel + ": " 12100 + leftSubqueryWrapper.getExpressionType() 12101 + " wrapper has a subquery on the LEFT side " 12102 + "(only RHS-subquery IN / ANY-ALL-SOME and " 12103 + "either-side scalar comparison are admitted; " 12104 + "rewrite to put the subquery on the right side, " 12105 + "or rewrite as a join across a derived table)", null)); 12106 } 12107 // Root check: if the entire condition IS an EXISTS root and it WASN'T 12108 // extracted (meaning extraction threw an exception, which should not 12109 // reach here, OR some other root subquery shape), reject. The root 12110 // walker would otherwise miss it. 12111 TExpression rootSubject = isExistsRoot(onCond) ? unwrapExistsRoot(onCond) : onCond; 12112 if (rootSubject != null 12113 && (rootSubject.getExpressionType() == EExpressionType.subquery_t 12114 || rootSubject.getSubQuery() != null 12115 || rootSubject.getExpressionType() == EExpressionType.exists_t) 12116 && !extractedRoots.contains(rootSubject)) { 12117 throw new SemanticIRBuildException( 12118 Diagnostic.error(ctx.genericSubqueryNotSupported, 12119 "subquery in " + ctx.clauseLabel + " predicate is not supported yet " 12120 + "(slice 26 accepts only uncorrelated EXISTS / " 12121 + "IN-SELECT / scalar-comparison / ANY-ALL-SOME with " 12122 + "single column-ref or constant-only inner projection " 12123 + "and a single column ref on the non-subquery side)", null)); 12124 } 12125 final boolean[] found = {false}; 12126 onCond.acceptChildren(new TParseTreeVisitor() { 12127 int skipDepth = 0; 12128 12129 @Override 12130 public void preVisit(TExpression e) { 12131 if (found[0]) return; 12132 if (extractedRoots.contains(e)) { 12133 skipDepth++; 12134 return; 12135 } 12136 if (skipDepth > 0) return; 12137 if (e.getExpressionType() == EExpressionType.subquery_t 12138 || e.getSubQuery() != null 12139 || e.getExpressionType() == EExpressionType.exists_t) { 12140 found[0] = true; 12141 } 12142 } 12143 12144 @Override 12145 public void postVisit(TExpression e) { 12146 if (extractedRoots.contains(e) && skipDepth > 0) { 12147 skipDepth--; 12148 } 12149 } 12150 }); 12151 if (found[0]) { 12152 throw new SemanticIRBuildException( 12153 Diagnostic.error(ctx.genericSubqueryNotSupported, 12154 "subquery in " + ctx.clauseLabel + " predicate is not supported yet " 12155 + "(slice 26 accepts only uncorrelated EXISTS / " 12156 + "IN-SELECT / scalar-comparison / ANY-ALL-SOME with " 12157 + "single column-ref or constant-only inner projection " 12158 + "and a single column ref on the non-subquery side)", null)); 12159 } 12160 } 12161 12162 /** 12163 * Slice 26: walk {@code onCond} (root + every descendant outside 12164 * {@code extractedRoots}) and return the first 12165 * {@code simple_comparison_t} expression whose BOTH operands are 12166 * {@code subquery_t}. Returns null if none. 12167 */ 12168 private static TExpression findComparisonWithBothSubqueries(TExpression onCond, 12169 final Set<TExpression> extractedRoots) { 12170 if (onCond == null) return null; 12171 if (isComparisonWithBothSubqueries(onCond)) return onCond; 12172 final TExpression[] found = {null}; 12173 onCond.acceptChildren(new TParseTreeVisitor() { 12174 int skipDepth = 0; 12175 12176 @Override 12177 public void preVisit(TExpression e) { 12178 if (found[0] != null) return; 12179 if (extractedRoots.contains(e)) { 12180 skipDepth++; 12181 return; 12182 } 12183 if (skipDepth > 0) return; 12184 if (isComparisonWithBothSubqueries(e)) { 12185 found[0] = e; 12186 } 12187 } 12188 12189 @Override 12190 public void postVisit(TExpression e) { 12191 if (extractedRoots.contains(e) && skipDepth > 0) { 12192 skipDepth--; 12193 } 12194 } 12195 }); 12196 return found[0]; 12197 } 12198 12199 /** 12200 * Slice 23: variant of {@link #rejectWindowFunctionInScope} that skips 12201 * subtrees in {@code skipRoots}. The outer-SELECT JOIN-ON path passes the 12202 * extracted EXISTS roots so a window function inside an extracted body 12203 * is NOT incorrectly rejected as a window in the outer JOIN-ON. (The 12204 * inner statement's own buildSelectStatement does its own 12205 * rejectWindowFunctionInScope sweeps on WHERE / GROUP BY / HAVING / 12206 * ORDER BY, so legitimate inner-window violations still surface.) 12207 */ 12208 private static void rejectWindowFunctionInScopeSkipping( 12209 gudusoft.gsqlparser.nodes.TParseTreeNode root, 12210 String clauseLabel, 12211 final Set<TExpression> skipRoots) { 12212 if (root == null) return; 12213 // Root fast path: if the root itself IS a skipped subtree, nothing to 12214 // check. (acceptChildren wouldn't see the root anyway.) 12215 if (root instanceof TExpression && skipRoots.contains(root)) { 12216 return; 12217 } 12218 final boolean[] found = {false}; 12219 root.acceptChildren(new TParseTreeVisitor() { 12220 int skipDepth = 0; 12221 12222 @Override 12223 public void preVisit(TExpression e) { 12224 if (skipRoots.contains(e)) { 12225 skipDepth++; 12226 } 12227 } 12228 12229 @Override 12230 public void postVisit(TExpression e) { 12231 if (skipRoots.contains(e) && skipDepth > 0) { 12232 skipDepth--; 12233 } 12234 } 12235 12236 @Override 12237 public void preVisit(TFunctionCall fn) { 12238 if (found[0] || skipDepth > 0) return; 12239 if (fn.getWindowDef() != null) found[0] = true; 12240 } 12241 }); 12242 if (found[0]) { 12243 throw new SemanticIRBuildException( 12244 Diagnostic.error(DiagnosticCode.CLAUSE_WINDOW_FUNCTION_LEAK, 12245 clauseLabel + " contains a window function (OVER (...)); " 12246 + "window functions are not allowed in " + clauseLabel 12247 + " per standard SQL", root)); 12248 } 12249 } 12250 12251 /** 12252 * Slice 23: variant of {@link #collectColumnRefs} that skips subtrees in 12253 * {@code skipRoots}. Used by the outer-SELECT JOIN-ON path so the 12254 * extracted EXISTS bodies' inner refs do not leak into outer 12255 * {@code joinColumnRefs}. 12256 */ 12257 private static List<ColumnRef> collectColumnRefsSkipping( 12258 gudusoft.gsqlparser.nodes.TParseTreeNode root, 12259 final NameBindingProvider provider, 12260 final Set<TExpression> skipRoots) { 12261 // Slice 31 refactor: delegate to the extended variant with no 12262 // TWithinGroup skips. Behavior preserved exactly for the 12263 // existing slice-28 caller (outer JOIN-ON path at line ~3021) 12264 // and for the new slice-31 caller via 12265 // {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses}. 12266 return collectColumnRefsSkippingExtended(root, provider, 12267 skipRoots, Collections.<TWithinGroup>emptySet()); 12268 } 12269 12270 /** 12271 * Slice 28: collect every non-null {@link TFunctionCall#getFilterClause()} 12272 * subtree reachable from {@code root}. The returned set is identity-keyed 12273 * (uses {@link IdentityHashMap}) — required because the parser may yield 12274 * two structurally-equal FILTER WHERE expressions with different 12275 * identities; a value-keyed set would coalesce them and the 12276 * downstream {@link #collectColumnRefsSkipping} call would skip only one. 12277 * 12278 * <p>Contract: {@code root} is one of {@link TResultColumn} or 12279 * {@link TExpression}. Both call sites 12280 * ({@link #collectColumnRefsExcludingFilterClauses} for projection 12281 * source collection; the slice-28 correlation walk inside 12282 * {@link #extractOnePredicateSubqueryBody}) pass values of those 12283 * two types. Other {@code TParseTreeNode} subclasses are accepted 12284 * defensively (the visitor scan still works) but the top-level 12285 * direct-check fast paths only cover {@code TResultColumn} and 12286 * {@code TExpression}; this is intentional — adding a 12287 * {@code TFunctionCall} fast path would be reachable only if a 12288 * future call site were added with a {@code TFunctionCall} root. 12289 * 12290 * <p>Visitor-driven; descends into all expression subtrees the 12291 * standard {@code TFunctionCall.acceptChildren} path visits — function 12292 * args, {@code OVER} (analyticFunction / windowDef), FILTER, CASE arms, 12293 * parenthesised sub-expressions, AND the 12294 * {@code windowDef.withinGroup.orderBy} path on Oracle / MSSQL / 12295 * SparkSQL parsers. Note: the PostgreSQL parser stores WITHIN GROUP 12296 * on the direct {@code fn.withinGroup} field, which 12297 * {@code TFunctionCall.acceptChildren} does NOT visit, so PG WITHIN 12298 * GROUP ORDER BY refs are invisible to this collector. Slice 29 12299 * relies on that asymmetry to admit PG WITHIN GROUP aggregates in 12300 * predicate-subquery inner projections without a source-skip. 12301 * Used by: 12302 * <ul> 12303 * <li>{@link #collectColumnRefsExcludingFilterClauses} (Pass 1) — the 12304 * global source-skip in {@link #buildOutputColumns}.</li> 12305 * <li>{@link #extractOnePredicateSubqueryBody}'s slice-28 correlation 12306 * walk — projection-only correlation check for FILTER predicate 12307 * refs.</li> 12308 * </ul> 12309 */ 12310 private static Set<TExpression> collectFilterClauses( 12311 gudusoft.gsqlparser.nodes.TParseTreeNode root) { 12312 final Set<TExpression> out = 12313 Collections.newSetFromMap(new IdentityHashMap<TExpression, Boolean>()); 12314 if (root == null) return out; 12315 // Visitor descends into all expression subtrees; preVisit on 12316 // TFunctionCall records the filter clause if present. The 12317 // visitor's preVisit(TFunctionCall) does NOT fire for a top-level 12318 // function_t expression's root TFunctionCall (matches the slice-13 / 12319 // slice-27 visitor descent behavior); the defensive direct checks 12320 // below cover the top-level case for the two supported root types. 12321 root.acceptChildren(new TParseTreeVisitor() { 12322 @Override 12323 public void preVisit(TFunctionCall fn) { 12324 TExpression f = fn.getFilterClause(); 12325 if (f != null) out.add(f); 12326 } 12327 }); 12328 if (root instanceof TExpression) { 12329 TExpression e = (TExpression) root; 12330 if (e.getExpressionType() == EExpressionType.function_t) { 12331 TFunctionCall fn = e.getFunctionCall(); 12332 if (fn != null && fn.getFilterClause() != null) { 12333 out.add(fn.getFilterClause()); 12334 } 12335 } 12336 } else if (root instanceof TResultColumn) { 12337 TResultColumn rc = (TResultColumn) root; 12338 TExpression e = rc.getExpr(); 12339 if (e != null && e.getExpressionType() == EExpressionType.function_t) { 12340 TFunctionCall fn = e.getFunctionCall(); 12341 if (fn != null && fn.getFilterClause() != null) { 12342 out.add(fn.getFilterClause()); 12343 } 12344 } 12345 } 12346 return out; 12347 } 12348 12349 /** 12350 * Slice 30: collect every qualifier alias (the {@code x} of an 12351 * {@code x.region} TObjectName column reference) reachable from 12352 * {@code root}, without going through the resolver. Used by the 12353 * slice-30 WITHIN GROUP ORDER BY correlation walk in 12354 * {@link #extractOnePredicateSubqueryBody}. 12355 * 12356 * <p>Why bypass the resolver: PostgreSQL's parser stores WITHIN GROUP 12357 * on the direct {@code fn.withinGroup} field. {@code TFunctionCall.acceptChildren} 12358 * does NOT descend into that field, AND Resolver2 follows the same 12359 * traversal, so its {@code ResolutionResult} is null on TObjectName 12360 * nodes inside {@code fn.withinGroup.orderBy}. Calling 12361 * {@link #collectColumnRefs} on the WG ORDER BY would route through 12362 * {@code provider.bindColumn} → {@code NOT_FOUND}, and the 12363 * {@code non-exact column bindings} check would throw on legitimate 12364 * non-correlated refs. The qualifier-only collector here reads the 12365 * alias straight off the TObjectName via {@link TObjectName#getTableString()}, 12366 * matching the slice-23 correlation invariant: qualified refs only. 12367 * Unqualified refs are out of scope (same schema-less limitation as 12368 * the rest of slice-23). 12369 * 12370 * <p>Returned in iteration order (so error messages identify the 12371 * first offender) using a list rather than a set. 12372 */ 12373 private static List<String> collectQualifierAliases( 12374 gudusoft.gsqlparser.nodes.TParseTreeNode root) { 12375 final List<String> out = new ArrayList<>(); 12376 if (root == null) return out; 12377 root.acceptChildren(new TParseTreeVisitor() { 12378 @Override 12379 public void preVisit(TObjectName node) { 12380 if (node.getDbObjectType() != EDbObjectType.column) return; 12381 String t = node.getTableString(); 12382 if (t != null && !t.isEmpty()) out.add(t); 12383 } 12384 }); 12385 return out; 12386 } 12387 12388 /** 12389 * Slice 30 / Slice 31: collect every WITHIN-GROUP {@code ORDER BY} 12390 * clause anywhere in the subtree, identity-keyed so two 12391 * structurally-equal order-by clauses don't collapse. 12392 * 12393 * <p>Two attachment styles are covered: 12394 * <ul> 12395 * <li><b>Slice 30 — direct attachment</b>: PostgreSQL / 12396 * Snowflake / DB2 / SparkSQL parsers store WITHIN GROUP on 12397 * {@code fn.getWithinGroup()}. The default 12398 * {@code TFunctionCall.acceptChildren} does NOT descend into 12399 * that field, so the slice-23 {@link #collectAllInnerRefs} 12400 * walk is blind to outer-alias references inside the ORDER BY. 12401 * The slice-30 correlation walk needs explicit access, hence 12402 * this helper.</li> 12403 * <li><b>Slice 31 — windowDef attachment</b>: Oracle / MSSQL 12404 * parsers store WITHIN GROUP on 12405 * {@code fn.getWindowDef().getWithinGroup()} when the 12406 * windowDef is {@link #isWithinGroupOnlyWindowDef 12407 * WITHIN-GROUP-only}. The default {@code acceptChildren} 12408 * DOES descend through {@code windowDef.acceptChildren} 12409 * which calls {@code withinGroup.acceptChildren} which calls 12410 * {@code orderBy.acceptChildren}, so column refs would 12411 * already appear in {@link #collectAllInnerRefs}-driven 12412 * walks. <b>However</b>, the slice-31 source-skip in 12413 * {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses} 12414 * removes those refs from {@link OutputColumn#getSources()}; 12415 * the slice-23 correlation walk only sees {@code OutputColumn.sources} 12416 * for the projection bucket, so a correlated 12417 * {@code LISTAGG(x.id) WITHIN GROUP (ORDER BY e.region)} on 12418 * Oracle would slip past the slice-23 loop after the source- 12419 * skip. This dual-attachment helper closes that asymmetry. 12420 * (Inner WHERE / JOIN / HAVING / ORDER BY clauses are 12421 * independently rejected by the slice-13 strict 12422 * {@code rejectWindowFunctionInScope} family — Oracle 12423 * LISTAGG WG inside a clause never reaches this helper.)</li> 12424 * </ul> 12425 * 12426 * <p>The visitor's {@code preVisit(TFunctionCall)} does NOT fire for 12427 * a top-level {@code function_t} expression's root TFunctionCall; 12428 * defensive direct checks below cover the top-level case for both 12429 * {@link TExpression} and {@link TResultColumn} roots, mirroring 12430 * {@link #collectFilterClauses}. 12431 * 12432 * <p>Used by {@link #extractOnePredicateSubqueryBody}'s 12433 * projection-only correlation walk (line ~3690) for both 12434 * direct-attachment (slice 30) and windowDef-attachment (slice 31) 12435 * outer-alias references inside WITHIN GROUP ORDER BY. 12436 */ 12437 private static Set<TOrderBy> collectDirectWithinGroupOrderBys( 12438 gudusoft.gsqlparser.nodes.TParseTreeNode root) { 12439 final Set<TOrderBy> out = 12440 Collections.newSetFromMap(new IdentityHashMap<TOrderBy, Boolean>()); 12441 if (root == null) return out; 12442 root.acceptChildren(new TParseTreeVisitor() { 12443 @Override 12444 public void preVisit(TFunctionCall fn) { 12445 TOrderBy direct = fn.getWithinGroup() == null 12446 ? null : fn.getWithinGroup().getOrderBy(); 12447 if (direct != null) out.add(direct); 12448 TWindowDef wd = fn.getWindowDef(); 12449 if (isWithinGroupOnlyWindowDef(wd)) { 12450 TOrderBy wdOb = wd.getWithinGroup().getOrderBy(); 12451 if (wdOb != null) out.add(wdOb); 12452 } 12453 } 12454 }); 12455 if (root instanceof TExpression) { 12456 TExpression e = (TExpression) root; 12457 if (e.getExpressionType() == EExpressionType.function_t) { 12458 addWithinGroupOrderByIfPresent(e.getFunctionCall(), out); 12459 } 12460 } else if (root instanceof TResultColumn) { 12461 TResultColumn rc = (TResultColumn) root; 12462 TExpression e = rc.getExpr(); 12463 if (e != null && e.getExpressionType() == EExpressionType.function_t) { 12464 addWithinGroupOrderByIfPresent(e.getFunctionCall(), out); 12465 } 12466 } 12467 return out; 12468 } 12469 12470 /** 12471 * Slice 30 / 31: helper for top-level direct check inside 12472 * {@link #collectDirectWithinGroupOrderBys}. Adds the WITHIN GROUP 12473 * ORDER BY clause to {@code out} for whichever attachment style 12474 * the function carries. 12475 */ 12476 private static void addWithinGroupOrderByIfPresent(TFunctionCall fn, 12477 Set<TOrderBy> out) { 12478 if (fn == null) return; 12479 if (fn.getWithinGroup() != null && fn.getWithinGroup().getOrderBy() != null) { 12480 out.add(fn.getWithinGroup().getOrderBy()); 12481 } 12482 TWindowDef wd = fn.getWindowDef(); 12483 if (isWithinGroupOnlyWindowDef(wd) && wd.getWithinGroup().getOrderBy() != null) { 12484 out.add(wd.getWithinGroup().getOrderBy()); 12485 } 12486 } 12487 12488 /** 12489 * Slice 28: variant of {@link #collectColumnRefs} that excludes column 12490 * refs inside {@code FILTER (WHERE ...)} clauses on any function call 12491 * in the subtree. Used by {@link #buildOutputColumns} for ALL output 12492 * source collection so the IR's per-projection {@code OutputColumn.sources} 12493 * matches dlineage's lineage-relationship view (which omits FILTER 12494 * predicate column refs entirely; see slice-28 probes Q1–Q4). 12495 * 12496 * <p>For projections that contain no FILTER aggregates (the common case), 12497 * Pass 1 yields zero skip-roots and Pass 2 reduces to the plain 12498 * {@link #collectColumnRefs}. The asymmetry between projection sources 12499 * (FILTER-skipped) and clause refs ({@code filterColumnRefs}, 12500 * {@code joinColumnRefs}, {@code groupByColumnRefs}, 12501 * {@code havingColumnRefs}, {@code orderByColumnRefs} — NOT 12502 * FILTER-skipped) is intentional: it keeps the existing 12503 * {@link #collectAllInnerRefs}-driven correlation check at line ~3603 12504 * sufficient for FILTER refs landing in non-projection clauses, while 12505 * the slice-28 correlation walk in {@link #extractOnePredicateSubqueryBody} 12506 * covers projection-FILTER refs. 12507 */ 12508 private static List<ColumnRef> collectColumnRefsExcludingFilterClauses( 12509 gudusoft.gsqlparser.nodes.TParseTreeNode root, 12510 NameBindingProvider provider) { 12511 Set<TExpression> filterClauses = collectFilterClauses(root); 12512 if (filterClauses.isEmpty()) { 12513 return collectColumnRefs(root, provider); 12514 } 12515 return collectColumnRefsSkipping(root, provider, filterClauses); 12516 } 12517 12518 /** 12519 * Slice 31: identity-keyed set of every {@link TWithinGroup} reachable 12520 * from {@code root} via {@code fn.getWindowDef().getWithinGroup()} — 12521 * the Oracle / MSSQL attachment style for plain {@code WITHIN GROUP 12522 * (ORDER BY ...)} aggregates. Used as additional skip-roots in 12523 * {@link #collectColumnRefsExcludingFilterAndWithinGroupClauses} so 12524 * the column refs inside the WITHIN GROUP ORDER BY do NOT enter 12525 * {@link OutputColumn#getSources()} on Oracle / MSSQL — matching 12526 * dlineage's omission of those refs from {@code fdr clause="on"} 12527 * sources (probe Q1 / Q3 / Q4 / Q5 in {@code /tmp/probe31}). 12528 * 12529 * <p>Discriminator: {@link #isWithinGroupOnlyWindowDef}. OVER-bearing 12530 * windowDefs (real window functions) are NOT collected here — the 12531 * slice-13 invariant rejecters keep them rejected before this 12532 * collector ever fires for projection sources, so they cannot 12533 * reach the source-skip in practice. Defensive: even if they did, 12534 * the discriminator excludes them so PARTITION BY / OVER ORDER BY 12535 * column refs (slice-13 / slice-19 alias-bound contracts) keep 12536 * their existing semantics. 12537 * 12538 * <p>The PostgreSQL direct {@code fn.getWithinGroup()} attachment 12539 * is NOT collected here because PG's 12540 * {@code TFunctionCall.acceptChildren} does not descend into the 12541 * direct field — slice 29 relied on that asymmetry to admit PG 12542 * WITHIN GROUP aggregates without any source-skip; slice 31 12543 * preserves that asymmetry on PG. 12544 */ 12545 private static Set<TWithinGroup> collectWithinGroupClausesFromWindowDef( 12546 gudusoft.gsqlparser.nodes.TParseTreeNode root) { 12547 final Set<TWithinGroup> out = 12548 Collections.newSetFromMap(new IdentityHashMap<TWithinGroup, Boolean>()); 12549 if (root == null) return out; 12550 root.acceptChildren(new TParseTreeVisitor() { 12551 @Override 12552 public void preVisit(TFunctionCall fn) { 12553 TWindowDef wd = fn.getWindowDef(); 12554 if (isWithinGroupOnlyWindowDef(wd)) { 12555 out.add(wd.getWithinGroup()); 12556 } 12557 } 12558 }); 12559 if (root instanceof TExpression) { 12560 TExpression e = (TExpression) root; 12561 if (e.getExpressionType() == EExpressionType.function_t) { 12562 addWithinGroupFromWindowDefIfPresent(e.getFunctionCall(), out); 12563 } 12564 } else if (root instanceof TResultColumn) { 12565 TResultColumn rc = (TResultColumn) root; 12566 TExpression e = rc.getExpr(); 12567 if (e != null && e.getExpressionType() == EExpressionType.function_t) { 12568 addWithinGroupFromWindowDefIfPresent(e.getFunctionCall(), out); 12569 } 12570 } 12571 return out; 12572 } 12573 12574 /** 12575 * Slice 31: helper for top-level direct check inside 12576 * {@link #collectWithinGroupClausesFromWindowDef}. Mirrors the 12577 * slice-30 {@link #addWithinGroupOrderByIfPresent} helper but adds 12578 * the {@link TWithinGroup} node itself to {@code out} (the entire 12579 * WITHIN GROUP subtree is the skip-root, not just its ORDER BY). 12580 */ 12581 private static void addWithinGroupFromWindowDefIfPresent(TFunctionCall fn, 12582 Set<TWithinGroup> out) { 12583 if (fn == null) return; 12584 TWindowDef wd = fn.getWindowDef(); 12585 if (isWithinGroupOnlyWindowDef(wd)) { 12586 out.add(wd.getWithinGroup()); 12587 } 12588 } 12589 12590 /** 12591 * Slice 31: extends slice-28's filter-skipping projection-source 12592 * collector with an additional skip for Oracle / MSSQL 12593 * {@code fn.windowDef.withinGroup} subtrees. Reduces to slice-28 12594 * behavior on PostgreSQL (where windowDef is null) and to plain 12595 * {@link #collectColumnRefs} when neither FILTER nor WITHIN GROUP 12596 * is present. 12597 * 12598 * <p>Used by {@link #buildOutputColumns} for ALL projection source 12599 * collection (predicate-body short-circuit at line ~4952 and 12600 * normal projection loop at line ~5069) so the IR's 12601 * {@link OutputColumn#getSources()} matches dlineage's 12602 * lineage-relationship view across PG / Oracle / MSSQL. The 12603 * non-projection clause-bucket collectors 12604 * ({@code filterColumnRefs}, {@code joinColumnRefs}, 12605 * {@code groupByColumnRefs}, {@code havingColumnRefs}, 12606 * {@code orderByColumnRefs}) intentionally keep using plain 12607 * {@link #collectColumnRefs} — the slice-13 strict 12608 * {@code rejectWindowFunctionInScope} family rejects any 12609 * {@code wd != null} function in those clauses BEFORE collection 12610 * descends into them, so WITHIN GROUP refs cannot leak into 12611 * clause buckets in practice. 12612 */ 12613 private static List<ColumnRef> collectColumnRefsExcludingFilterAndWithinGroupClauses( 12614 gudusoft.gsqlparser.nodes.TParseTreeNode root, 12615 NameBindingProvider provider) { 12616 Set<TExpression> filterClauses = collectFilterClauses(root); 12617 Set<TWithinGroup> withinGroupClauses = collectWithinGroupClausesFromWindowDef(root); 12618 if (filterClauses.isEmpty() && withinGroupClauses.isEmpty()) { 12619 return collectColumnRefs(root, provider); 12620 } 12621 return collectColumnRefsSkippingExtended(root, provider, 12622 filterClauses, withinGroupClauses); 12623 } 12624 12625 /** 12626 * Slice 31: variant of {@link #collectColumnRefsSkipping} that 12627 * additionally skips column refs inside {@link TWithinGroup} 12628 * subtrees in {@code wgSkipRoots} (Oracle / MSSQL 12629 * {@code fn.windowDef.withinGroup} attachment). The existing 12630 * {@code exprSkipRoots} carries the slice-28 FILTER subtrees. 12631 * Returns column refs in iteration order. 12632 * 12633 * <p>Refactor note: {@link #collectColumnRefsSkipping} now delegates 12634 * to this method with an empty {@code wgSkipRoots} set so its 12635 * behavior is preserved exactly for legacy callers (the outer 12636 * JOIN-ON path at line ~3021). 12637 */ 12638 private static List<ColumnRef> collectColumnRefsSkippingExtended( 12639 gudusoft.gsqlparser.nodes.TParseTreeNode root, 12640 final NameBindingProvider provider, 12641 final Set<TExpression> exprSkipRoots, 12642 final Set<TWithinGroup> wgSkipRoots) { 12643 final LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 12644 final List<String> rejects = new ArrayList<>(); 12645 // Root fast path: if root IS a skipped TExpression subtree, return empty. 12646 if (root instanceof TExpression && exprSkipRoots.contains(root)) { 12647 return new ArrayList<>(refs); 12648 } 12649 root.acceptChildren(new TParseTreeVisitor() { 12650 int skipDepth = 0; 12651 int nestedSelectDepth = 0; 12652 12653 @Override 12654 public void preVisit(TExpression e) { 12655 if (exprSkipRoots.contains(e)) skipDepth++; 12656 } 12657 12658 @Override 12659 public void postVisit(TExpression e) { 12660 if (exprSkipRoots.contains(e) && skipDepth > 0) skipDepth--; 12661 } 12662 12663 @Override 12664 public void preVisit(TWithinGroup wg) { 12665 if (wgSkipRoots.contains(wg)) skipDepth++; 12666 } 12667 12668 @Override 12669 public void postVisit(TWithinGroup wg) { 12670 if (wgSkipRoots.contains(wg) && skipDepth > 0) skipDepth--; 12671 } 12672 12673 @Override 12674 public void preVisit(TSelectSqlStatement nested) { 12675 nestedSelectDepth++; 12676 } 12677 12678 @Override 12679 public void postVisit(TSelectSqlStatement nested) { 12680 nestedSelectDepth--; 12681 } 12682 12683 @Override 12684 public void preVisit(TObjectName node) { 12685 if (skipDepth > 0) return; 12686 if (nestedSelectDepth > 0) return; 12687 appendMergedOrBoundColumnRef(node, provider, refs, rejects); 12688 } 12689 }); 12690 if (!rejects.isEmpty()) { 12691 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.COLUMN_BINDING_NON_EXACT, "non-exact column bindings: " + rejects, null)); 12692 } 12693 return new ArrayList<>(refs); 12694 } 12695 12696 /** 12697 * Reject join shapes that would silently drop predicate semantics: 12698 * semi/anti, vendor-specific kinds; predicate-bearing joins with 12699 * no ON and no USING clause; CROSS / NATURAL JOIN with ON or 12700 * USING. Slice 63 admits {@code CROSS JOIN} via 12701 * {@link #ALLOWED_ON_LESS_JOIN_TYPES}. Slice 64 admits 12702 * {@code JOIN ... USING (...)} on predicate join types; the 12703 * per-key {@code joinColumnRefs} emission is handled in 12704 * {@link #buildRelations}. Slice 66 admits {@code NATURAL JOIN} 12705 * via {@link #NATURAL_JOIN_TYPES} when catalog metadata is 12706 * available on both sides; the catalog-required reject fires 12707 * inside {@link #buildRelations}, not here. 12708 */ 12709 private static void rejectUnsupportedJoinShape(TJoinItem item) { 12710 EJoinType jt = item.getJoinType(); 12711 boolean isPredicate = jt != null && ALLOWED_PREDICATE_JOIN_TYPES.contains(jt); 12712 boolean isOnLess = jt != null && ALLOWED_ON_LESS_JOIN_TYPES.contains(jt); 12713 boolean isNatural = isNaturalJoinType(jt); 12714 if (!isPredicate && !isOnLess && !isNatural) { 12715 throw new SemanticIRBuildException( 12716 Diagnostic.error(DiagnosticCode.UNSUPPORTED_JOIN_TYPE, 12717 "join type " + jt + " is not supported yet; " 12718 + "only INNER/LEFT/RIGHT/FULL [OUTER] JOIN ... ON, " 12719 + "JOIN ... USING (...), CROSS JOIN, and " 12720 + "NATURAL [INNER/LEFT/RIGHT/FULL [OUTER]] JOIN are accepted", item)); 12721 } 12722 boolean hasUsing = item.getUsingColumns() != null 12723 && item.getUsingColumns().size() > 0; 12724 boolean hasOn = item.getOnCondition() != null; 12725 if (isNatural) { 12726 if (hasOn) { 12727 throw new SemanticIRBuildException( 12728 Diagnostic.error(DiagnosticCode.NATURAL_WITH_ON, 12729 "NATURAL JOIN must not carry an ON condition; rewrite " 12730 + "as JOIN ... ON, or drop the NATURAL keyword", item)); 12731 } 12732 if (hasUsing) { 12733 throw new SemanticIRBuildException( 12734 Diagnostic.error(DiagnosticCode.NATURAL_WITH_USING, 12735 "NATURAL JOIN must not carry a USING clause; choose " 12736 + "either NATURAL or USING, not both", item)); 12737 } 12738 return; 12739 } 12740 if (isOnLess) { 12741 if (hasOn) { 12742 throw new SemanticIRBuildException( 12743 Diagnostic.error(DiagnosticCode.CROSS_WITH_ON, 12744 "CROSS JOIN must not carry an ON condition; rewrite " 12745 + "as INNER JOIN ... ON, or drop the ON clause", item)); 12746 } 12747 if (hasUsing) { 12748 throw new SemanticIRBuildException( 12749 Diagnostic.error(DiagnosticCode.CROSS_WITH_USING, 12750 "CROSS JOIN must not carry a USING clause; rewrite " 12751 + "as INNER JOIN ... USING (...) or drop USING", item)); 12752 } 12753 return; 12754 } 12755 // Predicate-bearing path. 12756 if (hasUsing && hasOn) { 12757 throw new SemanticIRBuildException( 12758 Diagnostic.error(DiagnosticCode.JOIN_WITH_BOTH_ON_AND_USING, 12759 "JOIN cannot carry both ON and USING; choose one", item)); 12760 } 12761 if (!hasUsing && !hasOn) { 12762 throw new SemanticIRBuildException( 12763 Diagnostic.error(DiagnosticCode.JOIN_MISSING_ON_OR_USING, 12764 "JOIN with no ON or USING condition is not supported yet " 12765 + "(implicit joins must be explicit and supported)", item)); 12766 } 12767 } 12768 12769 /** 12770 * Slice 64 — populate {@code joinColumnRefs} for a USING-shaped 12771 * join item. Emits refs in <b>left-then-right</b> order per key. 12772 * 12773 * <p>Column-source resolution looks at two sources, in order: 12774 * <ol> 12775 * <li>The catalog (via 12776 * {@link NameBindingProvider#getRelationColumnNames(TTable)}) 12777 * for base tables;</li> 12778 * <li>The slice-60 in-scope-relation-columns map (via 12779 * {@link NameBindingProvider#getInScopeRelationColumns()}) 12780 * for CTE and FROM-subquery relations, keyed by effective 12781 * alias.</li> 12782 * </ol> 12783 * 12784 * <p>Left side uses these two sources to narrow to the prior 12785 * relations that actually declare the USING key, walking 12786 * {@code topJoin.getTable()} then 12787 * {@code items[0..itemIndex-1].getTable()} in FROM order. 12788 * 12789 * <p>Right side is always {@code item.getTable()}. When either 12790 * source declares the right relation's columns and the USING key 12791 * is absent there, the build is failed-fast with a 12792 * non-exact-binding-style reject — matching what the resolver 12793 * does for plain {@code SELECT k} where {@code k} doesn't exist. 12794 * 12795 * <p>When neither source has any column info for the prior 12796 * relations (no catalog and no in-scope map), fall back to 12797 * emitting one ref for the immediately-prior relation so the 12798 * slice-64 admission still works without a catalog. Same 12799 * fallback applies to the right side (emit unconditionally). 12800 * 12801 * <p>This matches resolver2's all-chain-tables linkage 12802 * ({@code ScopeBuilder.preVisit(TJoinItem)}) for the cases where 12803 * catalog/in-scope info is missing, without adopting its 12804 * over-approximation when info IS available. 12805 */ 12806 private static void populateUsingJoinRefs(TJoin topJoin, 12807 TJoinItemList items, 12808 int itemIndex, 12809 TTable rightTable, 12810 TObjectNameList usingCols, 12811 NameBindingProvider provider, 12812 List<ColumnRef> joinRefsOut) { 12813 // Slice 66: collect the SQL-written USING key spellings and 12814 // delegate to the shared {@link #emitMergedJoinRefs} helper 12815 // which serves both USING (this path) and NATURAL. 12816 List<String> keyNames = new ArrayList<>(usingCols.size()); 12817 for (int k = 0; k < usingCols.size(); k++) { 12818 TObjectName usingKey = usingCols.getObjectName(k); 12819 if (usingKey == null) continue; 12820 String keyName = usingKey.getColumnNameOnly(); 12821 if (keyName == null || keyName.isEmpty()) continue; 12822 keyNames.add(keyName); 12823 } 12824 emitMergedJoinRefs(JoinKind.USING, keyNames, topJoin, items, 12825 itemIndex, rightTable, provider, joinRefsOut); 12826 } 12827 12828 /** 12829 * Slice 66 — discriminator for {@link #emitMergedJoinRefs}. USING 12830 * comes from a syntactic clause and enforces left-and-right 12831 * "key-must-exist" rejects; NATURAL comes from catalog inference 12832 * and never has a missing key by construction. 12833 */ 12834 private enum JoinKind { USING, NATURAL } 12835 12836 /** 12837 * Slice 66 — shared emit-refs helper used by USING and NATURAL. 12838 * Emits per-key {@code joinColumnRefs} in <b>left-then-right</b> 12839 * order, walking every prior FROM relation for the left side. The 12840 * {@code kind} discriminator controls: 12841 * 12842 * <ul> 12843 * <li><b>CTE-explicit-column-list deferral</b>: applies to BOTH 12844 * (the diagnostic wording mentions JOIN kind);</li> 12845 * <li><b>Right-side "missing key" reject</b>: USING-only — 12846 * NATURAL keys come from catalog intersection so the key 12847 * must be present on the right by construction;</li> 12848 * <li><b>Left-side "missing key" reject</b>: USING-only — same 12849 * rationale.</li> 12850 * </ul> 12851 * 12852 * <p>Spelling: the caller supplies the emitted spelling (USING 12853 * passes the SQL-written spelling; NATURAL passes the catalog- 12854 * declared spelling of the first contributor — see 12855 * {@link #naturalSharedKeys}). 12856 */ 12857 private static void emitMergedJoinRefs(JoinKind kind, 12858 List<String> keyNames, 12859 TJoin topJoin, 12860 TJoinItemList items, 12861 int itemIndex, 12862 TTable rightTable, 12863 NameBindingProvider provider, 12864 List<ColumnRef> joinRefsOut) { 12865 String rightAlias = effectiveAliasOf(rightTable); 12866 List<TTable> priorRelations = new ArrayList<>(); 12867 if (topJoin.getTable() != null) { 12868 priorRelations.add(topJoin.getTable()); 12869 } 12870 for (int j = 0; j < itemIndex; j++) { 12871 TJoinItem prevItem = items.getJoinItem(j); 12872 if (prevItem != null && prevItem.getTable() != null) { 12873 priorRelations.add(prevItem.getTable()); 12874 } 12875 } 12876 // Slice 60 / 64 / 66 originally rejected USING / NATURAL joins 12877 // against a CTE with an explicit column list because the CTE 12878 // body's StatementGraph published inner-projection names rather 12879 // than the renamed list. Slice 103 lifts that rejection by 12880 // wiring the slice-102 rename helper into the SELECT-side CTE 12881 // walker; the published column list now matches the explicit 12882 // list, so lookupRelationColumnNames returns the renamed names 12883 // from the in-scope map and the merged-key emit below works. 12884 // `MERGED_JOIN_AGAINST_CTE_WITH_EXPLICIT_COLUMN_LIST` stays 12885 // declared-but-unreached (slice 71/72/82/86/95/96/97/98/99/100/ 12886 // 101/102 precedent). 12887 for (String keyName : keyNames) { 12888 if (keyName == null || keyName.isEmpty()) continue; 12889 12890 // Left side FIRST (matches ON-clause natural reading order). 12891 // For USING: priorRelations with metadata-unknown or 12892 // declared-key emit a ref; missing-key skips. If no ref 12893 // emitted and all priors had known info → USING-only 12894 // reject (NATURAL never reaches this because the catalog 12895 // intersection guarantees at least one contributor). 12896 boolean emittedAnyLeft = false; 12897 boolean allPriorsHadColumnInfo = true; 12898 for (TTable prior : priorRelations) { 12899 List<String> cols = lookupRelationColumnNames(prior, provider); 12900 if (cols == null) { 12901 allPriorsHadColumnInfo = false; 12902 joinRefsOut.add(new ColumnRef( 12903 effectiveAliasOf(prior), keyName)); 12904 emittedAnyLeft = true; 12905 continue; 12906 } 12907 for (String c : cols) { 12908 if (c != null && c.equalsIgnoreCase(keyName)) { 12909 joinRefsOut.add(new ColumnRef( 12910 effectiveAliasOf(prior), keyName)); 12911 emittedAnyLeft = true; 12912 break; 12913 } 12914 } 12915 } 12916 if (kind == JoinKind.USING 12917 && !emittedAnyLeft && allPriorsHadColumnInfo 12918 && !priorRelations.isEmpty()) { 12919 throw new SemanticIRBuildException( 12920 Diagnostic.error(DiagnosticCode.USING_KEY_NOT_DECLARED, 12921 "USING key '" + keyName + "' is not declared on " 12922 + "any left-side relation; check that the " 12923 + "key exists on at least one of the " 12924 + "joined-in relations", rightTable)); 12925 } 12926 12927 // Right side. USING: must exist OR catalog unknown. 12928 // NATURAL: by construction the key is in right's catalog. 12929 // For the unknown-catalog case we still emit (over-approximate). 12930 List<String> rightCols = lookupRelationColumnNames(rightTable, provider); 12931 if (rightCols == null) { 12932 joinRefsOut.add(new ColumnRef(rightAlias, keyName)); 12933 } else { 12934 boolean rightHasKey = false; 12935 for (String c : rightCols) { 12936 if (c != null && c.equalsIgnoreCase(keyName)) { 12937 rightHasKey = true; 12938 break; 12939 } 12940 } 12941 if (!rightHasKey) { 12942 if (kind == JoinKind.USING) { 12943 throw new SemanticIRBuildException( 12944 Diagnostic.error(DiagnosticCode.USING_KEY_NOT_DECLARED, 12945 "USING key '" + keyName + "' is not declared on " 12946 + "right-side relation '" + rightAlias 12947 + "'; USING requires the key to exist on " 12948 + "both sides", rightTable)); 12949 } 12950 // NATURAL: silently skip — should be unreachable 12951 // because keys come from the intersection. 12952 continue; 12953 } 12954 joinRefsOut.add(new ColumnRef(rightAlias, keyName)); 12955 } 12956 } 12957 } 12958 12959 /** 12960 * Slice 64 — true iff the given table reference is a CTE with an 12961 * explicit column list (e.g. {@code WITH x(a, b) AS ...}). Slice 12962 * 64 originally used this to defer USING joins against such CTEs; 12963 * slice 103 lifted that deferral by wiring the slice-102 rename 12964 * helper into the SELECT-side CTE walker. The helper is retained 12965 * for {@link #buildUsingScope}'s ambiguity check (defense in depth) 12966 * and may be reused by future call sites that need to discriminate 12967 * the shape. 12968 */ 12969 private static boolean hasExplicitCteColumnList(TTable table) { 12970 if (table == null) return false; 12971 TCTE cte = table.getCTE(); 12972 return cte != null && cte.getColumnList() != null 12973 && cte.getColumnList().size() > 0; 12974 } 12975 12976 /** 12977 * Slice 65 — read the renamed column names from a CTE's explicit 12978 * column list ({@code WITH x(a, b) AS ...}). Used by 12979 * {@link #buildUsingScope}'s ambiguity check as a defense-in-depth 12980 * complement to {@link #lookupRelationColumnNames}. Slice 65 12981 * originally needed this because the CTE body's StatementGraph 12982 * published inner-projection names; slice 103 lifted that gap by 12983 * applying the slice-102 rename helper on the SELECT side, so the 12984 * in-scope-map path now returns the renamed list too. 12985 */ 12986 private static java.util.List<String> explicitCteColumnNames(TTable table) { 12987 if (table == null) return null; 12988 TCTE cte = table.getCTE(); 12989 if (cte == null) return null; 12990 if (cte.getColumnList() == null || cte.getColumnList().size() == 0) { 12991 return null; 12992 } 12993 java.util.List<String> names = new java.util.ArrayList<>(cte.getColumnList().size()); 12994 for (int i = 0; i < cte.getColumnList().size(); i++) { 12995 TObjectName col = cte.getColumnList().getObjectName(i); 12996 if (col == null) continue; 12997 String n = col.getColumnNameOnly(); 12998 if (n == null || n.isEmpty()) continue; 12999 names.add(n); 13000 } 13001 return names.isEmpty() ? null : names; 13002 } 13003 13004 /** 13005 * Slice 64 — look up column names for a FROM-clause relation 13006 * combining the slice-58 base-table catalog and the slice-60 13007 * in-scope CTE/subquery map. Returns {@code null} when neither 13008 * source has column info for the table. 13009 * 13010 * <p>The in-scope map is consulted <b>first</b>: when a CTE or 13011 * FROM-subquery has the same name as a base table in the catalog, 13012 * the scoped definition shadows the catalog (codex diff-review 13013 * round-2 P2 #1 — without this precedence, USING against the CTE 13014 * would see the catalog table's columns and reject a valid join). 13015 * 13016 * <p>Slice 103 — CTEs with an explicit column list are no longer 13017 * rejected upstream. The SELECT-side CTE walker now invokes the 13018 * slice-102 rename helper, so {@code ctePublishedColumns} carries 13019 * the renamed names; {@code addRelationToInScopeMap} reads from 13020 * that map, and this lookup returns the renamed list. (Slice 64's 13021 * older comment said the rename was deferred — that deferral was 13022 * lifted by slice 103.) 13023 */ 13024 private static List<String> lookupRelationColumnNames(TTable table, 13025 NameBindingProvider provider) { 13026 String key = effectiveAliasLowerCaseOrNull(table); 13027 if (key != null) { 13028 java.util.Map<String, List<String>> inScope = provider.getInScopeRelationColumns(); 13029 if (inScope != null) { 13030 List<String> scoped = inScope.get(key); 13031 if (scoped != null) return scoped; 13032 } 13033 } 13034 return provider.getRelationColumnNames(table); 13035 } 13036 13037 /** 13038 * Slice 66 — accumulated row type of the LEFT side of a top-level 13039 * {@code TJoin}. Maintained per top-level TJoin so that mixed 13040 * ON/CROSS/USING/NATURAL chains can be reasoned about against the 13041 * full visible row type (NATURAL JOIN's right operand sees every 13042 * column visible in the accumulated left, not just the immediate 13043 * prior table). 13044 * 13045 * <p>{@link #complete} flips to {@code false} when any contributor 13046 * along the chain has no resolvable catalog. A {@code false} 13047 * {@code complete} blocks subsequent NATURAL JoinItems from 13048 * inferring their shared-key list — they reject with a tuned 13049 * catalog-required diagnostic naming whichever side(s) lack 13050 * catalog metadata. 13051 */ 13052 private static final class LeftOutputState { 13053 final java.util.LinkedHashMap<String, List<TTable>> columns = new java.util.LinkedHashMap<>(); 13054 boolean complete = true; 13055 final List<String> missingAliases = new ArrayList<>(); 13056 13057 void markMissing(TTable t) { 13058 complete = false; 13059 String alias = effectiveAliasOf(t); 13060 if (alias != null && !alias.isEmpty()) { 13061 if (!missingAliases.contains(alias)) { 13062 missingAliases.add(alias); 13063 } 13064 } 13065 } 13066 } 13067 13068 /** 13069 * Slice 66 — result of {@link #naturalSharedKeys}. Either a SUCCESS 13070 * (with the inferred key list in left-output insertion order) or 13071 * one of three failure kinds: 13072 * 13073 * <ul> 13074 * <li>{@code INCOMPLETE_LEFT}: at least one prior contributor on 13075 * the accumulated left side had null/empty catalog;</li> 13076 * <li>{@code MISSING_RIGHT}: right table has null/empty catalog;</li> 13077 * <li>{@code BOTH_MISSING}: both above conditions hold.</li> 13078 * </ul> 13079 * 13080 * <p>Failures carry diagnostic aliases so the caller can produce 13081 * a side-specific reject message. 13082 */ 13083 private static final class NaturalKeyResult { 13084 enum Kind { SUCCESS, INCOMPLETE_LEFT, MISSING_RIGHT, BOTH_MISSING } 13085 final Kind kind; 13086 final List<String> keys; 13087 final List<String> leftMissingAliases; 13088 final String rightAlias; 13089 13090 private NaturalKeyResult(Kind kind, List<String> keys, 13091 List<String> leftMissingAliases, 13092 String rightAlias) { 13093 this.kind = kind; 13094 this.keys = keys; 13095 this.leftMissingAliases = leftMissingAliases; 13096 this.rightAlias = rightAlias; 13097 } 13098 static NaturalKeyResult success(List<String> keys) { 13099 return new NaturalKeyResult(Kind.SUCCESS, keys, null, null); 13100 } 13101 static NaturalKeyResult incompleteLeft(List<String> missing) { 13102 return new NaturalKeyResult(Kind.INCOMPLETE_LEFT, null, missing, null); 13103 } 13104 static NaturalKeyResult missingRight(String alias) { 13105 return new NaturalKeyResult(Kind.MISSING_RIGHT, null, null, alias); 13106 } 13107 static NaturalKeyResult bothMissing(List<String> missing, String alias) { 13108 return new NaturalKeyResult(Kind.BOTH_MISSING, null, missing, alias); 13109 } 13110 } 13111 13112 /** 13113 * Slice 66 — seed the {@link LeftOutputState} with the top-left 13114 * table of a top-level TJoin. Used at the start of each TJoin walk. 13115 */ 13116 private static void seedLeftOutput(LeftOutputState state, TTable t, 13117 NameBindingProvider provider) { 13118 if (t == null) return; 13119 List<String> cols = lookupRelationColumnNames(t, provider); 13120 if (cols == null || cols.isEmpty()) { 13121 state.markMissing(t); 13122 return; 13123 } 13124 for (String c : cols) { 13125 if (c == null || c.isEmpty()) continue; 13126 String colLC = c.toLowerCase(Locale.ROOT); 13127 List<TTable> contributors = state.columns.get(colLC); 13128 if (contributors == null) { 13129 contributors = new ArrayList<>(); 13130 state.columns.put(colLC, contributors); 13131 } 13132 contributors.add(t); 13133 } 13134 } 13135 13136 /** 13137 * Slice 66 — append the right table of an ON-shaped or CROSS 13138 * JoinItem into the running {@link LeftOutputState}. Each catalog 13139 * column is added as a new entry (or extends the contributor list 13140 * for an existing same-named entry). Mirrors 13141 * {@link #seedLeftOutput} but additive. 13142 */ 13143 private static void appendRightToLeftOutput(LeftOutputState state, TTable right, 13144 NameBindingProvider provider) { 13145 if (right == null) return; 13146 List<String> cols = lookupRelationColumnNames(right, provider); 13147 if (cols == null || cols.isEmpty()) { 13148 state.markMissing(right); 13149 return; 13150 } 13151 for (String c : cols) { 13152 if (c == null || c.isEmpty()) continue; 13153 String colLC = c.toLowerCase(Locale.ROOT); 13154 List<TTable> contributors = state.columns.get(colLC); 13155 if (contributors == null) { 13156 contributors = new ArrayList<>(); 13157 state.columns.put(colLC, contributors); 13158 } 13159 contributors.add(right); 13160 } 13161 } 13162 13163 /** 13164 * Slice 66 — merge the right table of a USING-shaped or 13165 * NATURAL-shaped JoinItem. Columns in {@code mergedKeys} are 13166 * appended to the existing same-named contributor list at their 13167 * original output position (no new slot); other columns are 13168 * appended as new entries (or contributed to an existing same-named 13169 * entry — slice-59 plain-vs-plain duplicate admit). 13170 */ 13171 private static void mergeRightIntoLeftOutput(LeftOutputState state, TTable right, 13172 NameBindingProvider provider, 13173 List<String> mergedKeys) { 13174 if (right == null) return; 13175 java.util.Set<String> mergedKeysLC = new HashSet<>(); 13176 if (mergedKeys != null) { 13177 for (String k : mergedKeys) { 13178 if (k != null && !k.isEmpty()) { 13179 mergedKeysLC.add(k.toLowerCase(Locale.ROOT)); 13180 } 13181 } 13182 } 13183 List<String> cols = lookupRelationColumnNames(right, provider); 13184 if (cols == null || cols.isEmpty()) { 13185 state.markMissing(right); 13186 return; 13187 } 13188 for (String c : cols) { 13189 if (c == null || c.isEmpty()) continue; 13190 String colLC = c.toLowerCase(Locale.ROOT); 13191 // mergedKeysLC.contains(colLC) — append to existing entry; 13192 // !mergedKeysLC.contains(colLC) && state.columns.containsKey(colLC) 13193 // — append to existing entry (plain-vs-plain duplicate); 13194 // !state.columns.containsKey(colLC) — new entry. 13195 List<TTable> contributors = state.columns.get(colLC); 13196 if (contributors == null) { 13197 contributors = new ArrayList<>(); 13198 state.columns.put(colLC, contributors); 13199 } 13200 contributors.add(right); 13201 } 13202 } 13203 13204 /** 13205 * Slice 66 — infer the NATURAL JOIN shared-column list for the 13206 * current JoinItem. Returns one of four results per §6.1 of the 13207 * slice-66 plan. The shared list uses catalog-declared spelling 13208 * from the FIRST contributor that publishes each key (NATURAL has 13209 * no SQL-written key token, so the catalog form is the only 13210 * source of truth). 13211 */ 13212 private static NaturalKeyResult naturalSharedKeys(LeftOutputState leftState, 13213 TTable right, 13214 NameBindingProvider provider) { 13215 List<String> rightCols = lookupRelationColumnNames(right, provider); 13216 boolean rightMissing = (rightCols == null || rightCols.isEmpty()); 13217 if (!leftState.complete && rightMissing) { 13218 return NaturalKeyResult.bothMissing(leftState.missingAliases, 13219 effectiveAliasOf(right)); 13220 } 13221 if (!leftState.complete) { 13222 return NaturalKeyResult.incompleteLeft(leftState.missingAliases); 13223 } 13224 if (rightMissing) { 13225 return NaturalKeyResult.missingRight(effectiveAliasOf(right)); 13226 } 13227 java.util.Set<String> rightLC = new HashSet<>(); 13228 for (String c : rightCols) { 13229 if (c != null && !c.isEmpty()) { 13230 rightLC.add(c.toLowerCase(Locale.ROOT)); 13231 } 13232 } 13233 List<String> shared = new ArrayList<>(); 13234 for (java.util.Map.Entry<String, List<TTable>> e 13235 : leftState.columns.entrySet()) { 13236 String keyLC = e.getKey(); 13237 if (rightLC.contains(keyLC)) { 13238 shared.add(firstCatalogSpelling(e.getValue(), keyLC, provider)); 13239 } 13240 } 13241 return NaturalKeyResult.success(shared); 13242 } 13243 13244 /** 13245 * Slice 66 — return the catalog-declared spelling of {@code keyLC} 13246 * from the first contributor in insertion order that publishes the 13247 * key with a non-null spelling. Defensive fallback to {@code keyLC} 13248 * if no contributor exposes the spelling (unreachable in practice 13249 * because contributors are catalogued by construction). 13250 */ 13251 private static String firstCatalogSpelling(List<TTable> contributors, 13252 String keyLC, 13253 NameBindingProvider provider) { 13254 if (contributors != null) { 13255 for (TTable t : contributors) { 13256 List<String> cols = lookupRelationColumnNames(t, provider); 13257 if (cols == null) continue; 13258 for (String c : cols) { 13259 if (c != null && c.equalsIgnoreCase(keyLC)) { 13260 return c; 13261 } 13262 } 13263 } 13264 } 13265 return keyLC; 13266 } 13267 13268 /** 13269 * Slice 66 — diagnostic helper. Joins a list of aliases for the 13270 * NATURAL-required catalog reject message. 13271 */ 13272 private static String formatAliasList(List<String> aliases) { 13273 if (aliases == null || aliases.isEmpty()) return "<none>"; 13274 StringBuilder sb = new StringBuilder(); 13275 for (int i = 0; i < aliases.size(); i++) { 13276 if (i > 0) sb.append(", "); 13277 sb.append("'").append(aliases.get(i)).append("'"); 13278 } 13279 return sb.toString(); 13280 } 13281 13282 /** 13283 * Slice 66 — turn a {@link NaturalKeyResult} failure into a 13284 * structured diagnostic for the gated reject inside 13285 * {@link #buildRelations}. 13286 */ 13287 private static String formatNaturalCatalogReject(NaturalKeyResult r) { 13288 switch (r.kind) { 13289 case INCOMPLETE_LEFT: 13290 return "NATURAL JOIN requires catalog metadata for both sides; " 13291 + "left-side row type is incomplete due to uncatalogued " 13292 + "relation(s) " + formatAliasList(r.leftMissingAliases) 13293 + "; supply a TSQLEnv (or in-scope CTE / FROM-subquery " 13294 + "body) for the missing relation(s), or rewrite as " 13295 + "JOIN ... ON"; 13296 case MISSING_RIGHT: 13297 return "NATURAL JOIN requires catalog metadata for both sides; " 13298 + "right-side relation '" + r.rightAlias 13299 + "' has no resolvable column list; supply a TSQLEnv " 13300 + "(or in-scope CTE / FROM-subquery body) for this " 13301 + "relation, or rewrite as JOIN ... ON"; 13302 case BOTH_MISSING: 13303 return "NATURAL JOIN requires catalog metadata for both sides; " 13304 + "left-side row type is incomplete due to uncatalogued " 13305 + "relation(s) " + formatAliasList(r.leftMissingAliases) 13306 + " and right-side relation '" + r.rightAlias 13307 + "' also has no resolvable column list; supply a " 13308 + "TSQLEnv for the missing relation(s), or rewrite " 13309 + "as JOIN ... ON"; 13310 default: 13311 return "NATURAL JOIN: unexpected result kind " + r.kind; 13312 } 13313 } 13314 13315 /** 13316 * Slice 65 — fail fast when a JOIN ON clause references a USING 13317 * merged key by its bare (unqualified) name. JOIN ON requires 13318 * per-position scope (only relations BEFORE that JoinItem are 13319 * visible), which slice 65 does not yet model; the merged-key 13320 * collector applied by other clauses would over-include later 13321 * relations. Reject the shape so the slice-66+ slice can lift 13322 * with proper per-position scope. 13323 * 13324 * <p>This is the narrowed replacement for slice-64's 13325 * {@code rejectUnqualifiedUsingKeyReferences}, which scanned the 13326 * entire SELECT body. Slice 65 admits unqualified USING-key refs 13327 * in every other clause via the merged-key collector. 13328 * 13329 * <p>Qualified references (e.g. {@code a.k}, {@code b.k}) and 13330 * column references whose names don't match a USING key are 13331 * unaffected. 13332 */ 13333 private static void rejectUnqualifiedMergedKeyInJoinOn(TSelectSqlStatement select, 13334 NameBindingProvider provider) { 13335 if (select.joins == null) return; 13336 // Walk each TOP-LEVEL TJoin independently — each comma-FROM 13337 // group has its own scope for JOIN ON purposes (codex slice-65 13338 // diff-review round-4 P2 #1). Within one TJoin, walk JoinItems 13339 // in FROM order and track which merged keys (USING-declared OR 13340 // NATURAL-inferred) have been established. An ON clause is only 13341 // checked against keys that are ALREADY merged at that position; 13342 // a bare `k` in an ON before any USING(k) / NATURAL is just 13343 // resolver2's unqualified-binding case. 13344 // 13345 // Slice 66: NATURAL JoinItems contribute their catalog-inferred 13346 // key list to declaredKeysSoFar. When NATURAL would fail the 13347 // catalog requirement (INCOMPLETE_LEFT / MISSING_RIGHT / 13348 // BOTH_MISSING), the preflight silently skips recording this 13349 // JoinItem's keys — the gated reject in buildRelations will 13350 // fire with a catalog-required diagnostic and the user sees 13351 // that error first. 13352 // 13353 // Identity skip set: USING-clause own TObjectNames are 13354 // declarations not references; never matched against the 13355 // declaredKeysSoFar set since the preflight only walks ON 13356 // conditions. Kept as a defensive no-op. 13357 final java.util.Set<TObjectName> skip = 13358 java.util.Collections.newSetFromMap( 13359 new java.util.IdentityHashMap<TObjectName, Boolean>()); 13360 for (int j = 0; j < select.joins.size(); j++) { 13361 TJoin top = select.joins.getJoin(j); 13362 if (top == null) continue; 13363 TJoinItemList items = top.getJoinItems(); 13364 if (items == null) continue; 13365 // Reset per top-level TJoin so independent comma-FROM 13366 // groups don't poison each other's ON clauses. 13367 final java.util.Set<String> declaredKeysSoFar = new java.util.HashSet<>(); 13368 LeftOutputState leftState = new LeftOutputState(); 13369 seedLeftOutput(leftState, top.getTable(), provider); 13370 for (int i = 0; i < items.size(); i++) { 13371 TJoinItem item = items.getJoinItem(i); 13372 if (item == null) continue; 13373 // Check ON FIRST (uses scope BEFORE this JoinItem), then 13374 // record this JoinItem's USING/NATURAL declarations so 13375 // future siblings see them. 13376 TExpression onCond = item.getOnCondition(); 13377 if (onCond != null && !declaredKeysSoFar.isEmpty()) { 13378 final java.util.Set<String> alreadyDeclared = 13379 new java.util.HashSet<>(declaredKeysSoFar); 13380 onCond.acceptChildren(new TParseTreeVisitor() { 13381 int nestedSelectDepth = 0; 13382 13383 @Override 13384 public void preVisit(TSelectSqlStatement nested) { 13385 nestedSelectDepth++; 13386 } 13387 13388 @Override 13389 public void postVisit(TSelectSqlStatement nested) { 13390 nestedSelectDepth--; 13391 } 13392 13393 @Override 13394 public void preVisit(TObjectName node) { 13395 if (nestedSelectDepth > 0) return; 13396 if (skip.contains(node)) return; 13397 if (node.getDbObjectType() != EDbObjectType.column) return; 13398 String name = node.getColumnNameOnly(); 13399 if (name == null || name.isEmpty() || "*".equals(name)) return; 13400 if (!alreadyDeclared.contains(name.toLowerCase(Locale.ROOT))) return; 13401 String qualifier = node.getTableString(); 13402 if (qualifier == null || qualifier.isEmpty()) { 13403 throw new SemanticIRBuildException( 13404 Diagnostic.error(DiagnosticCode.UNQUALIFIED_MERGED_KEY_IN_JOIN_ON, 13405 "unqualified reference to merged key '" 13406 + name + "' inside a JOIN ON condition " 13407 + "is deferred to a future slice " 13408 + "(per-position scope semantics needed); " 13409 + "qualify with a table alias " 13410 + "(e.g. a." + name + ") to disambiguate", null)); 13411 } 13412 } 13413 }); 13414 } 13415 // Record this JoinItem's contribution to declaredKeysSoFar 13416 // and update leftState for NATURAL's accumulated-left 13417 // semantics. 13418 TTable rightTable = item.getTable(); 13419 TObjectNameList usingCols = item.getUsingColumns(); 13420 if (usingCols != null && usingCols.size() > 0) { 13421 List<String> usingKeyNames = new ArrayList<>(usingCols.size()); 13422 for (int k = 0; k < usingCols.size(); k++) { 13423 TObjectName n = usingCols.getObjectName(k); 13424 if (n == null) continue; 13425 skip.add(n); 13426 String name = n.getColumnNameOnly(); 13427 if (name != null && !name.isEmpty()) { 13428 declaredKeysSoFar.add(name.toLowerCase(Locale.ROOT)); 13429 usingKeyNames.add(name); 13430 } 13431 } 13432 if (rightTable != null) { 13433 mergeRightIntoLeftOutput(leftState, rightTable, provider, usingKeyNames); 13434 } 13435 } else if (isNaturalJoinType(item.getJoinType()) && rightTable != null) { 13436 NaturalKeyResult r = naturalSharedKeys(leftState, rightTable, provider); 13437 if (r.kind == NaturalKeyResult.Kind.SUCCESS) { 13438 for (String s : r.keys) { 13439 if (s != null && !s.isEmpty()) { 13440 declaredKeysSoFar.add(s.toLowerCase(Locale.ROOT)); 13441 } 13442 } 13443 mergeRightIntoLeftOutput(leftState, rightTable, provider, r.keys); 13444 } else { 13445 // Catalog-required reject fires upstream in 13446 // buildRelations. Defensively append for state 13447 // consistency. 13448 appendRightToLeftOutput(leftState, rightTable, provider); 13449 } 13450 } else if (rightTable != null) { 13451 appendRightToLeftOutput(leftState, rightTable, provider); 13452 } 13453 } 13454 } 13455 } 13456 13457 /** 13458 * Slice 65 — compute the {@link UsingScope} for the current SELECT 13459 * body from its FROM-clause USING joins. Walks every {@link TJoin} 13460 * in {@code select.joins} and for each USING(k) JoinItem, builds 13461 * the per-key equivalence class via DSU-like union over prior 13462 * relations + the right-side relation. Then materializes each 13463 * class by a separate FROM-order pass with identity dedup so 13464 * chained USING joins (`a JOIN b USING(k) JOIN c USING(k)`) 13465 * produce {@code [a, b, c]}, never duplicates. 13466 * 13467 * <p>For each class, builds a {@link UsingScope.MergedKeyEntry} 13468 * with FROM-ordered merged source refs (one per relation that 13469 * publishes the key per catalog / in-scope map; unknown-metadata 13470 * priors emit refs unconditionally, matching slice-64's over- 13471 * approximation policy in {@link #populateUsingJoinRefs}). 13472 * 13473 * <p>Ambiguity is precomputed: 13474 * <ul> 13475 * <li>{@code entries.size() > 1}: two disconnected USING classes 13476 * share the same key name.</li> 13477 * <li>{@code entries.size() == 1} AND a FROM relation outside 13478 * the class has catalog metadata that declares the key: 13479 * out-of-class same-named column.</li> 13480 * </ul> 13481 * 13482 * <p>Returns {@link UsingScope#EMPTY} when no USING clauses are 13483 * present in {@code select.joins}. 13484 */ 13485 private static UsingScope buildUsingScope(TSelectSqlStatement select, 13486 NameBindingProvider provider) { 13487 if (select.joins == null) return UsingScope.EMPTY; 13488 // Slice 86 — delegate to the shared TJoinList-taking helper so 13489 // joined UPDATE (slice 86 buildUpdateUsingScope) can reuse the 13490 // identical scope-build pipeline. 13491 return buildUsingScopeFromJoinList(select.joins, provider); 13492 } 13493 13494 /** 13495 * Slice 86 — compute the {@link UsingScope} for a joined UPDATE's 13496 * FROM clause via {@code update.getJoins()}. Mirrors slice-65 13497 * {@link #buildUsingScope}: USING / NATURAL JoinItems contribute 13498 * merged-key equivalence classes; unqualified merged-key references 13499 * in SET RHS / WHERE / RETURNING resolve to the merged source list. 13500 * 13501 * <p>Returns {@link UsingScope#EMPTY} when no USING/NATURAL JoinItems 13502 * appear in the FROM clause. 13503 */ 13504 private static UsingScope buildUpdateUsingScope(TUpdateSqlStatement update, 13505 NameBindingProvider provider) { 13506 if (update == null) return UsingScope.EMPTY; 13507 return buildUsingScopeFromJoinList(update.getJoins(), provider); 13508 } 13509 13510 /** 13511 * Slice 86 — shared {@link UsingScope} computation extracted from 13512 * slice-65 {@link #buildUsingScope}. Takes the {@link TJoinList} 13513 * directly so it can be invoked from both SELECT 13514 * ({@link #buildUsingScope}) and joined UPDATE 13515 * ({@link #buildUpdateUsingScope}). 13516 * 13517 * <p>Behavior identical to slice 65/66: per-key DSU union over prior 13518 * relations + right-side relation per top-level {@link TJoin}; 13519 * disconnected comma-FROM groups keep their own per-key components; 13520 * NATURAL JoinItems infer shared keys against accumulated left row 13521 * type via {@link LeftOutputState}; ambiguity detection walks all 13522 * FROM relations for out-of-class same-named columns. 13523 */ 13524 private static UsingScope buildUsingScopeFromJoinList(TJoinList joins, 13525 NameBindingProvider provider) { 13526 if (joins == null) return UsingScope.EMPTY; 13527 // Pass 1: per-key DSU. For each USING(k) or NATURAL JoinItem, 13528 // union the prior relations PUBLISHING the key (catalog-narrowed 13529 // per codex slice-66 round-1 P1 #1) with the right-side relation, 13530 // scoped to the enclosing top-level TJoin (chained merges within 13531 // one TJoin transitively connect through DSU). Disconnected 13532 // top-level TJoins (comma-FROM) keep their own per-key components. 13533 // Slice 66 maintains a LeftOutputState alongside the loop so 13534 // NATURAL JoinItems can infer their shared-key list against the 13535 // accumulated left row type. 13536 java.util.Map<String, java.util.List<java.util.List<TTable>>> perKeyComponents = 13537 new java.util.LinkedHashMap<>(); 13538 // Track the SQL-written spelling of each merged key (the first 13539 // occurrence in FROM order). For USING keys this is the 13540 // SQL-written USING-clause case (slice-64 contract); for 13541 // NATURAL keys this is the catalog-declared spelling from the 13542 // first contributor. 13543 java.util.Map<String, String> originalSpellingByKey = new java.util.HashMap<>(); 13544 for (int jx = 0; jx < joins.size(); jx++) { 13545 TJoin top = joins.getJoin(jx); 13546 if (top == null) continue; 13547 TJoinItemList items = top.getJoinItems(); 13548 if (items == null) continue; 13549 TTable topTable = top.getTable(); 13550 // Slice 66: per-TJoin LeftOutputState for NATURAL inference. 13551 LeftOutputState leftState = new LeftOutputState(); 13552 seedLeftOutput(leftState, topTable, provider); 13553 // Per-key in-progress chain for THIS top-level TJoin. 13554 java.util.Map<String, java.util.List<TTable>> inProgressByKey = 13555 new java.util.HashMap<>(); 13556 for (int i = 0; i < items.size(); i++) { 13557 TJoinItem item = items.getJoinItem(i); 13558 if (item == null) continue; 13559 TTable rightTable = item.getTable(); 13560 if (rightTable == null) continue; 13561 13562 // Determine merged keys for this JoinItem and the 13563 // emitted spelling per key. Three cases: 13564 // USING: keys = syntactic usingCols; spelling = USING-clause text. 13565 // NATURAL: keys = catalog intersection (when SUCCESS); 13566 // spelling = catalog spelling. 13567 // ON/CROSS/other: skip — append to leftState only. 13568 List<String> keyNames; 13569 java.util.Map<String, String> spellingByKeyLC = new java.util.HashMap<>(); 13570 TObjectNameList usingCols = item.getUsingColumns(); 13571 if (usingCols != null && usingCols.size() > 0) { 13572 keyNames = new java.util.ArrayList<>(usingCols.size()); 13573 for (int k = 0; k < usingCols.size(); k++) { 13574 TObjectName keyNode = usingCols.getObjectName(k); 13575 if (keyNode == null) continue; 13576 String keyName = keyNode.getColumnNameOnly(); 13577 if (keyName == null || keyName.isEmpty()) continue; 13578 keyNames.add(keyName); 13579 spellingByKeyLC.put(keyName.toLowerCase(Locale.ROOT), keyName); 13580 } 13581 } else if (isNaturalJoinType(item.getJoinType())) { 13582 NaturalKeyResult r = naturalSharedKeys(leftState, rightTable, provider); 13583 if (r.kind != NaturalKeyResult.Kind.SUCCESS) { 13584 // Catalog-required reject already fired (or will 13585 // fire) inside buildRelations. Defensively skip 13586 // this JoinItem in the scope build; it does NOT 13587 // contribute to the merged-key scope. 13588 appendRightToLeftOutput(leftState, rightTable, provider); 13589 continue; 13590 } 13591 keyNames = r.keys; 13592 for (String s : keyNames) { 13593 if (s != null && !s.isEmpty()) { 13594 spellingByKeyLC.put(s.toLowerCase(Locale.ROOT), s); 13595 } 13596 } 13597 } else { 13598 // ON / CROSS / other — no merged-key contribution. 13599 appendRightToLeftOutput(leftState, rightTable, provider); 13600 continue; 13601 } 13602 13603 // Prior relations for this JoinItem in FROM order: 13604 // topTable + items[0..i-1].getTable(). 13605 java.util.List<TTable> priorRelations = new java.util.ArrayList<>(); 13606 if (topTable != null) priorRelations.add(topTable); 13607 for (int p = 0; p < i; p++) { 13608 TJoinItem prev = items.getJoinItem(p); 13609 if (prev != null && prev.getTable() != null) { 13610 priorRelations.add(prev.getTable()); 13611 } 13612 } 13613 for (String keyName : keyNames) { 13614 if (keyName == null || keyName.isEmpty()) continue; 13615 String keyLC = keyName.toLowerCase(Locale.ROOT); 13616 // Record the first emitted spelling we see for this 13617 // key. USING uses SQL-written spelling; NATURAL uses 13618 // catalog-declared spelling. 13619 if (!originalSpellingByKey.containsKey(keyLC)) { 13620 originalSpellingByKey.put(keyLC, spellingByKeyLC.get(keyLC)); 13621 } 13622 java.util.List<TTable> chain = inProgressByKey.get(keyLC); 13623 if (chain == null) { 13624 chain = new java.util.ArrayList<>(); 13625 inProgressByKey.put(keyLC, chain); 13626 } 13627 // Slice 66 catalog-narrowed union (codex round-1 P1 #1): 13628 // for each prior relation, include in this key's 13629 // equivalence class only if (a) catalog is unknown 13630 // (over-approximate; slice-64 fallback), or (b) 13631 // catalog declares the key. Skip if catalog is 13632 // known and the key is proven absent. 13633 for (TTable prior : priorRelations) { 13634 if (containsByIdentity(chain, prior)) continue; 13635 List<String> priorCols = lookupRelationColumnNames(prior, provider); 13636 if (priorCols == null) { 13637 chain.add(prior); 13638 continue; 13639 } 13640 boolean priorPublishes = false; 13641 for (String pc : priorCols) { 13642 if (pc != null && pc.equalsIgnoreCase(keyLC)) { 13643 priorPublishes = true; 13644 break; 13645 } 13646 } 13647 if (priorPublishes) { 13648 chain.add(prior); 13649 } 13650 } 13651 if (!containsByIdentity(chain, rightTable)) { 13652 chain.add(rightTable); 13653 } 13654 } 13655 // After the merged-key bookkeeping, merge right into 13656 // the leftState so subsequent NATURAL JoinItems see 13657 // the accumulated row type. 13658 mergeRightIntoLeftOutput(leftState, rightTable, provider, keyNames); 13659 } 13660 // Flush this TJoin's in-progress chains as one component 13661 // per key. 13662 for (java.util.Map.Entry<String, java.util.List<TTable>> e : 13663 inProgressByKey.entrySet()) { 13664 java.util.List<java.util.List<TTable>> bucket = perKeyComponents.get(e.getKey()); 13665 if (bucket == null) { 13666 bucket = new java.util.ArrayList<>(); 13667 perKeyComponents.put(e.getKey(), bucket); 13668 } 13669 bucket.add(e.getValue()); 13670 } 13671 } 13672 if (perKeyComponents.isEmpty()) return UsingScope.EMPTY; 13673 // Pass 2: materialize EquivalenceClass + MergedKeyEntry per 13674 // component. FROM-order is already preserved by Pass 1's 13675 // accumulation order (priorRelations + rightTable). 13676 java.util.Map<String, java.util.List<UsingScope.MergedKeyEntry>> entriesByName = 13677 new java.util.LinkedHashMap<>(); 13678 for (java.util.Map.Entry<String, java.util.List<java.util.List<TTable>>> e : 13679 perKeyComponents.entrySet()) { 13680 String keyLC = e.getKey(); 13681 // ColumnRef-emit spelling: SQL-written USING-clause spelling 13682 // (matches slice-64 populateUsingJoinRefs). Falls back to 13683 // keyLC if no spelling was recorded (defensive). 13684 String emitKeyName = originalSpellingByKey.containsKey(keyLC) 13685 ? originalSpellingByKey.get(keyLC) 13686 : keyLC; 13687 java.util.List<UsingScope.MergedKeyEntry> entries = new java.util.ArrayList<>(); 13688 for (java.util.List<TTable> componentMembers : e.getValue()) { 13689 if (componentMembers.isEmpty()) continue; 13690 UsingScope.EquivalenceClass cls = new UsingScope.EquivalenceClass( 13691 keyLC, componentMembers); 13692 java.util.List<ColumnRef> sources = new java.util.ArrayList<>(); 13693 java.util.Set<String> seenAliases = new java.util.HashSet<>(); 13694 for (TTable t : componentMembers) { 13695 String effAlias = effectiveAliasOf(t); 13696 if (effAlias == null || effAlias.isEmpty()) continue; 13697 String aliasKey = effAlias.toLowerCase(Locale.ROOT); 13698 if (seenAliases.contains(aliasKey)) continue; 13699 java.util.List<String> cols = lookupRelationColumnNames(t, provider); 13700 if (cols == null) { 13701 // Metadata-unknown: emit ref (over-approximate). 13702 sources.add(new ColumnRef(effAlias, emitKeyName)); 13703 seenAliases.add(aliasKey); 13704 continue; 13705 } 13706 for (String c : cols) { 13707 if (c != null && c.equalsIgnoreCase(keyLC)) { 13708 sources.add(new ColumnRef(effAlias, emitKeyName)); 13709 seenAliases.add(aliasKey); 13710 break; 13711 } 13712 } 13713 } 13714 if (!sources.isEmpty()) { 13715 entries.add(new UsingScope.MergedKeyEntry(cls, sources)); 13716 } 13717 } 13718 if (!entries.isEmpty()) { 13719 entriesByName.put(keyLC, entries); 13720 } 13721 } 13722 if (entriesByName.isEmpty()) return UsingScope.EMPTY; 13723 // Pass 3: precompute ambiguity per key. 13724 java.util.Map<String, String> ambiguityByName = new java.util.HashMap<>(); 13725 java.util.List<TTable> allFromRelations = walkAllFromRelationsFromJoinList(joins); 13726 for (java.util.Map.Entry<String, java.util.List<UsingScope.MergedKeyEntry>> e : 13727 entriesByName.entrySet()) { 13728 String keyLC = e.getKey(); 13729 java.util.List<UsingScope.MergedKeyEntry> entries = e.getValue(); 13730 if (entries.size() > 1) { 13731 ambiguityByName.put(keyLC, 13732 "multiple disconnected USING(" + keyLC + ") equivalence " 13733 + "classes appear in this FROM (their merged " 13734 + "columns share the same key name)"); 13735 continue; 13736 } 13737 // Single class. Walk all FROM relations; if any out-of-class 13738 // relation is catalog-known to publish the key, mark ambiguous. 13739 UsingScope.EquivalenceClass cls = entries.get(0).getEquivClass(); 13740 java.util.IdentityHashMap<TTable, Boolean> inClass = new java.util.IdentityHashMap<>(); 13741 for (TTable m : cls.getMembers()) inClass.put(m, Boolean.TRUE); 13742 for (TTable r : allFromRelations) { 13743 if (inClass.containsKey(r)) continue; 13744 // Slice 65 diff-review round-3 P2 #2 (slice 103 update): 13745 // post-slice-103 both branches return the same data — 13746 // `lookupRelationColumnNames` consults the in-scope map 13747 // populated from `ctePublishedColumns`, which now holds 13748 // the renamed names. The discriminator is retained as a 13749 // defense-in-depth path for any call site that bypasses 13750 // the SELECT-side CTE walker but still wants to detect 13751 // renamed-key collisions; the slice-103 path falls into 13752 // the `lookupRelationColumnNames` branch and gets the 13753 // same renamed list. 13754 java.util.List<String> cols; 13755 if (hasExplicitCteColumnList(r)) { 13756 cols = explicitCteColumnNames(r); 13757 } else { 13758 cols = lookupRelationColumnNames(r, provider); 13759 } 13760 if (cols == null) continue; // unknown → trust writer 13761 for (String c : cols) { 13762 if (c != null && c.equalsIgnoreCase(keyLC)) { 13763 String outAlias = effectiveAliasOf(r); 13764 ambiguityByName.put(keyLC, 13765 "the USING(" + keyLC + ") merged column collides " 13766 + "with column '" + keyLC + "' on relation '" 13767 + (outAlias != null ? outAlias : "<unnamed>") 13768 + "' which is not part of the USING equivalence class"); 13769 break; 13770 } 13771 } 13772 if (ambiguityByName.containsKey(keyLC)) break; 13773 } 13774 } 13775 return new UsingScope(entriesByName, ambiguityByName); 13776 } 13777 13778 private static boolean containsByIdentity(java.util.List<TTable> list, TTable t) { 13779 for (TTable x : list) { 13780 if (x == t) return true; 13781 } 13782 return false; 13783 } 13784 13785 /** 13786 * Slice 65 — every FROM-clause relation reachable directly from 13787 * {@code select.joins} (every {@code top.getTable()} + every 13788 * {@code joinItem.getTable()}). Used by {@link #buildUsingScope} 13789 * to detect out-of-equivalence-class same-named columns. 13790 */ 13791 private static java.util.List<TTable> walkAllFromRelations(TSelectSqlStatement select) { 13792 if (select == null) return new java.util.ArrayList<>(); 13793 return walkAllFromRelationsFromJoinList(select.joins); 13794 } 13795 13796 /** 13797 * Slice 86 — shared {@link TJoinList}-taking walker for ambiguity 13798 * detection inside {@link #buildUsingScopeFromJoinList}. Used by both 13799 * SELECT ({@link #walkAllFromRelations}) and joined UPDATE 13800 * ({@link #buildUpdateUsingScope}). 13801 */ 13802 private static java.util.List<TTable> walkAllFromRelationsFromJoinList(TJoinList joins) { 13803 java.util.List<TTable> out = new java.util.ArrayList<>(); 13804 if (joins == null) return out; 13805 for (int j = 0; j < joins.size(); j++) { 13806 TJoin top = joins.getJoin(j); 13807 if (top == null) continue; 13808 if (top.getTable() != null) out.add(top.getTable()); 13809 TJoinItemList items = top.getJoinItems(); 13810 if (items == null) continue; 13811 for (int i = 0; i < items.size(); i++) { 13812 TJoinItem item = items.getJoinItem(i); 13813 if (item != null && item.getTable() != null) { 13814 out.add(item.getTable()); 13815 } 13816 } 13817 } 13818 return out; 13819 } 13820 13821 /** 13822 * Slice 64 — true iff any TJoinItem in {@code select.joins} 13823 * carries a non-empty USING list. Used by {@link #tryExpandStar} 13824 * to defer bare {@code *} over USING JOIN to S65 (merged-key 13825 * output naming). 13826 */ 13827 private static boolean hasUsingInFromClause(TSelectSqlStatement select) { 13828 if (select.joins == null) return false; 13829 for (int j = 0; j < select.joins.size(); j++) { 13830 TJoin top = select.joins.getJoin(j); 13831 if (top == null) continue; 13832 TJoinItemList items = top.getJoinItems(); 13833 if (items == null) continue; 13834 for (int i = 0; i < items.size(); i++) { 13835 TJoinItem item = items.getJoinItem(i); 13836 if (item != null 13837 && item.getUsingColumns() != null 13838 && item.getUsingColumns().size() > 0) { 13839 return true; 13840 } 13841 } 13842 } 13843 return false; 13844 } 13845 13846 /** 13847 * Slice 66 — true iff any JoinItem is NATURAL AND its inferred 13848 * shared-column list is non-empty (catalog-resolved on both sides 13849 * and the intersection contains at least one column). 13850 * 13851 * <p>Routes bare {@code *} expansion through 13852 * {@link #expandBareStarOverUsing} when NATURAL contributes merged 13853 * keys. NATURAL with empty intersection or with INCOMPLETE_LEFT / 13854 * MISSING_RIGHT / BOTH_MISSING returns false for THIS JoinItem 13855 * (codex slice-66 round-4 P2 #3) — the bare-* path then falls 13856 * through to per-relation expansion, which is correct for an 13857 * empty-intersection NATURAL (Cartesian, no dedup needed). The 13858 * catalog-required reject for NATURAL fires upstream inside 13859 * {@link #buildRelations} before bare-* runs. 13860 * 13861 * <p>The walk maintains its own per-top-level-TJoin 13862 * {@link LeftOutputState} (so NATURAL inference against accumulated 13863 * left works the same way as in {@link #buildUsingScope} and 13864 * {@link #buildRelations}). 13865 */ 13866 private static boolean hasNaturalJoinMergedKeysInFromClause( 13867 TSelectSqlStatement select, NameBindingProvider provider) { 13868 if (select.joins == null) return false; 13869 for (int j = 0; j < select.joins.size(); j++) { 13870 TJoin top = select.joins.getJoin(j); 13871 if (top == null) continue; 13872 TJoinItemList items = top.getJoinItems(); 13873 if (items == null) continue; 13874 LeftOutputState leftState = new LeftOutputState(); 13875 seedLeftOutput(leftState, top.getTable(), provider); 13876 for (int i = 0; i < items.size(); i++) { 13877 TJoinItem item = items.getJoinItem(i); 13878 if (item == null) continue; 13879 TTable rightTable = item.getTable(); 13880 if (rightTable == null) continue; 13881 if (isNaturalJoinType(item.getJoinType())) { 13882 NaturalKeyResult r = naturalSharedKeys(leftState, rightTable, provider); 13883 if (r.kind == NaturalKeyResult.Kind.SUCCESS 13884 && r.keys != null && !r.keys.isEmpty()) { 13885 return true; 13886 } 13887 appendRightToLeftOutput(leftState, rightTable, provider); 13888 continue; 13889 } 13890 TObjectNameList usingCols = item.getUsingColumns(); 13891 if (usingCols != null && usingCols.size() > 0) { 13892 List<String> usingKeyNames = new ArrayList<>(usingCols.size()); 13893 for (int k = 0; k < usingCols.size(); k++) { 13894 TObjectName usingKey = usingCols.getObjectName(k); 13895 if (usingKey == null) continue; 13896 String keyName = usingKey.getColumnNameOnly(); 13897 if (keyName != null && !keyName.isEmpty()) { 13898 usingKeyNames.add(keyName); 13899 } 13900 } 13901 mergeRightIntoLeftOutput(leftState, rightTable, provider, usingKeyNames); 13902 } else { 13903 appendRightToLeftOutput(leftState, rightTable, provider); 13904 } 13905 } 13906 } 13907 return false; 13908 } 13909 13910 /** 13911 * Two relations sharing the same effective alias would make 13912 * {@link ColumnRef#getRelationAlias()} ambiguous in the IR. Resolver2 13913 * may already flag column references in this case, but the IR-level 13914 * invariant still needs to hold. 13915 */ 13916 private static void rejectDuplicateAliases(List<RelationSource> relations) { 13917 Set<String> seen = new HashSet<>(); 13918 for (RelationSource r : relations) { 13919 if (!seen.add(r.getAlias())) { 13920 throw new SemanticIRBuildException( 13921 Diagnostic.error(DiagnosticCode.DUPLICATE_RELATION_ALIAS, 13922 "duplicate relation alias '" + r.getAlias() 13923 + "' is not supported (would make ColumnRef ambiguous)", null)); 13924 } 13925 } 13926 } 13927 13928 private static RelationSource buildRelation(TTable table, NameBindingProvider provider, 13929 boolean allowFromSubqueries) { 13930 // Reject FROM-subqueries when the caller did not extract them as 13931 // separate statements. After slice 18 the still-uncovered scopes 13932 // are scalar bodies (slice-11 boundary), set-op branches (slice-16 13933 // boundary), and set-op CTE bodies (build()'s set-op CTE dispatch 13934 // passes allowFromSubqueries=false to each branch). 13935 if (table.getTableType() == gudusoft.gsqlparser.ETableSource.subquery 13936 && !allowFromSubqueries) { 13937 // Slice 74: use effectiveAliasOf so anonymous subqueries 13938 // surface their synth name in the diagnostic instead of the 13939 // empty-string the prior `getAliasName() == null` ternary 13940 // produced. 13941 String bodyAlias = effectiveAliasOf(table); 13942 throw new SemanticIRBuildException( 13943 Diagnostic.error(DiagnosticCode.FROM_SUBQUERY_IN_BODY_CONTEXT_NOT_SUPPORTED, 13944 "FROM-clause subquery '" + (bodyAlias == null || bodyAlias.isEmpty() ? "<anonymous>" : bodyAlias) 13945 + "' inside a scalar body, set-op branch, or set-op CTE body is not supported yet", table)); 13946 } 13947 RelationBinding binding = provider.bindRelation(table); 13948 if (binding == null) { 13949 throw new SemanticIRBuildException( 13950 Diagnostic.error(DiagnosticCode.TABLE_BINDING_UNRESOLVED, 13951 "could not bind table " + safeName(table) + " (only base tables and in-scope CTEs are supported)", table)); 13952 } 13953 // Effective alias: prefer the SQL-written alias, then the slice-74 13954 // synthetic alias for anonymous FROM-subqueries, then the table 13955 // name (mirrors effectiveAliasOf so RelationSource.alias and 13956 // ColumnRef.relationAlias stay aligned). 13957 String alias = effectiveAliasOf(table); 13958 return new RelationSource(alias, binding); 13959 } 13960 13961 private static String safeName(TTable t) { 13962 try { 13963 return t.getName(); 13964 } catch (RuntimeException e) { 13965 return "<unnamed>"; 13966 } 13967 } 13968 13969 // ----------------------------------------------------------------- 13970 // Slice 58 / 59 — catalog-backed SELECT * expansion. 13971 // 13972 // The hook in buildOutputColumns calls tryExpandStar(rc, select, 13973 // provider, isPredicateBody, stmtName) for any result column whose 13974 // columnNameOnly is "*" (and as defense in depth for any 13975 // EExpressionType.list_t expression). tryExpandStar returns a 13976 // StarExpansionResult that is either EXPANDED (with a list of 13977 // OutputColumns) or one of several reasoned rejection kinds. The 13978 // hook then either appends the expanded columns or throws a 13979 // structured SemanticIRBuildException whose message is unique per 13980 // kind so external callers can pattern-match without parsing a 13981 // generic "not supported yet" string. 13982 // 13983 // Scope (slice 58): 13984 // - single base-table FROM (1 join, no join items) 13985 // - bare `*` or qualified `t.*` 13986 // - catalog provided via NameBindingProvider#getRelationColumnNames 13987 // 13988 // Slice 59 lift: 13989 // - multi-relation FROM is now supported when a single top-level 13990 // TJoin carries one or more explicit JOIN clauses (joinItems). 13991 // Each FROM relation must individually satisfy slice-58 rules 13992 // (binding kind TABLE, catalog declares columns). Bare `*` 13993 // concatenates per-relation expansions in FROM order; qualified 13994 // `t.*` selects the one relation whose effective alias matches. 13995 // - qualifier matching is now effective-alias only 13996 // (alias if present, else table name) — case-insensitive. This 13997 // unifies the rule across single- and multi-relation paths; 13998 // `SELECT employees.* FROM employees e` rejects because the 13999 // effective alias is `e`, not `employees`. 14000 // - star expansion is rejected inside synthetic body contexts 14001 // (scalar-subquery / set-op-branch / predicate-subquery) via 14002 // SYNTHETIC_BODY_CONTEXT; the slice-58 path silently allowed 14003 // this for catalog-equipped builds even though a multi-column 14004 // expansion would corrupt scalar-body shape downstream. 14005 // 14006 // Slice 60 lift: 14007 // - CTE star and FROM-subquery star (`a.*` and bare `*` over a CTE 14008 // or FROM-clause subquery alias) are now supported via the 14009 // in-scope-relation-columns map carried on the provider. The 14010 // map is populated at each consuming-SELECT call site in build() 14011 // and extractFromSubqueriesAsStatements before the consumer's 14012 // buildOutputColumns runs; tryExpandStar reads it for CTE / 14013 // SUBQUERY bindings. Explicit CTE column lists 14014 // (`WITH a(x, y) AS ...`) stay rejected because the CTE body's 14015 // StatementGraph publishes inner-projection names, not the 14016 // explicit list, and emitLineageForStatement would point at 14017 // non-existent body outputs. Lifting that path needs either 14018 // body-output renaming or a published-name → body-name lineage 14019 // map; deferred to a future slice. 14020 // 14021 // Slice 62 lift: 14022 // - Comma-FROM (multiple top-level TJoin elements parsed from 14023 // `FROM a, b, c`) is now admitted at the outer / CTE-body / 14024 // FROM-subquery-body call sites. {@link #tryExpandStar} walks 14025 // every top-level TJoin and accumulates relations in FROM 14026 // order; bare `*` concatenates per-relation expansions and 14027 // qualified `t.*` selects the matching effective-alias. 14028 // Synthetic body contexts (scalar / set-op-branch / set-op-CTE 14029 // / predicate) still reject comma-FROM via the gated reject 14030 // in buildRelations and the slice-62 reject inside 14031 // preflightExistsInnerShape. 14032 // 14033 // Out of scope (slice 60+): 14034 // - SELECT * EXCEPT/REPLACE (BigQuery extensions; no slice scheduled) 14035 // - Explicit CTE column list star expansion (slice 61+) 14036 // ----------------------------------------------------------------- 14037 14038 enum StarExpansionKind { 14039 EXPANDED, 14040 PREDICATE_BODY_GUARD, 14041 // Defensive catch-all for malformed FROM lists: missing top-level 14042 // TJoin, null table on a top-level TJoin or a join item, or 14043 // empty {@code select.joins}. Slice 62 made comma-FROM admit 14044 // here (the walk iterates every top-level TJoin), so reaching 14045 // this kind indicates a parse-tree anomaly rather than a comma- 14046 // FROM rejection. 14047 MULTI_RELATION_FROM, 14048 NON_BASE_TABLE_RELATION, 14049 QUALIFIER_NOT_FOUND, 14050 // Slice 59: a qualifier matches 2+ relations (case-insensitive 14051 // effective-alias collision). Real SQL never reaches this case 14052 // unless `rejectDuplicateAliases` permitted a case-only collision 14053 // (it is case-sensitive at SemanticIRBuilder.java:5621). 14054 QUALIFIER_AMBIGUOUS, 14055 NO_CATALOG_OR_UNKNOWN_TABLE, 14056 // Slice 59: star expansion in synthetic body contexts 14057 // (scalar-subquery, set-op-branch, predicate-subquery) is rejected 14058 // because multi-column expansion would violate the body's shape 14059 // contract (e.g. scalar bodies must project exactly one column). 14060 SYNTHETIC_BODY_CONTEXT, 14061 // Slice 60: CTE has an explicit column list 14062 // (`WITH a(x, y) AS ...`). Deferred to a future slice because 14063 // the CTE body's StatementGraph publishes inner-projection 14064 // names, not the explicit list, and lineage emission cannot 14065 // bridge that without either body-output renaming or a 14066 // published-name → body-name map. 14067 EXPLICIT_CTE_COLUMN_LIST_DEFERRED, 14068 // Slice 60: CTE / SUBQUERY binding's published-column map 14069 // lookup returned null or empty. This indicates a builder 14070 // invariant failure (the body should have been built and 14071 // registered before the consumer's buildOutputColumns runs); 14072 // user SQL cannot reach this kind under normal builds — only 14073 // fabricated providers or a missed plumbing path would. The 14074 // diagnostic names the binding kind and qualified name so 14075 // regressions are loud, not silent. 14076 NO_INSCOPE_RELATION_COLUMNS 14077 } 14078 14079 static final class StarExpansionResult { 14080 final StarExpansionKind kind; 14081 final List<OutputColumn> columns; 14082 final String qualifier; 14083 final String detail; 14084 14085 private StarExpansionResult(StarExpansionKind kind, 14086 List<OutputColumn> columns, 14087 String qualifier, 14088 String detail) { 14089 this.kind = kind; 14090 this.columns = columns; 14091 this.qualifier = qualifier; 14092 this.detail = detail; 14093 } 14094 14095 static StarExpansionResult expanded(List<OutputColumn> cols) { 14096 return new StarExpansionResult(StarExpansionKind.EXPANDED, cols, null, null); 14097 } 14098 14099 static StarExpansionResult reject(StarExpansionKind kind) { 14100 return new StarExpansionResult(kind, null, null, null); 14101 } 14102 14103 static StarExpansionResult reject(StarExpansionKind kind, String qualifier, String detail) { 14104 return new StarExpansionResult(kind, null, qualifier, detail); 14105 } 14106 } 14107 14108 /** 14109 * Effective alias for a FROM-clause {@link TTable}: the SQL-written 14110 * alias if present, else the slice-74 synthetic alias for unaliased 14111 * FROM-subquery TTables (position-keyed via 14112 * {@link FromSubqueryNaming#synthAliasFor}), else the table name. 14113 * Mirrors the rule used by {@link #buildRelation} so 14114 * {@link ColumnRef#getRelationAlias()} stays aligned with what the 14115 * lineage emitter expects. 14116 */ 14117 private static String effectiveAliasOf(TTable t) { 14118 if (t == null) return null; 14119 String alias = t.getAliasName(); 14120 if (alias != null && !alias.isEmpty()) return alias; 14121 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 14122 return FromSubqueryNaming.synthAliasFor(t); 14123 } 14124 return t.getName(); 14125 } 14126 14127 /** 14128 * Slice 58 / 59 — attempt to expand a {@code SELECT *} or 14129 * {@code SELECT alias.*} result column using the catalog exposed via 14130 * {@link NameBindingProvider#getRelationColumnNames(TTable)}. 14131 * 14132 * <p>Slice 58 supported only single-base-table FROM. Slice 59 lifts 14133 * the multi-relation case to JOIN forms (single top-level TJoin with 14134 * explicit JOIN clauses). Comma-FROM stays rejected by 14135 * {@code buildRelations}. 14136 * 14137 * <p>Returns {@link StarExpansionKind#EXPANDED} with one 14138 * {@link OutputColumn} per catalog-declared column on success. 14139 * Otherwise returns a reasoned rejection so the caller can throw a 14140 * shape-specific {@link SemanticIRBuildException}. 14141 */ 14142 private static StarExpansionResult tryExpandStar(TResultColumn rc, 14143 TSelectSqlStatement select, 14144 NameBindingProvider provider, 14145 boolean isPredicateBody, 14146 String stmtName) { 14147 if (isPredicateBody) { 14148 // Defensive: the active rejection lives in 14149 // preflightExistsInnerShape (~line 3880) and fires before 14150 // this code runs. Slice-24 EXISTS-with-* tests pin that path. 14151 return StarExpansionResult.reject(StarExpansionKind.PREDICATE_BODY_GUARD); 14152 } 14153 // Slice 59: reject star expansion in synthetic body contexts. 14154 // Scalar-subquery bodies must project exactly one column; 14155 // set-op-branch bodies must keep per-branch column-count parity; 14156 // predicate-subquery bodies are constant or column-ref shapes 14157 // (slice 23/24/27). Multi-column expansion would corrupt all 14158 // three. The preflight at SemanticIRBuilder.java:1007 only 14159 // checks AST result-column count/name, not "*", so without this 14160 // guard a catalog-equipped scalar body `SELECT * FROM small` 14161 // would silently emit multiple OutputColumns. 14162 if (stmtName != null 14163 && (isScalarSyntheticName(stmtName) 14164 || isSetOpBranchSyntheticName(stmtName) 14165 || isPredicateSubquerySyntheticName(stmtName))) { 14166 return StarExpansionResult.reject( 14167 StarExpansionKind.SYNTHETIC_BODY_CONTEXT, null, 14168 "star expansion is not supported inside synthetic body '" 14169 + stmtName + "' (scalar, set-op branch, or predicate body)"); 14170 } 14171 // Extract qualifier (empty string for bare `*`, alias/name for `t.*`). 14172 String qualifier = ""; 14173 TExpression expr = rc.getExpr(); 14174 if (expr != null && expr.getObjectOperand() != null) { 14175 String q = expr.getObjectOperand().getTableString(); 14176 if (q != null && !q.isEmpty()) { 14177 qualifier = q; 14178 } 14179 } 14180 // FROM-clause shape gate. Slice 59 supported a single top-level 14181 // TJoin with zero or more explicit JOIN clauses. Slice 62 lifts 14182 // comma-FROM to multi-TJoin: walk every top-level TJoin in 14183 // {@code select.joins} (a comma-FROM list parses as multiple 14184 // top-level TJoins) and accumulate every relation in FROM order. 14185 if (select.joins == null || select.joins.size() == 0) { 14186 return StarExpansionResult.reject(StarExpansionKind.MULTI_RELATION_FROM); 14187 } 14188 List<TTable> fromRelations = new ArrayList<>(); 14189 for (int j = 0; j < select.joins.size(); j++) { 14190 TJoin topJoin = select.joins.getJoin(j); 14191 if (topJoin == null) { 14192 return StarExpansionResult.reject(StarExpansionKind.MULTI_RELATION_FROM); 14193 } 14194 TTable leftTable = topJoin.getTable(); 14195 if (leftTable == null) { 14196 return StarExpansionResult.reject(StarExpansionKind.MULTI_RELATION_FROM); 14197 } 14198 fromRelations.add(leftTable); 14199 TJoinItemList items = topJoin.getJoinItems(); 14200 if (items == null) continue; 14201 for (int i = 0; i < items.size(); i++) { 14202 TJoinItem item = items.getJoinItem(i); 14203 TTable rightTable = item.getTable(); 14204 if (rightTable == null) { 14205 return StarExpansionResult.reject(StarExpansionKind.MULTI_RELATION_FROM); 14206 } 14207 fromRelations.add(rightTable); 14208 } 14209 } 14210 // Slice 65 / 66: bare `*` over a USING / NATURAL JOIN collapses 14211 // merged keys. For each FROM relation in order, walk catalog/ 14212 // in-scope columns; emit one OutputColumn per merged key (sources 14213 // = merged ref list) and one per non-merged column. Qualified 14214 // `t.*` is unaffected (single-relation path, no merged-key dedup). 14215 if (qualifier.isEmpty() 14216 && (hasUsingInFromClause(select) 14217 || hasNaturalJoinMergedKeysInFromClause(select, provider))) { 14218 return expandBareStarOverUsing(select, provider, fromRelations); 14219 } 14220 // Qualified `t.*`: pick the (unique) FROM relation whose 14221 // effective alias matches the qualifier (case-insensitive). 14222 // Effective alias = `alias != null && !alias.isEmpty() ? alias : 14223 // tableName`, matching buildRelation at line 5649. Slice 58's 14224 // alias-OR-name match (line 5785 before slice 59) is replaced 14225 // here so `SELECT employees.* FROM employees e` rejects 14226 // (qualifier=`employees` ≠ effective alias `e`), consistent 14227 // with standard SQL correlation-name semantics. 14228 if (!qualifier.isEmpty()) { 14229 List<TTable> matches = new ArrayList<>(); 14230 for (TTable t : fromRelations) { 14231 String ea = effectiveAliasOf(t); 14232 if (ea != null && ea.equalsIgnoreCase(qualifier)) { 14233 matches.add(t); 14234 } 14235 } 14236 if (matches.isEmpty()) { 14237 return StarExpansionResult.reject( 14238 StarExpansionKind.QUALIFIER_NOT_FOUND, qualifier, null); 14239 } 14240 if (matches.size() > 1) { 14241 StringBuilder names = new StringBuilder(); 14242 for (int i = 0; i < matches.size(); i++) { 14243 if (i > 0) names.append(", "); 14244 names.append(effectiveAliasOf(matches.get(i))); 14245 } 14246 return StarExpansionResult.reject( 14247 StarExpansionKind.QUALIFIER_AMBIGUOUS, qualifier, 14248 "matches " + matches.size() + " FROM-clause relations: " 14249 + names); 14250 } 14251 return expandSingleRelation(matches.get(0), provider, qualifier); 14252 } 14253 // Bare `*`: expand every FROM relation in order. Fail fast on 14254 // the first relation that does not satisfy the slice-58 rules 14255 // (binding kind TABLE, catalog declares columns); the caller 14256 // sees the per-relation rejection kind and detail. No partial 14257 // outputs are returned. 14258 List<OutputColumn> all = new ArrayList<>(); 14259 for (TTable t : fromRelations) { 14260 StarExpansionResult one = expandSingleRelation(t, provider, ""); 14261 if (one.kind != StarExpansionKind.EXPANDED) { 14262 return one; 14263 } 14264 all.addAll(one.columns); 14265 } 14266 return StarExpansionResult.expanded(all); 14267 } 14268 14269 /** 14270 * Slice 58 / 59 — pure per-relation star expander. Applies the 14271 * base-table-only + catalog rules and builds one 14272 * {@link OutputColumn} per catalog-declared column with a 14273 * {@link ColumnRef} whose {@code relationAlias} is the effective 14274 * alias of {@code target}. Returns {@link StarExpansionKind#EXPANDED} 14275 * on success, otherwise a tuned rejection. 14276 * 14277 * <p>The {@code qualifier} parameter is the SQL-written qualifier 14278 * for qualified `t.*` (empty for bare `*`); it is plumbed back into 14279 * the rejection result so the caller can include it in 14280 * user-visible diagnostics. 14281 */ 14282 private static StarExpansionResult expandSingleRelation(TTable target, 14283 NameBindingProvider provider, 14284 String qualifier) { 14285 // The TTable's tableType cannot distinguish CTE from base table — 14286 // CTE references arrive as ETableSource.objectname. Use the 14287 // provider's bindRelation to get the resolved RelationKind. 14288 RelationBinding binding = provider.bindRelation(target); 14289 if (binding == null) { 14290 String diagAlias = effectiveAliasOf(target); 14291 return StarExpansionResult.reject( 14292 StarExpansionKind.NON_BASE_TABLE_RELATION, qualifier, 14293 "FROM source '" 14294 + (diagAlias != null ? diagAlias : "<unnamed>") 14295 + "' could not be bound (only base tables, in-scope CTEs, and FROM-subqueries are supported)"); 14296 } 14297 RelationKind kind = binding.getKind(); 14298 if (kind == RelationKind.TABLE) { 14299 // Slice 58 catalog-backed path. Returns null when no 14300 // catalog, when the catalog doesn't declare this table, 14301 // or when the table has no columns. 14302 List<String> columnNames = provider.getRelationColumnNames(target); 14303 if (columnNames == null || columnNames.isEmpty()) { 14304 String diagAlias = effectiveAliasOf(target); 14305 return StarExpansionResult.reject( 14306 StarExpansionKind.NO_CATALOG_OR_UNKNOWN_TABLE, qualifier, 14307 diagAlias); 14308 } 14309 return buildExpansionFromColumnNames(target, columnNames); 14310 } 14311 if (kind == RelationKind.CTE) { 14312 // Slice 60 + Slice 103: key by EFFECTIVE ALIAS in the consuming 14313 // SELECT, not by CTE name. This avoids a collision when a 14314 // FROM-subquery alias equals a visible CTE name and the 14315 // CTE is referenced under a different alias (codex 14316 // diff-review): `WITH a AS (...) SELECT c.*, a.* FROM a c 14317 // JOIN (SELECT ...) a ON ...` — both 'a' (CTE) and 'a' 14318 // (subquery alias) live in the FROM clause; effective 14319 // aliases are 'c' and 'a' respectively, so per-relation 14320 // entries cannot overwrite each other. 14321 // 14322 // Slice 103 — explicit CTE column lists (WITH a(x, y) AS ...) 14323 // are no longer rejected here. The slice-102 rename helper now 14324 // runs on the SELECT-side CTE walker too; the in-scope map 14325 // populated by addRelationToInScopeMap reads from 14326 // ctePublishedColumns, which the helper has populated with the 14327 // renamed names. Star expansion just falls through to the 14328 // in-scope lookup below; the renamed list comes back. 14329 String lookupKey = effectiveAliasLowerCaseOrNull(target); 14330 List<String> cteColumns = (lookupKey == null) ? null 14331 : provider.getInScopeRelationColumns().get(lookupKey); 14332 if (cteColumns == null || cteColumns.isEmpty()) { 14333 String diagAlias = effectiveAliasOf(target); 14334 return StarExpansionResult.reject( 14335 StarExpansionKind.NO_INSCOPE_RELATION_COLUMNS, 14336 qualifier, 14337 "CTE '" + diagAlias + "' has no published columns in " 14338 + "the in-scope map (builder invariant: the " 14339 + "CTE body should have been built and " 14340 + "registered before this consumer ran)"); 14341 } 14342 return buildExpansionFromColumnNames(target, cteColumns); 14343 } 14344 if (kind == RelationKind.SUBQUERY) { 14345 // Slice 60: same effective-alias keying as the CTE branch 14346 // above (codex diff-review). For a subquery the effective 14347 // alias IS the alias the SQL writer wrote (preflight 14348 // rejects anonymous subqueries), so this branch is also 14349 // unambiguous under the alias-collision example. 14350 String lookupKey = effectiveAliasLowerCaseOrNull(target); 14351 List<String> subColumns = (lookupKey == null) ? null 14352 : provider.getInScopeRelationColumns().get(lookupKey); 14353 if (subColumns == null || subColumns.isEmpty()) { 14354 String diagAlias = effectiveAliasOf(target); 14355 return StarExpansionResult.reject( 14356 StarExpansionKind.NO_INSCOPE_RELATION_COLUMNS, 14357 qualifier, 14358 "FROM-clause subquery '" 14359 + (diagAlias != null ? diagAlias : "<unnamed>") 14360 + "' has no published columns in the in-scope " 14361 + "map (builder invariant: the subquery body " 14362 + "should have been extracted and registered " 14363 + "before this consumer ran)"); 14364 } 14365 return buildExpansionFromColumnNames(target, subColumns); 14366 } 14367 // OUTER_REFERENCE / UNION / UNKNOWN: keep the slice-58 / 59 14368 // rejection contract. None of these arrive on a slice-60 14369 // FROM-clause relation via the current builder paths 14370 // (OUTER_REFERENCE bindings live only on RelationSource for 14371 // correlated scalar lookup; UNION is a set-op branch concept, 14372 // not a FROM-clause relation). Defensive catch-all. 14373 String diagAlias = effectiveAliasOf(target); 14374 String detail; 14375 switch (kind) { 14376 case OUTER_REFERENCE: 14377 detail = "OUTER_REFERENCE star expansion is not supported (relation '" 14378 + diagAlias + "')"; 14379 break; 14380 case UNION: 14381 case UNKNOWN: 14382 default: 14383 detail = "FROM source '" + diagAlias 14384 + "' must be a base table, CTE, or FROM-subquery (got kind=" 14385 + kind + ")"; 14386 break; 14387 } 14388 return StarExpansionResult.reject( 14389 StarExpansionKind.NON_BASE_TABLE_RELATION, qualifier, detail); 14390 } 14391 14392 /** 14393 * Slice 60 — shared helper that turns a column-name list into the 14394 * star-expansion OutputColumn list with one {@link ColumnRef} per 14395 * column whose {@code relationAlias} is the effective alias of the 14396 * target table (alias if present, else the table name). Used by all 14397 * three slice-58 / 59 / 60 paths. 14398 */ 14399 private static StarExpansionResult buildExpansionFromColumnNames( 14400 TTable target, List<String> columnNames) { 14401 String alias = effectiveAliasOf(target); 14402 List<OutputColumn> outputs = new ArrayList<>(columnNames.size()); 14403 for (String colName : columnNames) { 14404 ColumnRef ref = new ColumnRef(alias, colName); 14405 outputs.add(new OutputColumn( 14406 colName, 14407 /*derived=*/ false, 14408 /*aggregate=*/ false, 14409 Collections.singletonList(ref), 14410 /*windowSpec=*/ null)); 14411 } 14412 return StarExpansionResult.expanded(outputs); 14413 } 14414 14415 /** 14416 * Slice 65 — bare {@code *} over a USING JOIN: deduplicate the 14417 * merged key within each equivalence class. For each FROM relation 14418 * in order: 14419 * <ul> 14420 * <li>look up columns via the existing 14421 * {@link #lookupRelationColumnNames} (catalog + in-scope map); 14422 * reject with {@link StarExpansionKind#NO_CATALOG_OR_UNKNOWN_TABLE} 14423 * when null (we can't dedup without knowing what's there);</li> 14424 * <li>for each column, check 14425 * {@link UsingScope#entryContaining(String, TTable)}; 14426 * if a class contains this relation, emit a single 14427 * merged-source {@link OutputColumn} the first time the class 14428 * is seen and skip duplicates from later class members;</li> 14429 * <li>otherwise emit a plain single-source OutputColumn.</li> 14430 * </ul> 14431 * 14432 * <p>Duplicate-output guard fires ONLY when the conflicting names 14433 * involve a USING-merged entry (merged-vs-plain or two disconnected 14434 * merged classes for the same key). Plain duplicates from 14435 * non-USING multi-relation expansion remain admitted (slice-59 14436 * behavior). 14437 * 14438 * <p>Output column order is <b>left-table order with USING-key 14439 * dedup within each equivalence class</b>: the merged column 14440 * appears at the position of its first member in FROM order. E.g. 14441 * {@code a(id, k), b(k, name)} → {@code [id, k, name]}. 14442 * 14443 * <p>This is INTENTIONALLY DIFFERENT from the ANSI/PostgreSQL 14444 * physical column order (which puts USING columns first, then 14445 * remaining left, then remaining right — would yield 14446 * {@code [k, id, name]}). The Semantic IR is not a query 14447 * executor; the order it surfaces is a lineage-tracking 14448 * presentation choice. Left-table order: 14449 * <ol> 14450 * <li>matches the slice-65 roadmap resume protocol 14451 * ({@code docs/designs/sql-semantic-governance-unified-roadmap.md} 14452 * §13.1) which fixed this order before implementation;</li> 14453 * <li>keeps the merged column physically adjacent to its 14454 * left-side neighbors, matching how lineage tooling 14455 * traditionally renders combined JOIN output;</li> 14456 * <li>does not depend on USING-clause ordering (which is a 14457 * syntactic choice, not a semantic one).</li> 14458 * </ol> 14459 * Codex diff-review round 5 flagged this as P2 (non-ANSI). The 14460 * choice was confirmed in plan-review and is locked by 14461 * {@code bareStarOverUsingLeftPositionPreserved} so any future 14462 * change to ANSI order is a deliberate observable contract 14463 * change, not a silent fix. 14464 */ 14465 private static StarExpansionResult expandBareStarOverUsing( 14466 TSelectSqlStatement select, 14467 NameBindingProvider provider, 14468 List<TTable> fromRelations) { 14469 UsingScope scope = provider.getUsingScope(); 14470 if (scope.isEmpty()) { 14471 // The slice-65 caller checks hasUsingInFromClause before 14472 // routing here, so an empty scope here means buildUsingScope 14473 // computed empty entries (unreachable in practice). Fall 14474 // back to a generic reject so the caller surfaces a 14475 // structured diagnostic. 14476 return StarExpansionResult.reject( 14477 StarExpansionKind.SYNTHETIC_BODY_CONTEXT, null, 14478 "bare * over JOIN ... USING reached the merged-key expander " 14479 + "with an empty UsingScope (builder invariant failure)"); 14480 } 14481 LinkedHashSet<String> emittedNamesLC = new LinkedHashSet<>(); 14482 Set<String> mergedNamesLC = new HashSet<>(); 14483 java.util.IdentityHashMap<UsingScope.EquivalenceClass, Boolean> emittedClasses = 14484 new java.util.IdentityHashMap<>(); 14485 List<OutputColumn> outputs = new ArrayList<>(); 14486 for (TTable t : fromRelations) { 14487 // Slice 103 lifted the explicit-CTE-column-list deferral: 14488 // populateUsingJoinRefs no longer rejects, and 14489 // lookupRelationColumnNames returns the renamed names from 14490 // the in-scope map (populated by the slice-102 rename 14491 // helper that the SELECT-side CTE walker now invokes). 14492 List<String> cols = lookupRelationColumnNames(t, provider); 14493 if (cols == null) { 14494 return StarExpansionResult.reject( 14495 StarExpansionKind.NO_CATALOG_OR_UNKNOWN_TABLE, null, 14496 effectiveAliasOf(t)); 14497 } 14498 for (String c : cols) { 14499 if (c == null) continue; 14500 String keyLC = c.toLowerCase(Locale.ROOT); 14501 UsingScope.MergedKeyEntry entry = scope.entryContaining(keyLC, t); 14502 if (entry != null) { 14503 // USING-merged column. Dedup per class. 14504 if (emittedClasses.containsKey(entry.getEquivClass())) { 14505 continue; 14506 } 14507 emittedClasses.put(entry.getEquivClass(), Boolean.TRUE); 14508 OutputColumn cand = new OutputColumn( 14509 c, /*derived=*/ false, /*aggregate=*/ false, 14510 entry.getSources(), /*windowSpec=*/ null); 14511 StarExpansionResult dup = appendMergedAwareOrReject( 14512 outputs, emittedNamesLC, mergedNamesLC, cand, /*isMerged=*/ true); 14513 if (dup != null) return dup; 14514 } else { 14515 // Plain column. Slice-59 behavior: duplicate plain 14516 // names are admitted. Codex round-5: the merged-aware 14517 // guard fires only when ONE side is merged. 14518 OutputColumn cand = new OutputColumn( 14519 c, /*derived=*/ false, /*aggregate=*/ false, 14520 Collections.singletonList(new ColumnRef(effectiveAliasOf(t), c)), 14521 /*windowSpec=*/ null); 14522 StarExpansionResult dup = appendMergedAwareOrReject( 14523 outputs, emittedNamesLC, mergedNamesLC, cand, /*isMerged=*/ false); 14524 if (dup != null) return dup; 14525 } 14526 } 14527 } 14528 return StarExpansionResult.expanded(outputs); 14529 } 14530 14531 /** 14532 * Slice 65 — duplicate-output helper for 14533 * {@link #expandBareStarOverUsing}. Fires the merged-vs-non-merged 14534 * collision guard. Returns a {@link StarExpansionResult} when the 14535 * caller should reject; returns {@code null} when the candidate is 14536 * appended successfully. 14537 */ 14538 private static StarExpansionResult appendMergedAwareOrReject( 14539 List<OutputColumn> outputs, 14540 LinkedHashSet<String> emittedNamesLC, 14541 Set<String> mergedNamesLC, 14542 OutputColumn cand, 14543 boolean isMerged) { 14544 String nameLC = cand.getName().toLowerCase(Locale.ROOT); 14545 boolean alreadyEmitted = emittedNamesLC.contains(nameLC); 14546 boolean alreadyMerged = mergedNamesLC.contains(nameLC); 14547 // Reject only when at least one side is a USING-merged entry. 14548 // Plain-vs-plain duplicates remain admitted (slice-59 behavior). 14549 if (alreadyEmitted && (isMerged || alreadyMerged)) { 14550 return StarExpansionResult.reject( 14551 StarExpansionKind.SYNTHETIC_BODY_CONTEXT, null, 14552 "bare * over JOIN ... USING produces ambiguous output " 14553 + "column '" + cand.getName() + "': a USING-merged " 14554 + "entry and a same-named column from outside the " 14555 + "USING equivalence class collide (or two disconnected " 14556 + "USING classes share the same key name); qualify " 14557 + "with t.* per relation or rename a column to " 14558 + "disambiguate"); 14559 } 14560 emittedNamesLC.add(nameLC); 14561 if (isMerged) mergedNamesLC.add(nameLC); 14562 outputs.add(cand); 14563 return null; 14564 } 14565 14566 /** 14567 * Slice 60 — read the published column names for an already-built 14568 * statement (CTE body or FROM-subquery body) from its 14569 * {@link StatementGraph#getOutputColumns()}. Used by {@code build()} 14570 * and {@code extractFromSubqueriesAsStatements} to populate the 14571 * in-scope map before each consuming SELECT's 14572 * {@code buildOutputColumns} runs. 14573 */ 14574 private static List<String> outputColumnNames(StatementGraph body) { 14575 List<OutputColumn> cols = body.getOutputColumns(); 14576 List<String> names = new ArrayList<>(cols.size()); 14577 for (OutputColumn c : cols) names.add(c.getName()); 14578 return Collections.unmodifiableList(names); 14579 } 14580 14581 /** 14582 * Slice 60 — effective alias of a TTable lower-cased, or null when 14583 * the table has neither an alias nor a name. The alias-collision 14584 * fix (codex diff-review) replaced CTE-name / subquery-alias 14585 * keying with effective-alias keying; this helper centralises the 14586 * lookup-key computation. 14587 */ 14588 private static String effectiveAliasLowerCaseOrNull(TTable t) { 14589 String alias = effectiveAliasOf(t); 14590 if (alias == null || alias.isEmpty()) return null; 14591 return alias.toLowerCase(Locale.ROOT); 14592 } 14593 14594 /** 14595 * Slice 60 — build a per-consumer effective-alias-keyed map of 14596 * "FROM-clause relation alias → published column names" by walking 14597 * the consumer's direct FROM/JOIN list (single top-level TJoin, 14598 * left table + each joinItem.getTable()). 14599 * 14600 * <p>Each CTE-bound relation contributes its effective alias → 14601 * {@code ctePublishedColumns.get(cteName.toLowerCase())}. Each 14602 * FROM-subquery contributes its alias → {@code 14603 * outputColumnNames(stmts.get(subqueryAliasToIndex.get(alias)))}. 14604 * Base-table relations are skipped because their star expansion 14605 * uses the catalog path (TSQLEnv); adding them here would force 14606 * dialect-specific catalog walks before catalog access is required. 14607 * 14608 * <p>The codex diff-review found that a single name-keyed map 14609 * collides when a FROM-subquery alias equals a visible CTE name 14610 * (`WITH a AS (...) ... FROM a c JOIN (SELECT ...) a ...`). Keying 14611 * by effective alias (which is unique per FROM clause — 14612 * {@link #preflightDirectFromList} rejects duplicates) closes the 14613 * collision class. 14614 * 14615 * @param consumer the SELECT whose FROM list to walk 14616 * @param consumerProvider provider used only for bindRelation 14617 * (CTE vs TABLE discrimination) 14618 * @param ctePublishedColumns CTE-name → columns lookup 14619 * populated as CTE bodies are built 14620 * @param subqueryAliasToIndex this consumer's own subquery alias 14621 * → stmts index lookup 14622 * @param stmts already-built statement list 14623 * @return mutable effective-alias-keyed in-scope map for this 14624 * consumer; callers wrap it via 14625 * {@code provider.withInScopeRelationColumns(map)} 14626 */ 14627 private static Map<String, List<String>> buildEffectiveAliasInScopeMap( 14628 TSelectSqlStatement consumer, 14629 NameBindingProvider consumerProvider, 14630 Map<String, List<String>> ctePublishedColumns, 14631 Map<String, Integer> subqueryAliasToIndex, 14632 List<StatementGraph> stmts) { 14633 Map<String, List<String>> result = new HashMap<>(); 14634 if (consumer.joins == null) return result; 14635 for (TJoin join : consumer.joins) { 14636 addRelationToInScopeMap(join.getTable(), consumerProvider, 14637 ctePublishedColumns, subqueryAliasToIndex, stmts, result); 14638 TJoinItemList items = join.getJoinItems(); 14639 if (items == null) continue; 14640 for (int i = 0; i < items.size(); i++) { 14641 TJoinItem item = items.getJoinItem(i); 14642 if (item == null) continue; 14643 addRelationToInScopeMap(item.getTable(), consumerProvider, 14644 ctePublishedColumns, subqueryAliasToIndex, stmts, result); 14645 } 14646 } 14647 return result; 14648 } 14649 14650 private static void addRelationToInScopeMap( 14651 TTable t, 14652 NameBindingProvider consumerProvider, 14653 Map<String, List<String>> ctePublishedColumns, 14654 Map<String, Integer> subqueryAliasToIndex, 14655 List<StatementGraph> stmts, 14656 Map<String, List<String>> result) { 14657 if (t == null) return; 14658 String key = effectiveAliasLowerCaseOrNull(t); 14659 if (key == null) return; 14660 if (t.getTableType() == gudusoft.gsqlparser.ETableSource.subquery) { 14661 Integer idx = subqueryAliasToIndex.get(key); 14662 if (idx != null) { 14663 result.put(key, outputColumnNames(stmts.get(idx))); 14664 } 14665 return; 14666 } 14667 // objectname (base-table OR CTE reference). Use bindRelation 14668 // to discriminate; base tables don't need an in-scope entry 14669 // (slice 58 catalog path handles them via getRelationColumnNames). 14670 RelationBinding b = consumerProvider.bindRelation(t); 14671 if (b == null) return; 14672 if (b.getKind() == RelationKind.CTE) { 14673 String cteName = t.getName(); 14674 if (cteName == null) return; 14675 List<String> cols = ctePublishedColumns.get(cteName.toLowerCase(Locale.ROOT)); 14676 if (cols != null && !cols.isEmpty()) { 14677 result.put(key, cols); 14678 } 14679 } 14680 // For TABLE, OUTER_REFERENCE, UNION, UNKNOWN bindings the 14681 // in-scope map is intentionally not populated; the 14682 // base-table catalog path or rejection path applies. 14683 } 14684 14685 /** 14686 * Build the {@link OutputColumn} list. Slice 4 lifts the 14687 * simple-object-name / single-source restriction: any expression with at 14688 * least one column reference is accepted, and the column is marked 14689 * {@link OutputColumn#isDerived()} when the expression is anything 14690 * other than a direct column reference. Slice 61 also admits 14691 * canonical constant-only projections (zero column refs) outside 14692 * scalar-subquery bodies, using alias-or-expression text naming. 14693 */ 14694 private static List<OutputColumn> buildOutputColumns(TSelectSqlStatement select, 14695 NameBindingProvider provider, 14696 boolean allowScalarProjectionSubqueries, 14697 boolean allowWindowProjection, 14698 boolean isPredicateBody, 14699 String stmtName) { 14700 TResultColumnList rcl = select.getResultColumnList(); 14701 if (rcl == null || rcl.size() == 0) { 14702 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.SELECT_NO_PROJECTED_COLUMNS, "SELECT has no projected columns", select)); 14703 } 14704 // Slice 23/24/27: predicate-body short-circuit. The preflight 14705 // (§4.4 / slice-24 §4.1.1 / slice-27 §4.1) already validated that 14706 // the inner SELECT projects exactly one column, of an admitted 14707 // shape — constant (slice 23), simple column ref (slice 24), or 14708 // expression / function call / CASE / aggregate over inner 14709 // columns (slice 27). Discriminate on the shape: 14710 // 14711 // - Constant: bypass the regular result-column loop (which would 14712 // reject empty-source non-aggregate projections via the 14713 // "no column refs" guard at line ~4397) and emit one synthetic 14714 // OutputColumn with empty sources. The synthesised name 14715 // `<predicate_subquery_<i>>_const_0` guarantees no collision 14716 // with real column names. 14717 // 14718 // - Slice-24 column ref (simple_object_name_t with name): fall 14719 // through to the normal loop; effectiveOutputName(rc) returns 14720 // the column name. OutputColumn carries name, derived=false, 14721 // aggregate=false, sources=[ColumnRef(...)]. 14722 // 14723 // - Slice-27 expression / function / CASE / aggregate without 14724 // alias: synthesise the OutputColumn here. The normal loop's 14725 // {@link #effectiveOutputName} would throw on rc with neither 14726 // alias nor column name (a slice-6 invariant for OUTER 14727 // projections); for predicate bodies the OutputColumn name is 14728 // internal scaffolding only — no consumer references it 14729 // externally — so a synthetic name is sound. For aggregate-over- 14730 // constants (COUNT(*), SUM(1)) sources is empty and aggregate=true 14731 // matches the line-4397 guard's intent. The slice-24 projector 14732 // pass walks OutputColumn.sources to base-column terminals and 14733 // emits JOIN canonical edges (zero terminals → zero edges, 14734 // multi-source → multiple edges). 14735 if (isPredicateBody) { 14736 TResultColumn rc0 = rcl.getResultColumn(0); 14737 if (rc0.getExpr() != null && isConstantExpression(rc0.getExpr())) { 14738 String synthName = (stmtName != null ? stmtName : "<predicate_subquery_?>") 14739 + "_const_0"; 14740 return Collections.singletonList(new OutputColumn( 14741 synthName, /*derived=*/ true, /*aggregate=*/ false, 14742 Collections.<ColumnRef>emptyList(), /*windowSpec=*/ null)); 14743 } 14744 // Slice 27 + Slice 32: synthesise the OutputColumn for any 14745 // slice-27/31-admitted predicate-body projection EXCEPT the 14746 // slice-24 simple_object_name_t shape. Slice 27 fired this 14747 // branch only when both alias AND columnNameOnly were absent 14748 // (missingName=true); slice 32 widens it to also fire when 14749 // alias is present, so aliased Oracle / MSSQL plain 14750 // {@code LISTAGG(x.id, ',') WITHIN GROUP (ORDER BY ...) AS lst} 14751 // is admitted (the slice-31 boundary lifted by slice 32). 14752 // 14753 // The simple_object_name_t exclusion is intentional. That 14754 // shape MUST keep falling through to the normal loop, where 14755 // {@link #effectiveOutputName} returns the column name (or 14756 // alias if present), {@code derived=false}, and 14757 // {@code sources=[ColumnRef(...)]} — the slice-24 baseline. 14758 // Per {@code TResultColumn.getColumnNameOnly()}, only 14759 // {@code simple_object_name_t}, {@code typecast_t}, and 14760 // {@code sqlserver_proprietary_column_alias_t} populate 14761 // columnNameOnly; function_t / case_t / pure-binary all 14762 // return empty, so the cascade below is alias > _proj_0 14763 // (no columnNameOnly intermediate). 14764 if (rc0.getExpr() != null 14765 && rc0.getExpr().getExpressionType() != EExpressionType.simple_object_name_t) { 14766 String alias = rc0.getColumnAlias(); 14767 String name; 14768 if (alias != null && !alias.isEmpty()) { 14769 // Slice 32 widening: aliased projection. Use the 14770 // alias as the OutputColumn name. 14771 name = alias; 14772 } else { 14773 // Slice 27 carryover: unaliased non-column-ref 14774 // projection. Synthesise a stable name. The synth 14775 // name is used internally by 14776 // {@link gudusoft.gsqlparser.ir.semantic.diff.SemanticIRProjector} 14777 // (line ~161 — BFS start key keyed by 14778 // {@code stmtOutputKey(idx, out.getName())}) to walk 14779 // predicate-body lineage to base columns; uniqueness 14780 // within the single-column predicate body is 14781 // sufficient for that walk. {@code _proj_0} is also 14782 // exposed by the JSON exporter but is not externally 14783 // referenced by callers — only the inner JOIN 14784 // canonical edges (target.column omitted; role=JOIN) 14785 // are visible to consumers. 14786 name = (stmtName != null ? stmtName : "<predicate_subquery_?>") 14787 + "_proj_0"; 14788 } 14789 boolean aggregate = isAggregateFunction(rc0.getExpr()); 14790 // Slice 43 / 44: PG (slice 43) and Snowflake (slice 44) 14791 // hypothetical-set ordered-set aggregates ({@code rank} / 14792 // {@code dense_rank} / {@code percent_rank} / 14793 // {@code cume_dist}) via direct {@code fn.getWithinGroup()} 14794 // attachment do not satisfy 14795 // {@link #isHypotheticalSetWithinGroupCall} (which requires 14796 // a non-null windowDef) and are not in the regular 14797 // {@link #AGGREGATE_FUNCTION_NAMES} whitelist. Inside the 14798 // predicate-body branch they are admitted as aggregates 14799 // when the slice-43 / 44 vendor-gated shape predicate 14800 // fires — contained here (NOT folded into 14801 // {@code isAggregateFunction}) so the carve-out cannot 14802 // accidentally lift the top-level PG / Snowflake case 14803 // (whose dlineage XML is structurally identical to the 14804 // OVER form — see Slice43Test / Slice44Test javadoc). 14805 if (!aggregate 14806 && rc0.getExpr().getExpressionType() == EExpressionType.function_t 14807 && isDirectAttachmentHypotheticalSetCall( 14808 rc0.getExpr().getFunctionCall(), select.dbvendor)) { 14809 aggregate = true; 14810 } 14811 // Slice 28: FILTER-aware collector excludes column refs inside 14812 // FILTER (WHERE ...) subtrees so OutputColumn.sources matches 14813 // dlineage's lineage-relationship view (FILTER predicate refs 14814 // absent from fdd / fdr). 14815 // Slice 31: also excludes column refs inside Oracle / MSSQL 14816 // {@code fn.windowDef.withinGroup} (the WITHIN GROUP ORDER BY) 14817 // so plain {@code LISTAGG(x.id, ',') WITHIN GROUP (ORDER BY x.region)} 14818 // emits sources=[x.id] only — matching dlineage's omission of the 14819 // WITHIN GROUP ORDER BY ref from {@code fdr clause="on"} sources 14820 // (probe Q1 in {@code /tmp/probe31}). Slice 32 reuses the 14821 // same collector unchanged. 14822 List<ColumnRef> sources = collectColumnRefsExcludingFilterAndWithinGroupClauses(rc0, provider); 14823 if (sources.isEmpty() && !aggregate) { 14824 // Non-aggregate with no inner column refs: should be 14825 // covered by the constant short-circuit above. If we 14826 // reach here, fall through to the normal loop's 14827 // line-4397 guard for a conservative tuned message. 14828 } else { 14829 return Collections.singletonList(new OutputColumn( 14830 name, /*derived=*/ true, aggregate, 14831 sources, /*windowSpec=*/ null)); 14832 } 14833 } 14834 // simple_object_name_t falls through (slice-24 carryover): 14835 // the normal loop produces derived=false / 14836 // sources=[ColumnRef(...)] using effectiveOutputName. 14837 } 14838 // Slice 19 (alias-bound PARTITION BY discriminator): the resolver 14839 // synthesises EXACT_MATCH bindings for PARTITION BY <name> when no 14840 // schema metadata is available (TableNamespace.resolveColumn 14841 // inferred_from_usage fallback), even when <name> is a SELECT-list 14842 // alias on a calculated expression. The discriminator is exposed 14843 // by NameBindingProvider#isCalculatedProjectionAliasFallback and 14844 // consulted in buildWindowPartitionRefs / buildWindowOrderRefs; 14845 // see Slice13Test#partitionByExpressionAliasIsRejectedAsAliasBound 14846 // and the shadowing-with-metadata companion. Slice 19 prefers 14847 // conservative rejection in the no-metadata case; with TSQLEnv 14848 // declaring the shadowed column, ColumnSource#hasDefiniteEvidence 14849 // returns true and the discriminator falls through. 14850 List<OutputColumn> out = new ArrayList<>(rcl.size()); 14851 for (int i = 0; i < rcl.size(); i++) { 14852 TResultColumn rc = rcl.getResultColumn(i); 14853 if (rc.getExpr() == null) { 14854 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.RESULT_COLUMN_NULL_EXPRESSION, "result column " + rc + " has null expression", rc)); 14855 } 14856 EExpressionType type = rc.getExpr().getExpressionType(); 14857 // Slice 58: catalog-backed star expansion for a single base 14858 // table. Star projections were rejected by slices 1-57 with 14859 // "SELECT * / list expansions are deferred"; slice 58 lifts 14860 // the single-base-table case when a catalog is available via 14861 // NameBindingProvider#getRelationColumnNames(TTable). Bare 14862 // `*` and qualified `t.*` both arrive here as 14863 // simple_object_name_t with rc.getColumnNameOnly() == "*" 14864 // (probed; see slice-58 plan); the prior EExpressionType.list_t 14865 // branch is dead defense for stars in practice but stays in 14866 // case a future grammar variant routes them differently. 14867 String colNameOnly = rc.getColumnNameOnly(); 14868 if ("*".equals(colNameOnly) || type == EExpressionType.list_t) { 14869 StarExpansionResult exp = tryExpandStar(rc, select, provider, 14870 isPredicateBody, stmtName); 14871 switch (exp.kind) { 14872 case EXPANDED: 14873 out.addAll(exp.columns); 14874 continue; 14875 case PREDICATE_BODY_GUARD: 14876 // Defensive; preflightExistsInnerShape at line ~3880 14877 // rejects SELECT * in EXISTS earlier with a tuned 14878 // message. This branch only fires if a future call 14879 // site enters buildOutputColumns with isPredicateBody 14880 // and a star still present. 14881 throw new SemanticIRBuildException( 14882 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_PREDICATE_BODY, 14883 "result column " + rc + " is a star expansion (SELECT *) " 14884 + "inside a predicate body; not supported yet", rc)); 14885 case SYNTHETIC_BODY_CONTEXT: 14886 // Slice 59: star expansion is rejected inside a 14887 // synthetic body (scalar-subquery / set-op-branch / 14888 // predicate-subquery). Multi-column expansion would 14889 // violate the body's shape contract; the slice-58 14890 // path could silently produce this for 14891 // catalog-equipped builds. 14892 throw new SemanticIRBuildException( 14893 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_SYNTHETIC_BODY, 14894 "result column " + rc + " is a star expansion (SELECT *); " 14895 + (exp.detail != null ? exp.detail 14896 : "star expansion is not supported inside a synthetic body"), rc)); 14897 case MULTI_RELATION_FROM: 14898 // Slice 59: defensive catch-all. Normal comma-FROM 14899 // is rejected earlier by buildRelations:~3042 with 14900 // a clearer message; reaching this kind indicates 14901 // a missing top-level TJoin or null FROM table. 14902 throw new SemanticIRBuildException( 14903 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_MULTI_RELATION_FROM, 14904 "result column " + rc + " is a star expansion (SELECT *); " 14905 + "FROM source could not be determined " 14906 + "(comma-FROM is rejected earlier with a clearer message)", rc)); 14907 case NON_BASE_TABLE_RELATION: 14908 throw new SemanticIRBuildException( 14909 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_NON_BASE_TABLE, 14910 "result column " + rc + " is a star expansion (SELECT *); " 14911 + (exp.detail != null ? exp.detail 14912 : "FROM source must be a base table"), rc)); 14913 case QUALIFIER_NOT_FOUND: 14914 throw new SemanticIRBuildException( 14915 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_QUALIFIER_NOT_FOUND, 14916 "result column " + rc + " (qualified star " 14917 + (exp.qualifier == null ? "?" : exp.qualifier) 14918 + ".*) does not match any FROM-clause relation", rc)); 14919 case QUALIFIER_AMBIGUOUS: 14920 // Slice 59: 2+ FROM relations have the same 14921 // effective alias. Real SQL never reaches this 14922 // unless rejectDuplicateAliases:~5621 (case- 14923 // sensitive) allowed a case-only collision. 14924 throw new SemanticIRBuildException( 14925 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_QUALIFIER_AMBIGUOUS, 14926 "result column " + rc + " (qualified star " 14927 + (exp.qualifier == null ? "?" : exp.qualifier) 14928 + ".*) is ambiguous: " 14929 + (exp.detail != null ? exp.detail 14930 : "multiple FROM-clause relations match"), rc)); 14931 case NO_CATALOG_OR_UNKNOWN_TABLE: 14932 // Slice 58 single-FROM message kept stable; slice 59 14933 // names the failing relation when known 14934 // (exp.detail carries the relation's effective alias). 14935 String relationLabel; 14936 if (exp.qualifier != null && !exp.qualifier.isEmpty()) { 14937 relationLabel = exp.qualifier; 14938 } else if (exp.detail != null && !exp.detail.isEmpty()) { 14939 relationLabel = exp.detail; 14940 } else { 14941 relationLabel = "the FROM relation"; 14942 } 14943 throw new SemanticIRBuildException( 14944 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_NO_CATALOG, 14945 "result column " + rc + " is a star expansion (SELECT *); " 14946 + "requires catalog with column declarations for " 14947 + relationLabel, rc)); 14948 case EXPLICIT_CTE_COLUMN_LIST_DEFERRED: 14949 // Slice 103 lifted the explicit-CTE-column-list 14950 // deferral: the SELECT-side CTE walker now runs 14951 // the slice-102 rename helper, so the in-scope 14952 // map publishes the renamed columns and 14953 // expandSingleRelation returns EXPANDED instead 14954 // of falling into this arm. The case is kept 14955 // declared-but-unreached for API stability and 14956 // exhaustive-switch coverage (slice 71/72/82/86 14957 // /95/96/97/98/99/100/101/102 precedent). If a 14958 // future call path re-introduces the kind, the 14959 // throw still fires with a faithful diagnostic. 14960 throw new SemanticIRBuildException( 14961 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_EXPLICIT_CTE_COLUMN_LIST, 14962 "result column " + rc + " is a star expansion (SELECT *); " 14963 + (exp.detail != null ? exp.detail 14964 : "star expansion through an explicit CTE column list is deferred to a future slice"), rc)); 14965 case NO_INSCOPE_RELATION_COLUMNS: 14966 // Slice 60: builder invariant failure — a CTE 14967 // or FROM-subquery body was not registered in 14968 // the provider's in-scope-relation-columns map 14969 // before this consumer ran. User SQL cannot 14970 // reach this kind under normal build() 14971 // execution; reaching it indicates a missing 14972 // call site is not narrowing the provider 14973 // before invoking buildOutputColumns. 14974 throw new SemanticIRBuildException( 14975 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_NO_INSCOPE_RELATION_COLUMNS, 14976 "result column " + rc + " is a star expansion (SELECT *); " 14977 + (exp.detail != null ? exp.detail 14978 : "in-scope CTE/subquery column map is empty for this relation (builder invariant failure)"), rc)); 14979 // No `default`: switch is intentionally exhaustive 14980 // over StarExpansionKind. The post-switch throw 14981 // below is the actual runtime guard if a future 14982 // enum value is added without updating this 14983 // switch. 14984 } 14985 throw new SemanticIRBuildException( 14986 Diagnostic.error(DiagnosticCode.STAR_EXPANSION_UNHANDLED_KIND, 14987 "result column " + rc + " is a star expansion (SELECT *); " 14988 + "unhandled StarExpansionKind=" + exp.kind, rc)); 14989 } 14990 // Top-level scalar subquery in projection (slice 11). When the 14991 // caller permits it (allowScalarProjectionSubqueries=true), the 14992 // outer caller has already extracted the inner SELECT as its 14993 // own statement via extractScalarSubqueriesAsStatements; here 14994 // we just construct the OutputColumn shell with empty sources 14995 // and let emitLineageForStatement wire the 14996 // STATEMENT_OUTPUT → STATEMENT_OUTPUT edge. 14997 if (type == EExpressionType.subquery_t) { 14998 if (!allowScalarProjectionSubqueries) { 14999 throw new SemanticIRBuildException( 15000 Diagnostic.error(DiagnosticCode.NESTED_SCALAR_SUBQUERY_IN_PROJECTION, 15001 "nested scalar subquery in projection (inside another " 15002 + "scalar subquery body or FROM-clause subquery body) " 15003 + "is not supported yet", rc)); 15004 } 15005 String alias = rc.getColumnAlias(); 15006 if (alias == null || alias.isEmpty()) { 15007 throw new SemanticIRBuildException( 15008 Diagnostic.error(DiagnosticCode.SCALAR_SUBQUERY_ALIAS_REQUIRED, 15009 "scalar subquery projection must have an alias", rc)); 15010 } 15011 out.add(new OutputColumn(alias, /*derived=*/ true, 15012 /*aggregate=*/ false, 15013 Collections.<ColumnRef>emptyList(), 15014 /*windowSpec=*/ null)); 15015 continue; 15016 } 15017 // Slice 13: detect top-level window function before deep scans 15018 // so the embedded-window rejecter can identity-skip the 15019 // legitimate top-level window function call. 15020 boolean topLevelWindow = isTopLevelWindowProjection(rc.getExpr()); 15021 // Slice 33: detect Oracle / MSSQL plain WITHIN-GROUP-only 15022 // aggregate at the projection root. When admitted, the root 15023 // function carries fn.windowDef!=null but is the legitimate 15024 // top-level form — the slice-13 invariant rejecters 15025 // (isTopLevelWindowProjection / rejectWindowFunctions / 15026 // rejectEmbeddedWindowFunction) keep their strict wd!=null 15027 // check unchanged; this local boolean is what discriminates 15028 // them. The admission helper combines: 15029 // - isWithinGroupOnlyWindowDef (no OVER, no KEEP DENSE_RANK) 15030 // - explicit EDbVendor gate (Oracle / MSSQL only — mirrors 15031 // the slice-31 predicate-body gate at line ~3860) 15032 // - function name in AGGREGATE_FUNCTION_NAMES whitelist 15033 // PG / Snowflake / DB2 / SparkSQL produce direct fn.withinGroup 15034 // (windowDef=null) and never reach this admission helper; their 15035 // top-level WG already builds today via the normal aggregate 15036 // path (with pre-existing AGGREGATION_MISMATCH divergence on 15037 // the dlineage projector side that slice 33 deliberately does 15038 // not address — see the slice-30 rationale on 15039 // ORDER_BY_WITHIN_GROUP_AGGREGATE_NAMES for why a name-only 15040 // projector override is unsafe across the dual-form aggregates 15041 // SUM / MIN / MAX / LISTAGG that have OVER (PARTITION BY) 15042 // forms on Oracle). 15043 TFunctionCall slice33RootFn = rc.getExpr().getExpressionType() == EExpressionType.function_t 15044 ? rc.getExpr().getFunctionCall() 15045 : null; 15046 boolean slice33TopLevelWG = isAdmittedTopLevelWithinGroupAggregate( 15047 slice33RootFn, select.dbvendor); 15048 boolean slice35TopLevelDirectWG = isAdmittedTopLevelDirectWithinGroupAggregate( 15049 slice33RootFn, select.dbvendor); 15050 // Reject scalar subqueries embedded inside larger projection 15051 // expressions (slice 11 + codex round-2 MUST 7). Catches both 15052 // top-level subquery_t hidden under a wrapping expression 15053 // (e.g. UPPER((SELECT ...)) — though the parser sometimes 15054 // strips the wrap) AND predicate subqueries that don't surface 15055 // as subquery_t (EXISTS in projection, IN-projection). 15056 // Slice 9/10 deep-scan pattern. 15057 rejectEmbeddedSubqueryInProjection(rc.getExpr(), rc); 15058 // Slice 13: reject window functions embedded inside larger 15059 // projection expressions (e.g. `ROW_NUMBER() OVER (...) + 1`, 15060 // `UPPER(LAG(...) OVER (...))`). The helper identity-skips 15061 // the legitimate top-level window function call when 15062 // `topLevelWindow=true`. 15063 // 15064 // Slice 33: also identity-skip the top-level WITHIN-GROUP-only 15065 // aggregate root. TFunctionCall.acceptChildren preVisits the 15066 // root function (TFunctionCall.java:1528), so without 15067 // skipTopLevel=true the visitor would catch the slice-33- 15068 // admitted root (fn.windowDef!=null). Embedded WG inside 15069 // UPPER / CASE still rejects because the visitor finds a 15070 // non-root function whose windowDef!=null — the inner 15071 // function is not == identity to the root, so the skip 15072 // doesn't apply. 15073 rejectEmbeddedWindowFunction(rc.getExpr(), rc, topLevelWindow || slice33TopLevelWG); 15074 // Slice 33/35 fast path: WITHIN-GROUP-only aggregate — fall 15075 // through to the normal aggregate path. Oracle / MSSQL use the 15076 // windowDef attachment (slice 33); PostgreSQL direct attachment 15077 // is already on the normal aggregate path but shares the 15078 // unaliased expression-text fallback below (slice 35). 15079 if (slice33TopLevelWG || slice35TopLevelDirectWG) { 15080 // No special branch — fall through to the plain aggregate 15081 // / expression / column path below. 15082 } else if (topLevelWindow) { 15083 if (!allowWindowProjection) { 15084 throw new SemanticIRBuildException( 15085 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_AS_PROJECTION_NOT_SUPPORTED, 15086 "result column " + rc + " is a window function; not supported " 15087 + "inside this body (e.g. scalar-subquery body)", rc)); 15088 } 15089 out.add(buildWindowOutputColumn(rc, select, provider)); 15090 continue; 15091 } 15092 // Plain aggregate / expression / column path. The 15093 // rejectWindowFunctions call below is now defensive — the 15094 // top-level-window fast path above intercepts legitimate 15095 // windows, and rejectEmbeddedWindowFunction caught any 15096 // descendant window functions. 15097 // 15098 // Slice 33: skip rejectWindowFunctions for the slice-33- 15099 // admitted shape. The root function has windowDef!=null but 15100 // is the legitimate top-level form; calling 15101 // rejectWindowFunctions here would reject it via the 15102 // strict-wd!=null check (kept unchanged per slice-31 15103 // invariant). 15104 if (!slice33TopLevelWG && !slice35TopLevelDirectWG) { 15105 rejectWindowFunctions(rc.getExpr(), rc); 15106 } 15107 boolean derived = (type != EExpressionType.simple_object_name_t); 15108 boolean aggregate = isAggregateFunction(rc.getExpr()); 15109 // Slice 28: FILTER-aware collector excludes column refs inside 15110 // FILTER (WHERE ...) subtrees so OutputColumn.sources matches 15111 // dlineage's lineage-relationship view (FILTER predicate refs 15112 // absent from fdd / fdr). 15113 // Slice 31: also excludes column refs inside Oracle / MSSQL 15114 // {@code fn.windowDef.withinGroup} so plain WITHIN GROUP 15115 // aggregates emit sources from function args only. Defense- 15116 // in-depth here: the slice-31 lift only admits Oracle / MSSQL 15117 // plain WITHIN GROUP at the unaliased predicate-body 15118 // short-circuit (line ~5216) — the strict 15119 // {@link #rejectWindowFunctions} call above keeps top-level 15120 // windowDef-bearing projections rejected outside the 15121 // predicate-body context, so this collector reduces to the 15122 // slice-28 FILTER-only variant in practice today. 15123 List<ColumnRef> sources = collectColumnRefsExcludingFilterAndWithinGroupClauses(rc, provider); 15124 if (sources.isEmpty() && !aggregate) { 15125 boolean canonicalConstant = isConstantExpression(rc.getExpr()); 15126 boolean inScalarBody = isScalarSyntheticName(stmtName); 15127 if (canonicalConstant && !inScalarBody) { 15128 // Slice 61: constant-only projection lift. Predicate 15129 // bodies still use the earlier slice-23 short-circuit, 15130 // while scalar-subquery bodies intentionally keep the 15131 // slice-11/20 invariant that scalar body projections 15132 // must have a column source. 15133 String alias = rc.getColumnAlias(); 15134 String name = (alias != null && !alias.isEmpty()) 15135 ? alias 15136 : rc.getExpr().toString(); 15137 out.add(new OutputColumn(name, /*derived=*/ true, 15138 /*aggregate=*/ false, 15139 Collections.<ColumnRef>emptyList(), 15140 /*windowSpec=*/ null)); 15141 continue; 15142 } 15143 throw new SemanticIRBuildException( 15144 Diagnostic.error(DiagnosticCode.RESULT_COLUMN_NO_COLUMN_REFS, 15145 "result column " + rc + " has no column references " 15146 + "and is not a constant or aggregate expression " 15147 + "(e.g. UPPER('literal') / CAST / current_date - not supported yet)", rc)); 15148 } 15149 // Slice 34: when the slice-33-admitted top-level Oracle / MSSQL 15150 // WITHIN-GROUP-only aggregate has no alias, fall back to the 15151 // parser's expression text. {@code effectiveOutputName} would 15152 // throw "neither alias nor column name" because 15153 // {@code function_t} returns "" from getColumnNameOnly(). 15154 // Probe-verified that {@code rc.getExpr().toString()} byte- 15155 // matches dlineage's <select_list> column name attribute on 15156 // Oracle / MSSQL for this shape, so canonical SELECT-edge 15157 // outputName remains in parity with no projector change. 15158 // Gated tightly on slice33TopLevelWG so unrelated unaliased 15159 // shapes (function calls / CASE / expressions outside the 15160 // slice-33 admit set) keep failing loudly via 15161 // effectiveOutputName until each is probed and admitted 15162 // explicitly. See Slice34Test. 15163 String name; 15164 if (slice33TopLevelWG || slice35TopLevelDirectWG) { 15165 String alias = rc.getColumnAlias(); 15166 name = (alias != null && !alias.isEmpty()) 15167 ? alias 15168 : rc.getExpr().toString(); 15169 } else { 15170 name = effectiveOutputName(rc); 15171 } 15172 out.add(new OutputColumn(name, derived, aggregate, sources, /*windowSpec=*/ null)); 15173 } 15174 return out; 15175 } 15176 15177 /** 15178 * Reject scalar subqueries embedded inside larger projection 15179 * expressions (slice 11). Catches: 15180 * 15181 * <ul> 15182 * <li>{@code SELECT UPPER((SELECT MAX(salary) AS m FROM employees)) 15183 * AS x FROM ...} — scalar nested inside a function call.</li> 15184 * <li>{@code SELECT EXISTS (SELECT 1 FROM employees) AS has_emp 15185 * FROM ...} — EXISTS doesn't surface as 15186 * {@link EExpressionType#subquery_t} but carries 15187 * {@code getSubQuery() != null} (slice-9 round-3 lesson).</li> 15188 * <li>Other in-expression subqueries that 15189 * {@link #collectColumnRefs} would otherwise descend into.</li> 15190 * </ul> 15191 * 15192 * <p>Only top-level {@code subquery_t} projections are extracted as 15193 * separate statements (handled in 15194 * {@link #extractScalarSubqueriesAsStatements}); embedded subqueries 15195 * remain rejected because the IR doesn't yet model the "expression 15196 * over subquery result" shape. 15197 */ 15198 private static void rejectEmbeddedSubqueryInProjection(TExpression expr, TResultColumn rc) { 15199 if (expr == null) return; 15200 final boolean[] found = {false}; 15201 expr.acceptChildren(new TParseTreeVisitor() { 15202 @Override 15203 public void preVisit(TExpression e) { 15204 if (found[0]) return; 15205 if (e.getExpressionType() == EExpressionType.subquery_t 15206 || e.getSubQuery() != null) { 15207 found[0] = true; 15208 } 15209 } 15210 }); 15211 if (!found[0]) { 15212 // Top-level expression itself may carry a subquery (e.g. EXISTS 15213 // at the projection root, where rc.getExpr() is exists_t with 15214 // non-null getSubQuery() but is NOT subquery_t — so the 15215 // top-level subquery_t branch above didn't extract it). 15216 if (expr.getExpressionType() != EExpressionType.subquery_t 15217 && expr.getSubQuery() != null) { 15218 found[0] = true; 15219 } 15220 } 15221 if (found[0]) { 15222 throw new SemanticIRBuildException( 15223 Diagnostic.error(DiagnosticCode.RESULT_COLUMN_SCALAR_SUBQUERY_EMBEDDED, 15224 "result column " + rc + " contains a scalar subquery embedded " 15225 + "in a larger projection expression; not supported yet " 15226 + "(only top-level scalar subquery projections are extracted)", rc)); 15227 } 15228 } 15229 15230 /** 15231 * Detect whether an expression contains an aggregate function call 15232 * anywhere in its subtree. Slice 6 uses a name whitelist via 15233 * {@link #AGGREGATE_FUNCTION_NAMES}. Walking recursively means 15234 * {@code SUM(salary) + 1} and {@code COUNT(*) + 1} are both classified 15235 * as aggregate (and thus permitted with empty sources via 15236 * {@link #buildOutputColumns}). Wrapped in a helper so slice 7+ can 15237 * swap in deeper detection (e.g. vendor-specific function classification 15238 * on TFunctionCall) without touching call sites. 15239 * 15240 * <p>Note on aggregate literals like {@code COUNT(1)} or {@code SUM(1)}: 15241 * the visitor finds no column refs, so {@code sources=[]}. Slice 6 15242 * permits these as aggregates with no lineage edges; consumers must 15243 * read {@link OutputColumn#isAggregate()} to know the value is 15244 * row-collapsing without column lineage. 15245 */ 15246 private static boolean isAggregateFunction(TExpression expr) { 15247 if (expr == null) return false; 15248 // Slice 13: short-circuit for top-level window function. The 15249 // upstream `rejectEmbeddedWindowFunction` has already rejected any 15250 // embedded window functions, but this short-circuit ensures 15251 // `AVG(salary) OVER (...)` is never classified as an aggregate 15252 // even if it somehow slips past the upstream guard. 15253 // 15254 // Slice 31: discriminate WITHIN-GROUP-only windowDef (Oracle / 15255 // MSSQL plain WITHIN GROUP attachment without OVER) so 15256 // `LISTAGG(x.id, ',') WITHIN GROUP (ORDER BY x.region)` stays 15257 // classified as an aggregate. Uses {@link #isWindowDefBearingFunction} 15258 // — only this check and {@link #containsWindowFunction} are 15259 // lifted; every other slice-13 invariant rejecter is unchanged. 15260 if (expr.getExpressionType() == EExpressionType.function_t) { 15261 TFunctionCall rootFn = expr.getFunctionCall(); 15262 // Slice 42: hypothetical-set ordered-set aggregate root 15263 // (Oracle / MSSQL {@code RANK(100) WITHIN GROUP (ORDER BY x)}) 15264 // — short-circuit aggregate=true. The shape predicate 15265 // {@link #isHypotheticalSetWithinGroupCall} requires WITHIN- 15266 // GROUP-only windowDef AND a name in 15267 // {@link #HYPOTHETICAL_SET_AGGREGATE_NAMES}, so PG direct 15268 // attachment ({@code fn.getWindowDef()==null}) and OVER- 15269 // bearing forms cannot fire it. 15270 if (isHypotheticalSetWithinGroupCall(rootFn)) { 15271 return true; 15272 } 15273 if (isWindowDefBearingFunction(rootFn)) { 15274 return false; 15275 } 15276 } 15277 final boolean[] found = {false}; 15278 expr.acceptChildren(new TParseTreeVisitor() { 15279 @Override 15280 public void preVisit(TFunctionCall fn) { 15281 if (found[0]) return; 15282 // Slice 13 codex round-2 SHOULD 3: skip windowed function 15283 // calls inside the visitor too, defensively. Upstream 15284 // rejection should already have fired, but this removes 15285 // overlap risk for `sum/count/avg`. 15286 // 15287 // Slice 31: same WITHIN-GROUP-only carve-out as the root 15288 // short-circuit above so an Oracle / MSSQL plain WITHIN 15289 // GROUP aggregate nested inside CASE/UPPER (slice-27 15290 // admit) is still picked up as aggregate. 15291 // 15292 // Slice 42: hypothetical-set ordered-set aggregate carve- 15293 // out — descendants matching the shape predicate count 15294 // as aggregate (defense-in-depth; the slice-13 embedded- 15295 // window rejecter already fires on inner WG-bearing 15296 // calls, so this branch is mostly unreachable today). 15297 if (isHypotheticalSetWithinGroupCall(fn)) { 15298 found[0] = true; 15299 return; 15300 } 15301 if (isWindowDefBearingFunction(fn)) return; 15302 if (fn.getFunctionName() == null) return; 15303 String name = fn.getFunctionName().toString(); 15304 if (name == null || name.isEmpty()) return; 15305 if (AGGREGATE_FUNCTION_NAMES.contains(name.toLowerCase(Locale.ROOT))) { 15306 found[0] = true; 15307 } 15308 } 15309 }); 15310 // The root expression itself is not visited by acceptChildren — only 15311 // its children. If the root is the function call (the common case 15312 // for `SUM(salary)` with no enclosing arithmetic), check it too. 15313 if (!found[0] && expr.getExpressionType() == EExpressionType.function_t) { 15314 TFunctionCall fn = expr.getFunctionCall(); 15315 if (fn != null && fn.getFunctionName() != null) { 15316 String name = fn.getFunctionName().toString(); 15317 if (name != null && !name.isEmpty() 15318 && AGGREGATE_FUNCTION_NAMES.contains(name.toLowerCase(Locale.ROOT))) { 15319 found[0] = true; 15320 } 15321 } 15322 } 15323 return found[0]; 15324 } 15325 15326 /** 15327 * Reject window-function projections like {@code AVG(salary) OVER (...)}. 15328 * In the GSP AST these still parse as {@code function_t} with a 15329 * non-null {@code TFunctionCall.getWindowDef()}, but their semantics 15330 * are row-preserving (analytic), not row-collapsing (aggregate). Slice 15331 * 6 owns plain GROUP BY aggregation only; window functions deserve 15332 * their own slice. 15333 */ 15334 private static void rejectWindowFunctions(TExpression expr, TResultColumn rc) { 15335 if (expr == null) return; 15336 final boolean[] found = {false}; 15337 expr.acceptChildren(new TParseTreeVisitor() { 15338 @Override 15339 public void preVisit(TFunctionCall fn) { 15340 if (found[0]) return; 15341 if (fn.getWindowDef() != null) found[0] = true; 15342 } 15343 }); 15344 if (!found[0] && expr.getExpressionType() == EExpressionType.function_t) { 15345 TFunctionCall fn = expr.getFunctionCall(); 15346 if (fn != null && fn.getWindowDef() != null) found[0] = true; 15347 } 15348 if (found[0]) { 15349 throw new SemanticIRBuildException( 15350 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_USED_NOT_SUPPORTED, 15351 "result column " + rc + " uses a window function (OVER (...)); not supported yet", rc)); 15352 } 15353 } 15354 15355 /** 15356 * Slice 13: detect whether the projection root is a top-level 15357 * window-function call. Returns {@code true} iff 15358 * {@code expr.getExpressionType() == function_t} AND the function 15359 * call carries a non-null {@code TWindowDef}. The result drives 15360 * three things in {@link #buildOutputColumns}: 15361 * 15362 * <ul> 15363 * <li>The {@code skipTopLevel} arg to 15364 * {@link #rejectEmbeddedWindowFunction} so the legitimate 15365 * top-level window call is identity-skipped during embedded 15366 * detection.</li> 15367 * <li>The fast-path dispatch into 15368 * {@link #buildWindowOutputColumn} when window projections 15369 * are allowed.</li> 15370 * <li>The scalar-body / future-context rejection when window 15371 * projections are forbidden in the surrounding context 15372 * (slice-13 {@code allowWindowProjection=false}).</li> 15373 * </ul> 15374 */ 15375 private static boolean isTopLevelWindowProjection(TExpression expr) { 15376 if (expr == null) return false; 15377 if (expr.getExpressionType() != EExpressionType.function_t) return false; 15378 TFunctionCall fn = expr.getFunctionCall(); 15379 return fn != null && fn.getWindowDef() != null; 15380 } 15381 15382 /** 15383 * Slice 13: reject window functions embedded inside a larger 15384 * projection expression (mirrors slice 11's 15385 * {@link #rejectEmbeddedSubqueryInProjection}). The {@code skipTopLevel} 15386 * flag is set when the caller has identified 15387 * {@code expr} as a legitimate top-level window-function projection 15388 * — without identity-skipping that exact {@code TFunctionCall} the 15389 * visitor would reject every valid top-level window. 15390 * 15391 * <p>Visitor-only (no post-visitor fallback): unlike 15392 * {@code subquery_t} which can be wrapped in expression types that 15393 * do not surface as {@code subquery_t}, a window function is always 15394 * reachable through {@code TExpression.acceptChildren} → 15395 * {@code TFunctionCall.preVisit}. Codex round-3 MUST 1. 15396 */ 15397 private static void rejectEmbeddedWindowFunction(TExpression expr, 15398 TResultColumn rc, 15399 boolean skipTopLevel) { 15400 if (expr == null) return; 15401 final TFunctionCall topLevelFn = 15402 skipTopLevel 15403 && expr.getExpressionType() == EExpressionType.function_t 15404 ? expr.getFunctionCall() 15405 : null; 15406 final boolean[] found = {false}; 15407 expr.acceptChildren(new TParseTreeVisitor() { 15408 @Override 15409 public void preVisit(TFunctionCall fn) { 15410 if (found[0]) return; 15411 if (fn == topLevelFn) return; // identity-skip 15412 if (fn.getWindowDef() != null) found[0] = true; 15413 } 15414 }); 15415 if (found[0]) { 15416 throw new SemanticIRBuildException( 15417 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_EMBEDDED_NOT_SUPPORTED, 15418 "result column " + rc + " contains a window function embedded " 15419 + "in a larger projection expression; not supported yet " 15420 + "(only top-level window-function projections are supported)", rc)); 15421 } 15422 } 15423 15424 /** 15425 * Lower-cased function names accepted as window functions in slice 13. 15426 * Includes every name in {@link #AGGREGATE_FUNCTION_NAMES} (aggregates 15427 * can be windowed: {@code SUM(...) OVER (...)}, {@code AVG(...) OVER (...)}, 15428 * etc.) plus the analytic-only names. New analytic functions must be 15429 * added here explicitly to avoid silent acceptance of an unfamiliar 15430 * window function whose semantics the slice does not yet model. 15431 * 15432 * <p>Slice 30 exception: {@code mode} is added to 15433 * {@link #AGGREGATE_FUNCTION_NAMES} for the WITHIN GROUP path but 15434 * REMOVED from this allowlist via {@code s.remove("mode")} below — 15435 * {@code mode()} has no documented window form in any GSP-supported 15436 * vendor and the explicit removal keeps {@code mode() OVER (...)} 15437 * (which the PostgreSQL parser accepts) rejected by 15438 * {@code buildWindowOutputColumn}. 15439 */ 15440 private static final Set<String> WINDOW_FUNCTION_NAMES; 15441 static { 15442 Set<String> s = new HashSet<>(); 15443 // Aggregate names that can be windowed. 15444 s.addAll(AGGREGATE_FUNCTION_NAMES); 15445 // Slice 30: mode is an ordered-set-only aggregate; remove from the 15446 // window allowlist (it was added to AGGREGATE_FUNCTION_NAMES for the 15447 // WITHIN GROUP path but never appears as a real window function in 15448 // any GSP-supported vendor — see Slice30Test.pgModeOverStillRejected 15449 // AtOuterProjection for the lock-in). 15450 s.remove("mode"); 15451 // Analytic-only window functions. 15452 s.add("row_number"); 15453 s.add("rank"); 15454 s.add("dense_rank"); 15455 s.add("lag"); 15456 s.add("lead"); 15457 s.add("ntile"); 15458 s.add("first_value"); 15459 s.add("last_value"); 15460 s.add("percent_rank"); 15461 s.add("cume_dist"); 15462 s.add("nth_value"); 15463 WINDOW_FUNCTION_NAMES = Collections.unmodifiableSet(s); 15464 } 15465 15466 /** 15467 * Slice 13: build an {@link OutputColumn} for a top-level 15468 * window-function projection. Caller must have already verified 15469 * {@link #isTopLevelWindowProjection(TExpression)}, run the 15470 * {@link #rejectEmbeddedSubqueryInProjection} and 15471 * {@link #rejectEmbeddedWindowFunction} guards, and confirmed the 15472 * surrounding body permits window projections (i.e., the 15473 * {@code !allowWindowProjection} fast-path in 15474 * {@link #buildOutputColumns} did not fire). 15475 * 15476 * <p>The constructed {@link OutputColumn} carries: 15477 * <ul> 15478 * <li>{@code derived = true} (window functions are computed)</li> 15479 * <li>{@code aggregate = false} (window functions are 15480 * row-preserving — see slice-13 §14)</li> 15481 * <li>{@code sources} = column refs from the function args only 15482 * (PARTITION BY / OVER ORDER BY refs are excluded so that 15483 * canonical SELECT lineage matches dlineage's 15484 * function-arg-only SELECT BFS)</li> 15485 * <li>{@code windowSpec = WindowSpec(partitionRefs, orderRefs, frame)} 15486 * (slice 22 — frame may be null when the SQL has no 15487 * {@code ROWS}/{@code RANGE}/{@code GROUPS BETWEEN ...} clause)</li> 15488 * </ul> 15489 */ 15490 private static OutputColumn buildWindowOutputColumn(TResultColumn rc, 15491 TSelectSqlStatement enclosingSelect, 15492 NameBindingProvider provider) { 15493 TFunctionCall fn = rc.getExpr().getFunctionCall(); 15494 TWindowDef wd = fn.getWindowDef(); 15495 15496 // 1. Function-name allowlist (codex round-1 MUST 3). 15497 String fnName = fn.getFunctionName() == null ? null : fn.getFunctionName().toString(); 15498 if (fnName == null || !WINDOW_FUNCTION_NAMES.contains(fnName.toLowerCase(Locale.ROOT))) { 15499 throw new SemanticIRBuildException( 15500 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_UNSUPPORTED, 15501 "result column " + rc + " uses unsupported window function '" 15502 + fnName + "'; supported names are " + WINDOW_FUNCTION_NAMES, rc)); 15503 } 15504 15505 // 2. Reject vendor-specific function-level surfaces (codex round-1 MUST 4). 15506 if (fn.getFilterClause() != null) { 15507 throw new SemanticIRBuildException( 15508 Diagnostic.error(DiagnosticCode.WINDOW_FILTER_NOT_SUPPORTED, 15509 "result column " + rc + " uses FILTER (WHERE ...) on a " 15510 + "window function; not supported yet", rc)); 15511 } 15512 if (fn.getWithinGroup() != null) { 15513 throw new SemanticIRBuildException( 15514 Diagnostic.error(DiagnosticCode.WINDOW_WITHIN_GROUP_NOT_SUPPORTED, 15515 "result column " + rc + " uses WITHIN GROUP on a " 15516 + "window function; not supported yet", rc)); 15517 } 15518 if (fn.getOrderByList() != null && fn.getOrderByList().size() > 0) { 15519 throw new SemanticIRBuildException( 15520 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_LEVEL_ORDER_BY_NOT_SUPPORTED, 15521 "result column " + rc + " uses function-level ORDER BY " 15522 + "(LISTAGG-style); not supported yet", rc)); 15523 } 15524 if (fn.getSortClause() != null) { 15525 throw new SemanticIRBuildException( 15526 Diagnostic.error(DiagnosticCode.WINDOW_FUNCTION_LEVEL_SORT_NOT_SUPPORTED, 15527 "result column " + rc + " uses function-level SORT clause; " 15528 + "not supported yet", rc)); 15529 } 15530 15531 // 3. Reject vendor-specific window-def surfaces (codex round-1 MUSTs 5, 7). 15532 if (wd.getName() != null) { 15533 throw new SemanticIRBuildException( 15534 Diagnostic.error(DiagnosticCode.WINDOW_NAMED_WINDOW_DECLARATION_NOT_SUPPORTED, 15535 "result column " + rc + " declares a named window " 15536 + "(WINDOW name AS); not supported yet", rc)); 15537 } 15538 if (wd.getReferenceName() != null) { 15539 throw new SemanticIRBuildException( 15540 Diagnostic.error(DiagnosticCode.WINDOW_NAMED_WINDOW_REFERENCE_NOT_SUPPORTED, 15541 "result column " + rc + " references a named window via " 15542 + "OVER name; not supported yet", rc)); 15543 } 15544 if (wd.getWithinGroup() != null) { 15545 throw new SemanticIRBuildException( 15546 Diagnostic.error(DiagnosticCode.WINDOW_WITHIN_GROUP_INSIDE_PROJECTION_NOT_SUPPORTED, 15547 "result column " + rc + " uses WITHIN GROUP inside the " 15548 + "OVER clause; not supported yet", rc)); 15549 } 15550 if (wd.getKeepDenseRankClause() != null) { 15551 throw new SemanticIRBuildException( 15552 Diagnostic.error(DiagnosticCode.WINDOW_KEEP_DENSE_RANK_NOT_SUPPORTED, 15553 "result column " + rc + " uses KEEP DENSE_RANK FIRST/LAST; " 15554 + "not supported yet", rc)); 15555 } 15556 if (wd.getDistributeBy() != null) { 15557 throw new SemanticIRBuildException( 15558 Diagnostic.error(DiagnosticCode.WINDOW_DISTRIBUTE_BY_NOT_SUPPORTED, 15559 "result column " + rc + " uses Hive DISTRIBUTE BY in window; " 15560 + "not supported yet", rc)); 15561 } 15562 if (wd.getClusterBy() != null) { 15563 throw new SemanticIRBuildException( 15564 Diagnostic.error(DiagnosticCode.WINDOW_CLUSTER_BY_NOT_SUPPORTED, 15565 "result column " + rc + " uses Hive CLUSTER BY in window; " 15566 + "not supported yet", rc)); 15567 } 15568 if (wd.getSortBy() != null) { 15569 throw new SemanticIRBuildException( 15570 Diagnostic.error(DiagnosticCode.WINDOW_SORT_BY_NOT_SUPPORTED, 15571 "result column " + rc + " uses Hive SORT BY in window; " 15572 + "not supported yet", rc)); 15573 } 15574 // Slice 22: frame clauses are now built into WindowSpec.frame; the 15575 // slice-13 wholesale rejection is gone. Frame build happens AFTER 15576 // empty-OVER reject below so a frame-only OVER (...) fails on 15577 // empty-OVER first (the more user-tuned error message). 15578 15579 // 4. Reject empty OVER () (slice-13 boundary; dlineage parity — 15580 // empty OVER () is byte-identical to a plain aggregate in the XML). 15581 TPartitionClause pc = wd.getPartitionClause(); 15582 TOrderBy ob = wd.getOrderBy(); 15583 boolean hasPartitionBy = pc != null 15584 && pc.getExpressionList() != null 15585 && pc.getExpressionList().size() > 0; 15586 boolean hasOverOrderBy = ob != null 15587 && ob.getItems() != null 15588 && ob.getItems().size() > 0; 15589 if (!hasPartitionBy && !hasOverOrderBy) { 15590 throw new SemanticIRBuildException( 15591 Diagnostic.error(DiagnosticCode.WINDOW_EMPTY_OVER_NOT_SUPPORTED, 15592 "result column " + rc + " uses empty OVER (); not supported yet " 15593 + "(dlineage XML cannot discriminate from a plain aggregate)", rc)); 15594 } 15595 15596 // 5. Reject Hive PARTITION BY ... SORT (...). 15597 if (pc != null && pc.getSortedColumns() != null && pc.getSortedColumns().size() > 0) { 15598 throw new SemanticIRBuildException( 15599 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_SORT_NOT_SUPPORTED, 15600 "result column " + rc + " uses Hive PARTITION BY ... SORT (...); " 15601 + "not supported yet", rc)); 15602 } 15603 15604 // 6. Build PARTITION BY refs. 15605 List<ColumnRef> partitionRefs = hasPartitionBy 15606 ? buildWindowPartitionRefs(pc, rc, enclosingSelect, provider) 15607 : new ArrayList<ColumnRef>(); 15608 15609 // 7. Build OVER ORDER BY refs. 15610 List<ColumnRef> orderRefs = hasOverOrderBy 15611 ? buildWindowOrderRefs(ob, rc, enclosingSelect, provider) 15612 : new ArrayList<ColumnRef>(); 15613 15614 // 8. Build frame (slice 22). Null when the SQL has no ROWS/RANGE/ 15615 // GROUPS clause inside OVER (...). 15616 WindowFrame frame = wd.getWindowFrame() == null 15617 ? null 15618 : buildWindowFrame(wd.getWindowFrame(), rc); 15619 15620 // 9. Build sources from args only — PARTITION BY / OVER ORDER BY 15621 // refs must NOT leak into OutputColumn.sources because canonical 15622 // SELECT lineage on the dlineage side only walks fdd edges 15623 // (function args), not fdr edges (PARTITION BY / OVER ORDER BY). 15624 List<ColumnRef> sources = (fn.getArgs() == null || fn.getArgs().size() == 0) 15625 ? new ArrayList<ColumnRef>() 15626 : collectColumnRefs(fn.getArgs(), provider); 15627 15628 // 10. Construct OutputColumn. aggregate=false ALWAYS for window 15629 // functions (row-preserving). The OutputColumn ctor enforces 15630 // the windowSpec!=null AND aggregate=false invariant. 15631 String name = effectiveOutputName(rc); 15632 return new OutputColumn(name, /*derived=*/ true, /*aggregate=*/ false, 15633 sources, new WindowSpec(partitionRefs, orderRefs, frame)); 15634 } 15635 15636 /** 15637 * Slice 22: build a {@link WindowFrame} from a parser 15638 * {@link TWindowFrame}. Frame information is presentation-only 15639 * (dlineage XML harvests no frame data — see 15640 * {@code DataFlowAnalyzer.java:20558-20575}); this helper captures 15641 * the surface shape into the IR for governance consumers without 15642 * touching the canonical lineage model. 15643 * 15644 * <p>Direct field access via {@link TWindowFrame#getStartBoundary()} / 15645 * {@link TWindowFrame#getEndBoundary()}; visitors are NOT used because 15646 * {@code TWindowFrame.acceptChildren()} doesn't recurse into the 15647 * boundaries (codex round-1 SHOULD 3). 15648 * 15649 * <p>Order of guards (codex round-2 SHOULD 2): EXCLUDE first so the 15650 * error message is tuned to the actual surface; then null-guard the 15651 * boundary type (defensive — current parsers always pass it); then 15652 * map the {@code EBoundaryType} via an exhaustive switch 15653 * (slice-14 process lesson #17 — no catch-all); then check the 15654 * {@code boundaryNumber} expression type and reject non-constant 15655 * offsets (codex round-1 SHOULD 1 — PG {@code simple_object_name_t} 15656 * and ANSI {@code parenthesis_t} are reachable). 15657 * 15658 * <p>Null guards on the frame's {@link ELimitRowType} and 15659 * {@code startBoundary} fields are defensive / forward-compat: every 15660 * vendor grammar surveyed (codex round-4 NOTE 1) passes these 15661 * arguments together when constructing a {@code TWindowFrame}, so the 15662 * guards are unexercised by current parsers but protect against 15663 * future parser drift. 15664 */ 15665 private static WindowFrame buildWindowFrame(TWindowFrame wf, TResultColumn rc) { 15666 // Defensive null guards (codex round-2 MUST 2; codex round-4 15667 // SHOULD 1 — labelled DEFENSIVE / FORWARD-COMPAT). 15668 if (wf.getLimitRowType() == null) { 15669 throw new SemanticIRBuildException( 15670 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_NULL_LIMIT_ROW_TYPE, 15671 "result column " + rc + " has a frame with null limitRowType " 15672 + "(forward-compat / unexpected parser shape); not supported", rc)); 15673 } 15674 if (wf.getStartBoundary() == null) { 15675 throw new SemanticIRBuildException( 15676 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_NULL_START_BOUNDARY, 15677 "result column " + rc + " has a frame with null start boundary " 15678 + "(forward-compat / unexpected parser shape); not supported", rc)); 15679 } 15680 WindowFrame.Unit unit = mapFrameUnit(wf.getLimitRowType()); 15681 FrameBound start = buildFrameBound(wf.getStartBoundary(), rc, /*end=*/ false); 15682 FrameBound end = wf.getEndBoundary() == null 15683 ? null 15684 : buildFrameBound(wf.getEndBoundary(), rc, /*end=*/ true); 15685 return new WindowFrame(unit, start, end); 15686 } 15687 15688 /** 15689 * Slice 22: map the parser's {@link ELimitRowType} to the IR's 15690 * {@link WindowFrame.Unit}. Exhaustive switch (slice-14 process 15691 * lesson #17 — no catch-all); a future enum addition fails closed. 15692 */ 15693 private static WindowFrame.Unit mapFrameUnit(ELimitRowType type) { 15694 switch (type) { 15695 case Rows: 15696 return WindowFrame.Unit.ROWS; 15697 case Range: 15698 return WindowFrame.Unit.RANGE; 15699 case Groups: 15700 return WindowFrame.Unit.GROUPS; 15701 default: 15702 throw new SemanticIRBuildException( 15703 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_UNSUPPORTED_LIMIT_ROW_TYPE, 15704 "unsupported window frame limitRowType: " + type, null)); 15705 } 15706 } 15707 15708 /** 15709 * Slice 22: build a {@link FrameBound} from a parser 15710 * {@link TWindowFrameBoundary}. The {@code end} parameter is for 15711 * error messages only (start vs end disambiguation). 15712 * 15713 * <p>Per-bound check order: EXCLUDE → boundaryType-null → kind switch 15714 * → boundaryNumber shape (codex round-2 SHOULD 2 + slice-22 invariant). 15715 */ 15716 private static FrameBound buildFrameBound(TWindowFrameBoundary boundary, 15717 TResultColumn rc, 15718 boolean end) { 15719 String which = end ? "end" : "start"; 15720 15721 // (a) EXCLUDE first (codex round-1 MUST 2 + Netezza probe). 15722 // Netezza populates getExclusionClause() on the END boundary for 15723 // EXCLUDE CURRENT ROW / GROUP / TIES / NO OTHERS; rejecting here 15724 // surfaces the unsupported clause with a tuned message rather 15725 // than letting the offset-shape check fire on an unrelated 15726 // surface. 15727 if (boundary.getExclusionClause() != null) { 15728 throw new SemanticIRBuildException( 15729 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_EXCLUDE_NOT_SUPPORTED, 15730 "result column " + rc + " has a frame " + which 15731 + " boundary with EXCLUDE clause " 15732 + "(EXCLUDE CURRENT ROW / GROUP / TIES / NO OTHERS); " 15733 + "not supported yet", rc)); 15734 } 15735 15736 // (b) Null-guard the boundary type (defensive). 15737 if (boundary.getBoundaryType() == null) { 15738 throw new SemanticIRBuildException( 15739 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_NULL_BOUNDARY_TYPE, 15740 "result column " + rc + " has a frame " + which 15741 + " boundary with null boundaryType " 15742 + "(forward-compat / unexpected parser shape); not supported", rc)); 15743 } 15744 15745 // (c) Map the kind via exhaustive switch. 15746 FrameBound.Kind kind = mapBoundaryKind(boundary.getBoundaryType()); 15747 15748 // (d) Capture the optional offset literal. Reject non-constant 15749 // offsets (codex round-1 SHOULD 1 + slice-22 PG/ANSI probe — PG 15750 // accepts simple_object_name_t (column ROWS BETWEEN x PRECEDING ...), 15751 // ANSI accepts parenthesis_t ((x+1))). 15752 String offsetLiteral = null; 15753 TExpression offsetExpr = boundary.getBoundaryNumber(); 15754 if (offsetExpr != null) { 15755 // Slice-22 codex impl-review SHOULD 1: when the kind forbids 15756 // an offset (UNBOUNDED_*/CURRENT_ROW), reject with 15757 // SemanticIRBuildException so the failure stays inside the 15758 // builder's error contract — without this guard, a parser 15759 // surfacing a stray boundary number on CURRENT_ROW would 15760 // escape as IllegalArgumentException from 15761 // FrameBound's ctor. 15762 boolean offsetAllowed = (kind == FrameBound.Kind.PRECEDING 15763 || kind == FrameBound.Kind.FOLLOWING); 15764 if (!offsetAllowed) { 15765 throw new SemanticIRBuildException( 15766 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_UNEXPECTED_OFFSET, 15767 "result column " + rc + " has a frame " + which 15768 + " boundary of kind " + kind 15769 + " carrying an unexpected offset '" 15770 + offsetExpr + "' (forward-compat / " 15771 + "unexpected parser shape); not supported", rc)); 15772 } 15773 EExpressionType offsetType = offsetExpr.getExpressionType(); 15774 if (offsetType != EExpressionType.simple_constant_t) { 15775 throw new SemanticIRBuildException( 15776 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_OFFSET_NON_CONSTANT, 15777 "result column " + rc + " has a frame " + which 15778 + " offset that is not a simple constant " 15779 + "(got " + offsetType + " '" + offsetExpr + "'); " 15780 + "not supported yet", rc)); 15781 } 15782 offsetLiteral = offsetExpr.toString(); 15783 } 15784 return new FrameBound(kind, offsetLiteral); 15785 } 15786 15787 /** 15788 * Slice 22: map the parser's {@link EBoundaryType} to the IR's 15789 * {@link FrameBound.Kind}. Exhaustive switch (slice-14 process 15790 * lesson #17). 15791 */ 15792 private static FrameBound.Kind mapBoundaryKind(EBoundaryType type) { 15793 switch (type) { 15794 case ebtUnboundedPreceding: 15795 return FrameBound.Kind.UNBOUNDED_PRECEDING; 15796 case ebtUnboundedFollowing: 15797 return FrameBound.Kind.UNBOUNDED_FOLLOWING; 15798 case ebtCurrentRow: 15799 return FrameBound.Kind.CURRENT_ROW; 15800 case ebtPreceding: 15801 return FrameBound.Kind.PRECEDING; 15802 case ebtFollowing: 15803 return FrameBound.Kind.FOLLOWING; 15804 default: 15805 throw new SemanticIRBuildException( 15806 Diagnostic.error(DiagnosticCode.WINDOW_FRAME_UNSUPPORTED_BOUNDARY_TYPE, 15807 "unsupported frame boundary type: " + type, null)); 15808 } 15809 } 15810 15811 /** 15812 * Slice 13: build the PARTITION BY ref list. Every item must be a 15813 * physical column reference ({@code simple_object_name_t} resolving 15814 * via the provider to {@code EXACT_MATCH}). Other shapes are 15815 * rejected with a tuned message — slice-9 / slice-13 15816 * rejection-over-silent-loss. 15817 */ 15818 private static List<ColumnRef> buildWindowPartitionRefs(TPartitionClause pc, 15819 TResultColumn rc, 15820 TSelectSqlStatement enclosingSelect, 15821 NameBindingProvider provider) { 15822 LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 15823 TExpressionList list = pc.getExpressionList(); 15824 for (int i = 0; i < list.size(); i++) { 15825 TExpression item = list.getExpression(i); 15826 EExpressionType t = item.getExpressionType(); 15827 if (t == EExpressionType.simple_constant_t) { 15828 throw new SemanticIRBuildException( 15829 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_LITERAL, 15830 "result column " + rc + " has PARTITION BY literal '" 15831 + item + "'; not supported yet", rc)); 15832 } 15833 if (t == EExpressionType.subquery_t || item.getSubQuery() != null) { 15834 throw new SemanticIRBuildException( 15835 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_SUBQUERY, 15836 "result column " + rc + " has PARTITION BY containing " 15837 + "a subquery; not supported yet", rc)); 15838 } 15839 if (t == EExpressionType.function_t) { 15840 throw new SemanticIRBuildException( 15841 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_AGGREGATE, 15842 "result column " + rc + " has PARTITION BY containing " 15843 + "a function call '" + item + "'; not supported yet", rc)); 15844 } 15845 if (t != EExpressionType.simple_object_name_t) { 15846 throw new SemanticIRBuildException( 15847 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_UNKNOWN_REFERENCE, 15848 "result column " + rc + " has PARTITION BY using an " 15849 + "unsupported expression shape (" + t + "): " + item, rc)); 15850 } 15851 // Defensive: reject if the parser/resolver has retyped this 15852 // item as a projection alias. Current Oracle parsers leave 15853 // PARTITION BY <alias> as dbType=column even for projection 15854 // aliases; this guard fires for vendors that may behave 15855 // differently. The slice-19 discriminator below catches the 15856 // Oracle case where dbType stays "column" but the binding 15857 // came from the schema-less inferred-from-usage fallback. 15858 TObjectName on = item.getObjectOperand(); 15859 if (on != null && on.getDbObjectType() == EDbObjectType.column_alias) { 15860 throw new SemanticIRBuildException( 15861 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_PROJECTION_ALIAS, 15862 "result column " + rc + " has PARTITION BY referencing " 15863 + "a projection alias '" + item + "'; not supported yet", rc)); 15864 } 15865 // Slice 19: alias-bound discriminator. Reject when the 15866 // resolver's binding lacks definite FROM-scope evidence and 15867 // the name matches a calculated SELECT-list alias of the 15868 // enclosing SELECT. Without schema metadata the resolver 15869 // cannot tell alias from real column; rejection-over-silent- 15870 // guess matches the slice-9/-10/-13 invariant. 15871 if (on != null && provider.isCalculatedProjectionAliasFallback(on, enclosingSelect)) { 15872 throw new SemanticIRBuildException( 15873 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_CALCULATED_ALIAS, 15874 "result column " + rc + " has PARTITION BY referencing a " 15875 + "SELECT-list alias on a calculated expression ('" + item 15876 + "'); not supported yet — requires schema metadata to " 15877 + "discriminate alias from base column", rc)); 15878 } 15879 // Resolve the column ref through the provider. EXACT_MATCH is 15880 // required (slice-1 fail-fast invariant); collectColumnRefs 15881 // does the heavy lifting and rejects anything else. 15882 List<ColumnRef> built = collectColumnRefs(item, provider); 15883 if (built.isEmpty()) { 15884 throw new SemanticIRBuildException( 15885 Diagnostic.error(DiagnosticCode.WINDOW_PARTITION_BY_ITEM_UNUSABLE, 15886 "result column " + rc + " has PARTITION BY item '" 15887 + item + "' with no resolvable column refs", rc)); 15888 } 15889 refs.addAll(built); 15890 } 15891 return new ArrayList<>(refs); 15892 } 15893 15894 /** 15895 * Slice 13: build the OVER ORDER BY ref list. Every sort key must 15896 * be a physical column reference (mirrors slice-9 outer ORDER BY 15897 * rejection set). Ordinals, projection aliases, expressions, 15898 * subqueries, window functions, and SIBLINGS / RESET WHEN are 15899 * rejected with tuned messages. 15900 */ 15901 private static List<ColumnRef> buildWindowOrderRefs(TOrderBy ob, 15902 TResultColumn rc, 15903 TSelectSqlStatement enclosingSelect, 15904 NameBindingProvider provider) { 15905 // Slice-13 codex impl-review MUST 2: defense in depth, mirror outer 15906 // ORDER BY's slice-9 SIBLINGS / RESET WHEN guards. 15907 if (ob.isSiblings()) { 15908 throw new SemanticIRBuildException( 15909 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_SIBLINGS_NOT_SUPPORTED, 15910 "result column " + rc + " has OVER ORDER BY SIBLINGS; not supported yet " 15911 + "(Oracle hierarchical-query syntax in window OVER clause)", rc)); 15912 } 15913 if (ob.getResetWhenCondition() != null) { 15914 throw new SemanticIRBuildException( 15915 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_RESET_WHEN_NOT_SUPPORTED, 15916 "result column " + rc + " has OVER ORDER BY ... RESET WHEN; not supported yet " 15917 + "(Teradata window-style restart)", rc)); 15918 } 15919 LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 15920 TOrderByItemList items = ob.getItems(); 15921 for (int i = 0; i < items.size(); i++) { 15922 TOrderByItem item = items.getOrderByItem(i); 15923 TExpression key = item.getSortKey(); 15924 if (key == null) { 15925 throw new SemanticIRBuildException( 15926 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_NULL_SORT_KEY, 15927 "result column " + rc + " has OVER ORDER BY item with " 15928 + "null sort key", rc)); 15929 } 15930 EExpressionType t = key.getExpressionType(); 15931 if (t == EExpressionType.simple_constant_t) { 15932 // Catches both ordinal sort keys and string-literal sort keys. 15933 throw new SemanticIRBuildException( 15934 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_LITERAL, 15935 "result column " + rc + " has OVER ORDER BY literal/ordinal '" 15936 + key + "'; not supported yet", rc)); 15937 } 15938 if (t == EExpressionType.subquery_t || key.getSubQuery() != null) { 15939 throw new SemanticIRBuildException( 15940 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_SUBQUERY, 15941 "result column " + rc + " has OVER ORDER BY containing a " 15942 + "subquery; not supported yet", rc)); 15943 } 15944 if (t == EExpressionType.function_t) { 15945 TFunctionCall innerFn = key.getFunctionCall(); 15946 if (innerFn != null && innerFn.getWindowDef() != null) { 15947 throw new SemanticIRBuildException( 15948 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_WINDOW_FUNCTION, 15949 "result column " + rc + " has OVER ORDER BY containing a " 15950 + "window function; not supported yet", rc)); 15951 } 15952 throw new SemanticIRBuildException( 15953 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_AGGREGATE, 15954 "result column " + rc + " has OVER ORDER BY containing a " 15955 + "function call '" + key + "'; not supported yet", rc)); 15956 } 15957 if (t != EExpressionType.simple_object_name_t) { 15958 throw new SemanticIRBuildException( 15959 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_UNKNOWN_REFERENCE, 15960 "result column " + rc + " has OVER ORDER BY using an " 15961 + "unsupported expression shape (" + t + "): " + key, rc)); 15962 } 15963 // NOTE: Oracle's parser DOES retype OVER ORDER BY refs to 15964 // column_alias when they match a SELECT alias (mirrors 15965 // slice-9 outer ORDER BY behaviour). The defensive 15966 // column_alias guard from PARTITION BY is intentionally 15967 // omitted here — `collectColumnRefs` already skips 15968 // column_alias-typed nodes, and the empty-refs guard below 15969 // catches the resulting unresolvable item with a clear 15970 // message. Outer ORDER BY aliases use the same path. 15971 // 15972 // Slice 19: defensive symmetry with PARTITION BY. A future 15973 // vendor whose parser does NOT retype OVER ORDER BY refs to 15974 // column_alias would land here as `simple_object_name_t` 15975 // with an inferred-from-usage resolution; the discriminator 15976 // catches that case before collectColumnRefs descends. As of 15977 // slice 19, every supported vendor retypes (probe in 15978 // §14.21), so this branch is unreachable in current tests 15979 // — kept for forward-compat. 15980 TObjectName on = key.getObjectOperand(); 15981 if (on != null && provider.isCalculatedProjectionAliasFallback(on, enclosingSelect)) { 15982 throw new SemanticIRBuildException( 15983 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_CALCULATED_ALIAS, 15984 "result column " + rc + " has OVER ORDER BY referencing a " 15985 + "SELECT-list alias on a calculated expression ('" + key 15986 + "'); not supported yet — requires schema metadata to " 15987 + "discriminate alias from base column", rc)); 15988 } 15989 List<ColumnRef> built = collectColumnRefs(key, provider); 15990 if (built.isEmpty()) { 15991 throw new SemanticIRBuildException( 15992 Diagnostic.error(DiagnosticCode.WINDOW_OVER_ORDER_BY_ITEM_UNUSABLE, 15993 "result column " + rc + " has OVER ORDER BY item '" 15994 + key + "' with no resolvable column refs", rc)); 15995 } 15996 refs.addAll(built); 15997 } 15998 return new ArrayList<>(refs); 15999 } 16000 16001 /** 16002 * Slice 13: reject any window function ({@code FUNC(...) OVER (...)}) 16003 * appearing in a {@link TParseTreeNode} subtree. Used by the 16004 * WHERE / GROUP BY / JOIN ON guards before the visitor would 16005 * otherwise descend into the OVER clause and leak PARTITION BY / 16006 * OVER ORDER BY refs into the wrong column-ref bucket. Mirrors 16007 * {@link #rejectHavingWindowFunction} (slice 10) and 16008 * {@link #rejectOrderByWindowFunction} (slice 9). 16009 */ 16010 /** 16011 * Slice 85 — admit RETURNING (PG / Oracle) and OUTPUT (SQL Server) 16012 * projections on INSERT / UPDATE / DELETE statements. Returns the 16013 * list of {@link OutputColumn}s for the {@code returningColumns} 16014 * slot on the DML's {@link StatementGraph}, and appends one 16015 * {@link LineageEdge} per source column ref to {@code lineage}: 16016 * <pre> 16017 * from = LineageRef.statementOutput(dmlIdx, returningColumns[i].name) 16018 * to = LineageRef.tableColumn(targetQName, sourceColumnName) 16019 * </pre> 16020 * (consumer ← producer direction; mirrors slice-78 INSERT's 16021 * {@code target ← source} convention but with the DML's own output 16022 * as the consumer and the target table's column as the producer.) 16023 * 16024 * <p>At most one of {@code ret} and {@code out} is non-null. When 16025 * both are null (no RETURNING / OUTPUT clause), returns an empty 16026 * list and emits no edges. 16027 * 16028 * <p>Reject ordering (codex round-3 Q2 BLOCKING fix — two-pass): 16029 * <ol> 16030 * <li>Pass 1, statement-level: empty projection list → 16031 * {@link DiagnosticCode#RETURNING_EMPTY_PROJECTION}.</li> 16032 * <li>Pass 1.5, OUTPUT-only DML-kind / pseudo-table mismatch scan: 16033 * INSERT with any {@code DELETED.col} → 16034 * {@link DiagnosticCode#OUTPUT_DELETED_ON_INSERT_NOT_SUPPORTED}; 16035 * DELETE with any {@code INSERTED.col} → 16036 * {@link DiagnosticCode#OUTPUT_INSERTED_ON_DELETE_NOT_SUPPORTED}. 16037 * Fires on the first matching column regardless of position.</li> 16038 * <li>Pass 2, per-column (in SQL declaration order): 16039 * <ul> 16040 * <li>{@code *} → {@link DiagnosticCode#RETURNING_STAR_NOT_SUPPORTED}</li> 16041 * <li>any subquery → 16042 * {@link DiagnosticCode#RETURNING_HAS_SUBQUERY_NOT_SUPPORTED}</li> 16043 * <li>any window function over a base ref → reuses 16044 * {@link DiagnosticCode#CLAUSE_WINDOW_FUNCTION_LEAK} via 16045 * {@link #rejectWindowFunctionInScope}</li> 16046 * <li>any aggregate function over a base ref → 16047 * {@link DiagnosticCode#RETURNING_HAS_AGGREGATE_NOT_SUPPORTED} 16048 * (aggregates are not legal in DML RETURNING / OUTPUT per 16049 * spec — fires defensively when parser admits them)</li> 16050 * </ul> 16051 * </li> 16052 * </ol> 16053 * 16054 * <p>OUTPUT_INTO_NOT_SUPPORTED is rejected at the caller (before 16055 * any FROM walk / SET / WHERE processing) so multi-violation shapes 16056 * route through the cheaper structural code first. 16057 * 16058 * @param ret RETURNING clause; null when this DML uses OUTPUT or 16059 * no projection at all 16060 * @param out OUTPUT clause; null when this DML uses RETURNING or 16061 * no projection at all 16062 * @param dmlKind "INSERT" / "UPDATE" / "DELETE" — only relevant for 16063 * the pseudo-table mismatch scan (UPDATE admits both 16064 * INSERTED and DELETED; INSERT admits only INSERTED; 16065 * DELETE admits only DELETED) 16066 * @param targetQName the target table's qualified name; used as the 16067 * {@code to} endpoint of every emitted LineageEdge 16068 * @param provider name-binding provider; same instance used for 16069 * SET RHS / WHERE / JOIN ON ref collection so 16070 * FROM-side relation refs (slice-82 joined UPDATE, 16071 * slice-84 joined DELETE) resolve correctly 16072 * @param dmlIdx the DML statement's position in 16073 * {@link SemanticProgram#getStatements()}; used as the 16074 * {@code statementIndex} on the {@code from} endpoint 16075 * @param lineage in/out: collected edges are appended here 16076 * @param anchor parse-tree anchor for diagnostics 16077 */ 16078 private static List<OutputColumn> buildReturningColumns( 16079 TReturningClause ret, 16080 TOutputClause out, 16081 String dmlKind, 16082 String targetQName, 16083 String targetAlias, 16084 TTable targetTable, 16085 List<RelationSource> fromSideRelations, 16086 NameBindingProvider provider, 16087 int dmlIdx, 16088 List<LineageEdge> lineage, 16089 TParseTreeNode anchor) { 16090 if (ret == null && out == null) { 16091 return Collections.emptyList(); 16092 } 16093 // Oracle host-variable form: `RETURNING col INTO :v` — AST shape 16094 // is columnValueList + variableList populated, resultExprList null. 16095 // Slice 88 admits it: extract column exprs from columnValueList, 16096 // discard variableList (bind sinks have no semantic IR relevance). 16097 // The still-unsupported degenerate case (resultExprList=null AND 16098 // columnValueList=null) keeps RETURNING_INTO_NOT_SUPPORTED so the 16099 // code stays declared-not-unreachable per the slice-71/72/82 precedent. 16100 boolean isOracleInto = (ret != null && ret.getResultExprList() == null 16101 && ret.getColumnValueList() != null); 16102 if (ret != null && ret.getResultExprList() == null && !isOracleInto) { 16103 throw new SemanticIRBuildException(Diagnostic.error( 16104 DiagnosticCode.RETURNING_INTO_NOT_SUPPORTED, 16105 "Oracle `RETURNING col INTO :host_var` with no column list " 16106 + "is not supported; admits the standard INTO form only", 16107 anchor)); 16108 } 16109 // Extract the source column list. 16110 TResultColumnList items = null; 16111 TExpressionList intoExprs = null; 16112 if (isOracleInto) { 16113 intoExprs = ret.getColumnValueList(); 16114 } else if (ret != null) { 16115 items = ret.getResultExprList(); 16116 } else { 16117 items = out.getSelectItemList(); 16118 } 16119 int colCount = isOracleInto 16120 ? (intoExprs == null ? 0 : intoExprs.size()) 16121 : (items == null ? 0 : items.size()); 16122 // Pass 1: empty projection list (defensive — the parser usually 16123 // refuses to produce an empty list, but a malformed AST should 16124 // surface a clean diagnostic). 16125 if (colCount == 0) { 16126 throw new SemanticIRBuildException(Diagnostic.error( 16127 DiagnosticCode.RETURNING_EMPTY_PROJECTION, 16128 dmlKind + (ret != null ? " RETURNING" : " OUTPUT") 16129 + " clause has no projection columns", 16130 anchor)); 16131 } 16132 // Pass 1.5: OUTPUT-only DML-kind / pseudo-table mismatch scan 16133 // (codex round-1 Q4 BLOCKING — deep-walk all TObjectName leaves 16134 // so compound exprs like `OUTPUT INSERTED.a + DELETED.b` also 16135 // reject deterministically). The parser sets pseudoTableType 16136 // on the fieldAttr for SIMPLE column references but leaves 16137 // it null on the leaf TObjectNames inside compound expressions; 16138 // we detect those by checking the objectToken spelling against 16139 // "INSERTED" / "DELETED". 16140 // The Oracle INTO path skips this scan (no INSERTED/DELETED pseudo-tables). 16141 if (out != null && !isOracleInto) { 16142 final String targetAliasFinal = targetAlias; 16143 final String targetQNameFinal = targetQName; 16144 final List<RelationSource> relsFinal = fromSideRelations; 16145 for (int i = 0; i < items.size(); i++) { 16146 TResultColumn rc = items.getResultColumn(i); 16147 final String dmlKindFinal = dmlKind; 16148 final TResultColumn rcFinal = rc; 16149 scanOutputPseudoTableLeaves(rc.getExpr(), 16150 new TParseTreeVisitor() { 16151 @Override 16152 public void preVisit(TObjectName n) { 16153 EPseudoTableType pt = detectPseudoTable( 16154 n, rcFinal, targetAliasFinal, 16155 targetQNameFinal, relsFinal); 16156 if (pt == EPseudoTableType.deleted 16157 && "INSERT".equals(dmlKindFinal)) { 16158 throw new SemanticIRBuildException(Diagnostic.error( 16159 DiagnosticCode.OUTPUT_DELETED_ON_INSERT_NOT_SUPPORTED, 16160 "INSERT OUTPUT references DELETED." 16161 + bareColumnNameOf(n) 16162 + " but there is no deleted-row " 16163 + "image on INSERT; use INSERTED.* instead", 16164 rcFinal)); 16165 } 16166 if (pt == EPseudoTableType.inserted 16167 && "DELETE".equals(dmlKindFinal)) { 16168 throw new SemanticIRBuildException(Diagnostic.error( 16169 DiagnosticCode.OUTPUT_INSERTED_ON_DELETE_NOT_SUPPORTED, 16170 "DELETE OUTPUT references INSERTED." 16171 + bareColumnNameOf(n) 16172 + " but there is no inserted-row " 16173 + "image on DELETE; use DELETED.* instead", 16174 rcFinal)); 16175 } 16176 } 16177 }); 16178 } 16179 } 16180 // Pass 2: per-column. Build OutputColumns, emit edges. 16181 List<OutputColumn> outputs = new ArrayList<>(colCount); 16182 for (int i = 0; i < colCount; i++) { 16183 // For the Oracle INTO path rc is null — the INTO column list 16184 // carries bare expressions, not TResultColumn wrappers. 16185 TResultColumn rc = isOracleInto ? null 16186 : items.getResultColumn(i); 16187 TExpression expr = isOracleInto 16188 ? intoExprs.getExpression(i) 16189 : (rc == null ? null : rc.getExpr()); 16190 if (expr == null) { 16191 throw new SemanticIRBuildException(Diagnostic.error( 16192 DiagnosticCode.RESULT_COLUMN_NULL_EXPRESSION, 16193 dmlKind + (ret != null ? " RETURNING" : " OUTPUT") 16194 + " column #" + (i + 1) + " has no expression", 16195 rc != null ? rc : anchor)); 16196 } 16197 // Slice 98 — MSSQL MERGE OUTPUT `$action` pseudo-column. 16198 // Returns the merge action string per output row ('INSERT' / 16199 // 'UPDATE' / 'DELETE') — it has no underlying base column. 16200 // Detected case-insensitively because parser tokens come out 16201 // as `$action` regardless of how the user wrote it; bracketed 16202 // `[$action]` is a delimited identifier and is NOT treated 16203 // as the pseudo-column (codex Q1 confirmed YES — slice-98 16204 // detection is literal text equality on the un-bracketed 16205 // spelling). The check is gated on dmlKind="MERGE" so 16206 // INSERT/UPDATE/DELETE OUTPUT (slice 85) are unaffected. 16207 if ("MERGE".equals(dmlKind) 16208 && isMergeActionPseudoColumn(expr)) { 16209 String actionName = (rc != null && rc.getColumnAlias() != null 16210 && !rc.getColumnAlias().toString().isEmpty()) 16211 ? rc.getColumnAlias().toString() 16212 : expr.toString(); 16213 outputs.add(new OutputColumn(actionName, 16214 /*derived=*/ true, 16215 /*aggregate=*/ false, 16216 Collections.<ColumnRef>emptyList())); 16217 // No LineageEdge — $action has no producer column. 16218 continue; 16219 } 16220 // STAR check — bare `RETURNING *` parses as 16221 // simple_object_name_t with toString="*"; qualified star 16222 // forms like `RETURNING t.*` / `OUTPUT inserted.*` / 16223 // `OUTPUT deleted.*` parse as simple_object_name_t with 16224 // partToken (and getColumnNameOnly()) equal to "*" 16225 // (codex round-4 BLOCKING fix). 16226 // Slice 90: standard RETURNING star attempts catalog-backed expansion. 16227 // Slice 99: MSSQL MERGE OUTPUT INSERTED.* / DELETED.* 16228 // attempts catalog-backed expansion against the target table. 16229 // Oracle INTO star and non-MERGE OUTPUT star (and bare / 16230 // target-alias / source-alias MERGE OUTPUT star) remain rejected. 16231 if (isStarReference(expr)) { 16232 if (isOracleInto) { 16233 // Oracle INTO star: keep existing reject. 16234 throw new SemanticIRBuildException(Diagnostic.error( 16235 DiagnosticCode.RETURNING_STAR_NOT_SUPPORTED, 16236 dmlKind + " RETURNING INTO * star expansion " 16237 + "is not yet supported; use explicit column names", 16238 rc != null ? rc : expr)); 16239 } 16240 if (out != null) { 16241 // Slice 99 / Slice 100 — MSSQL pseudo-table 16242 // OUTPUT INSERTED.* / DELETED.* routes to catalog- 16243 // backed expansion against the target table. The 16244 // pseudo-table discriminator is the parser-set 16245 // EPseudoTableType.inserted / .deleted flag on the 16246 // star qualifier (slice-85 primary discriminator). 16247 // Slice 99 lifted the reject for dmlKind="MERGE"; 16248 // slice 100 generalises to all DML kinds (INSERT / 16249 // UPDATE / DELETE) — the parser sets pseudoTableType 16250 // identically on non-MERGE OUTPUT stars, and Pass 16251 // 1.5 has already rejected cross-direction 16252 // mismatches (INSERT OUTPUT DELETED.* / 16253 // DELETE OUTPUT INSERTED.*) before this branch. 16254 // OUTPUT *, t.*, s.* (no pseudo-table marker) still 16255 // reject — they're either ambiguous (bare *) or 16256 // refer to non-pseudo relations. 16257 EPseudoTableType pseudo = EPseudoTableType.none; 16258 TObjectName starObj = expr.getObjectOperand(); 16259 if (starObj != null 16260 && starObj.getPseudoTableType() != null) { 16261 pseudo = starObj.getPseudoTableType(); 16262 } 16263 if (pseudo == EPseudoTableType.inserted 16264 || pseudo == EPseudoTableType.deleted) { 16265 expandOutputPseudoTableStarColumns( 16266 expr, rc, pseudo, dmlKind, 16267 targetTable, targetQName, 16268 provider, dmlIdx, lineage, anchor, outputs); 16269 continue; 16270 } 16271 // Non-pseudo OUTPUT star (bare *, target-alias *, 16272 // source-alias *): keep existing reject. 16273 throw new SemanticIRBuildException(Diagnostic.error( 16274 DiagnosticCode.RETURNING_STAR_NOT_SUPPORTED, 16275 dmlKind + " OUTPUT * star expansion is not " 16276 + "yet supported; use explicit column names", 16277 rc != null ? rc : expr)); 16278 } 16279 // Standard RETURNING star: attempt catalog-backed expansion. 16280 // On success, the helper adds to `outputs` and `lineage` in place 16281 // and we `continue` past the normal single-column build below. 16282 expandReturningStarColumns( 16283 expr, rc, dmlKind, targetTable, targetAlias, targetQName, 16284 fromSideRelations, provider, dmlIdx, lineage, anchor, outputs); 16285 continue; 16286 } 16287 // Subquery / aggregate / window — guarded by !isOracleInto 16288 // because Oracle's INTO column list forbids nested queries and 16289 // aggregates at the grammar level; skip the checks to avoid 16290 // false rejects on unusual AST shapes. 16291 if (!isOracleInto) { 16292 if (containsAnySubqueryExpression(expr)) { 16293 throw new SemanticIRBuildException(Diagnostic.error( 16294 DiagnosticCode.RETURNING_HAS_SUBQUERY_NOT_SUPPORTED, 16295 dmlKind + " " + (ret != null ? "RETURNING" : "OUTPUT") 16296 + " column #" + (i + 1) + " contains a subquery; " 16297 + "slice 85 admits scalar expressions over base columns only", 16298 rc)); 16299 } 16300 rejectWindowFunctionInScope(expr, 16301 dmlKind + " " + (ret != null ? "RETURNING" : "OUTPUT")); 16302 if (isAggregateFunction(expr)) { 16303 throw new SemanticIRBuildException(Diagnostic.error( 16304 DiagnosticCode.RETURNING_HAS_AGGREGATE_NOT_SUPPORTED, 16305 dmlKind + " " + (ret != null ? "RETURNING" : "OUTPUT") 16306 + " column #" + (i + 1) + " contains an aggregate " 16307 + "function; aggregates are not legal in DML " 16308 + "RETURNING / OUTPUT projection per SQL spec", 16309 rc)); 16310 } 16311 } 16312 // Name extraction. 16313 // INTO path: no alias possible, use expr.toString() directly. 16314 // Normal path: use the projection-side helper which strips 16315 // INSERTED./DELETED. qualifiers on OUTPUT pseudo-table refs. 16316 String outName = isOracleInto 16317 ? expr.toString() 16318 : returningOutputName(rc, expr, dmlKind, ret != null); 16319 if (outName == null || outName.isEmpty()) { 16320 throw new SemanticIRBuildException(Diagnostic.error( 16321 DiagnosticCode.RESULT_COLUMN_NO_NAME, 16322 dmlKind + " RETURNING INTO column #" + (i + 1) 16323 + " has no resolvable name", 16324 anchor)); 16325 } 16326 // Source collection via manual walker (slice-89 fix registers 16327 // RETURNING refs in Resolver2 allColumnReferences for DELETE/UPDATE; 16328 // INSERT RETURNING lacks an InsertScope so Resolver2 path is partial). 16329 // rc=null is safe for the INTO path: synthRefForReturningLeaf 16330 // only dereferences rc inside the `if (isOutput)` guard, 16331 // which is false for all RETURNING (non-OUTPUT) paths. 16332 List<ColumnRef> sources = collectReturningSourceRefs( 16333 expr, rc, out != null && !isOracleInto, 16334 targetAlias, targetQName, fromSideRelations); 16335 boolean derived = expr.getExpressionType() 16336 != EExpressionType.simple_object_name_t; 16337 outputs.add(new OutputColumn(outName, derived, 16338 /*aggregate=*/ false, sources)); 16339 // Emit one LineageEdge per source column ref. 16340 // Edge direction (consumer ← producer; slice-85 convention 16341 // documented on getReturningColumns()): 16342 // from = STATEMENT_OUTPUT(dmlIdx, returningName) 16343 // to = TABLE_COLUMN(<producer-qualified-name>, <colName>) 16344 // The producer qualified-name is: 16345 // - target table qname when the source ref's relationAlias 16346 // is INSERTED / DELETED (MSSQL OUTPUT pseudo-tables both 16347 // ultimately reference the physical target row) 16348 // - target table qname when the source ref's relationAlias 16349 // matches the target alias 16350 // - FROM-side relation's binding qualifiedName when the 16351 // ref's relationAlias matches a FROM-side relation 16352 // - the relationAlias verbatim otherwise (defensive) 16353 for (ColumnRef src : sources) { 16354 String srcCol = src.getColumnName(); 16355 if (srcCol == null || srcCol.isEmpty()) continue; 16356 String alias = src.getRelationAlias(); 16357 String to = resolveReturningEdgeTarget(alias, 16358 targetAlias, targetQName, fromSideRelations); 16359 lineage.add(new LineageEdge( 16360 LineageRef.statementOutput(dmlIdx, outName), 16361 LineageRef.tableColumn(to, srcCol))); 16362 } 16363 } 16364 return outputs; 16365 } 16366 16367 /** 16368 * Slice 98 helper — true when an expression is the MSSQL MERGE 16369 * OUTPUT {@code $action} pseudo-column. 16370 * 16371 * <p>Detection rule: the expression is a 16372 * {@code simple_object_name_t} and its {@code toString()} matches 16373 * {@code "$action"} case-insensitively. Bracketed delimited 16374 * identifiers like {@code [$action]} parse with the brackets in 16375 * the token string, so they are NOT matched here — a column 16376 * actually named {@code $action} (delimited) is treated as a 16377 * normal target column, not the pseudo-column (codex Q1 confirmed). 16378 * 16379 * <p>Caller gates the check on {@code dmlKind == "MERGE"} so the 16380 * slice-85 INSERT/UPDATE/DELETE OUTPUT path is unaffected. 16381 */ 16382 private static boolean isMergeActionPseudoColumn(TExpression expr) { 16383 if (expr == null) return false; 16384 if (expr.getExpressionType() != EExpressionType.simple_object_name_t) { 16385 return false; 16386 } 16387 String text = expr.toString(); 16388 return text != null && "$action".equalsIgnoreCase(text); 16389 } 16390 16391 /** 16392 * Slice 85 helper — true when an expression is a star reference 16393 * (bare {@code *} or qualified {@code t.*} / {@code INSERTED.*} / 16394 * {@code DELETED.*}). Covers both forms: bare star has 16395 * {@code expr.toString()=="*"}; qualified star is a 16396 * simple_object_name_t whose leaf TObjectName has 16397 * {@code partToken=="*"} (codex round-4 BLOCKING fix — 16398 * qualified stars were previously slipping past the bare-only 16399 * check and producing bogus ColumnRefs). 16400 */ 16401 private static boolean isStarReference(TExpression expr) { 16402 if (expr == null) return false; 16403 if (expr.getExpressionType() != EExpressionType.simple_object_name_t) { 16404 return false; 16405 } 16406 if ("*".equals(expr.toString())) return true; 16407 TObjectName n = expr.getObjectOperand(); 16408 if (n == null) return false; 16409 if (n.getPartToken() != null && "*".equals(n.getPartToken().toString())) { 16410 return true; 16411 } 16412 String colOnly = n.getColumnNameOnly(); 16413 return colOnly != null && "*".equals(colOnly); 16414 } 16415 16416 /** 16417 * Slice 90 helper — expand a standard {@code RETURNING *} / 16418 * {@code RETURNING t.*} star into per-column 16419 * {@link OutputColumn} entries using catalog metadata from 16420 * {@link #lookupRelationColumnNames(TTable, NameBindingProvider)}. 16421 * 16422 * <p>Called only for standard PG/Oracle RETURNING (not MSSQL OUTPUT, 16423 * not Oracle INTO). Adds expanded columns to {@code outputs} and 16424 * matching {@link LineageEdge}s to {@code lineage} in place. 16425 * 16426 * <p>Qualifier matching mirrors Slice 59 SELECT star semantics: 16427 * alias-only — the qualifier must equal the target's effective alias, 16428 * not the schema-qualified name. For INSERT without alias the effective 16429 * alias is the bare table name, so {@code RETURNING employees.*} matches 16430 * {@code INSERT INTO schema.employees}. 16431 * 16432 * <p>Throws {@link SemanticIRBuildException} on any failure: 16433 * <ul> 16434 * <li>{@link DiagnosticCode#RETURNING_STAR_CATALOG_REQUIRED} — no 16435 * catalog metadata available for the target relation;</li> 16436 * <li>{@link DiagnosticCode#RETURNING_STAR_NOT_SUPPORTED} — the 16437 * qualifier matches a FROM-side relation alias but FROM-side 16438 * star expansion is deferred to a future slice;</li> 16439 * <li>{@link DiagnosticCode#RETURNING_STAR_QUALIFIER_UNKNOWN} — the 16440 * qualifier does not match the target alias or any FROM-side 16441 * relation alias.</li> 16442 * </ul> 16443 */ 16444 private static void expandReturningStarColumns( 16445 TExpression expr, 16446 TResultColumn rc, 16447 String dmlKind, 16448 TTable targetTable, 16449 String targetAlias, 16450 String targetQName, 16451 List<RelationSource> fromSideRelations, 16452 NameBindingProvider provider, 16453 int dmlIdx, 16454 List<LineageEdge> lineage, 16455 TParseTreeNode anchor, 16456 List<OutputColumn> outputs) { 16457 // Extract qualifier: empty for bare `*`, table/alias name for `t.*`. 16458 String qualifier = ""; 16459 TObjectName n = (expr != null) ? expr.getObjectOperand() : null; 16460 if (n != null) { 16461 String q = n.getTableString(); 16462 if (q != null && !q.isEmpty()) qualifier = q; 16463 } 16464 // Rendered star form for use in diagnostic messages. 16465 String starForm = qualifier.isEmpty() ? "*" : qualifier + ".*"; 16466 // Determine which relation to expand. 16467 // Rule (Slice 90, mirrors Slice 59 correlation-name semantics): 16468 // effective alias only — bare name without alias counts as alias. 16469 boolean matchesTarget = qualifier.isEmpty() 16470 || qualifier.equalsIgnoreCase(targetAlias); 16471 if (matchesTarget) { 16472 // Attempt catalog-backed expansion of the target table. 16473 List<String> cols = lookupRelationColumnNames(targetTable, provider); 16474 if (cols == null || cols.isEmpty()) { 16475 throw new SemanticIRBuildException(Diagnostic.error( 16476 DiagnosticCode.RETURNING_STAR_CATALOG_REQUIRED, 16477 dmlKind + " RETURNING " + starForm 16478 + " requires catalog metadata for target '" 16479 + targetQName + "' to expand; supply a Catalog via " 16480 + "SqlSemanticAnalyzer.analyze(sql, vendor, catalog)", 16481 rc != null ? rc : anchor)); 16482 } 16483 for (String colName : cols) { 16484 ColumnRef ref = new ColumnRef(targetAlias, colName); 16485 outputs.add(new OutputColumn(colName, /*derived=*/ false, 16486 /*aggregate=*/ false, Collections.singletonList(ref))); 16487 lineage.add(new LineageEdge( 16488 LineageRef.statementOutput(dmlIdx, colName), 16489 LineageRef.tableColumn(targetQName, colName))); 16490 } 16491 return; 16492 } 16493 // Qualifier doesn't match target. Check FROM-side relations. 16494 for (RelationSource rs : fromSideRelations) { 16495 if (qualifier.equalsIgnoreCase(rs.getAlias())) { 16496 // Known FROM-side relation, but expansion is deferred. 16497 throw new SemanticIRBuildException(Diagnostic.error( 16498 DiagnosticCode.RETURNING_STAR_NOT_SUPPORTED, 16499 dmlKind + " RETURNING " + starForm + " — " 16500 + "star expansion for FROM-side/USING relations " 16501 + "is deferred to a future slice; use explicit " 16502 + "column names for FROM-side RETURNING refs", 16503 rc != null ? rc : anchor)); 16504 } 16505 } 16506 // Qualifier is truly unknown (doesn't match target or any FROM-side relation). 16507 throw new SemanticIRBuildException(Diagnostic.error( 16508 DiagnosticCode.RETURNING_STAR_QUALIFIER_UNKNOWN, 16509 dmlKind + " RETURNING " + starForm + " — qualifier '" 16510 + qualifier + "' does not match the DML target alias '" 16511 + targetAlias + "' or any FROM-side relation; " 16512 + "use the target's effective alias for RETURNING star expansion", 16513 rc != null ? rc : anchor)); 16514 } 16515 16516 /** 16517 * Slice 99 / Slice 100 helper — expand MSSQL pseudo-table 16518 * {@code OUTPUT INSERTED.*} / {@code OUTPUT DELETED.*} into 16519 * per-column {@link OutputColumn} entries using catalog metadata 16520 * from {@link #lookupRelationColumnNames(TTable, NameBindingProvider)}. 16521 * 16522 * <p>Slice 99 originally introduced this helper for MSSQL MERGE 16523 * OUTPUT. Slice 100 generalised it to all DML kinds (INSERT / UPDATE 16524 * / DELETE / MERGE): the parser sets {@code pseudoTableType=inserted/deleted} 16525 * on the star qualifier identically for non-MERGE DML, so the 16526 * expansion is mechanically the same — only the catalog-missing 16527 * message text varies by {@code dmlKind} per the slice-80 16528 * message-text-discrimination contract. 16529 * 16530 * <p>Mirrors the slice-90 standard-RETURNING star design with two 16531 * differences: 16532 * <ul> 16533 * <li>The pseudo-table qualifier ({@code INSERTED} / {@code DELETED}) 16534 * is normalized to UPPERCASE on 16535 * {@link ColumnRef#getRelationAlias()} regardless of the SQL 16536 * case, matching slice-85 16537 * {@code synthRefForReturningLeaf}'s 16538 * {@code new ColumnRef("INSERTED", ...)} convention.</li> 16539 * <li>The catalog lookup target is the DML target table (the 16540 * pseudo-table rows physically reference target rows), not a 16541 * FROM-side relation.</li> 16542 * </ul> 16543 * 16544 * <p>Catalog miss reuses {@link DiagnosticCode#RETURNING_STAR_CATALOG_REQUIRED} 16545 * (slice-90 code) with a discriminating message text formatted as 16546 * {@code "<dmlKind> OUTPUT <pseudoLabel>.* requires catalog metadata 16547 * for target '<qname>' ..."}. 16548 * 16549 * <p>Adds expanded columns to {@code outputs} and matching 16550 * {@link LineageEdge}s to {@code lineage} in place. 16551 * 16552 * @param dmlKind one of {@code "INSERT"}, {@code "UPDATE"}, 16553 * {@code "DELETE"}, {@code "MERGE"} — feeds the 16554 * catalog-missing message text only; expansion is 16555 * identical regardless of kind because the parser 16556 * sets {@code pseudoTableType} on the star qualifier 16557 * uniformly. 16558 */ 16559 private static void expandOutputPseudoTableStarColumns( 16560 TExpression expr, 16561 TResultColumn rc, 16562 EPseudoTableType pseudoTable, 16563 String dmlKind, 16564 TTable targetTable, 16565 String targetQName, 16566 NameBindingProvider provider, 16567 int dmlIdx, 16568 List<LineageEdge> lineage, 16569 TParseTreeNode anchor, 16570 List<OutputColumn> outputs) { 16571 // Slice-85 convention: relationAlias is normalized to UPPERCASE. 16572 String pseudoLabel = (pseudoTable == EPseudoTableType.inserted) 16573 ? "INSERTED" 16574 : "DELETED"; 16575 List<String> cols = lookupRelationColumnNames(targetTable, provider); 16576 if (cols == null || cols.isEmpty()) { 16577 throw new SemanticIRBuildException(Diagnostic.error( 16578 DiagnosticCode.RETURNING_STAR_CATALOG_REQUIRED, 16579 dmlKind + " OUTPUT " + pseudoLabel + ".* requires catalog " 16580 + "metadata for target '" + targetQName 16581 + "' to expand; supply a Catalog via " 16582 + "SqlSemanticAnalyzer.analyze(sql, vendor, catalog)", 16583 rc != null ? rc : anchor)); 16584 } 16585 for (String colName : cols) { 16586 ColumnRef ref = new ColumnRef(pseudoLabel, colName); 16587 outputs.add(new OutputColumn(colName, /*derived=*/ false, 16588 /*aggregate=*/ false, Collections.singletonList(ref))); 16589 // INSERTED / DELETED both physically reference the target 16590 // row image; lineage edge target is the target qname. 16591 lineage.add(new LineageEdge( 16592 LineageRef.statementOutput(dmlIdx, colName), 16593 LineageRef.tableColumn(targetQName, colName))); 16594 } 16595 } 16596 16597 /** 16598 * Slice 85 helper — pull the pseudo-table type for an OUTPUT 16599 * column (SQL Server). Returns {@link EPseudoTableType#none} for 16600 * RETURNING (PG / Oracle) columns and for OUTPUT columns whose 16601 * top-level expression doesn't carry an INSERTED / DELETED 16602 * qualifier. For compound expressions, the parser sets the 16603 * pseudo-table type on the fieldAttr only for SIMPLE column 16604 * references; compound shapes must be deep-walked separately 16605 * via {@link #scanOutputPseudoTableLeaves}. 16606 */ 16607 private static EPseudoTableType pseudoTableOf(TResultColumn rc) { 16608 if (rc == null) return EPseudoTableType.none; 16609 TObjectName fa = rc.getFieldAttr(); 16610 if (fa == null) return EPseudoTableType.none; 16611 EPseudoTableType pt = fa.getPseudoTableType(); 16612 return (pt == null) ? EPseudoTableType.none : pt; 16613 } 16614 16615 /** 16616 * Slice 85 helper — walk an OUTPUT projection expression and 16617 * invoke {@code visitor} on every TObjectName leaf for the 16618 * pseudo-table mismatch scan. Skips function-name TObjectNames 16619 * via the dbObjectType filter so {@code OUTPUT FUNC(inserted.x)} 16620 * still surfaces the leaf inserted.x ref. 16621 */ 16622 private static void scanOutputPseudoTableLeaves( 16623 TExpression expr, final TParseTreeVisitor visitor) { 16624 if (expr == null) return; 16625 // Collect function-name identities first (codex round-2 Q1 16626 // BLOCKING — dialect-portable structural filter). 16627 final java.util.Set<TObjectName> fnLeaves = 16628 collectFunctionNameLeaves(expr); 16629 // Fast path: leaf simple_object_name_t. 16630 if (expr.getExpressionType() == EExpressionType.simple_object_name_t) { 16631 TObjectName n = expr.getObjectOperand(); 16632 if (n != null && !isFunctionNameObjectName(n, fnLeaves)) { 16633 visitor.preVisit(n); 16634 } 16635 return; 16636 } 16637 // Compound expression — walk for TObjectName leaves. 16638 expr.acceptChildren(new TParseTreeVisitor() { 16639 int nestedSelectDepth = 0; 16640 @Override 16641 public void preVisit(TSelectSqlStatement s) { nestedSelectDepth++; } 16642 @Override 16643 public void postVisit(TSelectSqlStatement s) { nestedSelectDepth--; } 16644 @Override 16645 public void preVisit(TObjectName node) { 16646 if (nestedSelectDepth > 0) return; 16647 if (isFunctionNameObjectName(node, fnLeaves)) return; 16648 visitor.preVisit(node); 16649 } 16650 }); 16651 } 16652 16653 /** 16654 * Slice 85 helper — detect the pseudo-table type (INSERTED / 16655 * DELETED) for a TObjectName leaf in an OUTPUT projection, 16656 * honouring both the parser-set fieldAttr.pseudoTableType (for 16657 * simple leaf refs where the parser ran its qualifier swap) AND 16658 * the objectToken spelling (for compound expressions where the 16659 * parser left pseudoTableType=none on leaf TObjectNames). 16660 * 16661 * <p>Codex round-2 Q3 BLOCKING fix — MSSQL permits "INSERTED" / 16662 * "DELETED" as real identifiers via ColId, so the text-match 16663 * fallback fires ONLY when no FROM-side relation alias / 16664 * qualifiedName / bare-component matches the qualifier. With a 16665 * real table named "INSERTED" in scope, the text-match is 16666 * suppressed and the leaf surfaces as a normal column ref. 16667 */ 16668 private static EPseudoTableType detectPseudoTable(TObjectName n, 16669 TResultColumn rc, 16670 String targetAlias, 16671 String targetQName, 16672 List<RelationSource> fromSideRelations) { 16673 if (n == null) return EPseudoTableType.none; 16674 // Direct: parser-set pseudoTableType. 16675 if (n.getPseudoTableType() != null 16676 && n.getPseudoTableType() != EPseudoTableType.none) { 16677 return n.getPseudoTableType(); 16678 } 16679 // Indirect: if this leaf is the result column's fieldAttr, 16680 // read from the fieldAttr's pseudoTableType (slice-78 16681 // TOutputClause.doParse sets it there for simple refs). 16682 if (rc != null && rc.getFieldAttr() == n) { 16683 EPseudoTableType pt = pseudoTableOf(rc); 16684 if (pt != null && pt != EPseudoTableType.none) return pt; 16685 } 16686 // Compound expression leaf — the parser leaves 16687 // pseudoTableType=none on the leaf TObjectNames but the 16688 // objectToken spelling is preserved. Text-match 16689 // INSERTED / DELETED case-insensitively, but only when no 16690 // real FROM-side relation shadows the pseudo name (codex 16691 // round-2 Q3 BLOCKING). 16692 if (n.getObjectToken() != null && n.getPartToken() != null) { 16693 String obj = n.getObjectToken().toString(); 16694 if (obj == null) return EPseudoTableType.none; 16695 boolean shadowedByRealRelation = 16696 qualifierMatchesAnyRelation(obj, targetAlias, 16697 targetQName, fromSideRelations); 16698 if (shadowedByRealRelation) return EPseudoTableType.none; 16699 if ("INSERTED".equalsIgnoreCase(obj)) return EPseudoTableType.inserted; 16700 if ("DELETED".equalsIgnoreCase(obj)) return EPseudoTableType.deleted; 16701 } 16702 return EPseudoTableType.none; 16703 } 16704 16705 /** 16706 * Slice 85 helper — true when {@code qualifier} matches some 16707 * real relation in scope (target alias / qualified name / bare 16708 * component, or any FROM-side relation alias / qualified name / 16709 * bare component). Used by {@link #detectPseudoTable} to 16710 * suppress the INSERTED / DELETED text-match when a real table 16711 * by that name is in scope. 16712 */ 16713 private static boolean qualifierMatchesAnyRelation(String qualifier, 16714 String targetAlias, String targetQName, 16715 List<RelationSource> fromSideRelations) { 16716 if (qualifier == null || qualifier.isEmpty()) return false; 16717 if (targetAlias != null && targetAlias.equalsIgnoreCase(qualifier)) { 16718 return true; 16719 } 16720 if (targetQName != null 16721 && (targetQName.equalsIgnoreCase(qualifier) 16722 || bareLastDotComponent(targetQName) 16723 .equalsIgnoreCase(qualifier))) { 16724 return true; 16725 } 16726 if (fromSideRelations != null) { 16727 for (RelationSource rs : fromSideRelations) { 16728 String a = rs.getAlias(); 16729 if (a != null && a.equalsIgnoreCase(qualifier)) return true; 16730 String qn = rs.getBinding() == null ? null 16731 : rs.getBinding().getQualifiedName(); 16732 if (qn != null 16733 && (qn.equalsIgnoreCase(qualifier) 16734 || bareLastDotComponent(qn) 16735 .equalsIgnoreCase(qualifier))) { 16736 return true; 16737 } 16738 } 16739 } 16740 return false; 16741 } 16742 16743 /** 16744 * Slice 85 helper — collect identities of every TObjectName that 16745 * is a function-name in the given expression tree (codex round-2 16746 * Q1 BLOCKING — {@code EDbObjectType.function} is unreliable 16747 * across dialects: Oracle builtins surface as {@code constant}, 16748 * MSSQL XML methods as {@code method}). The walker uses 16749 * {@link IdentityHashMap}-style reference equality so the 16750 * column-ref walker can structurally skip function-name leaves 16751 * regardless of dbType. 16752 */ 16753 private static java.util.Set<TObjectName> collectFunctionNameLeaves( 16754 TExpression expr) { 16755 final java.util.Set<TObjectName> set = java.util.Collections.newSetFromMap( 16756 new IdentityHashMap<TObjectName, Boolean>()); 16757 if (expr == null) return set; 16758 expr.acceptChildren(new TParseTreeVisitor() { 16759 @Override 16760 public void preVisit(TFunctionCall fn) { 16761 TObjectName name = fn.getFunctionName(); 16762 if (name != null) set.add(name); 16763 } 16764 }); 16765 return set; 16766 } 16767 16768 /** 16769 * Slice 85 helper — true when this TObjectName is in the 16770 * function-name set collected for the current expression 16771 * (codex round-2 Q1 BLOCKING fix — structural identity rather 16772 * than dbType). {@code functionNameLeaves} may be null (empty 16773 * set semantics) when the caller doesn't have the set in hand. 16774 */ 16775 private static boolean isFunctionNameObjectName(TObjectName n, 16776 java.util.Set<TObjectName> functionNameLeaves) { 16777 if (n == null) return false; 16778 if (functionNameLeaves != null && functionNameLeaves.contains(n)) { 16779 return true; 16780 } 16781 // Best-effort fallback: dbType check catches the 16782 // single-leaf simple_object_name_t function case (rare — 16783 // those normally arrive as TFunctionCall). Kept defensively. 16784 EDbObjectType t = n.getDbObjectType(); 16785 return t == EDbObjectType.function; 16786 } 16787 16788 /** 16789 * Slice 85 helper — best-effort spelling of the bare column name 16790 * for an OUTPUT pseudo-table ref like INSERTED.foo. Used only in 16791 * diagnostic message text. 16792 */ 16793 private static String safePseudoColumn(TResultColumn rc) { 16794 if (rc == null) return "<unknown>"; 16795 TObjectName fa = rc.getFieldAttr(); 16796 if (fa == null) return "<unknown>"; 16797 if (fa.getPartToken() != null) return fa.getPartToken().toString(); 16798 if (fa.getPropertyToken() != null) return fa.getPropertyToken().toString(); 16799 return rc.toString(); 16800 } 16801 16802 /** 16803 * Slice 85 helper — derive the OutputColumn name for a RETURNING / 16804 * OUTPUT projection column. Uses the explicit alias when present, 16805 * else the bare column name (for OUTPUT INSERTED.col / DELETED.col 16806 * the bare partToken spelling, stripping the pseudo-table 16807 * qualifier). Falls back to {@code expr.toString()} for derived 16808 * expressions without alias (e.g. {@code RETURNING a + 1} → name 16809 * = "a + 1"). 16810 */ 16811 private static String returningOutputName(TResultColumn rc, 16812 TExpression expr, 16813 String dmlKind, 16814 boolean isReturning) { 16815 String alias = rc.getColumnAlias(); 16816 if (alias != null && !alias.isEmpty()) { 16817 return alias; 16818 } 16819 // OUTPUT pseudo-table ref without alias — strip the qualifier 16820 // so the OutputColumn.name carries the bare column spelling. 16821 if (!isReturning) { 16822 EPseudoTableType pt = pseudoTableOf(rc); 16823 if (pt != EPseudoTableType.none) { 16824 TObjectName fa = rc.getFieldAttr(); 16825 if (fa != null && fa.getPartToken() != null) { 16826 return fa.getPartToken().toString(); 16827 } 16828 } 16829 } 16830 // RETURNING bare column or derived expression — fall back to 16831 // the expression's toString(). For simple_object_name_t this is 16832 // the verbatim bare or qualified column spelling; for 16833 // arithmetic / function expressions it is the rendered text. 16834 String s = expr.toString(); 16835 if (s == null || s.isEmpty()) { 16836 throw new SemanticIRBuildException(Diagnostic.error( 16837 DiagnosticCode.RESULT_COLUMN_NO_NAME, 16838 dmlKind + (isReturning ? " RETURNING" : " OUTPUT") 16839 + " column has no resolvable name", 16840 rc)); 16841 } 16842 return s; 16843 } 16844 16845 /** 16846 * Slice 85 helper — collect ColumnRefs from a RETURNING / OUTPUT 16847 * projection expression. Slice 89 fixed TReturningClause.acceptChildren() 16848 * to descend into children so Resolver2 now registers RETURNING refs in 16849 * allColumnReferences for DELETE/UPDATE (INSERT RETURNING lacks InsertScope 16850 * so Resolver2 coverage there is partial). This walker remains the 16851 * authoritative source for Semantic IR because it maps qualifier tokens 16852 * directly onto the DML's known relation set: 16853 * 16854 * <ul> 16855 * <li>OUTPUT pseudo-table ref ({@code INSERTED.col} / 16856 * {@code DELETED.col}, detected via 16857 * {@code fieldAttr.pseudoTableType}) → ColumnRef with 16858 * uppercase {@code "INSERTED"} / {@code "DELETED"} as the 16859 * relationAlias, preserving temporal phase (codex round-2 16860 * Q2 BLOCKING).</li> 16861 * <li>Qualified ref matching a FROM-side relation's alias 16862 * (slice-82 joined UPDATE / slice-84 joined DELETE) 16863 * → ColumnRef with that alias.</li> 16864 * <li>Qualified ref matching the target table's effective alias 16865 * or its qualified name → ColumnRef with the target alias.</li> 16866 * <li>Unqualified ref → ColumnRef with the target alias 16867 * (default scope).</li> 16868 * <li>Qualified ref matching nothing known → ColumnRef with the 16869 * parser's qualifier verbatim. Lineage consumers can spot 16870 * the unresolved relation via the relationAlias they see.</li> 16871 * </ul> 16872 * 16873 * <p>Match policy: case-insensitive on alias / qualified name, 16874 * to match the slice-83 codex Q3 advisory and stay forgiving of 16875 * dialect-specific identifier casing. 16876 */ 16877 private static List<ColumnRef> collectReturningSourceRefs( 16878 TExpression expr, 16879 TResultColumn rc, 16880 boolean isOutput, 16881 String targetAlias, 16882 String targetQName, 16883 List<RelationSource> fromSideRelations) { 16884 final LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 16885 // Collect function-name identities first (codex round-2 Q1 16886 // BLOCKING — dialect-portable structural filter; see 16887 // {@link #collectFunctionNameLeaves} javadoc for the 16888 // dbObjectType unreliability rationale). 16889 final java.util.Set<TObjectName> fnLeaves = collectFunctionNameLeaves(expr); 16890 // Fast path: simple_object_name_t leaf — handle directly so 16891 // OUTPUT INSERTED.col / DELETED.col resolves through fieldAttr. 16892 if (expr.getExpressionType() == EExpressionType.simple_object_name_t) { 16893 ColumnRef r = synthRefForReturningLeaf( 16894 rc, expr.getObjectOperand(), isOutput, 16895 targetAlias, targetQName, fromSideRelations, fnLeaves); 16896 if (r != null) refs.add(r); 16897 return new ArrayList<>(refs); 16898 } 16899 // Compound expression — walk for TObjectName leaves. We don't 16900 // descend into nested subqueries (already rejected upstream) 16901 // and don't try to resolve fieldAttr for compound expressions 16902 // (pseudo-table qualifier inside arithmetic / function args is 16903 // a rare shape; slice 85 surfaces the bare column ref against 16904 // the target alias as a best-effort). 16905 expr.acceptChildren(new TParseTreeVisitor() { 16906 int nestedSelectDepth = 0; 16907 @Override 16908 public void preVisit(TSelectSqlStatement nested) { 16909 nestedSelectDepth++; 16910 } 16911 @Override 16912 public void postVisit(TSelectSqlStatement nested) { 16913 nestedSelectDepth--; 16914 } 16915 @Override 16916 public void preVisit(TObjectName node) { 16917 if (nestedSelectDepth > 0) return; 16918 // Codex round-1 Q1 / round-2 Q1 BLOCKING — skip 16919 // function-name leaves via structural identity. 16920 if (isFunctionNameObjectName(node, fnLeaves)) return; 16921 ColumnRef r = synthRefForReturningLeaf( 16922 rc, node, isOutput, 16923 targetAlias, targetQName, fromSideRelations, fnLeaves); 16924 if (r != null) refs.add(r); 16925 } 16926 }); 16927 return new ArrayList<>(refs); 16928 } 16929 16930 /** 16931 * Slice 85 helper — build one ColumnRef for a TObjectName leaf in 16932 * a RETURNING / OUTPUT projection. Returns null when the node is 16933 * not a column-name reference (e.g. a function name token). 16934 */ 16935 private static ColumnRef synthRefForReturningLeaf( 16936 TResultColumn rc, 16937 TObjectName node, 16938 boolean isOutput, 16939 String targetAlias, 16940 String targetQName, 16941 List<RelationSource> fromSideRelations, 16942 java.util.Set<TObjectName> functionNameLeaves) { 16943 if (node == null) return null; 16944 // Codex round-1 Q1 / round-2 Q1 BLOCKING — skip function-name 16945 // TObjectNames via structural identity (the "UPPER" leaf in 16946 // `RETURNING UPPER(name)`). 16947 if (isFunctionNameObjectName(node, functionNameLeaves)) return null; 16948 String colName = bareColumnNameOf(node); 16949 if (colName == null || colName.isEmpty()) return null; 16950 // OUTPUT pseudo-table ref (INSERTED / DELETED). Detect via 16951 // fieldAttr on the result column (parser surfaces it there 16952 // for simple refs) OR objectToken spelling (compound exprs; 16953 // codex round-1 Q4 BLOCKING — same deep detection as 16954 // detectPseudoTable). Both INSERTED and DELETED ultimately 16955 // reference the target table's row; only the temporal phase 16956 // surfaces on ColumnRef.relationAlias. 16957 if (isOutput) { 16958 EPseudoTableType pt = detectPseudoTable(node, rc, 16959 targetAlias, targetQName, fromSideRelations); 16960 if (pt == EPseudoTableType.inserted) { 16961 return new ColumnRef("INSERTED", partColumnNameOf(node, colName)); 16962 } 16963 if (pt == EPseudoTableType.deleted) { 16964 return new ColumnRef("DELETED", partColumnNameOf(node, colName)); 16965 } 16966 } 16967 // Qualifier resolution: bare or qualified. 16968 String qualifier = qualifierOf(node); 16969 if (qualifier == null || qualifier.isEmpty()) { 16970 return new ColumnRef(targetAlias, colName); 16971 } 16972 // Codex round-2 Q2 + round-3 BLOCKING fix — single-pass 16973 // count-all-candidates matcher. A relation "matches" the 16974 // qualifier if any of (alias, qualifiedName, bare-last-dot 16975 // component of qualifiedName) compares case-insensitive- 16976 // equal. Multiple matches (e.g. unaliased `FROM s1.t s2.t` 16977 // both have effectiveAlias "t" via TTable.getName() fallback, 16978 // both have bareComponent "t") are ambiguous and fall through 16979 // to the verbatim qualifier path so consumers see the 16980 // ambiguity rather than a silent order-dependent pick. 16981 int totalMatches = 0; 16982 // Single-match accumulators (one each — only valid when 16983 // totalMatches == 1): 16984 RelationSource matchedRelation = null; 16985 boolean matchedIsTarget = false; 16986 if (fromSideRelations != null) { 16987 for (RelationSource rs : fromSideRelations) { 16988 if (relationCandidateMatch(rs, qualifier)) { 16989 totalMatches++; 16990 matchedRelation = rs; 16991 } 16992 } 16993 } 16994 if (targetCandidateMatch(targetAlias, targetQName, qualifier)) { 16995 totalMatches++; 16996 matchedIsTarget = true; 16997 } 16998 if (totalMatches == 1) { 16999 if (matchedIsTarget) { 17000 return new ColumnRef( 17001 targetAlias != null ? targetAlias : targetQName, colName); 17002 } 17003 String a = matchedRelation.getAlias(); 17004 String qn = matchedRelation.getBinding() == null ? null 17005 : matchedRelation.getBinding().getQualifiedName(); 17006 return new ColumnRef((a != null && !a.isEmpty()) ? a : qn, colName); 17007 } 17008 // Zero matches or ambiguous — pass through verbatim. Lineage 17009 // consumers can spot the unresolved / ambiguous relation. 17010 return new ColumnRef(qualifier, colName); 17011 } 17012 17013 /** 17014 * Slice 85 helper — true when a FROM-side relation is a match 17015 * candidate for the qualifier under any of: alias, full 17016 * qualifiedName, or bare last-dot component of qualifiedName 17017 * (case-insensitive). Caller uses {@link #synthRefForReturningLeaf}'s 17018 * single-pass count-then-pick policy to disambiguate. 17019 */ 17020 private static boolean relationCandidateMatch(RelationSource rs, 17021 String qualifier) { 17022 if (rs == null || qualifier == null || qualifier.isEmpty()) return false; 17023 String a = rs.getAlias(); 17024 if (a != null && a.equalsIgnoreCase(qualifier)) return true; 17025 String qn = rs.getBinding() == null ? null 17026 : rs.getBinding().getQualifiedName(); 17027 if (qn == null) return false; 17028 return qn.equalsIgnoreCase(qualifier) 17029 || bareLastDotComponent(qn).equalsIgnoreCase(qualifier); 17030 } 17031 17032 /** 17033 * Slice 85 helper — true when the target table is a candidate 17034 * match for the qualifier (alias / qualifiedName / bare 17035 * component, case-insensitive). 17036 */ 17037 private static boolean targetCandidateMatch(String targetAlias, 17038 String targetQName, 17039 String qualifier) { 17040 if (qualifier == null || qualifier.isEmpty()) return false; 17041 if (targetAlias != null && targetAlias.equalsIgnoreCase(qualifier)) { 17042 return true; 17043 } 17044 if (targetQName != null 17045 && (targetQName.equalsIgnoreCase(qualifier) 17046 || bareLastDotComponent(targetQName) 17047 .equalsIgnoreCase(qualifier))) { 17048 return true; 17049 } 17050 return false; 17051 } 17052 17053 /** 17054 * Slice 85 helper — strip everything up to and including the 17055 * last dot in a qualified name. Returns the input unchanged when 17056 * no dot is present. 17057 */ 17058 private static String bareLastDotComponent(String qname) { 17059 if (qname == null) return ""; 17060 int dot = qname.lastIndexOf('.'); 17061 return (dot < 0) ? qname : qname.substring(dot + 1); 17062 } 17063 17064 /** 17065 * Slice 85 helper — map a ColumnRef.relationAlias to the qualified 17066 * table name used on the LineageEdge {@code to} endpoint. INSERTED 17067 * / DELETED both collapse to the target's qualified name; FROM-side 17068 * aliases route to their bound qualifiedName; target alias / qname 17069 * stays on the target; unknown aliases pass through verbatim. 17070 */ 17071 private static String resolveReturningEdgeTarget( 17072 String alias, String targetAlias, String targetQName, 17073 List<RelationSource> fromSideRelations) { 17074 if ("INSERTED".equals(alias) || "DELETED".equals(alias)) { 17075 return targetQName; 17076 } 17077 if (targetAlias != null && targetAlias.equalsIgnoreCase(alias)) { 17078 return targetQName; 17079 } 17080 if (targetQName != null && targetQName.equalsIgnoreCase(alias)) { 17081 return targetQName; 17082 } 17083 if (fromSideRelations != null) { 17084 for (RelationSource rs : fromSideRelations) { 17085 if (rs.getAlias() != null 17086 && rs.getAlias().equalsIgnoreCase(alias)) { 17087 String qn = rs.getBinding() == null ? null 17088 : rs.getBinding().getQualifiedName(); 17089 return (qn != null && !qn.isEmpty()) ? qn : alias; 17090 } 17091 } 17092 } 17093 return alias != null ? alias : targetQName; 17094 } 17095 17096 /** 17097 * Slice 85 helper — extract the bare column name from a TObjectName 17098 * leaf, honouring partToken / propertyToken / objectToken in the 17099 * order set by the parser. Returns null when no column-name token 17100 * is present. 17101 */ 17102 private static String bareColumnNameOf(TObjectName node) { 17103 // partToken is the column name in `qualifier.col` form; 17104 // for bare `col`, the parser may put it on objectToken. 17105 if (node.getPartToken() != null) { 17106 return node.getPartToken().toString(); 17107 } 17108 if (node.getColumnNameOnly() != null 17109 && !node.getColumnNameOnly().isEmpty()) { 17110 return node.getColumnNameOnly(); 17111 } 17112 if (node.getObjectToken() != null) { 17113 return node.getObjectToken().toString(); 17114 } 17115 return null; 17116 } 17117 17118 /** 17119 * Slice 85 helper — qualifier (table alias or schema-table) of a 17120 * column-name reference. Returns null for bare references. 17121 */ 17122 private static String qualifierOf(TObjectName node) { 17123 // For `t.col`, parser populates objectToken=t, partToken=col. 17124 if (node.getPartToken() != null && node.getObjectToken() != null) { 17125 return node.getObjectToken().toString(); 17126 } 17127 return null; 17128 } 17129 17130 /** 17131 * Slice 85 helper — pseudo-table partToken column name for OUTPUT 17132 * INSERTED.col / DELETED.col. Falls back to the bare column name 17133 * when partToken is null (e.g. raw bare reference). 17134 */ 17135 private static String partColumnNameOf(TObjectName node, String fallbackColName) { 17136 if (node.getPartToken() != null) { 17137 return node.getPartToken().toString(); 17138 } 17139 return fallbackColName; 17140 } 17141 17142 private static void rejectWindowFunctionInScope(gudusoft.gsqlparser.nodes.TParseTreeNode root, 17143 String clauseLabel) { 17144 if (root == null) return; 17145 final boolean[] found = {false}; 17146 root.acceptChildren(new TParseTreeVisitor() { 17147 @Override 17148 public void preVisit(TFunctionCall fn) { 17149 if (found[0]) return; 17150 if (fn.getWindowDef() != null) found[0] = true; 17151 } 17152 }); 17153 if (found[0]) { 17154 throw new SemanticIRBuildException( 17155 Diagnostic.error(DiagnosticCode.CLAUSE_WINDOW_FUNCTION_LEAK, 17156 clauseLabel + " contains a window function (OVER (...)); " 17157 + "window functions are not allowed in " + clauseLabel 17158 + " per standard SQL", root)); 17159 } 17160 } 17161 17162 private static String effectiveOutputName(TResultColumn rc) { 17163 String alias = rc.getColumnAlias(); 17164 if (alias != null && !alias.isEmpty()) { 17165 return alias; 17166 } 17167 String colName = rc.getColumnNameOnly(); 17168 if (colName != null && !colName.isEmpty()) { 17169 return colName; 17170 } 17171 throw new SemanticIRBuildException( 17172 Diagnostic.error(DiagnosticCode.RESULT_COLUMN_NO_NAME, 17173 "result column " + rc + " has neither alias nor column name", rc)); 17174 } 17175 17176 private static List<ColumnRef> buildFilterColumnRefs(TSelectSqlStatement select, 17177 NameBindingProvider provider, 17178 boolean allowPredicateSubqueries, 17179 List<StatementGraph> stmtsForExtraction, 17180 List<LineageEdge> lineageForExtraction, 17181 Map<String, Integer> cteMapForExtraction, 17182 PredicateClauseContext whereClauseContext) { 17183 TWhereClause where = select.getWhereClause(); 17184 if (where == null || where.getCondition() == null) { 17185 return new ArrayList<>(); 17186 } 17187 Set<TExpression> extractedWhereRoots = 17188 Collections.<TExpression>emptySet(); 17189 if (containsAnySubquery(where)) { 17190 if (!allowPredicateSubqueries) { 17191 // Slice 112 — non-outer SELECTs (FROM-subquery, scalar 17192 // projection subquery body, predicate body) keep the 17193 // slice-80 blanket reject. The outermost SELECT path 17194 // (slice 112) and set-op branch path (slice 113) thread 17195 // {@code allowPredicateSubqueries=true} plus the live 17196 // extraction context so the slice-23+ walker can lift 17197 // uncorrelated predicate-subquery wrappers. Inner 17198 // contexts also have earlier preflight rejecters 17199 // ({@code rejectSubqueriesInFromSubqueryBodyClauses} for 17200 // FROM-subquery bodies, {@code rejectSubqueriesInPredicateBodyClauses} 17201 // for slice-23 predicate bodies); this remains the 17202 // fallback path for any unanticipated nested SELECTs 17203 // that bypass those preflights. 17204 throw new SemanticIRBuildException( 17205 Diagnostic.error(DiagnosticCode.WHERE_HAS_SUBQUERY_NOT_SUPPORTED, 17206 "WHERE clause contains a subquery; subqueries in WHERE " 17207 + "are not supported yet in nested SELECTs", 17208 select)); 17209 } 17210 // Slice 112 / 113 — outer SELECT WHERE and set-op branch 17211 // WHERE lift the slice-80 blanket subquery reject by 17212 // routing uncorrelated predicate-subquery wrappers 17213 // (IN-SELECT / EXISTS / NOT EXISTS / scalar comparison / 17214 // ANY-ALL-SOME) through the slice-23+ JOIN-ON extraction 17215 // pipeline refactored by slice 110 to take a 17216 // PredicateClauseContext. Slice 112 added the SELECT_WHERE 17217 // constant for outer SELECT WHERE; slice 113 adds the 17218 // SET_OP_BRANCH_WHERE constant for nested set-op branch 17219 // WHERE — both reuse the same SELECT_WHERE_* DiagnosticCode 17220 // family (a branch IS a SELECT, only nested) and differ 17221 // only in the {@code clauseLabel} for diagnostic messages. 17222 // Each extracted wrapper lands as its own 17223 // <predicate_subquery_<i>> StatementGraph BEFORE the host 17224 // outer SELECT or set-op branch in {@code stmts} (selectIdx 17225 // = stmts.size() naturally accounts for them — slice-83 17226 // dynamic-index pattern, slice 110/111 precedent). 17227 // 17228 // Remaining non-subquery refs flow into filterColumnRefs 17229 // via collectColumnRefsSkipping. Window functions in 17230 // non-subquery subtrees still reject via 17231 // rejectWindowFunctionInScopeSkipping. The {@code provider} 17232 // already carries withCteContext / withInScopeRelationColumns 17233 // from the outer build chain (slice-65 withUsingScope 17234 // preserves both facets), so the predicate body's inner 17235 // FROM cte routes through RelationKind.CTE and the body's 17236 // own lineage edge becomes STATEMENT_OUTPUT(predicateIdx, 17237 // col) -> STATEMENT_OUTPUT(cteIdx, col) instead of 17238 // TABLE_COLUMN (slice 110/111 precedent). 17239 extractedWhereRoots = 17240 extractUncorrelatedPredicateSubqueriesFromClause( 17241 where.getCondition(), provider, 17242 stmtsForExtraction, lineageForExtraction, 17243 cteMapForExtraction, 17244 whereClauseContext); 17245 rejectAnyRemainingSubqueriesFromClause( 17246 where.getCondition(), extractedWhereRoots, 17247 whereClauseContext); 17248 } 17249 // Slice 13: reject window functions in WHERE before 17250 // collectColumnRefs descends into OVER (...) and leaks 17251 // PARTITION BY / OVER ORDER BY refs into filterColumnRefs. 17252 // Slice 112 — skip extracted predicate-subquery subtrees so 17253 // inner window functions do not leak into the outer reject 17254 // (mirrors the slice-110/111 UPDATE/DELETE WHERE behaviour). 17255 rejectWindowFunctionInScopeSkipping(where, "WHERE clause", 17256 extractedWhereRoots); 17257 return collectColumnRefsSkipping(where, provider, extractedWhereRoots); 17258 } 17259 17260 /** 17261 * Slice 65 — shared visitor body that emits either the merged-key 17262 * source list (when {@code node} is an unqualified reference to a 17263 * USING merged key in the current SELECT's 17264 * {@link UsingScope}) or the resolver2-bound {@link ColumnRef}. 17265 * 17266 * <p>Used by every visitor that walks expression subtrees and 17267 * collects column refs ({@link #collectColumnRefs}, 17268 * {@link #collectColumnRefsSkippingExtended}, the derived FILTER / 17269 * WITHIN-GROUP-excluding variants). Each visitor remains 17270 * responsible for its own skip-depth and nested-SELECT-depth 17271 * tracking; only the column-emit body is shared. 17272 * 17273 * <p>Behavior: 17274 * <ul> 17275 * <li>If the node is not a column, name is null/empty/star → no-op.</li> 17276 * <li>If the node is unqualified AND its name matches a USING 17277 * key in {@code provider.getUsingScope()} AND that scope 17278 * reports the reference as ambiguous (two disconnected 17279 * classes, or a catalog-proven out-of-class same-named 17280 * relation) → throw {@link SemanticIRBuildException}.</li> 17281 * <li>Otherwise if the unqualified name matches a USING key 17282 * unambiguously → emit each {@link ColumnRef} from the 17283 * merged source list (FROM-ordered, deduped per relation).</li> 17284 * <li>Otherwise → delegate to 17285 * {@link NameBindingProvider#bindColumn} and emit the bound 17286 * {@link ColumnRef}; any non-EXACT_MATCH binding records a 17287 * reject (caller throws after collecting all rejects).</li> 17288 * </ul> 17289 * 17290 * <p>The qualifier check is the SQL-written prefix 17291 * ({@link TObjectName#getTableString()}); when present the 17292 * merged-key path is skipped so {@code a.k} continues to resolve 17293 * to {@code (a, k)} regardless of {@code k}'s USING-key status. 17294 */ 17295 private static void appendMergedOrBoundColumnRef( 17296 TObjectName node, 17297 NameBindingProvider provider, 17298 LinkedHashSet<ColumnRef> refsOut, 17299 List<String> rejectsOut) { 17300 if (node.getDbObjectType() != EDbObjectType.column) return; 17301 String name = node.getColumnNameOnly(); 17302 if (name == null || "*".equals(name)) return; 17303 UsingScope scope = provider.getUsingScope(); 17304 String qualifier = node.getTableString(); 17305 if ((qualifier == null || qualifier.isEmpty()) && scope.has(name)) { 17306 if (scope.isAmbiguous(name)) { 17307 throw new SemanticIRBuildException( 17308 Diagnostic.error(DiagnosticCode.UNQUALIFIED_COLUMN_AMBIGUOUS, 17309 "unqualified reference to '" + name + "' is ambiguous: " 17310 + scope.ambiguityReason(name) 17311 + "; qualify with a table alias", null)); 17312 } 17313 for (ColumnRef ref : scope.mergedSourcesFor(name)) { 17314 refsOut.add(ref); 17315 } 17316 return; 17317 } 17318 ColumnBinding binding = provider.bindColumn(node); 17319 if (binding == null) { 17320 rejectsOut.add(node + "[no binding]"); 17321 return; 17322 } 17323 if (binding.getStatus() != ResolutionStatus.EXACT_MATCH) { 17324 rejectsOut.add(node + "[" + binding.getStatus() + "]"); 17325 return; 17326 } 17327 refsOut.add(new ColumnRef(binding.getRelationAlias(), binding.getColumnName())); 17328 } 17329 17330 /** 17331 * Visit every column-typed {@link TObjectName} reachable from the given 17332 * subtree, ask the provider to bind it, and return de-duplicated 17333 * {@link ColumnRef}s. Any non-EXACT_MATCH binding aborts the build. 17334 * 17335 * <p>Slice 65: when the provider carries a non-empty 17336 * {@link UsingScope}, unqualified references that match a USING 17337 * merged key are expanded to the merged source list (one ref per 17338 * relation in the equivalence class) before delegating to 17339 * {@link NameBindingProvider#bindColumn}. See 17340 * {@link #appendMergedOrBoundColumnRef}. 17341 */ 17342 private static List<ColumnRef> collectColumnRefs(gudusoft.gsqlparser.nodes.TParseTreeNode root, 17343 final NameBindingProvider provider) { 17344 final LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 17345 final List<String> rejects = new ArrayList<>(); 17346 root.acceptChildren(new TParseTreeVisitor() { 17347 int nestedSelectDepth = 0; 17348 17349 @Override 17350 public void preVisit(TSelectSqlStatement nested) { 17351 nestedSelectDepth++; 17352 } 17353 17354 @Override 17355 public void postVisit(TSelectSqlStatement nested) { 17356 nestedSelectDepth--; 17357 } 17358 17359 @Override 17360 public void preVisit(TObjectName node) { 17361 if (nestedSelectDepth > 0) return; 17362 appendMergedOrBoundColumnRef(node, provider, refs, rejects); 17363 } 17364 }); 17365 if (!rejects.isEmpty()) { 17366 throw new SemanticIRBuildException(Diagnostic.error(DiagnosticCode.COLUMN_BINDING_NON_EXACT, "non-exact column bindings: " + rejects, null)); 17367 } 17368 return new ArrayList<>(refs); 17369 } 17370 17371 /** 17372 * Tolerant variant of {@link #collectColumnRefs} for the MySQL 17373 * self-reference DELETE path (slice 92 Codex P1 fix). 17374 * 17375 * <p>The MySQL parser populates {@code stmt.tables} with 3 entries for 17376 * {@code DELETE T1 FROM T1 WHERE id = 1} (target + {@code joins[0]} + 17377 * {@code referenceJoins[0]}). Resolver2's {@code inferredCandidates} 17378 * then sees 3 candidates for any unqualified column ref and marks the 17379 * binding as NOT_FOUND, which {@link #collectColumnRefs} rejects as 17380 * {@code COLUMN_BINDING_NON_EXACT}. 17381 * 17382 * <p>This variant emits EXACT_MATCH bindings verbatim and falls back 17383 * to the SQL-written qualifier (or {@code null} for unqualified refs) 17384 * for non-exact bindings instead of throwing. Subquery children are 17385 * not descended into (matches the strict collector's behaviour). 17386 */ 17387 /** 17388 * @param fallbackRelationAlias used when the binding is non-exact and the 17389 * SQL-written qualifier is absent; for the MySQL self-reference path 17390 * this is {@code targetQName} so unqualified refs like 17391 * {@code WHERE id = 1} emit {@code ColumnRef(targetName, "id")} 17392 * instead of crashing on the non-null constraint on 17393 * {@link ColumnRef#ColumnRef(String, String)}. 17394 */ 17395 private static List<ColumnRef> collectColumnRefsTolerant( 17396 gudusoft.gsqlparser.nodes.TParseTreeNode root, 17397 final NameBindingProvider provider, 17398 final String fallbackRelationAlias) { 17399 return collectColumnRefsTolerant(root, provider, fallbackRelationAlias, 17400 Collections.<TExpression>emptySet()); 17401 } 17402 17403 /** 17404 * Slice 111 — variant of the slice-92 tolerant collector that also 17405 * skips any descendants of {@code skipRoots} (extracted predicate 17406 * subquery wrappers). Mirrors the 17407 * {@link #collectColumnRefsSkipping} skipping behavior so DELETE 17408 * WHERE-side IN-SELECT / EXISTS / scalar-comparison wrappers 17409 * extracted by 17410 * {@link #extractUncorrelatedPredicateSubqueriesFromClause} are 17411 * not double-collected as outer filter refs on the MySQL self-ref 17412 * DELETE path. For the non-self-ref DELETE path the 17413 * {@link #collectColumnRefsSkipping} helper handles the same job; 17414 * this helper exists only for the slice-92 path which needs the 17415 * tolerant binding behavior to survive Resolver2's 17416 * NOT_FOUND / NON_EXACT bindings on unqualified self-ref refs. 17417 */ 17418 private static List<ColumnRef> collectColumnRefsTolerant( 17419 gudusoft.gsqlparser.nodes.TParseTreeNode root, 17420 final NameBindingProvider provider, 17421 final String fallbackRelationAlias, 17422 final Set<TExpression> skipRoots) { 17423 final LinkedHashSet<ColumnRef> refs = new LinkedHashSet<>(); 17424 // Root fast path: if root IS a skipped TExpression subtree, return empty. 17425 if (root instanceof TExpression && skipRoots.contains(root)) { 17426 return new ArrayList<>(refs); 17427 } 17428 root.acceptChildren(new TParseTreeVisitor() { 17429 int nestedSelectDepth = 0; 17430 int skipDepth = 0; 17431 17432 @Override 17433 public void preVisit(TExpression e) { 17434 if (skipRoots.contains(e)) skipDepth++; 17435 } 17436 17437 @Override 17438 public void postVisit(TExpression e) { 17439 if (skipRoots.contains(e) && skipDepth > 0) skipDepth--; 17440 } 17441 17442 @Override 17443 public void preVisit(TSelectSqlStatement nested) { 17444 nestedSelectDepth++; 17445 } 17446 17447 @Override 17448 public void postVisit(TSelectSqlStatement nested) { 17449 nestedSelectDepth--; 17450 } 17451 17452 @Override 17453 public void preVisit(TObjectName node) { 17454 if (skipDepth > 0) return; 17455 if (nestedSelectDepth > 0) return; 17456 if (node.getDbObjectType() != EDbObjectType.column) return; 17457 String name = node.getColumnNameOnly(); 17458 if (name == null || "*".equals(name)) return; 17459 ColumnBinding binding = provider.bindColumn(node); 17460 if (binding != null 17461 && binding.getStatus() == ResolutionStatus.EXACT_MATCH) { 17462 refs.add(new ColumnRef( 17463 binding.getRelationAlias(), binding.getColumnName())); 17464 } else { 17465 // Non-exact or null binding: prefer SQL-written qualifier; 17466 // fall back to the single delete-target name so the 17467 // ColumnRef non-null constraint is satisfied. 17468 String qualifier = node.getTableString(); 17469 String alias = (qualifier != null && !qualifier.isEmpty()) 17470 ? qualifier : fallbackRelationAlias; 17471 refs.add(new ColumnRef(alias, name)); 17472 } 17473 } 17474 }); 17475 return new ArrayList<>(refs); 17476 } 17477 17478 /** 17479 * Thrown when the input falls outside current builder scope or a 17480 * binding fails. Slice 67 attached a {@link Diagnostic} to every 17481 * throw site so external callers can pattern-match on 17482 * {@link DiagnosticCode} rather than parsing message text. The 17483 * legacy {@code (String)} constructor was removed in slice 67; 17484 * use one of the {@link Diagnostic#error} factories and the 17485 * {@link #SemanticIRBuildException(Diagnostic)} constructor. 17486 */ 17487 public static final class SemanticIRBuildException extends RuntimeException { 17488 private final Diagnostic diagnostic; 17489 17490 public SemanticIRBuildException(Diagnostic diagnostic) { 17491 super(java.util.Objects.requireNonNull(diagnostic, "diagnostic").getMessage()); 17492 this.diagnostic = diagnostic; 17493 } 17494 17495 /** 17496 * @return the structured diagnostic for this rejection. Always 17497 * non-null after slice 67. 17498 */ 17499 public Diagnostic getDiagnostic() { 17500 return diagnostic; 17501 } 17502 } 17503}