001package gudusoft.gsqlparser.ir.semantic; 002 003import java.util.ArrayList; 004import java.util.Collections; 005import java.util.List; 006 007/** 008 * Semantic shape of one SQL statement. Currently covers SELECT. 009 * 010 * <p>{@link #name} is non-null when this statement is the body of a named 011 * CTE or a FROM-clause subquery. For top-level outer SELECTs it is null. 012 * 013 * <p>{@link #filterColumnRefs}, {@link #joinColumnRefs}, 014 * {@link #groupByColumnRefs}, {@link #havingColumnRefs}, and 015 * {@link #orderByColumnRefs} are flat lists of column references that 016 * appear in the WHERE, JOIN predicate (ON / USING), GROUP BY, HAVING, 017 * and ORDER BY clauses respectively. For {@code JOIN ... USING (k)} 018 * (slice 64) {@code joinColumnRefs} contains one ref per 019 * (relation, key) pair on both sides — left side first via 020 * catalog-aware narrowing, then the right side. The IR deliberately 021 * does <i>not</i> model structured 022 * {@code Filter}, {@code Join}, or {@code GroupBy} nodes with predicate 023 * trees yet; later slices will add them. Listing the affected columns 024 * is enough to answer the roadmap's questions about 025 * filter/join/grouping/having/ordering influence. 026 * 027 * <p>{@link #orderByColumnRefs} only ever contains references to physical 028 * (base or in-statement) columns. Ordinal references ({@code ORDER BY 1}) 029 * and bare-constant sort keys are rejected by the builder — emitting 030 * {@code []} for them would lose the dependency information silently. 031 * 032 * <p>Slice 9 (single-SELECT) rejects projection-alias references like 033 * {@code SELECT id AS x ... ORDER BY x}. Slice 21 (set-op outer) 034 * <i>accepts</i> alias references positionally against branch[0]'s 035 * outputs — the alias IS the set-op output schema. The two paths 036 * diverge intentionally; see 037 * {@code SemanticIRBuilder.buildOrderByColumnRefs} (slice 9) versus 038 * {@code SemanticIRBuilder.buildSetOpOuterOrderByColumnRefs} (slice 21). 039 */ 040public final class StatementGraph { 041 042 private final String name; 043 private final String kind; 044 private final List<RelationSource> relations; 045 private final List<OutputColumn> outputColumns; 046 private final List<OutputColumn> returningColumns; 047 private final List<ColumnRef> filterColumnRefs; 048 private final List<ColumnRef> joinColumnRefs; 049 private final List<ColumnRef> groupByColumnRefs; 050 private final List<ColumnRef> havingColumnRefs; 051 private final List<ColumnRef> orderByColumnRefs; 052 private final List<ColumnRef> distinctOnColumnRefs; 053 private final boolean distinct; 054 private final SetOperator setOperator; 055 private final RowLimit rowLimit; 056 private final TargetRelation target; 057 058 /** 059 * Slice 85 primary constructor — adds the optional 060 * {@code returningColumns} slot for INSERT / UPDATE / DELETE RETURNING 061 * (PG / Oracle) and OUTPUT (SQL Server) projections. The slot is 062 * always non-null (use {@link Collections#emptyList()} when absent); 063 * non-empty only on DML statements that supplied a RETURNING or 064 * OUTPUT projection. 065 */ 066 public StatementGraph(String name, 067 String kind, 068 List<RelationSource> relations, 069 List<OutputColumn> outputColumns, 070 List<OutputColumn> returningColumns, 071 List<ColumnRef> filterColumnRefs, 072 List<ColumnRef> joinColumnRefs, 073 List<ColumnRef> groupByColumnRefs, 074 List<ColumnRef> havingColumnRefs, 075 List<ColumnRef> orderByColumnRefs, 076 List<ColumnRef> distinctOnColumnRefs, 077 boolean distinct, 078 SetOperator setOperator, 079 RowLimit rowLimit, 080 TargetRelation target) { 081 if (kind == null || kind.isEmpty()) { 082 throw new IllegalArgumentException("kind must be non-empty"); 083 } 084 if (relations == null || outputColumns == null 085 || returningColumns == null 086 || filterColumnRefs == null || joinColumnRefs == null 087 || groupByColumnRefs == null || havingColumnRefs == null 088 || orderByColumnRefs == null || distinctOnColumnRefs == null) { 089 throw new IllegalArgumentException( 090 "relations/outputColumns/returningColumns/filterColumnRefs/joinColumnRefs/" 091 + "groupByColumnRefs/havingColumnRefs/orderByColumnRefs/" 092 + "distinctOnColumnRefs must not be null"); 093 } 094 this.name = (name != null && name.isEmpty()) ? null : name; 095 this.kind = kind; 096 this.relations = Collections.unmodifiableList(new ArrayList<>(relations)); 097 this.outputColumns = Collections.unmodifiableList(new ArrayList<>(outputColumns)); 098 this.returningColumns = Collections.unmodifiableList(new ArrayList<>(returningColumns)); 099 this.filterColumnRefs = Collections.unmodifiableList(new ArrayList<>(filterColumnRefs)); 100 this.joinColumnRefs = Collections.unmodifiableList(new ArrayList<>(joinColumnRefs)); 101 this.groupByColumnRefs = Collections.unmodifiableList(new ArrayList<>(groupByColumnRefs)); 102 this.havingColumnRefs = Collections.unmodifiableList(new ArrayList<>(havingColumnRefs)); 103 this.orderByColumnRefs = Collections.unmodifiableList(new ArrayList<>(orderByColumnRefs)); 104 this.distinctOnColumnRefs = Collections.unmodifiableList(new ArrayList<>(distinctOnColumnRefs)); 105 this.distinct = distinct; 106 this.setOperator = setOperator; 107 this.rowLimit = rowLimit; 108 this.target = target; 109 } 110 111 /** 112 * Slice 78 constructor preserved so production code that predates 113 * slice 85 keeps compiling unchanged. Delegates to the slice-85 114 * primary constructor with empty {@code returningColumns}. 115 */ 116 public StatementGraph(String name, 117 String kind, 118 List<RelationSource> relations, 119 List<OutputColumn> outputColumns, 120 List<ColumnRef> filterColumnRefs, 121 List<ColumnRef> joinColumnRefs, 122 List<ColumnRef> groupByColumnRefs, 123 List<ColumnRef> havingColumnRefs, 124 List<ColumnRef> orderByColumnRefs, 125 List<ColumnRef> distinctOnColumnRefs, 126 boolean distinct, 127 SetOperator setOperator, 128 RowLimit rowLimit, 129 TargetRelation target) { 130 this(name, kind, relations, outputColumns, 131 Collections.<OutputColumn>emptyList(), 132 filterColumnRefs, joinColumnRefs, groupByColumnRefs, 133 havingColumnRefs, orderByColumnRefs, 134 distinctOnColumnRefs, 135 distinct, setOperator, rowLimit, target); 136 } 137 138 /** 139 * Slice 73 constructor preserved so SELECT-kind production code that 140 * predates slice 78 keeps compiling unchanged. Delegates to the 141 * slice-78 constructor with {@code target=null}. 142 */ 143 public StatementGraph(String name, 144 String kind, 145 List<RelationSource> relations, 146 List<OutputColumn> outputColumns, 147 List<ColumnRef> filterColumnRefs, 148 List<ColumnRef> joinColumnRefs, 149 List<ColumnRef> groupByColumnRefs, 150 List<ColumnRef> havingColumnRefs, 151 List<ColumnRef> orderByColumnRefs, 152 List<ColumnRef> distinctOnColumnRefs, 153 boolean distinct, 154 SetOperator setOperator, 155 RowLimit rowLimit) { 156 this(name, kind, relations, outputColumns, 157 filterColumnRefs, joinColumnRefs, groupByColumnRefs, 158 havingColumnRefs, orderByColumnRefs, 159 distinctOnColumnRefs, 160 distinct, setOperator, rowLimit, /*target=*/ null); 161 } 162 163 /** 164 * Pre-slice-73 constructor preserved so hand-built test fixtures 165 * (e.g. {@code SemanticIRProjectorBodyIndexesTest}) continue to 166 * compile without touching every call site. Delegates to the 167 * slice-73 constructor with an empty {@code distinctOnColumnRefs} 168 * list. New production code should call the slice-78 primary 169 * constructor directly. 170 */ 171 public StatementGraph(String name, 172 String kind, 173 List<RelationSource> relations, 174 List<OutputColumn> outputColumns, 175 List<ColumnRef> filterColumnRefs, 176 List<ColumnRef> joinColumnRefs, 177 List<ColumnRef> groupByColumnRefs, 178 List<ColumnRef> havingColumnRefs, 179 List<ColumnRef> orderByColumnRefs, 180 boolean distinct, 181 SetOperator setOperator, 182 RowLimit rowLimit) { 183 this(name, kind, relations, outputColumns, 184 filterColumnRefs, joinColumnRefs, groupByColumnRefs, 185 havingColumnRefs, orderByColumnRefs, 186 Collections.<ColumnRef>emptyList(), 187 distinct, setOperator, rowLimit); 188 } 189 190 /** Nullable: name for a CTE body or FROM-subquery alias, else null. */ 191 public String getName() { 192 return name; 193 } 194 195 public String getKind() { 196 return kind; 197 } 198 199 public List<RelationSource> getRelations() { 200 return relations; 201 } 202 203 public List<OutputColumn> getOutputColumns() { 204 return outputColumns; 205 } 206 207 /** 208 * Slice 85 — RETURNING / OUTPUT projection columns for INSERT / UPDATE / 209 * DELETE statements. Empty list on every SELECT-kind statement (CTE 210 * body / FROM-subquery / scalar / set-op branch / outer), on every 211 * DML statement that did not supply a RETURNING (PG / Oracle) or 212 * OUTPUT (SQL Server) clause, and on CTAS / CREATE VIEW statements. 213 * 214 * <p>For PG / Oracle RETURNING, each entry's 215 * {@link OutputColumn#getName()} is the explicit alias when present, 216 * else the verbatim bare column spelling. 217 * {@link OutputColumn#getSources()} lists the underlying column refs; 218 * the {@code relationAlias} resolves through the same provider used 219 * for SET RHS / WHERE / JOIN ON, so a joined-UPDATE with 220 * {@code RETURNING t.a, s.x} produces refs against both target and 221 * FROM-side relations. 222 * 223 * <p>For SQL Server OUTPUT pseudo-table refs (INSERTED.col, 224 * DELETED.col), the {@code relationAlias} is preserved as the 225 * uppercase pseudo-table name ({@code "INSERTED"} or 226 * {@code "DELETED"}) so consumers can distinguish post-write from 227 * pre-write row state. Lineage edges still flow to 228 * {@link LineageRef#tableColumn(String, String)} pointing at the 229 * physical target table column — both INSERTED and DELETED ultimately 230 * reference the same physical column; only the temporal phase differs. 231 */ 232 public List<OutputColumn> getReturningColumns() { 233 return returningColumns; 234 } 235 236 public List<ColumnRef> getFilterColumnRefs() { 237 return filterColumnRefs; 238 } 239 240 public List<ColumnRef> getJoinColumnRefs() { 241 return joinColumnRefs; 242 } 243 244 public List<ColumnRef> getGroupByColumnRefs() { 245 return groupByColumnRefs; 246 } 247 248 /** 249 * Column references that appear in the {@code HAVING} clause's 250 * predicate. The list is per-statement and per-clause: a HAVING 251 * predicate that names {@code d.id} contributes one entry; a HAVING 252 * predicate inside an aggregate ({@code HAVING SUM(salary) > 1000}) 253 * contributes the underlying column ({@code salary}) — the same 254 * convention used for projection-side aggregate arguments 255 * (slice 6 OutputColumn.sources). 256 * 257 * <p>Subqueries in HAVING (scalar, EXISTS, IN-SELECT, ANY/ALL/SOME) 258 * and window functions in HAVING are rejected by the builder rather 259 * than silently captured, because the visitor would descend into 260 * inner scopes and leak refs (mirrors the slice-9 ORDER BY guards). 261 * 262 * <p>HAVING is row-influence semantically (it filters out groups), 263 * but it deliberately does <i>not</i> contribute to the canonical 264 * lineage model (slice 7 / {@code CanonicalLineageEdge}). The 265 * canonical model is a parity contract between IR and dlineage, and 266 * dlineage exposes no per-clause HAVING field — it folds HAVING refs 267 * into aggregate-function fdr/fdd edges. Including HAVING-derived 268 * canonical edges only on the IR side would manufacture 269 * divergence-by-design. The {@code havingColumnRefs} field remains 270 * useful for downstream consumers (SQL Guard, lineage explainers) 271 * that don't depend on the dlineage parity contract. 272 */ 273 public List<ColumnRef> getHavingColumnRefs() { 274 return havingColumnRefs; 275 } 276 277 /** 278 * Column references that appear in the {@code ORDER BY} clause's sort 279 * keys. Only physical column references are recorded — ordinal 280 * ({@code ORDER BY 1}) and projection-alias ({@code ORDER BY x}) 281 * forms are rejected by the builder, not silently emitted as 282 * {@code []}. Sort direction ({@code ASC}/{@code DESC}) and null 283 * placement ({@code NULLS FIRST}/{@code NULLS LAST}) are presentation 284 * metadata and are not modelled. 285 * 286 * <p>The flag is per-statement: in 287 * {@code WITH x AS (... ORDER BY id) SELECT id FROM x} the inner 288 * statement's {@code orderByColumnRefs} contains {@code id} while the 289 * outer's is empty. 290 */ 291 public List<ColumnRef> getOrderByColumnRefs() { 292 return orderByColumnRefs; 293 } 294 295 /** 296 * Whether the statement applies row-deduplication. True for 297 * {@code SELECT DISTINCT}, Oracle's deprecated synonym 298 * {@code SELECT UNIQUE}, AND PostgreSQL / Greenplum 299 * {@code SELECT DISTINCT ON (cols)}; false for {@code SELECT}, 300 * {@code SELECT ALL}, and the absence of any row-filter clause. 301 * The flag is per-statement, never per-output. 302 * 303 * <p>For {@code DISTINCT ON (cols)} the partition keys live on 304 * {@link #getDistinctOnColumnRefs()}; the boolean here pins the 305 * semantic invariant that the statement deduplicates rows 306 * regardless of which key shape is used. 307 */ 308 public boolean isDistinct() { 309 return distinct; 310 } 311 312 /** 313 * Column references in the {@code DISTINCT ON (cols)} partition list 314 * (PostgreSQL / Greenplum). Empty for plain {@code SELECT DISTINCT}, 315 * {@code SELECT UNIQUE}, {@code SELECT ALL}, and the absence of any 316 * row-filter clause. 317 * 318 * <p>Invariant: {@code !distinctOnColumnRefs.isEmpty()} implies 319 * {@link #isDistinct()} == {@code true}. The reverse does not hold 320 * (plain {@code DISTINCT} also returns {@code true}). 321 * 322 * <p>The list collects physical column refs the same way 323 * {@code groupByColumnRefs} does: column refs inside compound 324 * expressions ({@code a + b}, {@code CASE WHEN ...}) and aggregate 325 * arguments ({@code COUNT(x)}) are descended into; subqueries and 326 * window functions in {@code DISTINCT ON} are rejected by the 327 * builder so they cannot leak inner-scope refs. 328 * 329 * <p>Oracle, MySQL, Redshift and other non-PG vendors silently 330 * accept {@code DISTINCT ON (...)} as plain {@code DISTINCT} — 331 * their parser drops the ON expression list, so this slot stays 332 * empty for those vendors regardless of the surface SQL. 333 */ 334 public List<ColumnRef> getDistinctOnColumnRefs() { 335 return distinctOnColumnRefs; 336 } 337 338 /** 339 * Set-operation kind for the outer statement of a set-op program 340 * (slice 12). Returns null on every regular SELECT statement and on 341 * every CTE / FROM-subquery / scalar / set-op-branch body. The 342 * {@code _ALL} variants encode {@code TSelectSqlStatement#isAll()}; 343 * {@code MINUS} (Oracle / Spark / Hive) and {@code EXCEPT} 344 * (PostgreSQL / SQL Server / standard) are kept distinct because the 345 * parser exposes them as separate 346 * {@link gudusoft.gsqlparser.ESetOperatorType} values, even though 347 * they are semantically equivalent. 348 */ 349 public SetOperator getSetOperator() { 350 return setOperator; 351 } 352 353 /** 354 * Per-statement row-limit metadata (slice 70). Returns null when no 355 * row-limit clause was present, or when the row-limit clause is in 356 * slice-71 / 72 territory (TOP, standalone OFFSET, PG inline 357 * {@code LIMIT N OFFSET M}, MySQL inline {@code LIMIT M, N}, 358 * set-op outer row-limit) — those surfaces continue to be rejected 359 * by the builder with their existing diagnostic codes. 360 * 361 * <p>When non-null, the {@link RowLimit#getKind()} captures which 362 * surface SQL form was used ({@code LIMIT} vs {@code FETCH FIRST}) 363 * and {@link RowLimit#getCount()} captures the verbatim count text. 364 * 365 * <p>Row-limit metadata does <i>not</i> change column lineage. The 366 * canonical lineage model (slice 7 / {@code CanonicalLineageEdge}) 367 * deliberately ignores it: row-limit is presentation-time pruning, 368 * not a column-flow influence. ORDER BY refs, output sources, 369 * filter / join / group-by / having refs are all unaffected. 370 */ 371 public RowLimit getRowLimit() { 372 return rowLimit; 373 } 374 375 /** 376 * Slice 78 — write-side target for INSERT statements. Non-null only on 377 * {@code "INSERT"}-kind statements; null on every {@code "SELECT"}-kind 378 * statement (whether the SELECT is an outer, CTE body, FROM-subquery 379 * body, scalar-subquery body, or set-op branch). 380 * 381 * <p>When non-null, {@link TargetRelation#getBinding()} is the target 382 * table (kind = {@link RelationKind#TABLE}) and 383 * {@link TargetRelation#getColumns()} holds the verbatim SQL column-list 384 * spellings (empty list when the SQL author omitted the column list). 385 * 386 * <p>Cross-statement {@link LineageEdge}s for INSERT use 387 * {@link LineageRef#tableColumn(String, String)} as the {@code from} 388 * endpoint (target_table, target_col) and 389 * {@link LineageRef#statementOutput(int, String)} as the {@code to} 390 * endpoint (source SELECT body statement index + output name). 391 */ 392 public TargetRelation getTarget() { 393 return target; 394 } 395}