001package gudusoft.gsqlparser.ir.semantic;
002
003import java.util.ArrayList;
004import java.util.Collections;
005import java.util.List;
006
007/**
008 * Semantic shape of one SQL statement. Currently covers SELECT.
009 *
010 * <p>{@link #name} is non-null when this statement is the body of a named
011 * CTE or a FROM-clause subquery. For top-level outer SELECTs it is null.
012 *
013 * <p>{@link #filterColumnRefs}, {@link #joinColumnRefs},
014 * {@link #groupByColumnRefs}, {@link #havingColumnRefs}, and
015 * {@link #orderByColumnRefs} are flat lists of column references that
016 * appear in the WHERE, JOIN predicate (ON / USING), GROUP BY, HAVING,
017 * and ORDER BY clauses respectively. For {@code JOIN ... USING (k)}
018 * (slice 64) {@code joinColumnRefs} contains one ref per
019 * (relation, key) pair on both sides — left side first via
020 * catalog-aware narrowing, then the right side. The IR deliberately
021 * does <i>not</i> model structured
022 * {@code Filter}, {@code Join}, or {@code GroupBy} nodes with predicate
023 * trees yet; later slices will add them. Listing the affected columns
024 * is enough to answer the roadmap's questions about
025 * filter/join/grouping/having/ordering influence.
026 *
027 * <p>{@link #orderByColumnRefs} only ever contains references to physical
028 * (base or in-statement) columns. Ordinal references ({@code ORDER BY 1})
029 * and bare-constant sort keys are rejected by the builder — emitting
030 * {@code []} for them would lose the dependency information silently.
031 *
032 * <p>Slice 9 (single-SELECT) rejects projection-alias references like
033 * {@code SELECT id AS x ... ORDER BY x}. Slice 21 (set-op outer)
034 * <i>accepts</i> alias references positionally against branch[0]'s
035 * outputs — the alias IS the set-op output schema. The two paths
036 * diverge intentionally; see
037 * {@code SemanticIRBuilder.buildOrderByColumnRefs} (slice 9) versus
038 * {@code SemanticIRBuilder.buildSetOpOuterOrderByColumnRefs} (slice 21).
039 */
040public final class StatementGraph {
041
042    private final String name;
043    private final String kind;
044    private final List<RelationSource> relations;
045    private final List<OutputColumn> outputColumns;
046    private final List<OutputColumn> returningColumns;
047    private final List<ColumnRef> filterColumnRefs;
048    private final List<ColumnRef> joinColumnRefs;
049    private final List<ColumnRef> groupByColumnRefs;
050    private final List<ColumnRef> havingColumnRefs;
051    private final List<ColumnRef> orderByColumnRefs;
052    private final List<ColumnRef> distinctOnColumnRefs;
053    private final boolean distinct;
054    private final SetOperator setOperator;
055    private final RowLimit rowLimit;
056    private final TargetRelation target;
057
058    /**
059     * Slice 85 primary constructor — adds the optional
060     * {@code returningColumns} slot for INSERT / UPDATE / DELETE RETURNING
061     * (PG / Oracle) and OUTPUT (SQL Server) projections. The slot is
062     * always non-null (use {@link Collections#emptyList()} when absent);
063     * non-empty only on DML statements that supplied a RETURNING or
064     * OUTPUT projection.
065     */
066    public StatementGraph(String name,
067                          String kind,
068                          List<RelationSource> relations,
069                          List<OutputColumn> outputColumns,
070                          List<OutputColumn> returningColumns,
071                          List<ColumnRef> filterColumnRefs,
072                          List<ColumnRef> joinColumnRefs,
073                          List<ColumnRef> groupByColumnRefs,
074                          List<ColumnRef> havingColumnRefs,
075                          List<ColumnRef> orderByColumnRefs,
076                          List<ColumnRef> distinctOnColumnRefs,
077                          boolean distinct,
078                          SetOperator setOperator,
079                          RowLimit rowLimit,
080                          TargetRelation target) {
081        if (kind == null || kind.isEmpty()) {
082            throw new IllegalArgumentException("kind must be non-empty");
083        }
084        if (relations == null || outputColumns == null
085                || returningColumns == null
086                || filterColumnRefs == null || joinColumnRefs == null
087                || groupByColumnRefs == null || havingColumnRefs == null
088                || orderByColumnRefs == null || distinctOnColumnRefs == null) {
089            throw new IllegalArgumentException(
090                    "relations/outputColumns/returningColumns/filterColumnRefs/joinColumnRefs/"
091                            + "groupByColumnRefs/havingColumnRefs/orderByColumnRefs/"
092                            + "distinctOnColumnRefs must not be null");
093        }
094        this.name = (name != null && name.isEmpty()) ? null : name;
095        this.kind = kind;
096        this.relations = Collections.unmodifiableList(new ArrayList<>(relations));
097        this.outputColumns = Collections.unmodifiableList(new ArrayList<>(outputColumns));
098        this.returningColumns = Collections.unmodifiableList(new ArrayList<>(returningColumns));
099        this.filterColumnRefs = Collections.unmodifiableList(new ArrayList<>(filterColumnRefs));
100        this.joinColumnRefs = Collections.unmodifiableList(new ArrayList<>(joinColumnRefs));
101        this.groupByColumnRefs = Collections.unmodifiableList(new ArrayList<>(groupByColumnRefs));
102        this.havingColumnRefs = Collections.unmodifiableList(new ArrayList<>(havingColumnRefs));
103        this.orderByColumnRefs = Collections.unmodifiableList(new ArrayList<>(orderByColumnRefs));
104        this.distinctOnColumnRefs = Collections.unmodifiableList(new ArrayList<>(distinctOnColumnRefs));
105        this.distinct = distinct;
106        this.setOperator = setOperator;
107        this.rowLimit = rowLimit;
108        this.target = target;
109    }
110
111    /**
112     * Slice 78 constructor preserved so production code that predates
113     * slice 85 keeps compiling unchanged. Delegates to the slice-85
114     * primary constructor with empty {@code returningColumns}.
115     */
116    public StatementGraph(String name,
117                          String kind,
118                          List<RelationSource> relations,
119                          List<OutputColumn> outputColumns,
120                          List<ColumnRef> filterColumnRefs,
121                          List<ColumnRef> joinColumnRefs,
122                          List<ColumnRef> groupByColumnRefs,
123                          List<ColumnRef> havingColumnRefs,
124                          List<ColumnRef> orderByColumnRefs,
125                          List<ColumnRef> distinctOnColumnRefs,
126                          boolean distinct,
127                          SetOperator setOperator,
128                          RowLimit rowLimit,
129                          TargetRelation target) {
130        this(name, kind, relations, outputColumns,
131                Collections.<OutputColumn>emptyList(),
132                filterColumnRefs, joinColumnRefs, groupByColumnRefs,
133                havingColumnRefs, orderByColumnRefs,
134                distinctOnColumnRefs,
135                distinct, setOperator, rowLimit, target);
136    }
137
138    /**
139     * Slice 73 constructor preserved so SELECT-kind production code that
140     * predates slice 78 keeps compiling unchanged. Delegates to the
141     * slice-78 constructor with {@code target=null}.
142     */
143    public StatementGraph(String name,
144                          String kind,
145                          List<RelationSource> relations,
146                          List<OutputColumn> outputColumns,
147                          List<ColumnRef> filterColumnRefs,
148                          List<ColumnRef> joinColumnRefs,
149                          List<ColumnRef> groupByColumnRefs,
150                          List<ColumnRef> havingColumnRefs,
151                          List<ColumnRef> orderByColumnRefs,
152                          List<ColumnRef> distinctOnColumnRefs,
153                          boolean distinct,
154                          SetOperator setOperator,
155                          RowLimit rowLimit) {
156        this(name, kind, relations, outputColumns,
157                filterColumnRefs, joinColumnRefs, groupByColumnRefs,
158                havingColumnRefs, orderByColumnRefs,
159                distinctOnColumnRefs,
160                distinct, setOperator, rowLimit, /*target=*/ null);
161    }
162
163    /**
164     * Pre-slice-73 constructor preserved so hand-built test fixtures
165     * (e.g. {@code SemanticIRProjectorBodyIndexesTest}) continue to
166     * compile without touching every call site. Delegates to the
167     * slice-73 constructor with an empty {@code distinctOnColumnRefs}
168     * list. New production code should call the slice-78 primary
169     * constructor directly.
170     */
171    public StatementGraph(String name,
172                          String kind,
173                          List<RelationSource> relations,
174                          List<OutputColumn> outputColumns,
175                          List<ColumnRef> filterColumnRefs,
176                          List<ColumnRef> joinColumnRefs,
177                          List<ColumnRef> groupByColumnRefs,
178                          List<ColumnRef> havingColumnRefs,
179                          List<ColumnRef> orderByColumnRefs,
180                          boolean distinct,
181                          SetOperator setOperator,
182                          RowLimit rowLimit) {
183        this(name, kind, relations, outputColumns,
184                filterColumnRefs, joinColumnRefs, groupByColumnRefs,
185                havingColumnRefs, orderByColumnRefs,
186                Collections.<ColumnRef>emptyList(),
187                distinct, setOperator, rowLimit);
188    }
189
190    /** Nullable: name for a CTE body or FROM-subquery alias, else null. */
191    public String getName() {
192        return name;
193    }
194
195    public String getKind() {
196        return kind;
197    }
198
199    public List<RelationSource> getRelations() {
200        return relations;
201    }
202
203    public List<OutputColumn> getOutputColumns() {
204        return outputColumns;
205    }
206
207    /**
208     * Slice 85 — RETURNING / OUTPUT projection columns for INSERT / UPDATE /
209     * DELETE statements. Empty list on every SELECT-kind statement (CTE
210     * body / FROM-subquery / scalar / set-op branch / outer), on every
211     * DML statement that did not supply a RETURNING (PG / Oracle) or
212     * OUTPUT (SQL Server) clause, and on CTAS / CREATE VIEW statements.
213     *
214     * <p>For PG / Oracle RETURNING, each entry's
215     * {@link OutputColumn#getName()} is the explicit alias when present,
216     * else the verbatim bare column spelling.
217     * {@link OutputColumn#getSources()} lists the underlying column refs;
218     * the {@code relationAlias} resolves through the same provider used
219     * for SET RHS / WHERE / JOIN ON, so a joined-UPDATE with
220     * {@code RETURNING t.a, s.x} produces refs against both target and
221     * FROM-side relations.
222     *
223     * <p>For SQL Server OUTPUT pseudo-table refs (INSERTED.col,
224     * DELETED.col), the {@code relationAlias} is preserved as the
225     * uppercase pseudo-table name ({@code "INSERTED"} or
226     * {@code "DELETED"}) so consumers can distinguish post-write from
227     * pre-write row state. Lineage edges still flow to
228     * {@link LineageRef#tableColumn(String, String)} pointing at the
229     * physical target table column — both INSERTED and DELETED ultimately
230     * reference the same physical column; only the temporal phase differs.
231     */
232    public List<OutputColumn> getReturningColumns() {
233        return returningColumns;
234    }
235
236    public List<ColumnRef> getFilterColumnRefs() {
237        return filterColumnRefs;
238    }
239
240    public List<ColumnRef> getJoinColumnRefs() {
241        return joinColumnRefs;
242    }
243
244    public List<ColumnRef> getGroupByColumnRefs() {
245        return groupByColumnRefs;
246    }
247
248    /**
249     * Column references that appear in the {@code HAVING} clause's
250     * predicate. The list is per-statement and per-clause: a HAVING
251     * predicate that names {@code d.id} contributes one entry; a HAVING
252     * predicate inside an aggregate ({@code HAVING SUM(salary) > 1000})
253     * contributes the underlying column ({@code salary}) — the same
254     * convention used for projection-side aggregate arguments
255     * (slice 6 OutputColumn.sources).
256     *
257     * <p>Subqueries in HAVING (scalar, EXISTS, IN-SELECT, ANY/ALL/SOME)
258     * and window functions in HAVING are rejected by the builder rather
259     * than silently captured, because the visitor would descend into
260     * inner scopes and leak refs (mirrors the slice-9 ORDER BY guards).
261     *
262     * <p>HAVING is row-influence semantically (it filters out groups),
263     * but it deliberately does <i>not</i> contribute to the canonical
264     * lineage model (slice 7 / {@code CanonicalLineageEdge}). The
265     * canonical model is a parity contract between IR and dlineage, and
266     * dlineage exposes no per-clause HAVING field — it folds HAVING refs
267     * into aggregate-function fdr/fdd edges. Including HAVING-derived
268     * canonical edges only on the IR side would manufacture
269     * divergence-by-design. The {@code havingColumnRefs} field remains
270     * useful for downstream consumers (SQL Guard, lineage explainers)
271     * that don't depend on the dlineage parity contract.
272     */
273    public List<ColumnRef> getHavingColumnRefs() {
274        return havingColumnRefs;
275    }
276
277    /**
278     * Column references that appear in the {@code ORDER BY} clause's sort
279     * keys. Only physical column references are recorded — ordinal
280     * ({@code ORDER BY 1}) and projection-alias ({@code ORDER BY x})
281     * forms are rejected by the builder, not silently emitted as
282     * {@code []}. Sort direction ({@code ASC}/{@code DESC}) and null
283     * placement ({@code NULLS FIRST}/{@code NULLS LAST}) are presentation
284     * metadata and are not modelled.
285     *
286     * <p>The flag is per-statement: in
287     * {@code WITH x AS (... ORDER BY id) SELECT id FROM x} the inner
288     * statement's {@code orderByColumnRefs} contains {@code id} while the
289     * outer's is empty.
290     */
291    public List<ColumnRef> getOrderByColumnRefs() {
292        return orderByColumnRefs;
293    }
294
295    /**
296     * Whether the statement applies row-deduplication. True for
297     * {@code SELECT DISTINCT}, Oracle's deprecated synonym
298     * {@code SELECT UNIQUE}, AND PostgreSQL / Greenplum
299     * {@code SELECT DISTINCT ON (cols)}; false for {@code SELECT},
300     * {@code SELECT ALL}, and the absence of any row-filter clause.
301     * The flag is per-statement, never per-output.
302     *
303     * <p>For {@code DISTINCT ON (cols)} the partition keys live on
304     * {@link #getDistinctOnColumnRefs()}; the boolean here pins the
305     * semantic invariant that the statement deduplicates rows
306     * regardless of which key shape is used.
307     */
308    public boolean isDistinct() {
309        return distinct;
310    }
311
312    /**
313     * Column references in the {@code DISTINCT ON (cols)} partition list
314     * (PostgreSQL / Greenplum). Empty for plain {@code SELECT DISTINCT},
315     * {@code SELECT UNIQUE}, {@code SELECT ALL}, and the absence of any
316     * row-filter clause.
317     *
318     * <p>Invariant: {@code !distinctOnColumnRefs.isEmpty()} implies
319     * {@link #isDistinct()} == {@code true}. The reverse does not hold
320     * (plain {@code DISTINCT} also returns {@code true}).
321     *
322     * <p>The list collects physical column refs the same way
323     * {@code groupByColumnRefs} does: column refs inside compound
324     * expressions ({@code a + b}, {@code CASE WHEN ...}) and aggregate
325     * arguments ({@code COUNT(x)}) are descended into; subqueries and
326     * window functions in {@code DISTINCT ON} are rejected by the
327     * builder so they cannot leak inner-scope refs.
328     *
329     * <p>Oracle, MySQL, Redshift and other non-PG vendors silently
330     * accept {@code DISTINCT ON (...)} as plain {@code DISTINCT} —
331     * their parser drops the ON expression list, so this slot stays
332     * empty for those vendors regardless of the surface SQL.
333     */
334    public List<ColumnRef> getDistinctOnColumnRefs() {
335        return distinctOnColumnRefs;
336    }
337
338    /**
339     * Set-operation kind for the outer statement of a set-op program
340     * (slice 12). Returns null on every regular SELECT statement and on
341     * every CTE / FROM-subquery / scalar / set-op-branch body. The
342     * {@code _ALL} variants encode {@code TSelectSqlStatement#isAll()};
343     * {@code MINUS} (Oracle / Spark / Hive) and {@code EXCEPT}
344     * (PostgreSQL / SQL Server / standard) are kept distinct because the
345     * parser exposes them as separate
346     * {@link gudusoft.gsqlparser.ESetOperatorType} values, even though
347     * they are semantically equivalent.
348     */
349    public SetOperator getSetOperator() {
350        return setOperator;
351    }
352
353    /**
354     * Per-statement row-limit metadata (slice 70). Returns null when no
355     * row-limit clause was present, or when the row-limit clause is in
356     * slice-71 / 72 territory (TOP, standalone OFFSET, PG inline
357     * {@code LIMIT N OFFSET M}, MySQL inline {@code LIMIT M, N},
358     * set-op outer row-limit) — those surfaces continue to be rejected
359     * by the builder with their existing diagnostic codes.
360     *
361     * <p>When non-null, the {@link RowLimit#getKind()} captures which
362     * surface SQL form was used ({@code LIMIT} vs {@code FETCH FIRST})
363     * and {@link RowLimit#getCount()} captures the verbatim count text.
364     *
365     * <p>Row-limit metadata does <i>not</i> change column lineage. The
366     * canonical lineage model (slice 7 / {@code CanonicalLineageEdge})
367     * deliberately ignores it: row-limit is presentation-time pruning,
368     * not a column-flow influence. ORDER BY refs, output sources,
369     * filter / join / group-by / having refs are all unaffected.
370     */
371    public RowLimit getRowLimit() {
372        return rowLimit;
373    }
374
375    /**
376     * Slice 78 — write-side target for INSERT statements. Non-null only on
377     * {@code "INSERT"}-kind statements; null on every {@code "SELECT"}-kind
378     * statement (whether the SELECT is an outer, CTE body, FROM-subquery
379     * body, scalar-subquery body, or set-op branch).
380     *
381     * <p>When non-null, {@link TargetRelation#getBinding()} is the target
382     * table (kind = {@link RelationKind#TABLE}) and
383     * {@link TargetRelation#getColumns()} holds the verbatim SQL column-list
384     * spellings (empty list when the SQL author omitted the column list).
385     *
386     * <p>Cross-statement {@link LineageEdge}s for INSERT use
387     * {@link LineageRef#tableColumn(String, String)} as the {@code from}
388     * endpoint (target_table, target_col) and
389     * {@link LineageRef#statementOutput(int, String)} as the {@code to}
390     * endpoint (source SELECT body statement index + output name).
391     */
392    public TargetRelation getTarget() {
393        return target;
394    }
395}