001package gudusoft.gsqlparser.pp2.island;
002
003import gudusoft.gsqlparser.ETokenType;
004import gudusoft.gsqlparser.pp2.island.SqlScopeDetector.SqlScopeResult;
005import gudusoft.gsqlparser.pp2.token.Pp2Token;
006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
007import gudusoft.gsqlparser.pp2.token.TokenRole;
008
009import java.util.ArrayList;
010import java.util.Collections;
011import java.util.HashMap;
012import java.util.HashSet;
013import java.util.List;
014import java.util.Locale;
015import java.util.Map;
016import java.util.Set;
017
018/**
019 * Annotates each token with the clause it belongs to ({@link ClausePart}) and
020 * marks master/clause keyword roles, by walking the token stream left-to-right
021 * and tracking the current clause per SQL scope level.
022 *
023 * <h2>SQL-scope aware</h2>
024 *
025 * <p>The annotator consumes {@link SqlScopeResult} (S20) so a subquery's
026 * clauses do not leak into the enclosing query: the "current clause" is tracked
027 * independently per scope level. When the walk descends into a deeper scope,
028 * that level's clause state starts fresh; when it returns, the enclosing level's
029 * clause (e.g. {@code FROM} around a derived-table subquery) is intact.
030 *
031 * <h2>Expression-depth gating</h2>
032 *
033 * <p>A clause keyword only changes the current clause when it appears at the
034 * query block's own paren depth — not inside a nested expression. This stops
035 * window specs ({@code OVER (ORDER BY a)}), aggregate filters
036 * ({@code FILTER (WHERE ...)}), and parenthesised predicates from corrupting
037 * the clause state. The function-form keywords {@code LEFT(} / {@code RIGHT(}
038 * are also recognised as function calls (not JOIN keywords) when immediately
039 * followed by an open paren.
040 *
041 * <h2>Designated annotator</h2>
042 *
043 * <p>S21 is one of the designated role-annotator stages (plan forbidden #6), so
044 * it sets {@link TokenRole#KEYWORD_MASTER} on master keywords
045 * (SELECT/INSERT/UPDATE/DELETE/MERGE) and {@link TokenRole#KEYWORD_CLAUSE} on
046 * recognised clause keywords. It never mutates the wrapped {@code TSourceToken}.
047 * The per-token clause membership is returned as a {@link ClauseScopeResult}
048 * side structure (a token belongs to exactly one clause, so a small enum is
049 * clearer than seven boolean roles).
050 *
051 * <p>Iterative single pass; no recursion. Plan reference: §7.3/S21, §7.4/S21.
052 */
053public final class ClauseScopeAnnotator {
054
055    private static final Set<String> MASTER_KEYWORDS = setOf(
056        "SELECT", "INSERT", "UPDATE", "DELETE", "MERGE");
057
058    /** Keyword text -> the clause it starts. */
059    private static final Map<String, ClausePart> CLAUSE_KEYWORDS = buildClauseKeywords();
060
061    private static Map<String, ClausePart> buildClauseKeywords() {
062        Map<String, ClausePart> m = new HashMap<String, ClausePart>();
063        m.put("FROM", ClausePart.FROM);
064        m.put("WHERE", ClausePart.WHERE);
065        m.put("GROUP", ClausePart.GROUP_BY);
066        m.put("ORDER", ClausePart.ORDER_BY);
067        m.put("HAVING", ClausePart.HAVING);
068        m.put("INTO", ClausePart.INTO);
069        m.put("VALUES", ClausePart.INSERT_VALUES);
070        m.put("SET", ClausePart.UPDATE_SET);
071        // JOIN family — all map to the JOIN clause.
072        for (String j : new String[]{"JOIN", "INNER", "LEFT", "RIGHT", "FULL",
073            "CROSS", "NATURAL", "OUTER", "ON", "USING", "APPLY"}) {
074            m.put(j, ClausePart.JOIN);
075        }
076        // Set operators.
077        for (String s : new String[]{"UNION", "INTERSECT", "EXCEPT", "MINUS"}) {
078            m.put(s, ClausePart.SET_OP);
079        }
080        return Collections.unmodifiableMap(m);
081    }
082
083    /**
084     * Annotate {@code stream} clause membership using the SQL scope levels.
085     *
086     * @param stream   the token stream; must not be null
087     * @param sqlScope the S20 scope result for the same stream; must not be null
088     *                 and must cover the same token count
089     * @return a {@link ClauseScopeResult}; never null
090     * @throws NullPointerException if any argument is null
091     * @throws IllegalArgumentException if {@code sqlScope.size()} != stream size
092     */
093    public ClauseScopeResult annotate(Pp2TokenStream stream, SqlScopeResult sqlScope) {
094        if (stream == null) throw new NullPointerException("stream");
095        if (sqlScope == null) throw new NullPointerException("sqlScope");
096        int n = stream.size();
097        if (sqlScope.size() != n) {
098            throw new IllegalArgumentException(
099                "sqlScope size " + sqlScope.size() + " != stream size " + n);
100        }
101
102        ClausePart[] part = new ClausePart[n];
103        List<Integer> masterIndices = new ArrayList<Integer>();
104
105        int levels = Math.max(1, sqlScope.getMaxLevel() + 1);
106        // Current clause per SQL scope level. Index by level.
107        ClausePart[] current = new ClausePart[levels];
108        java.util.Arrays.fill(current, ClausePart.NONE);
109        // The paren depth at which each level's clauses live (the query block's
110        // base). Clause keywords only transition when paren depth == base[level].
111        int[] base = new int[levels];
112
113        int parenDepth = 0;
114        int prevLevel = 0;
115        for (int i = 0; i < n; i++) {
116            Pp2Token t = stream.get(i);
117            int level = sqlScope.levelAt(i);
118
119            // Descending into deeper subquery scope(s) resets each newly entered
120            // level's clause and records its base paren depth (just inside the
121            // query paren). Reset every level in (prevLevel, level], not just the
122            // target, in case S20 levels ever step by more than one.
123            if (level > prevLevel) {
124                for (int l = prevLevel + 1; l <= level; l++) {
125                    current[l] = ClausePart.NONE;
126                    base[l] = parenDepth;
127                }
128            }
129            prevLevel = level;
130
131            ETokenType type = t.getSourceToken().tokentype;
132            if (type == ETokenType.ttsemicolon) {
133                // Statement boundary: reset clause state so the next statement's
134                // header does not inherit the previous statement's clause.
135                java.util.Arrays.fill(current, ClausePart.NONE);
136                java.util.Arrays.fill(base, 0);
137                parenDepth = 0;
138                part[i] = ClausePart.NONE;
139            } else if (type == ETokenType.ttleftparenthesis) {
140                part[i] = current[level];
141                parenDepth++;
142            } else if (type == ETokenType.ttrightparenthesis) {
143                if (parenDepth > 0) parenDepth--;
144                part[i] = current[level];
145            } else if (type == ETokenType.ttkeyword) {
146                String text = t.getText();
147                String upper = text == null ? "" : text.toUpperCase(Locale.ROOT);
148                boolean atBase = parenDepth == base[level];
149                if (MASTER_KEYWORDS.contains(upper)) {
150                    t.addRole(TokenRole.KEYWORD_MASTER);
151                    masterIndices.add(i);
152                    if (atBase) {
153                        // SELECT opens its projection list; other masters leave the
154                        // clause NONE until a recognised sub-clause keyword appears.
155                        current[level] = "SELECT".equals(upper)
156                            ? ClausePart.SELECT_LIST : ClausePart.NONE;
157                    }
158                    part[i] = current[level];
159                } else if (CLAUSE_KEYWORDS.containsKey(upper)
160                        && !isFunctionForm(upper, stream, i)
161                        && transitionAllowed(CLAUSE_KEYWORDS.get(upper), current[level],
162                                             atBase)) {
163                    // A genuine clause transition.
164                    t.addRole(TokenRole.KEYWORD_CLAUSE);
165                    current[level] = CLAUSE_KEYWORDS.get(upper);
166                    part[i] = current[level];
167                } else {
168                    // Generic keyword, OR a clause keyword inside an expression
169                    // (window/filter/func/parenthesised predicate): stays in the
170                    // current clause without transitioning.
171                    part[i] = current[level];
172                }
173            } else {
174                part[i] = current[level];
175            }
176        }
177
178        return new ClauseScopeResult(part, Collections.unmodifiableList(masterIndices));
179    }
180
181    /**
182     * Whether a clause keyword may transition the current clause given the
183     * paren context. Transitions normally require being at the query block's
184     * base paren depth ({@code atBase}). The exception is JOIN-family keywords
185     * inside a table-expression group such as {@code FROM (t1 LEFT JOIN t2 ON ...)}:
186     * those legitimately appear above base, but only when the current clause is
187     * already FROM or JOIN (the table region). Window/filter/expression keywords
188     * are not JOIN-family, so they remain gated by base depth.
189     */
190    private static boolean transitionAllowed(ClausePart mapped, ClausePart current,
191                                             boolean atBase) {
192        if (atBase) return true;
193        return mapped == ClausePart.JOIN
194            && (current == ClausePart.FROM || current == ClausePart.JOIN);
195    }
196
197    /** Keywords that double as function names; treated as functions when followed by '('. */
198    private static final Set<String> FUNCTION_FORM_KEYWORDS = setOf("LEFT", "RIGHT");
199
200    /**
201     * True if {@code upper} is a keyword that doubles as a function name (LEFT,
202     * RIGHT) and is immediately followed (skipping comments) by an open paren —
203     * i.e. it is a function call, not a JOIN keyword. Other clause keywords that
204     * legitimately precede a paren (FROM/JOIN/WHERE/ON/APPLY before a subquery or
205     * parenthesised predicate) are not treated as function forms.
206     */
207    private static boolean isFunctionForm(String upper, Pp2TokenStream stream, int i) {
208        if (!FUNCTION_FORM_KEYWORDS.contains(upper)) return false;
209        for (int j = i + 1; j < stream.size(); j++) {
210            ETokenType type = stream.get(j).getSourceToken().tokentype;
211            if (type == ETokenType.ttsimplecomment
212                || type == ETokenType.ttbracketedcomment
213                || type == ETokenType.ttCPPComment) {
214                continue;
215            }
216            return type == ETokenType.ttleftparenthesis;
217        }
218        return false;
219    }
220
221    private static Set<String> setOf(String... values) {
222        Set<String> s = new HashSet<String>();
223        Collections.addAll(s, values);
224        return Collections.unmodifiableSet(s);
225    }
226
227    /** Per-token clause membership plus the master-keyword token indices. */
228    public static final class ClauseScopeResult {
229        private final ClausePart[] part;
230        private final List<Integer> masterIndices;
231
232        ClauseScopeResult(ClausePart[] part, List<Integer> masterIndices) {
233            this.part = part;
234            this.masterIndices = masterIndices;
235        }
236
237        /** Number of tokens covered. */
238        public int size() { return part.length; }
239
240        /** The clause the token at {@code index} belongs to. */
241        public ClausePart partAt(int index) { return part[index]; }
242
243        /** Indices of master keyword tokens (SELECT/INSERT/UPDATE/DELETE/MERGE), in order. */
244        public List<Integer> getMasterIndices() { return masterIndices; }
245    }
246}