001package gudusoft.gsqlparser.pp2.region;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.ETokenType;
005import gudusoft.gsqlparser.TSourceToken;
006import gudusoft.gsqlparser.pp2.token.Pp2Token;
007import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
008import gudusoft.gsqlparser.pp2.token.TokenRole;
009
010import java.util.ArrayList;
011import java.util.HashSet;
012import java.util.List;
013import java.util.Locale;
014import java.util.Set;
015
016/**
017 * Walks a {@link Pp2TokenStream} and produces a list of
018 * {@link StatementRange}s — one per statement in the source.
019 *
020 * <p>Terminators recognized:
021 * <ul>
022 *   <li>{@code ;} — standard SQL statement separator. Every dialect.</li>
023 *   <li>{@code GO} (keyword) — SQL Server / Sybase batch separator,
024 *       recognized only when the vendor is in the T-SQL family.</li>
025 * </ul>
026 *
027 * <h2>What this detector handles correctly</h2>
028 *
029 * <ul>
030 *   <li><b>Strings containing {@code ;}</b> — string literals tokenize as
031 *       a single {@code ttsqstring} token; the inner {@code ;} never
032 *       appears as a {@code ttsemicolon} so no special case is needed.</li>
033 *   <li><b>PL/SQL block-internal {@code ;}</b> — a {@code BEGIN ... END;}
034 *       block has internal semicolons at depth ≥ 1; only depth-0
035 *       semicolons split statements. The detector tracks BEGIN/END depth
036 *       and treats {@code END IF / LOOP / WHILE / REPEAT / FOR} as
037 *       inner-construct ends that do not pop the block-depth counter
038 *       (those keywords don't push depth, so there's nothing to pop).</li>
039 *   <li><b>CASE expressions inside a {@code BEGIN} block</b> —
040 *       {@code CASE} pushes its own {@code caseDepth} counter so its
041 *       unqualified {@code END} doesn't accidentally pop the surrounding
042 *       block-depth. {@code END CASE} pops {@code caseDepth} (and the
043 *       trailing {@code CASE} keyword is consumed as a qualifier, not as
044 *       a new {@code CASE} push).</li>
045 *   <li><b>NO_FORMAT zones</b> — tokens flagged with
046 *       {@link TokenRole#NO_FORMAT_ZONE} (by S9's
047 *       {@code ProtectedZoneDetector}) are treated as opaque: depth
048 *       is not adjusted, terminators are not consumed.</li>
049 *   <li><b>Truncated final statement</b> — a stream that ends without a
050 *       terminator yields a final range with
051 *       {@link StatementRange.Terminator#NONE}.</li>
052 * </ul>
053 *
054 * <h2>What this detector deliberately does NOT handle</h2>
055 *
056 * <ul>
057 *   <li>Standalone Oracle {@code DECLARE ... BEGIN ... END;} blocks where
058 *       the {@code DECLARE} clause has its own {@code ;} at depth 0 — the
059 *       detector will split there. S22's island recognizer can refine.
060 *       PL/SQL in practice almost always wraps in {@code BEGIN}.</li>
061 *   <li>Oracle SQL*Plus {@code /} on its own line as a block terminator —
062 *       not in the plan §7.3/S11 scope. The {@code /} appears as a normal
063 *       token; the next statement starts after it.</li>
064 *   <li>SQL Server {@code GO N} count syntax — the {@code GO} is treated
065 *       as a terminator regardless of any trailing count.</li>
066 * </ul>
067 *
068 * <p>Plan reference: §7.3/S11, §7.4/S11.
069 */
070public final class StatementBoundaryDetector {
071
072    /**
073     * Inner-construct keywords that may follow {@code END}. Each takes a
074     * different code path:
075     *
076     * <ul>
077     *   <li>{@code IF / LOOP / WHILE / REPEAT / FOR} — these constructs
078     *       don't push any depth counter (their internal scope is left to
079     *       S22's island recognizer), so the matching {@code END FOO} is
080     *       a no-op for the depth model.</li>
081     *   <li>{@code CASE} — paired with a {@code CASE} keyword earlier that
082     *       pushed {@code caseDepth}. {@code END CASE} pops
083     *       {@code caseDepth}; the trailing {@code CASE} keyword is
084     *       consumed as a qualifier (does NOT push again).</li>
085     * </ul>
086     */
087    private static final Set<String> END_QUALIFIERS;
088    static {
089        Set<String> q = new HashSet<String>();
090        q.add("IF");
091        q.add("LOOP");
092        q.add("CASE");
093        q.add("WHILE");
094        q.add("REPEAT");
095        q.add("FOR");
096        END_QUALIFIERS = q;
097    }
098
099    /**
100     * Detect statement boundaries.
101     *
102     * @param stream non-null token stream (typically already annotated by
103     *               S9's {@code ProtectedZoneDetector})
104     * @param vendor non-null vendor identity; used to decide whether
105     *               {@code GO} is a terminator
106     * @return an immutable list of {@link StatementRange}s in source order
107     * @throws NullPointerException if either argument is null
108     */
109    public List<StatementRange> detect(Pp2TokenStream stream, EDbVendor vendor) {
110        if (stream == null) throw new NullPointerException("stream");
111        if (vendor == null) throw new NullPointerException("vendor");
112        final boolean goIsTerminator = isTsqlFamily(vendor);
113        final int n = stream.size();
114        List<StatementRange> out = new ArrayList<StatementRange>();
115        int rangeStart = 0;
116        int blockDepth = 0;
117        int caseDepth = 0;
118        // When an "END CASE" sequence pops caseDepth, the following CASE
119        // keyword is the qualifier — not a new CASE statement. Skip its
120        // push effect.
121        boolean swallowNextCaseAsQualifier = false;
122
123        for (int i = 0; i < n; i++) {
124            Pp2Token wrapped = stream.get(i);
125            // Skip tokens inside a NO_FORMAT zone — opaque, no depth or
126            // terminator effects.
127            if (wrapped.hasRole(TokenRole.NO_FORMAT_ZONE)) {
128                continue;
129            }
130            TSourceToken t = wrapped.getSourceToken();
131            ETokenType type = t.tokentype;
132            String text = wrapped.getText();
133
134            if (type == ETokenType.ttkeyword && text != null) {
135                String upper = text.toUpperCase(Locale.ROOT);
136                if ("BEGIN".equals(upper)) {
137                    blockDepth++;
138                } else if ("CASE".equals(upper)) {
139                    if (swallowNextCaseAsQualifier) {
140                        swallowNextCaseAsQualifier = false;
141                    } else {
142                        caseDepth++;
143                    }
144                } else if ("END".equals(upper)) {
145                    String nextQual = nextSolidKeywordText(stream, i);
146                    String nextUpper = nextQual == null
147                        ? null
148                        : nextQual.toUpperCase(Locale.ROOT);
149                    if (nextUpper != null && END_QUALIFIERS.contains(nextUpper)) {
150                        // Qualified END. END CASE pops caseDepth and
151                        // marks the trailing CASE as a consumed qualifier;
152                        // END IF/LOOP/WHILE/REPEAT/FOR are no-ops because
153                        // those keywords don't push.
154                        if ("CASE".equals(nextUpper)) {
155                            if (caseDepth > 0) caseDepth--;
156                            swallowNextCaseAsQualifier = true;
157                        }
158                    } else {
159                        // Unqualified END. Pop caseDepth first (CASE
160                        // expressions outrank block scope), then blockDepth.
161                        if (caseDepth > 0) {
162                            caseDepth--;
163                        } else if (blockDepth > 0) {
164                            blockDepth--;
165                        }
166                    }
167                }
168            }
169
170            // Terminators only fire at top-level depth.
171            if (blockDepth != 0 || caseDepth != 0) continue;
172
173            if (type == ETokenType.ttsemicolon) {
174                out.add(makeRange(stream, rangeStart, i + 1,
175                    StatementRange.Terminator.SEMICOLON));
176                rangeStart = i + 1;
177            } else if (goIsTerminator
178                && type == ETokenType.ttkeyword
179                && "GO".equalsIgnoreCase(text)) {
180                out.add(makeRange(stream, rangeStart, i + 1,
181                    StatementRange.Terminator.GO));
182                rangeStart = i + 1;
183            }
184        }
185
186        // Truncated final statement (no terminator at end).
187        if (rangeStart < n) {
188            out.add(makeRange(stream, rangeStart, n,
189                StatementRange.Terminator.NONE));
190        }
191        return java.util.Collections.unmodifiableList(out);
192    }
193
194    /**
195     * Return the text of the next solid (non-comment, non-empty) token
196     * after {@code fromIndex}, or {@code null} if there is none. Tokens
197     * inside a {@link TokenRole#NO_FORMAT_ZONE} are skipped — they're
198     * opaque to the depth model.
199     */
200    private static String nextSolidKeywordText(Pp2TokenStream stream, int fromIndex) {
201        for (int j = fromIndex + 1; j < stream.size(); j++) {
202            Pp2Token next = stream.get(j);
203            if (next.hasRole(TokenRole.NO_FORMAT_ZONE)) continue;
204            if (next.hasRole(TokenRole.COMMENT_LINE)
205                || next.hasRole(TokenRole.COMMENT_BLOCK)) {
206                continue;
207            }
208            String nextText = next.getText();
209            if (nextText == null || nextText.isEmpty()) continue;
210            return nextText;
211        }
212        return null;
213    }
214
215    private static StatementRange makeRange(Pp2TokenStream stream,
216                                            int startIdx, int endIdxExclusive,
217                                            StatementRange.Terminator terminator) {
218        int startOffset = startIdx < stream.size()
219            ? (int) stream.get(startIdx).getSourceToken().offset
220            : 0;
221        int endOffset = startOffset;
222        if (endIdxExclusive > startIdx) {
223            Pp2Token last = stream.get(endIdxExclusive - 1);
224            String txt = last.getText();
225            int textLen = txt == null ? 0 : txt.length();
226            endOffset = (int) last.getSourceToken().offset + textLen;
227        }
228        return new StatementRange(startIdx, endIdxExclusive,
229            startOffset, endOffset, terminator);
230    }
231
232    /**
233     * True when {@code GO} is a batch separator for the vendor. Currently
234     * SQL Server and Sybase (and OB-T-SQL where applicable).
235     */
236    private static boolean isTsqlFamily(EDbVendor vendor) {
237        if (vendor == null) return false;
238        switch (vendor) {
239            case dbvmssql:
240            case dbvsybase:
241                return true;
242            default:
243                return false;
244        }
245    }
246}