001package gudusoft.gsqlparser.pp2.region; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.ETokenType; 005import gudusoft.gsqlparser.TSourceToken; 006import gudusoft.gsqlparser.pp2.token.Pp2Token; 007import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 008import gudusoft.gsqlparser.pp2.token.TokenRole; 009 010import java.util.ArrayList; 011import java.util.HashSet; 012import java.util.List; 013import java.util.Locale; 014import java.util.Set; 015 016/** 017 * Walks a {@link Pp2TokenStream} and produces a list of 018 * {@link StatementRange}s — one per statement in the source. 019 * 020 * <p>Terminators recognized: 021 * <ul> 022 * <li>{@code ;} — standard SQL statement separator. Every dialect.</li> 023 * <li>{@code GO} (keyword) — SQL Server / Sybase batch separator, 024 * recognized only when the vendor is in the T-SQL family.</li> 025 * </ul> 026 * 027 * <h2>What this detector handles correctly</h2> 028 * 029 * <ul> 030 * <li><b>Strings containing {@code ;}</b> — string literals tokenize as 031 * a single {@code ttsqstring} token; the inner {@code ;} never 032 * appears as a {@code ttsemicolon} so no special case is needed.</li> 033 * <li><b>PL/SQL block-internal {@code ;}</b> — a {@code BEGIN ... END;} 034 * block has internal semicolons at depth ≥ 1; only depth-0 035 * semicolons split statements. The detector tracks BEGIN/END depth 036 * and treats {@code END IF / LOOP / WHILE / REPEAT / FOR} as 037 * inner-construct ends that do not pop the block-depth counter 038 * (those keywords don't push depth, so there's nothing to pop).</li> 039 * <li><b>CASE expressions inside a {@code BEGIN} block</b> — 040 * {@code CASE} pushes its own {@code caseDepth} counter so its 041 * unqualified {@code END} doesn't accidentally pop the surrounding 042 * block-depth. {@code END CASE} pops {@code caseDepth} (and the 043 * trailing {@code CASE} keyword is consumed as a qualifier, not as 044 * a new {@code CASE} push).</li> 045 * <li><b>NO_FORMAT zones</b> — tokens flagged with 046 * {@link TokenRole#NO_FORMAT_ZONE} (by S9's 047 * {@code ProtectedZoneDetector}) are treated as opaque: depth 048 * is not adjusted, terminators are not consumed.</li> 049 * <li><b>Truncated final statement</b> — a stream that ends without a 050 * terminator yields a final range with 051 * {@link StatementRange.Terminator#NONE}.</li> 052 * </ul> 053 * 054 * <h2>What this detector deliberately does NOT handle</h2> 055 * 056 * <ul> 057 * <li>Standalone Oracle {@code DECLARE ... BEGIN ... END;} blocks where 058 * the {@code DECLARE} clause has its own {@code ;} at depth 0 — the 059 * detector will split there. S22's island recognizer can refine. 060 * PL/SQL in practice almost always wraps in {@code BEGIN}.</li> 061 * <li>Oracle SQL*Plus {@code /} on its own line as a block terminator — 062 * not in the plan §7.3/S11 scope. The {@code /} appears as a normal 063 * token; the next statement starts after it.</li> 064 * <li>SQL Server {@code GO N} count syntax — the {@code GO} is treated 065 * as a terminator regardless of any trailing count.</li> 066 * </ul> 067 * 068 * <p>Plan reference: §7.3/S11, §7.4/S11. 069 */ 070public final class StatementBoundaryDetector { 071 072 /** 073 * Inner-construct keywords that may follow {@code END}. Each takes a 074 * different code path: 075 * 076 * <ul> 077 * <li>{@code IF / LOOP / WHILE / REPEAT / FOR} — these constructs 078 * don't push any depth counter (their internal scope is left to 079 * S22's island recognizer), so the matching {@code END FOO} is 080 * a no-op for the depth model.</li> 081 * <li>{@code CASE} — paired with a {@code CASE} keyword earlier that 082 * pushed {@code caseDepth}. {@code END CASE} pops 083 * {@code caseDepth}; the trailing {@code CASE} keyword is 084 * consumed as a qualifier (does NOT push again).</li> 085 * </ul> 086 */ 087 private static final Set<String> END_QUALIFIERS; 088 static { 089 Set<String> q = new HashSet<String>(); 090 q.add("IF"); 091 q.add("LOOP"); 092 q.add("CASE"); 093 q.add("WHILE"); 094 q.add("REPEAT"); 095 q.add("FOR"); 096 END_QUALIFIERS = q; 097 } 098 099 /** 100 * Detect statement boundaries. 101 * 102 * @param stream non-null token stream (typically already annotated by 103 * S9's {@code ProtectedZoneDetector}) 104 * @param vendor non-null vendor identity; used to decide whether 105 * {@code GO} is a terminator 106 * @return an immutable list of {@link StatementRange}s in source order 107 * @throws NullPointerException if either argument is null 108 */ 109 public List<StatementRange> detect(Pp2TokenStream stream, EDbVendor vendor) { 110 if (stream == null) throw new NullPointerException("stream"); 111 if (vendor == null) throw new NullPointerException("vendor"); 112 final boolean goIsTerminator = isTsqlFamily(vendor); 113 final int n = stream.size(); 114 List<StatementRange> out = new ArrayList<StatementRange>(); 115 int rangeStart = 0; 116 int blockDepth = 0; 117 int caseDepth = 0; 118 // When an "END CASE" sequence pops caseDepth, the following CASE 119 // keyword is the qualifier — not a new CASE statement. Skip its 120 // push effect. 121 boolean swallowNextCaseAsQualifier = false; 122 123 for (int i = 0; i < n; i++) { 124 Pp2Token wrapped = stream.get(i); 125 // Skip tokens inside a NO_FORMAT zone — opaque, no depth or 126 // terminator effects. 127 if (wrapped.hasRole(TokenRole.NO_FORMAT_ZONE)) { 128 continue; 129 } 130 TSourceToken t = wrapped.getSourceToken(); 131 ETokenType type = t.tokentype; 132 String text = wrapped.getText(); 133 134 if (type == ETokenType.ttkeyword && text != null) { 135 String upper = text.toUpperCase(Locale.ROOT); 136 if ("BEGIN".equals(upper)) { 137 blockDepth++; 138 } else if ("CASE".equals(upper)) { 139 if (swallowNextCaseAsQualifier) { 140 swallowNextCaseAsQualifier = false; 141 } else { 142 caseDepth++; 143 } 144 } else if ("END".equals(upper)) { 145 String nextQual = nextSolidKeywordText(stream, i); 146 String nextUpper = nextQual == null 147 ? null 148 : nextQual.toUpperCase(Locale.ROOT); 149 if (nextUpper != null && END_QUALIFIERS.contains(nextUpper)) { 150 // Qualified END. END CASE pops caseDepth and 151 // marks the trailing CASE as a consumed qualifier; 152 // END IF/LOOP/WHILE/REPEAT/FOR are no-ops because 153 // those keywords don't push. 154 if ("CASE".equals(nextUpper)) { 155 if (caseDepth > 0) caseDepth--; 156 swallowNextCaseAsQualifier = true; 157 } 158 } else { 159 // Unqualified END. Pop caseDepth first (CASE 160 // expressions outrank block scope), then blockDepth. 161 if (caseDepth > 0) { 162 caseDepth--; 163 } else if (blockDepth > 0) { 164 blockDepth--; 165 } 166 } 167 } 168 } 169 170 // Terminators only fire at top-level depth. 171 if (blockDepth != 0 || caseDepth != 0) continue; 172 173 if (type == ETokenType.ttsemicolon) { 174 out.add(makeRange(stream, rangeStart, i + 1, 175 StatementRange.Terminator.SEMICOLON)); 176 rangeStart = i + 1; 177 } else if (goIsTerminator 178 && type == ETokenType.ttkeyword 179 && "GO".equalsIgnoreCase(text)) { 180 out.add(makeRange(stream, rangeStart, i + 1, 181 StatementRange.Terminator.GO)); 182 rangeStart = i + 1; 183 } 184 } 185 186 // Truncated final statement (no terminator at end). 187 if (rangeStart < n) { 188 out.add(makeRange(stream, rangeStart, n, 189 StatementRange.Terminator.NONE)); 190 } 191 return java.util.Collections.unmodifiableList(out); 192 } 193 194 /** 195 * Return the text of the next solid (non-comment, non-empty) token 196 * after {@code fromIndex}, or {@code null} if there is none. Tokens 197 * inside a {@link TokenRole#NO_FORMAT_ZONE} are skipped — they're 198 * opaque to the depth model. 199 */ 200 private static String nextSolidKeywordText(Pp2TokenStream stream, int fromIndex) { 201 for (int j = fromIndex + 1; j < stream.size(); j++) { 202 Pp2Token next = stream.get(j); 203 if (next.hasRole(TokenRole.NO_FORMAT_ZONE)) continue; 204 if (next.hasRole(TokenRole.COMMENT_LINE) 205 || next.hasRole(TokenRole.COMMENT_BLOCK)) { 206 continue; 207 } 208 String nextText = next.getText(); 209 if (nextText == null || nextText.isEmpty()) continue; 210 return nextText; 211 } 212 return null; 213 } 214 215 private static StatementRange makeRange(Pp2TokenStream stream, 216 int startIdx, int endIdxExclusive, 217 StatementRange.Terminator terminator) { 218 int startOffset = startIdx < stream.size() 219 ? (int) stream.get(startIdx).getSourceToken().offset 220 : 0; 221 int endOffset = startOffset; 222 if (endIdxExclusive > startIdx) { 223 Pp2Token last = stream.get(endIdxExclusive - 1); 224 String txt = last.getText(); 225 int textLen = txt == null ? 0 : txt.length(); 226 endOffset = (int) last.getSourceToken().offset + textLen; 227 } 228 return new StatementRange(startIdx, endIdxExclusive, 229 startOffset, endOffset, terminator); 230 } 231 232 /** 233 * True when {@code GO} is a batch separator for the vendor. Currently 234 * SQL Server and Sybase (and OB-T-SQL where applicable). 235 */ 236 private static boolean isTsqlFamily(EDbVendor vendor) { 237 if (vendor == null) return false; 238 switch (vendor) { 239 case dbvmssql: 240 case dbvsybase: 241 return true; 242 default: 243 return false; 244 } 245 } 246}