001package gudusoft.gsqlparser.pp2.region; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.EResolverType; 005import gudusoft.gsqlparser.ETokenType; 006import gudusoft.gsqlparser.TGSqlParser; 007import gudusoft.gsqlparser.TSourceToken; 008import gudusoft.gsqlparser.TSyntaxError; 009import gudusoft.gsqlparser.pp2.Pp2FormatOptions; 010import gudusoft.gsqlparser.pp2.token.Pp2Token; 011import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 012import gudusoft.gsqlparser.pp2.token.TokenRole; 013 014import java.util.ArrayList; 015import java.util.Collections; 016import java.util.List; 017 018/** 019 * Parses statement regions produced by 020 * {@link StatementBoundaryDetector#detect(Pp2TokenStream, EDbVendor)} and 021 * tags each one as {@link RegionParseOutcome.Status#AST_OK AST_OK}, 022 * {@link RegionParseOutcome.Status#AST_ERROR AST_ERROR}, or 023 * {@link RegionParseOutcome.Status#TRIVIA TRIVIA}. 024 * 025 * <h2>Per-region parsing with fresh parsers on success</h2> 026 * 027 * <p>{@link #parseAll(List)} pre-marks every {@code TRIVIA} range and every 028 * non-trivia range whose source slice exceeds 029 * {@link Pp2FormatOptions#maxRegionParseChars} (the latter becomes 030 * {@code AST_ERROR} with an engine note). Every remaining range is parsed 031 * individually. The pool's single parser serves as the <b>probe</b>; if the 032 * probe parse succeeds, the engine re-parses the same slice into a freshly 033 * allocated {@link TGSqlParser} so the outcome's 034 * {@link RegionParseOutcome#getParser()} contains exactly one statement and 035 * can be handed to {@code FormatterFactory.pp(parser, opt)} without 036 * rendering its siblings. {@code AST_ERROR} outcomes do not need a 037 * dedicated parser — the lexical fallback (S14 / S31) renders them. 038 * 039 * <p>This is plan §13/R1's "fresh parser per region" contingency adopted as 040 * the default. The cost is one extra parser allocation and re-parse per 041 * {@code AST_OK} region; the benefit is that every outcome returned from a 042 * single {@code parseAll} call is mutually valid in any consumption order 043 * and {@code GuardedAstDelegate} (S13) can safely invoke pp's 044 * statement-iterating renderer on each one without observing neighbours. 045 * 046 * <p>A previous draft of this engine attempted a "lazy full-script first" 047 * optimisation: one {@code parse()} on the entire original SQL, with all 048 * happy-path outcomes sharing the parser snapshot. That broke 049 * {@code FormatterFactory.pp(parser, opt)} which iterates every statement 050 * in {@code parser.sqlstatements} — each outcome would have rendered the 051 * whole script. The optimisation is deferred (plan §16/Q3 / S37) and may 052 * be re-introduced once a render API that targets a single statement 053 * exists. 054 * 055 * <h2>Safety valves and failure modes</h2> 056 * 057 * <ul> 058 * <li>Regions whose source span exceeds 059 * {@link Pp2FormatOptions#maxRegionParseChars} skip the parse attempt 060 * entirely and are returned as {@code AST_ERROR} with an engine note — 061 * the lexical fallback (S14 / S31) handles the rendering.</li> 062 * <li>Any {@link Throwable} thrown by {@link TGSqlParser#parse()} is 063 * caught and converted to an {@code AST_ERROR} outcome. The pool is 064 * still reset before the next attempt, so a single misbehaving region 065 * does not corrupt the rest of the script.</li> 066 * <li>Trivia regions — those containing nothing but whitespace, line 067 * comments, block comments, and (optionally) the trailing terminator — 068 * are short-circuited to {@code TRIVIA} without invoking the parser.</li> 069 * </ul> 070 * 071 * <h2>AST lifetime</h2> 072 * 073 * <p>The pool is size-1. Each {@link #parseRegion(StatementRange)} call 074 * overwrites the prior region's parser state in place, which means a 075 * {@link RegionParseOutcome#getParser() parser} reference from an earlier 076 * {@code parseRegion} call becomes stale once the next call returns. See 077 * {@link RegionParseOutcome the outcome class Javadoc} for the contract. 078 * {@link #parseAll(List)}, by contrast, allocates a fresh parser per 079 * {@code AST_OK} region, so outcomes returned from a single 080 * {@code parseAll} call are mutually valid and outlive the engine call 081 * itself (each carries an isolated parser of its own). 082 * 083 * <p>This class is single-threaded by construction (the pool's threading 084 * contract); concurrent use is undefined. 085 * 086 * <p>Plan reference: §5.1, §7.3/S12, §7.4/S12, §10.4, §13/R1. 087 */ 088public final class ParseRecoveryEngine { 089 090 private final EDbVendor vendor; 091 private final String originalSql; 092 private final Pp2FormatOptions opts; 093 private final Pp2TokenStream stream; 094 private final ParserPool pool; 095 096 public ParseRecoveryEngine(EDbVendor vendor, 097 String originalSql, 098 Pp2TokenStream stream, 099 Pp2FormatOptions opts) { 100 if (vendor == null) throw new NullPointerException("vendor"); 101 if (originalSql == null) throw new NullPointerException("originalSql"); 102 if (stream == null) throw new NullPointerException("stream"); 103 if (opts == null) throw new NullPointerException("opts"); 104 this.vendor = vendor; 105 this.originalSql = originalSql; 106 this.opts = opts; 107 this.stream = stream; 108 this.pool = new ParserPool(vendor); 109 } 110 111 /** Pool access — exposed for tests / introspection. */ 112 public ParserPool getPool() { return pool; } 113 114 /** EDbVendor this engine targets. */ 115 public EDbVendor getVendor() { return vendor; } 116 117 /** 118 * Parse all ranges. The pipeline: 119 * 120 * <ol> 121 * <li>Tag every {@code TRIVIA} range up front (no parser involvement).</li> 122 * <li>Tag every non-trivia range whose source slice exceeds 123 * {@link Pp2FormatOptions#maxRegionParseChars} as {@code AST_ERROR} 124 * with an engine note.</li> 125 * <li>For every remaining range, run a probe parse on the pool. On 126 * {@code AST_OK}, re-parse the slice into a freshly-allocated 127 * {@link TGSqlParser} so the outcome's parser holds exactly one 128 * statement and is safe to hand to 129 * {@code FormatterFactory.pp(parser, opt)}.</li> 130 * </ol> 131 * 132 * @param ranges non-null list of statement ranges, in source order 133 * @return one outcome per input range, in the same order 134 * @throws NullPointerException if {@code ranges} or any element is null 135 */ 136 public List<RegionParseOutcome> parseAll(List<StatementRange> ranges) { 137 if (ranges == null) throw new NullPointerException("ranges"); 138 for (StatementRange r : ranges) { 139 if (r == null) throw new NullPointerException("ranges contains null"); 140 } 141 142 RegionParseOutcome[] outArr = new RegionParseOutcome[ranges.size()]; 143 144 // Steps 1+2: classify trivia and pre-mark oversized non-trivia ranges. 145 // Both short-circuit without touching the parser. 146 List<Integer> needParse = new ArrayList<Integer>(); 147 for (int i = 0; i < ranges.size(); i++) { 148 StatementRange r = ranges.get(i); 149 String slice = sliceFor(r); 150 if (isTrivia(r)) { 151 outArr[i] = RegionParseOutcome.trivia(r, slice); 152 continue; 153 } 154 if (slice.length() > opts.maxRegionParseChars) { 155 outArr[i] = oversized(r, slice); 156 continue; 157 } 158 needParse.add(i); 159 } 160 161 // Step 3: probe-with-pool, promote-to-fresh-parser-on-success per 162 // range. Each AST_OK outcome owns a dedicated parser so siblings do 163 // not appear in pp(parser, opt) output. 164 for (int idx : needParse) { 165 outArr[idx] = parseRegionWithFreshParserOnSuccess(ranges.get(idx)); 166 } 167 return wrap(outArr); 168 } 169 170 private static List<RegionParseOutcome> wrap(RegionParseOutcome[] arr) { 171 return Collections.unmodifiableList(new ArrayList<RegionParseOutcome>( 172 java.util.Arrays.asList(arr))); 173 } 174 175 private RegionParseOutcome oversized(StatementRange r, String slice) { 176 return RegionParseOutcome.astError(r, slice, 177 Collections.<TSyntaxError>emptyList(), 178 "region exceeds maxRegionParseChars=" + opts.maxRegionParseChars 179 + " (size=" + slice.length() + ")"); 180 } 181 182 /** 183 * Parse a single region. Never throws. 184 * 185 * <p>Behaviour: 186 * <ol> 187 * <li>If the region is {@link #isTrivia(StatementRange) trivia}, 188 * returns {@link RegionParseOutcome.Status#TRIVIA TRIVIA} without 189 * touching the parser.</li> 190 * <li>If the region's source span exceeds 191 * {@code opts.maxRegionParseChars}, returns 192 * {@code AST_ERROR} with an engine note. Skips parsing entirely.</li> 193 * <li>Otherwise resets the pool, hands the region source slice to the 194 * parser, and captures the outcome. Any {@code Throwable} from 195 * {@code parse()} is wrapped as {@code AST_ERROR}.</li> 196 * </ol> 197 */ 198 public RegionParseOutcome parseRegion(StatementRange range) { 199 if (range == null) throw new NullPointerException("range"); 200 String src = sliceFor(range); 201 202 if (isTrivia(range)) { 203 return RegionParseOutcome.trivia(range, src); 204 } 205 if (src.length() > opts.maxRegionParseChars) { 206 return oversized(range, src); 207 } 208 209 pool.reset(); 210 TGSqlParser parser = pool.borrow(); 211 return parseSliceInto(parser, range, src); 212 } 213 214 /** 215 * Per-region fallback variant: attempt the parse via the pool, and on 216 * {@code AST_OK} re-run the parse on a freshly-allocated parser dedicated 217 * to this outcome. {@code AST_ERROR} outcomes do not need a dedicated 218 * parser; they're rendered by the lexical fallback in S14/S31, not by 219 * {@code FormatterFactory.pp(parser, opt)}. 220 */ 221 private RegionParseOutcome parseRegionWithFreshParserOnSuccess(StatementRange range) { 222 String src = sliceFor(range); 223 if (isTrivia(range)) return RegionParseOutcome.trivia(range, src); 224 if (src.length() > opts.maxRegionParseChars) return oversized(range, src); 225 226 // Probe with the pool first — cheap, and avoids allocating a parser 227 // for AST_ERROR regions. 228 pool.reset(); 229 TGSqlParser pooled = pool.borrow(); 230 RegionParseOutcome probe = parseSliceInto(pooled, range, src); 231 if (probe.getStatus() != RegionParseOutcome.Status.AST_OK) { 232 return probe; 233 } 234 // Promote the success onto a dedicated, fresh parser so the outcome's 235 // AST is not invalidated by a later pool reset within this parseAll 236 // call. 237 TGSqlParser fresh = new TGSqlParser(vendor); 238 return parseSliceInto(fresh, range, src); 239 } 240 241 /** 242 * Run a parse of {@code src} on the supplied parser and convert the 243 * outcome to a {@link RegionParseOutcome}. Never throws. 244 */ 245 private RegionParseOutcome parseSliceInto(TGSqlParser parser, 246 StatementRange range, 247 String src) { 248 parser.sqltext = src; 249 // pp2 is a FORMATTER: it needs the parse tree, never semantic name 250 // resolution (column->table linking). Auto-running TSQLResolver2 on 251 // every region parse dominates time AND heap — on a script with 252 // hundreds of regions it pushes the formatter into GC thrash / OOM 253 // under a constrained heap (-Xmx512m), even though the formatted output 254 // is identical with or without resolution. Disabling the resolver per 255 // region is the single biggest pp2-side scaling lever (plan §13/R14, 256 // R16; surfaced in S36, gates S37). Set after sqltext and immune to 257 // ParserPool.reset()/prepareForReuse() which resets options. 258 parser.setResolverType(EResolverType.NONE); 259 int rc; 260 try { 261 rc = parser.parse(); 262 } catch (Throwable t) { 263 // Snapshot whatever diagnostics the parser may have collected 264 // before throwing; do not rely on getSyntaxErrors() being safe to 265 // call, but try. 266 List<TSyntaxError> partial = safeSyntaxErrors(parser); 267 return RegionParseOutcome.astError(range, src, partial, 268 "parser threw: " + t.getClass().getSimpleName() 269 + (t.getMessage() != null ? ": " + t.getMessage() : "")); 270 } 271 272 List<TSyntaxError> errors = safeSyntaxErrors(parser); 273 if (rc == 0 && parser.getErrorCount() == 0) { 274 return RegionParseOutcome.astOk(range, src, 275 parser.getSqlstatements(), parser); 276 } 277 return RegionParseOutcome.astError(range, src, errors, 278 errors.isEmpty() ? "parser returned " + rc + " with no diagnostics" : null); 279 } 280 281 /** 282 * True when the range contains no solid tokens — only whitespace, 283 * comments, and at most a trailing terminator ({@code ;} or 284 * vendor-specific {@code GO}). Trivia ranges are short-circuited to 285 * {@code TRIVIA} and never reach the parser. 286 * 287 * <p>The last-token-as-terminator exclusion is keyed off 288 * {@link StatementRange#getTerminator()} rather than text matching, so a 289 * mid-statement identifier that happens to spell {@code GO} is still 290 * treated as solid. 291 */ 292 public boolean isTrivia(StatementRange range) { 293 if (range == null) throw new NullPointerException("range"); 294 int lastIdx = range.getEndTokenIndex() - 1; 295 StatementRange.Terminator term = range.getTerminator(); 296 for (int i = range.getStartTokenIndex(); i < range.getEndTokenIndex(); i++) { 297 if (i >= stream.size()) break; 298 // Skip the range's terminator slot for solid-detection purposes: 299 // SEMICOLON is already filtered by isSolid via tokentype, but GO 300 // is a keyword and would otherwise count as solid. 301 if (i == lastIdx && term != StatementRange.Terminator.NONE) { 302 continue; 303 } 304 Pp2Token t = stream.get(i); 305 if (isSolid(t)) return false; 306 } 307 return true; 308 } 309 310 /** 311 * Source slice covered by the range. Sliced from the original input via 312 * the range's byte offsets (see the S11 hand-off note: parsing against 313 * the original input preserves comment / whitespace context the parser 314 * may key on for some shapes). 315 * 316 * <p>When the range's terminator is {@link StatementRange.Terminator#GO}, 317 * the trailing {@code GO} token is excluded from the slice. The GSP 318 * MSSQL parser treats {@code GO} as a batch separator that splits the 319 * stream into multiple {@code sqlstatements}, which would otherwise 320 * trigger {@link RegionParseOutcome}'s multi-statement guard. Stripping 321 * {@code GO} from the per-region slice keeps the parse contract 322 * single-statement. 323 */ 324 public String sliceFor(StatementRange range) { 325 int start = range.getStartOffset(); 326 int end = sliceEndForParse(range); 327 if (start < 0) start = 0; 328 if (end > originalSql.length()) end = originalSql.length(); 329 if (end < start) end = start; 330 return originalSql.substring(start, end); 331 } 332 333 /** 334 * Compute the end offset to use when slicing this range's source for the 335 * parser. Equal to {@link StatementRange#getEndOffset()} except when the 336 * range's terminator is {@code GO}, in which case the {@code GO} 337 * keyword token is dropped from the slice. 338 */ 339 private int sliceEndForParse(StatementRange range) { 340 if (range.getTerminator() != StatementRange.Terminator.GO) { 341 return range.getEndOffset(); 342 } 343 int lastIdx = range.getEndTokenIndex() - 1; 344 if (lastIdx < range.getStartTokenIndex() || lastIdx >= stream.size()) { 345 return range.getEndOffset(); 346 } 347 TSourceToken goTok = stream.get(lastIdx).getSourceToken(); 348 if (goTok == null) return range.getEndOffset(); 349 return (int) goTok.offset; // cut right before GO 350 } 351 352 // ------------------------------------------------------------------ // 353 // Internals // 354 // ------------------------------------------------------------------ // 355 356 private static List<TSyntaxError> safeSyntaxErrors(TGSqlParser parser) { 357 try { 358 List<TSyntaxError> live = parser.getSyntaxErrors(); 359 if (live == null) return Collections.emptyList(); 360 return new ArrayList<TSyntaxError>(live); 361 } catch (Throwable t) { 362 return Collections.emptyList(); 363 } 364 } 365 366 private static boolean isSolid(Pp2Token t) { 367 // NO_FORMAT_ZONE: opaque, but counts as solid because the renderer 368 // emits it verbatim — we don't want to swallow a NO_FORMAT-only 369 // region as TRIVIA. 370 if (t.hasRole(TokenRole.NO_FORMAT_ZONE)) return true; 371 // Prefer pp2 role annotations (set by ProtectedZoneDetector S9 when 372 // the caller annotated the stream). Roles are vendor-agnostic and 373 // therefore the authoritative source for "is this token a comment". 374 if (t.hasRole(TokenRole.COMMENT_LINE) 375 || t.hasRole(TokenRole.COMMENT_BLOCK)) { 376 return false; 377 } 378 TSourceToken st = t.getSourceToken(); 379 if (st == null) return false; 380 ETokenType type = st.tokentype; 381 if (type == null) return false; 382 switch (type) { 383 case ttwhitespace: 384 case ttreturn: 385 case ttsimplecomment: 386 case ttbracketedcomment: 387 case ttCPPComment: 388 case ttsemicolon: 389 case ttstmt_delimiter: 390 case ttRemoved: 391 return false; 392 default: 393 // Also short-circuit on empty text — e.g. a deleted token. 394 String text = t.getText(); 395 return text != null && text.length() > 0; 396 } 397 } 398}