001package gudusoft.gsqlparser.pp2.region;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.EResolverType;
005import gudusoft.gsqlparser.ETokenType;
006import gudusoft.gsqlparser.TGSqlParser;
007import gudusoft.gsqlparser.TSourceToken;
008import gudusoft.gsqlparser.TSyntaxError;
009import gudusoft.gsqlparser.pp2.Pp2FormatOptions;
010import gudusoft.gsqlparser.pp2.token.Pp2Token;
011import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
012import gudusoft.gsqlparser.pp2.token.TokenRole;
013
014import java.util.ArrayList;
015import java.util.Collections;
016import java.util.List;
017
018/**
019 * Parses statement regions produced by
020 * {@link StatementBoundaryDetector#detect(Pp2TokenStream, EDbVendor)} and
021 * tags each one as {@link RegionParseOutcome.Status#AST_OK AST_OK},
022 * {@link RegionParseOutcome.Status#AST_ERROR AST_ERROR}, or
023 * {@link RegionParseOutcome.Status#TRIVIA TRIVIA}.
024 *
025 * <h2>Per-region parsing with fresh parsers on success</h2>
026 *
027 * <p>{@link #parseAll(List)} pre-marks every {@code TRIVIA} range and every
028 * non-trivia range whose source slice exceeds
029 * {@link Pp2FormatOptions#maxRegionParseChars} (the latter becomes
030 * {@code AST_ERROR} with an engine note). Every remaining range is parsed
031 * individually. The pool's single parser serves as the <b>probe</b>; if the
032 * probe parse succeeds, the engine re-parses the same slice into a freshly
033 * allocated {@link TGSqlParser} so the outcome's
034 * {@link RegionParseOutcome#getParser()} contains exactly one statement and
035 * can be handed to {@code FormatterFactory.pp(parser, opt)} without
036 * rendering its siblings. {@code AST_ERROR} outcomes do not need a
037 * dedicated parser — the lexical fallback (S14 / S31) renders them.
038 *
039 * <p>This is plan §13/R1's "fresh parser per region" contingency adopted as
040 * the default. The cost is one extra parser allocation and re-parse per
041 * {@code AST_OK} region; the benefit is that every outcome returned from a
042 * single {@code parseAll} call is mutually valid in any consumption order
043 * and {@code GuardedAstDelegate} (S13) can safely invoke pp's
044 * statement-iterating renderer on each one without observing neighbours.
045 *
046 * <p>A previous draft of this engine attempted a "lazy full-script first"
047 * optimisation: one {@code parse()} on the entire original SQL, with all
048 * happy-path outcomes sharing the parser snapshot. That broke
049 * {@code FormatterFactory.pp(parser, opt)} which iterates every statement
050 * in {@code parser.sqlstatements} — each outcome would have rendered the
051 * whole script. The optimisation is deferred (plan §16/Q3 / S37) and may
052 * be re-introduced once a render API that targets a single statement
053 * exists.
054 *
055 * <h2>Safety valves and failure modes</h2>
056 *
057 * <ul>
058 *   <li>Regions whose source span exceeds
059 *       {@link Pp2FormatOptions#maxRegionParseChars} skip the parse attempt
060 *       entirely and are returned as {@code AST_ERROR} with an engine note —
061 *       the lexical fallback (S14 / S31) handles the rendering.</li>
062 *   <li>Any {@link Throwable} thrown by {@link TGSqlParser#parse()} is
063 *       caught and converted to an {@code AST_ERROR} outcome. The pool is
064 *       still reset before the next attempt, so a single misbehaving region
065 *       does not corrupt the rest of the script.</li>
066 *   <li>Trivia regions — those containing nothing but whitespace, line
067 *       comments, block comments, and (optionally) the trailing terminator —
068 *       are short-circuited to {@code TRIVIA} without invoking the parser.</li>
069 * </ul>
070 *
071 * <h2>AST lifetime</h2>
072 *
073 * <p>The pool is size-1. Each {@link #parseRegion(StatementRange)} call
074 * overwrites the prior region's parser state in place, which means a
075 * {@link RegionParseOutcome#getParser() parser} reference from an earlier
076 * {@code parseRegion} call becomes stale once the next call returns. See
077 * {@link RegionParseOutcome the outcome class Javadoc} for the contract.
078 * {@link #parseAll(List)}, by contrast, allocates a fresh parser per
079 * {@code AST_OK} region, so outcomes returned from a single
080 * {@code parseAll} call are mutually valid and outlive the engine call
081 * itself (each carries an isolated parser of its own).
082 *
083 * <p>This class is single-threaded by construction (the pool's threading
084 * contract); concurrent use is undefined.
085 *
086 * <p>Plan reference: §5.1, §7.3/S12, §7.4/S12, §10.4, §13/R1.
087 */
088public final class ParseRecoveryEngine {
089
090    private final EDbVendor vendor;
091    private final String originalSql;
092    private final Pp2FormatOptions opts;
093    private final Pp2TokenStream stream;
094    private final ParserPool pool;
095
096    public ParseRecoveryEngine(EDbVendor vendor,
097                               String originalSql,
098                               Pp2TokenStream stream,
099                               Pp2FormatOptions opts) {
100        if (vendor == null) throw new NullPointerException("vendor");
101        if (originalSql == null) throw new NullPointerException("originalSql");
102        if (stream == null) throw new NullPointerException("stream");
103        if (opts == null) throw new NullPointerException("opts");
104        this.vendor = vendor;
105        this.originalSql = originalSql;
106        this.opts = opts;
107        this.stream = stream;
108        this.pool = new ParserPool(vendor);
109    }
110
111    /** Pool access — exposed for tests / introspection. */
112    public ParserPool getPool() { return pool; }
113
114    /** EDbVendor this engine targets. */
115    public EDbVendor getVendor() { return vendor; }
116
117    /**
118     * Parse all ranges. The pipeline:
119     *
120     * <ol>
121     *   <li>Tag every {@code TRIVIA} range up front (no parser involvement).</li>
122     *   <li>Tag every non-trivia range whose source slice exceeds
123     *       {@link Pp2FormatOptions#maxRegionParseChars} as {@code AST_ERROR}
124     *       with an engine note.</li>
125     *   <li>For every remaining range, run a probe parse on the pool. On
126     *       {@code AST_OK}, re-parse the slice into a freshly-allocated
127     *       {@link TGSqlParser} so the outcome's parser holds exactly one
128     *       statement and is safe to hand to
129     *       {@code FormatterFactory.pp(parser, opt)}.</li>
130     * </ol>
131     *
132     * @param ranges non-null list of statement ranges, in source order
133     * @return one outcome per input range, in the same order
134     * @throws NullPointerException if {@code ranges} or any element is null
135     */
136    public List<RegionParseOutcome> parseAll(List<StatementRange> ranges) {
137        if (ranges == null) throw new NullPointerException("ranges");
138        for (StatementRange r : ranges) {
139            if (r == null) throw new NullPointerException("ranges contains null");
140        }
141
142        RegionParseOutcome[] outArr = new RegionParseOutcome[ranges.size()];
143
144        // Steps 1+2: classify trivia and pre-mark oversized non-trivia ranges.
145        // Both short-circuit without touching the parser.
146        List<Integer> needParse = new ArrayList<Integer>();
147        for (int i = 0; i < ranges.size(); i++) {
148            StatementRange r = ranges.get(i);
149            String slice = sliceFor(r);
150            if (isTrivia(r)) {
151                outArr[i] = RegionParseOutcome.trivia(r, slice);
152                continue;
153            }
154            if (slice.length() > opts.maxRegionParseChars) {
155                outArr[i] = oversized(r, slice);
156                continue;
157            }
158            needParse.add(i);
159        }
160
161        // Step 3: probe-with-pool, promote-to-fresh-parser-on-success per
162        // range. Each AST_OK outcome owns a dedicated parser so siblings do
163        // not appear in pp(parser, opt) output.
164        for (int idx : needParse) {
165            outArr[idx] = parseRegionWithFreshParserOnSuccess(ranges.get(idx));
166        }
167        return wrap(outArr);
168    }
169
170    private static List<RegionParseOutcome> wrap(RegionParseOutcome[] arr) {
171        return Collections.unmodifiableList(new ArrayList<RegionParseOutcome>(
172            java.util.Arrays.asList(arr)));
173    }
174
175    private RegionParseOutcome oversized(StatementRange r, String slice) {
176        return RegionParseOutcome.astError(r, slice,
177            Collections.<TSyntaxError>emptyList(),
178            "region exceeds maxRegionParseChars=" + opts.maxRegionParseChars
179                + " (size=" + slice.length() + ")");
180    }
181
182    /**
183     * Parse a single region. Never throws.
184     *
185     * <p>Behaviour:
186     * <ol>
187     *   <li>If the region is {@link #isTrivia(StatementRange) trivia},
188     *       returns {@link RegionParseOutcome.Status#TRIVIA TRIVIA} without
189     *       touching the parser.</li>
190     *   <li>If the region's source span exceeds
191     *       {@code opts.maxRegionParseChars}, returns
192     *       {@code AST_ERROR} with an engine note. Skips parsing entirely.</li>
193     *   <li>Otherwise resets the pool, hands the region source slice to the
194     *       parser, and captures the outcome. Any {@code Throwable} from
195     *       {@code parse()} is wrapped as {@code AST_ERROR}.</li>
196     * </ol>
197     */
198    public RegionParseOutcome parseRegion(StatementRange range) {
199        if (range == null) throw new NullPointerException("range");
200        String src = sliceFor(range);
201
202        if (isTrivia(range)) {
203            return RegionParseOutcome.trivia(range, src);
204        }
205        if (src.length() > opts.maxRegionParseChars) {
206            return oversized(range, src);
207        }
208
209        pool.reset();
210        TGSqlParser parser = pool.borrow();
211        return parseSliceInto(parser, range, src);
212    }
213
214    /**
215     * Per-region fallback variant: attempt the parse via the pool, and on
216     * {@code AST_OK} re-run the parse on a freshly-allocated parser dedicated
217     * to this outcome. {@code AST_ERROR} outcomes do not need a dedicated
218     * parser; they're rendered by the lexical fallback in S14/S31, not by
219     * {@code FormatterFactory.pp(parser, opt)}.
220     */
221    private RegionParseOutcome parseRegionWithFreshParserOnSuccess(StatementRange range) {
222        String src = sliceFor(range);
223        if (isTrivia(range)) return RegionParseOutcome.trivia(range, src);
224        if (src.length() > opts.maxRegionParseChars) return oversized(range, src);
225
226        // Probe with the pool first — cheap, and avoids allocating a parser
227        // for AST_ERROR regions.
228        pool.reset();
229        TGSqlParser pooled = pool.borrow();
230        RegionParseOutcome probe = parseSliceInto(pooled, range, src);
231        if (probe.getStatus() != RegionParseOutcome.Status.AST_OK) {
232            return probe;
233        }
234        // Promote the success onto a dedicated, fresh parser so the outcome's
235        // AST is not invalidated by a later pool reset within this parseAll
236        // call.
237        TGSqlParser fresh = new TGSqlParser(vendor);
238        return parseSliceInto(fresh, range, src);
239    }
240
241    /**
242     * Run a parse of {@code src} on the supplied parser and convert the
243     * outcome to a {@link RegionParseOutcome}. Never throws.
244     */
245    private RegionParseOutcome parseSliceInto(TGSqlParser parser,
246                                              StatementRange range,
247                                              String src) {
248        parser.sqltext = src;
249        // pp2 is a FORMATTER: it needs the parse tree, never semantic name
250        // resolution (column->table linking). Auto-running TSQLResolver2 on
251        // every region parse dominates time AND heap — on a script with
252        // hundreds of regions it pushes the formatter into GC thrash / OOM
253        // under a constrained heap (-Xmx512m), even though the formatted output
254        // is identical with or without resolution. Disabling the resolver per
255        // region is the single biggest pp2-side scaling lever (plan §13/R14,
256        // R16; surfaced in S36, gates S37). Set after sqltext and immune to
257        // ParserPool.reset()/prepareForReuse() which resets options.
258        parser.setResolverType(EResolverType.NONE);
259        int rc;
260        try {
261            rc = parser.parse();
262        } catch (Throwable t) {
263            // Snapshot whatever diagnostics the parser may have collected
264            // before throwing; do not rely on getSyntaxErrors() being safe to
265            // call, but try.
266            List<TSyntaxError> partial = safeSyntaxErrors(parser);
267            return RegionParseOutcome.astError(range, src, partial,
268                "parser threw: " + t.getClass().getSimpleName()
269                    + (t.getMessage() != null ? ": " + t.getMessage() : ""));
270        }
271
272        List<TSyntaxError> errors = safeSyntaxErrors(parser);
273        if (rc == 0 && parser.getErrorCount() == 0) {
274            return RegionParseOutcome.astOk(range, src,
275                parser.getSqlstatements(), parser);
276        }
277        return RegionParseOutcome.astError(range, src, errors,
278            errors.isEmpty() ? "parser returned " + rc + " with no diagnostics" : null);
279    }
280
281    /**
282     * True when the range contains no solid tokens — only whitespace,
283     * comments, and at most a trailing terminator ({@code ;} or
284     * vendor-specific {@code GO}). Trivia ranges are short-circuited to
285     * {@code TRIVIA} and never reach the parser.
286     *
287     * <p>The last-token-as-terminator exclusion is keyed off
288     * {@link StatementRange#getTerminator()} rather than text matching, so a
289     * mid-statement identifier that happens to spell {@code GO} is still
290     * treated as solid.
291     */
292    public boolean isTrivia(StatementRange range) {
293        if (range == null) throw new NullPointerException("range");
294        int lastIdx = range.getEndTokenIndex() - 1;
295        StatementRange.Terminator term = range.getTerminator();
296        for (int i = range.getStartTokenIndex(); i < range.getEndTokenIndex(); i++) {
297            if (i >= stream.size()) break;
298            // Skip the range's terminator slot for solid-detection purposes:
299            // SEMICOLON is already filtered by isSolid via tokentype, but GO
300            // is a keyword and would otherwise count as solid.
301            if (i == lastIdx && term != StatementRange.Terminator.NONE) {
302                continue;
303            }
304            Pp2Token t = stream.get(i);
305            if (isSolid(t)) return false;
306        }
307        return true;
308    }
309
310    /**
311     * Source slice covered by the range. Sliced from the original input via
312     * the range's byte offsets (see the S11 hand-off note: parsing against
313     * the original input preserves comment / whitespace context the parser
314     * may key on for some shapes).
315     *
316     * <p>When the range's terminator is {@link StatementRange.Terminator#GO},
317     * the trailing {@code GO} token is excluded from the slice. The GSP
318     * MSSQL parser treats {@code GO} as a batch separator that splits the
319     * stream into multiple {@code sqlstatements}, which would otherwise
320     * trigger {@link RegionParseOutcome}'s multi-statement guard. Stripping
321     * {@code GO} from the per-region slice keeps the parse contract
322     * single-statement.
323     */
324    public String sliceFor(StatementRange range) {
325        int start = range.getStartOffset();
326        int end = sliceEndForParse(range);
327        if (start < 0) start = 0;
328        if (end > originalSql.length()) end = originalSql.length();
329        if (end < start) end = start;
330        return originalSql.substring(start, end);
331    }
332
333    /**
334     * Compute the end offset to use when slicing this range's source for the
335     * parser. Equal to {@link StatementRange#getEndOffset()} except when the
336     * range's terminator is {@code GO}, in which case the {@code GO}
337     * keyword token is dropped from the slice.
338     */
339    private int sliceEndForParse(StatementRange range) {
340        if (range.getTerminator() != StatementRange.Terminator.GO) {
341            return range.getEndOffset();
342        }
343        int lastIdx = range.getEndTokenIndex() - 1;
344        if (lastIdx < range.getStartTokenIndex() || lastIdx >= stream.size()) {
345            return range.getEndOffset();
346        }
347        TSourceToken goTok = stream.get(lastIdx).getSourceToken();
348        if (goTok == null) return range.getEndOffset();
349        return (int) goTok.offset;                 // cut right before GO
350    }
351
352    // ------------------------------------------------------------------ //
353    //  Internals                                                          //
354    // ------------------------------------------------------------------ //
355
356    private static List<TSyntaxError> safeSyntaxErrors(TGSqlParser parser) {
357        try {
358            List<TSyntaxError> live = parser.getSyntaxErrors();
359            if (live == null) return Collections.emptyList();
360            return new ArrayList<TSyntaxError>(live);
361        } catch (Throwable t) {
362            return Collections.emptyList();
363        }
364    }
365
366    private static boolean isSolid(Pp2Token t) {
367        // NO_FORMAT_ZONE: opaque, but counts as solid because the renderer
368        // emits it verbatim — we don't want to swallow a NO_FORMAT-only
369        // region as TRIVIA.
370        if (t.hasRole(TokenRole.NO_FORMAT_ZONE)) return true;
371        // Prefer pp2 role annotations (set by ProtectedZoneDetector S9 when
372        // the caller annotated the stream). Roles are vendor-agnostic and
373        // therefore the authoritative source for "is this token a comment".
374        if (t.hasRole(TokenRole.COMMENT_LINE)
375            || t.hasRole(TokenRole.COMMENT_BLOCK)) {
376            return false;
377        }
378        TSourceToken st = t.getSourceToken();
379        if (st == null) return false;
380        ETokenType type = st.tokentype;
381        if (type == null) return false;
382        switch (type) {
383            case ttwhitespace:
384            case ttreturn:
385            case ttsimplecomment:
386            case ttbracketedcomment:
387            case ttCPPComment:
388            case ttsemicolon:
389            case ttstmt_delimiter:
390            case ttRemoved:
391                return false;
392            default:
393                // Also short-circuit on empty text — e.g. a deleted token.
394                String text = t.getText();
395                return text != null && text.length() > 0;
396        }
397    }
398}