001package gudusoft.gsqlparser.pp2.engine;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.TGSqlParser;
005import gudusoft.gsqlparser.TSourceTokenList;
006import gudusoft.gsqlparser.pp.logger.PPLogger;
007import gudusoft.gsqlparser.pp2.FormatDiagnostic;
008import gudusoft.gsqlparser.pp2.FormatStatus;
009import gudusoft.gsqlparser.pp2.Pp2FormatOptions;
010import gudusoft.gsqlparser.pp2.Pp2FormatResult;
011import gudusoft.gsqlparser.pp2.RendererId;
012import gudusoft.gsqlparser.pp2.region.ParseRecoveryEngine;
013import gudusoft.gsqlparser.pp2.region.RegionParseOutcome;
014import gudusoft.gsqlparser.pp2.overlay.AstOverlayAnnotator;
015import gudusoft.gsqlparser.pp2.region.StatementBoundaryDetector;
016import gudusoft.gsqlparser.pp2.region.StatementRange;
017import gudusoft.gsqlparser.pp2.render.ConservativeTokenRenderer;
018import gudusoft.gsqlparser.pp2.render.GuardedAstDelegate;
019import gudusoft.gsqlparser.pp2.render.LexicalIslandRenderer;
020import gudusoft.gsqlparser.pp2.render.RegionAssembler;
021import gudusoft.gsqlparser.pp2.render.RenderedRegion;
022import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
023import gudusoft.gsqlparser.pp2.token.Pp2TokenStreamBuilder;
024import gudusoft.gsqlparser.pp2.token.SourceSpanLedger;
025import gudusoft.gsqlparser.pp2.zone.ProtectedZoneDetector;
026
027import java.util.ArrayList;
028import java.util.Collections;
029import java.util.List;
030
031/**
032 * Fault-tolerant SQL formatter engine — Phase-2 MVP orchestrator.
033 *
034 * <h2>Pipeline summary</h2>
035 *
036 * <p>Each {@link #format(String, EDbVendor, Pp2FormatOptions)} call runs the
037 * following stages in order:
038 *
039 * <ol>
040 *   <li><b>Tokenize</b> — {@link TGSqlParser#tokenizeSqltext()} produces a
041 *       raw {@link TSourceTokenList}.</li>
042 *   <li><b>Token spine</b> — {@link Pp2TokenStreamBuilder} adapts the token
043 *       list into a {@link Pp2TokenStream}: folds whitespace into
044 *       {@code precedingBlanks} / {@code precedingLinebreaks} counts,
045 *       preserves comments as first-class tokens.</li>
046 *   <li><b>Ledger</b> — {@link SourceSpanLedger} records every byte of the
047 *       original input so the assembler can restore inter-region trivia
048 *       verbatim.</li>
049 *   <li><b>Zone annotation</b> — {@link ProtectedZoneDetector} annotates
050 *       {@code NO_FORMAT_ZONE}, {@code COMMENT_LINE}, {@code COMMENT_BLOCK},
051 *       and template-placeholder roles onto the token stream.</li>
052 *   <li><b>Boundary detection</b> — {@link StatementBoundaryDetector} walks
053 *       the annotated stream and emits one {@link StatementRange} per
054 *       statement.</li>
055 *   <li><b>Parse recovery</b> — {@link ParseRecoveryEngine#parseAll(List)}
056 *       attempts per-region parsing. Each outcome is tagged
057 *       {@code AST_OK | AST_ERROR | TRIVIA}.</li>
058 *   <li><b>Dispatch</b> — {@link EngineDispatch} routes each outcome to the
059 *       appropriate renderer (plan §5.2 three-tier strategy):
060 *       {@code AST_OK} → {@link GuardedAstDelegate} (fallback to
061 *       conservative on guard failure); {@code AST_ERROR} →
062 *       {@link ConservativeTokenRenderer}; {@code TRIVIA} → passthrough.
063 *       Each rendered text is wrapped in a {@link RenderedRegion}.</li>
064 *   <li><b>Assembly</b> — {@link RegionAssembler} interleaves the rendered
065 *       texts with the original inter-region trivia from the ledger,
066 *       producing the final output string.</li>
067 *   <li><b>Result</b> — a {@link Pp2FormatResult} carrying the assembled
068 *       text, {@link FormatStatus}, per-region {@link Pp2FormatResult.Region}
069 *       records, and all diagnostics accumulated across the pipeline.</li>
070 * </ol>
071 *
072 * <h2>Defensive behaviour</h2>
073 *
074 * <p>The engine never throws for non-null inputs. Any {@link Throwable} that
075 * escapes the pipeline — from the tokenizer, ledger builder, boundary
076 * detector, or assembler — is caught, logged via {@link PPLogger}, and the
077 * engine falls back to returning the original SQL unchanged with
078 * {@link FormatStatus#FAILED} and a {@link FormatDiagnostic.Severity#FATAL}
079 * diagnostic.
080 *
081 * <h2>Thread safety</h2>
082 *
083 * <p>The engine instance is stateless (all mutable objects are allocated per
084 * {@code format} call). Concurrent calls with different inputs are safe. The
085 * inner {@link ParseRecoveryEngine} allocates a fresh
086 * {@link gudusoft.gsqlparser.pp2.region.ParserPool} per call.
087 *
088 * <p>Plan reference: §5.1, §7.3/S16, §7.4/S16, §10.4.
089 */
090public final class Pp2Engine {
091
092    /**
093     * Format the given SQL string using the supplied database vendor and
094     * options. Never throws for non-null inputs.
095     *
096     * @param sql    the raw SQL to format; must not be null
097     * @param vendor the database dialect; governs tokenization, keyword
098     *               recognition, and boundary detection
099     * @param opts   formatting options; must not be null
100     * @return a {@link Pp2FormatResult} carrying the formatted text and
101     *         per-region metadata; never null
102     * @throws NullPointerException if any argument is null
103     */
104    public Pp2FormatResult format(String sql, EDbVendor vendor,
105                                   Pp2FormatOptions opts) {
106        if (sql == null) throw new NullPointerException("sql");
107        if (vendor == null) throw new NullPointerException("vendor");
108        if (opts == null) throw new NullPointerException("opts");
109
110        try {
111            return formatInternal(sql, vendor, opts);
112        } catch (Throwable t) {
113            // Top-level safety net: if anything escaped the pipeline, return
114            // the original SQL unchanged so the caller never gets an exception.
115            PPLogger.error(t);
116            PPLogger.info("Pp2Engine: unhandled throwable escaped pipeline; "
117                + "returning original SQL unchanged. vendor=" + vendor
118                + " sqlLen=" + sql.length());
119            List<FormatDiagnostic> diags = Collections.singletonList(
120                new FormatDiagnostic(FormatDiagnostic.Severity.FATAL, 0, sql.length(),
121                    "Pp2Engine: pipeline threw " + t.getClass().getSimpleName()
122                        + (t.getMessage() != null ? ": " + t.getMessage() : "")));
123            return new Pp2FormatResult(sql, FormatStatus.FAILED,
124                Collections.<Pp2FormatResult.Region>emptyList(), diags);
125        }
126    }
127
128    private Pp2FormatResult formatInternal(String sql, EDbVendor vendor,
129                                            Pp2FormatOptions opts) {
130        // All collaborators are allocated per call so the no-shared-mutable-state
131        // (and therefore thread-safety) guarantee holds unconditionally, even if
132        // a detector later grows internal scratch/cache state.
133        Pp2TokenStreamBuilder streamBuilder = new Pp2TokenStreamBuilder();
134        StatementBoundaryDetector boundaryDetector = new StatementBoundaryDetector();
135        ProtectedZoneDetector zoneDetector = new ProtectedZoneDetector();
136        RegionAssembler assembler = new RegionAssembler();
137
138        // Stage 1: tokenize.
139        TGSqlParser parser = new TGSqlParser(vendor);
140        parser.sqltext = sql;
141        parser.tokenizeSqltext();
142        TSourceTokenList tokenList = parser.getSourcetokenlist();
143
144        // Stage 2: build token spine.
145        Pp2TokenStreamBuilder.BuildResult buildResult = streamBuilder.build(tokenList);
146        Pp2TokenStream stream = buildResult.getStream();
147
148        // Stage 3: build source-span ledger (byte authority).
149        SourceSpanLedger ledger = SourceSpanLedger.build(sql, tokenList);
150
151        // Stage 4: annotate protected zones (NO_FORMAT_ZONE, comments, etc.).
152        zoneDetector.annotate(stream);
153
154        // Stage 5: detect statement boundaries.
155        List<StatementRange> ranges = boundaryDetector.detect(stream, vendor);
156
157        // Empty stream → no regions; return source unchanged.
158        if (ranges.isEmpty()) {
159            return new Pp2FormatResult(sql, FormatStatus.OK,
160                Collections.<Pp2FormatResult.Region>emptyList(),
161                new ArrayList<FormatDiagnostic>(ledger.getDiagnostics()));
162        }
163
164        // Stages 6+7 INTERLEAVED — parse and render each region in turn.
165        //
166        // Memory: an AST_OK region carries a live TGSqlParser (parse tables +
167        // AST). Parsing every region up front (parseAll) and only then
168        // rendering would keep N parsers alive simultaneously — on a script
169        // with thousands of valid statements that exhausts the heap (root-caused
170        // in S36: ~300KB/parser × N → OOM even at -Xmx2g). Interleaving keeps at
171        // most ONE region's parser/AST live at a time: each region is parsed,
172        // rendered, and then discarded before the next region's parse resets the
173        // shared pool. This also removes the need to promote AST_OK outcomes to
174        // a dedicated fresh parser (parseAll's strategy), cutting allocation
175        // further. The rendered output is identical — the AST content a region
176        // parses to is the same whichever parser instance holds it.
177        ParseRecoveryEngine recovery = new ParseRecoveryEngine(
178            vendor, sql, stream, opts);
179        GuardedAstDelegate astDelegate = new GuardedAstDelegate(vendor);
180        LexicalIslandRenderer lexicalIsland = new LexicalIslandRenderer(vendor);
181        ConservativeTokenRenderer conservative = new ConservativeTokenRenderer();
182        EngineDispatch dispatch = new EngineDispatch(astDelegate, lexicalIsland, conservative);
183
184        List<RenderedRegion> renderedRegions = new ArrayList<RenderedRegion>(ranges.size());
185        List<FormatDiagnostic> allDiags = new ArrayList<FormatDiagnostic>(ledger.getDiagnostics());
186        List<Pp2FormatResult.Region> regionRecords =
187            new ArrayList<Pp2FormatResult.Region>(ranges.size());
188
189        // S33 AST overlay (v3 bridge): off by default. When enabled, annotate
190        // each cleanly-parsed region's tokens with AST-derived roles. v2 does
191        // not render from these roles, so this never changes output here — it
192        // lands the annotation infrastructure a v3 renderer will consume.
193        AstOverlayAnnotator overlay = opts.astOverlayEnabled
194            ? new AstOverlayAnnotator() : null;
195
196        for (StatementRange range : ranges) {
197            RegionParseOutcome outcome = recovery.parseRegion(range);
198            if (overlay != null) {
199                applyAstOverlay(overlay, outcome, stream);
200            }
201            EngineDispatch.DispatchResult result = dispatch.dispatch(outcome, stream, opts);
202            renderedRegions.add(new RenderedRegion(
203                outcome.getRange(), result.text, result.rendererId));
204            allDiags.addAll(result.diagnostics);
205            regionRecords.add(toRegionRecord(outcome, result.rendererId));
206            // outcome is dropped here; the next parseRegion() resets the pool,
207            // freeing this region's parser/AST. At most one is ever live.
208        }
209
210        // Stage 8: assemble into final output.
211        String assembled = assembler.assemble(renderedRegions, ledger, opts);
212
213        // Stage 9: compute overall status and build result.
214        FormatStatus status = computeStatus(renderedRegions, allDiags);
215
216        return new Pp2FormatResult(assembled, status, regionRecords, allDiags);
217    }
218
219    /**
220     * Run the {@link AstOverlayAnnotator} on a single AST_OK region, stamping
221     * AST-derived roles directly onto the engine's whole-script {@code stream}
222     * (the tokens every renderer sees). The region was parsed from
223     * {@code originalSql.substring(range.getStartOffset(), ...)}, so the region
224     * AST's token offsets are region-relative; passing the range's start offset
225     * as the adjustment translates them onto the absolute-offset stream.
226     *
227     * <p>Best-effort: any failure is logged and swallowed so the feature flag
228     * can never break formatting. In v2 no renderer consumes these roles, so
229     * this stamping has no effect on output — it is the v3 evolution seam.
230     */
231    private static void applyAstOverlay(AstOverlayAnnotator overlay,
232                                        RegionParseOutcome outcome,
233                                        Pp2TokenStream stream) {
234        if (outcome.getStatus() != RegionParseOutcome.Status.AST_OK
235            || outcome.getStatement() == null) {
236            return;
237        }
238        try {
239            overlay.annotate(outcome.getStatement(), stream,
240                outcome.getRange().getStartOffset());
241        } catch (Throwable t) {
242            PPLogger.error(t);
243            PPLogger.info("Pp2Engine: AST overlay annotation failed for region "
244                + outcome.getRange() + "; continuing without overlay roles.");
245        }
246    }
247
248    /**
249     * Compute the overall {@link FormatStatus}:
250     * <ul>
251     *   <li>{@link FormatStatus#OK} — every region used {@link RendererId#GUARDED_AST};
252     *       no diagnostics at WARNING or above.</li>
253     *   <li>{@link FormatStatus#OK_WITH_RECOVERY} — at least one region used
254     *       the conservative renderer, or at least one WARNING/ERROR diagnostic
255     *       was emitted (but no FATAL).</li>
256     *   <li>{@link FormatStatus#FAILED} — at least one FATAL diagnostic.</li>
257     * </ul>
258     */
259    private static FormatStatus computeStatus(List<RenderedRegion> regions,
260                                               List<FormatDiagnostic> diags) {
261        for (FormatDiagnostic d : diags) {
262            if (d.getSeverity() == FormatDiagnostic.Severity.FATAL) {
263                return FormatStatus.FAILED;
264            }
265        }
266        for (RenderedRegion r : regions) {
267            if (isRecoveryRenderer(r.getRendererId())) {
268                return FormatStatus.OK_WITH_RECOVERY;
269            }
270        }
271        for (FormatDiagnostic d : diags) {
272            if (d.getSeverity() == FormatDiagnostic.Severity.WARNING
273                || d.getSeverity() == FormatDiagnostic.Severity.ERROR) {
274                return FormatStatus.OK_WITH_RECOVERY;
275            }
276        }
277        return FormatStatus.OK;
278    }
279
280    /** A region was recovered (not the clean AST path) if it used the island or conservative renderer. */
281    private static boolean isRecoveryRenderer(RendererId id) {
282        return id == RendererId.CONSERVATIVE || id == RendererId.LEXICAL_ISLAND;
283    }
284
285    /**
286     * Build one per-region {@link Pp2FormatResult.Region} record from a parse
287     * outcome and the renderer that produced its text. The region-local status
288     * is {@code OK} for {@code GUARDED_AST} and {@code OK_WITH_RECOVERY} for the
289     * island / conservative recovery renderers (or any AST_ERROR region).
290     * Called per region in the interleaved loop, so it never needs to retain
291     * the full outcome list.
292     */
293    private static Pp2FormatResult.Region toRegionRecord(RegionParseOutcome outcome,
294                                                          RendererId rendererId) {
295        FormatStatus regionStatus;
296        if (isRecoveryRenderer(rendererId)) {
297            regionStatus = FormatStatus.OK_WITH_RECOVERY;
298        } else if (outcome.getStatus() == RegionParseOutcome.Status.AST_ERROR) {
299            // Shouldn't happen (AST_ERROR → island/conservative), but be safe.
300            regionStatus = FormatStatus.OK_WITH_RECOVERY;
301        } else {
302            regionStatus = FormatStatus.OK;
303        }
304        return new Pp2FormatResult.Region(
305            outcome.getRange().getStartOffset(),
306            outcome.getRange().getEndOffset(),
307            rendererId,
308            regionStatus);
309    }
310}