001package gudusoft.gsqlparser.pp2.engine; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TGSqlParser; 005import gudusoft.gsqlparser.TSourceTokenList; 006import gudusoft.gsqlparser.pp.logger.PPLogger; 007import gudusoft.gsqlparser.pp2.FormatDiagnostic; 008import gudusoft.gsqlparser.pp2.FormatStatus; 009import gudusoft.gsqlparser.pp2.Pp2FormatOptions; 010import gudusoft.gsqlparser.pp2.Pp2FormatResult; 011import gudusoft.gsqlparser.pp2.RendererId; 012import gudusoft.gsqlparser.pp2.region.ParseRecoveryEngine; 013import gudusoft.gsqlparser.pp2.region.RegionParseOutcome; 014import gudusoft.gsqlparser.pp2.overlay.AstOverlayAnnotator; 015import gudusoft.gsqlparser.pp2.region.StatementBoundaryDetector; 016import gudusoft.gsqlparser.pp2.region.StatementRange; 017import gudusoft.gsqlparser.pp2.render.ConservativeTokenRenderer; 018import gudusoft.gsqlparser.pp2.render.GuardedAstDelegate; 019import gudusoft.gsqlparser.pp2.render.LexicalIslandRenderer; 020import gudusoft.gsqlparser.pp2.render.RegionAssembler; 021import gudusoft.gsqlparser.pp2.render.RenderedRegion; 022import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 023import gudusoft.gsqlparser.pp2.token.Pp2TokenStreamBuilder; 024import gudusoft.gsqlparser.pp2.token.SourceSpanLedger; 025import gudusoft.gsqlparser.pp2.zone.ProtectedZoneDetector; 026 027import java.util.ArrayList; 028import java.util.Collections; 029import java.util.List; 030 031/** 032 * Fault-tolerant SQL formatter engine — Phase-2 MVP orchestrator. 033 * 034 * <h2>Pipeline summary</h2> 035 * 036 * <p>Each {@link #format(String, EDbVendor, Pp2FormatOptions)} call runs the 037 * following stages in order: 038 * 039 * <ol> 040 * <li><b>Tokenize</b> — {@link TGSqlParser#tokenizeSqltext()} produces a 041 * raw {@link TSourceTokenList}.</li> 042 * <li><b>Token spine</b> — {@link Pp2TokenStreamBuilder} adapts the token 043 * list into a {@link Pp2TokenStream}: folds whitespace into 044 * {@code precedingBlanks} / {@code precedingLinebreaks} counts, 045 * preserves comments as first-class tokens.</li> 046 * <li><b>Ledger</b> — {@link SourceSpanLedger} records every byte of the 047 * original input so the assembler can restore inter-region trivia 048 * verbatim.</li> 049 * <li><b>Zone annotation</b> — {@link ProtectedZoneDetector} annotates 050 * {@code NO_FORMAT_ZONE}, {@code COMMENT_LINE}, {@code COMMENT_BLOCK}, 051 * and template-placeholder roles onto the token stream.</li> 052 * <li><b>Boundary detection</b> — {@link StatementBoundaryDetector} walks 053 * the annotated stream and emits one {@link StatementRange} per 054 * statement.</li> 055 * <li><b>Parse recovery</b> — {@link ParseRecoveryEngine#parseAll(List)} 056 * attempts per-region parsing. Each outcome is tagged 057 * {@code AST_OK | AST_ERROR | TRIVIA}.</li> 058 * <li><b>Dispatch</b> — {@link EngineDispatch} routes each outcome to the 059 * appropriate renderer (plan §5.2 three-tier strategy): 060 * {@code AST_OK} → {@link GuardedAstDelegate} (fallback to 061 * conservative on guard failure); {@code AST_ERROR} → 062 * {@link ConservativeTokenRenderer}; {@code TRIVIA} → passthrough. 063 * Each rendered text is wrapped in a {@link RenderedRegion}.</li> 064 * <li><b>Assembly</b> — {@link RegionAssembler} interleaves the rendered 065 * texts with the original inter-region trivia from the ledger, 066 * producing the final output string.</li> 067 * <li><b>Result</b> — a {@link Pp2FormatResult} carrying the assembled 068 * text, {@link FormatStatus}, per-region {@link Pp2FormatResult.Region} 069 * records, and all diagnostics accumulated across the pipeline.</li> 070 * </ol> 071 * 072 * <h2>Defensive behaviour</h2> 073 * 074 * <p>The engine never throws for non-null inputs. Any {@link Throwable} that 075 * escapes the pipeline — from the tokenizer, ledger builder, boundary 076 * detector, or assembler — is caught, logged via {@link PPLogger}, and the 077 * engine falls back to returning the original SQL unchanged with 078 * {@link FormatStatus#FAILED} and a {@link FormatDiagnostic.Severity#FATAL} 079 * diagnostic. 080 * 081 * <h2>Thread safety</h2> 082 * 083 * <p>The engine instance is stateless (all mutable objects are allocated per 084 * {@code format} call). Concurrent calls with different inputs are safe. The 085 * inner {@link ParseRecoveryEngine} allocates a fresh 086 * {@link gudusoft.gsqlparser.pp2.region.ParserPool} per call. 087 * 088 * <p>Plan reference: §5.1, §7.3/S16, §7.4/S16, §10.4. 089 */ 090public final class Pp2Engine { 091 092 /** 093 * Format the given SQL string using the supplied database vendor and 094 * options. Never throws for non-null inputs. 095 * 096 * @param sql the raw SQL to format; must not be null 097 * @param vendor the database dialect; governs tokenization, keyword 098 * recognition, and boundary detection 099 * @param opts formatting options; must not be null 100 * @return a {@link Pp2FormatResult} carrying the formatted text and 101 * per-region metadata; never null 102 * @throws NullPointerException if any argument is null 103 */ 104 public Pp2FormatResult format(String sql, EDbVendor vendor, 105 Pp2FormatOptions opts) { 106 if (sql == null) throw new NullPointerException("sql"); 107 if (vendor == null) throw new NullPointerException("vendor"); 108 if (opts == null) throw new NullPointerException("opts"); 109 110 try { 111 return formatInternal(sql, vendor, opts); 112 } catch (Throwable t) { 113 // Top-level safety net: if anything escaped the pipeline, return 114 // the original SQL unchanged so the caller never gets an exception. 115 PPLogger.error(t); 116 PPLogger.info("Pp2Engine: unhandled throwable escaped pipeline; " 117 + "returning original SQL unchanged. vendor=" + vendor 118 + " sqlLen=" + sql.length()); 119 List<FormatDiagnostic> diags = Collections.singletonList( 120 new FormatDiagnostic(FormatDiagnostic.Severity.FATAL, 0, sql.length(), 121 "Pp2Engine: pipeline threw " + t.getClass().getSimpleName() 122 + (t.getMessage() != null ? ": " + t.getMessage() : ""))); 123 return new Pp2FormatResult(sql, FormatStatus.FAILED, 124 Collections.<Pp2FormatResult.Region>emptyList(), diags); 125 } 126 } 127 128 private Pp2FormatResult formatInternal(String sql, EDbVendor vendor, 129 Pp2FormatOptions opts) { 130 // All collaborators are allocated per call so the no-shared-mutable-state 131 // (and therefore thread-safety) guarantee holds unconditionally, even if 132 // a detector later grows internal scratch/cache state. 133 Pp2TokenStreamBuilder streamBuilder = new Pp2TokenStreamBuilder(); 134 StatementBoundaryDetector boundaryDetector = new StatementBoundaryDetector(); 135 ProtectedZoneDetector zoneDetector = new ProtectedZoneDetector(); 136 RegionAssembler assembler = new RegionAssembler(); 137 138 // Stage 1: tokenize. 139 TGSqlParser parser = new TGSqlParser(vendor); 140 parser.sqltext = sql; 141 parser.tokenizeSqltext(); 142 TSourceTokenList tokenList = parser.getSourcetokenlist(); 143 144 // Stage 2: build token spine. 145 Pp2TokenStreamBuilder.BuildResult buildResult = streamBuilder.build(tokenList); 146 Pp2TokenStream stream = buildResult.getStream(); 147 148 // Stage 3: build source-span ledger (byte authority). 149 SourceSpanLedger ledger = SourceSpanLedger.build(sql, tokenList); 150 151 // Stage 4: annotate protected zones (NO_FORMAT_ZONE, comments, etc.). 152 zoneDetector.annotate(stream); 153 154 // Stage 5: detect statement boundaries. 155 List<StatementRange> ranges = boundaryDetector.detect(stream, vendor); 156 157 // Empty stream → no regions; return source unchanged. 158 if (ranges.isEmpty()) { 159 return new Pp2FormatResult(sql, FormatStatus.OK, 160 Collections.<Pp2FormatResult.Region>emptyList(), 161 new ArrayList<FormatDiagnostic>(ledger.getDiagnostics())); 162 } 163 164 // Stages 6+7 INTERLEAVED — parse and render each region in turn. 165 // 166 // Memory: an AST_OK region carries a live TGSqlParser (parse tables + 167 // AST). Parsing every region up front (parseAll) and only then 168 // rendering would keep N parsers alive simultaneously — on a script 169 // with thousands of valid statements that exhausts the heap (root-caused 170 // in S36: ~300KB/parser × N → OOM even at -Xmx2g). Interleaving keeps at 171 // most ONE region's parser/AST live at a time: each region is parsed, 172 // rendered, and then discarded before the next region's parse resets the 173 // shared pool. This also removes the need to promote AST_OK outcomes to 174 // a dedicated fresh parser (parseAll's strategy), cutting allocation 175 // further. The rendered output is identical — the AST content a region 176 // parses to is the same whichever parser instance holds it. 177 ParseRecoveryEngine recovery = new ParseRecoveryEngine( 178 vendor, sql, stream, opts); 179 GuardedAstDelegate astDelegate = new GuardedAstDelegate(vendor); 180 LexicalIslandRenderer lexicalIsland = new LexicalIslandRenderer(vendor); 181 ConservativeTokenRenderer conservative = new ConservativeTokenRenderer(); 182 EngineDispatch dispatch = new EngineDispatch(astDelegate, lexicalIsland, conservative); 183 184 List<RenderedRegion> renderedRegions = new ArrayList<RenderedRegion>(ranges.size()); 185 List<FormatDiagnostic> allDiags = new ArrayList<FormatDiagnostic>(ledger.getDiagnostics()); 186 List<Pp2FormatResult.Region> regionRecords = 187 new ArrayList<Pp2FormatResult.Region>(ranges.size()); 188 189 // S33 AST overlay (v3 bridge): off by default. When enabled, annotate 190 // each cleanly-parsed region's tokens with AST-derived roles. v2 does 191 // not render from these roles, so this never changes output here — it 192 // lands the annotation infrastructure a v3 renderer will consume. 193 AstOverlayAnnotator overlay = opts.astOverlayEnabled 194 ? new AstOverlayAnnotator() : null; 195 196 for (StatementRange range : ranges) { 197 RegionParseOutcome outcome = recovery.parseRegion(range); 198 if (overlay != null) { 199 applyAstOverlay(overlay, outcome, stream); 200 } 201 EngineDispatch.DispatchResult result = dispatch.dispatch(outcome, stream, opts); 202 renderedRegions.add(new RenderedRegion( 203 outcome.getRange(), result.text, result.rendererId)); 204 allDiags.addAll(result.diagnostics); 205 regionRecords.add(toRegionRecord(outcome, result.rendererId)); 206 // outcome is dropped here; the next parseRegion() resets the pool, 207 // freeing this region's parser/AST. At most one is ever live. 208 } 209 210 // Stage 8: assemble into final output. 211 String assembled = assembler.assemble(renderedRegions, ledger, opts); 212 213 // Stage 9: compute overall status and build result. 214 FormatStatus status = computeStatus(renderedRegions, allDiags); 215 216 return new Pp2FormatResult(assembled, status, regionRecords, allDiags); 217 } 218 219 /** 220 * Run the {@link AstOverlayAnnotator} on a single AST_OK region, stamping 221 * AST-derived roles directly onto the engine's whole-script {@code stream} 222 * (the tokens every renderer sees). The region was parsed from 223 * {@code originalSql.substring(range.getStartOffset(), ...)}, so the region 224 * AST's token offsets are region-relative; passing the range's start offset 225 * as the adjustment translates them onto the absolute-offset stream. 226 * 227 * <p>Best-effort: any failure is logged and swallowed so the feature flag 228 * can never break formatting. In v2 no renderer consumes these roles, so 229 * this stamping has no effect on output — it is the v3 evolution seam. 230 */ 231 private static void applyAstOverlay(AstOverlayAnnotator overlay, 232 RegionParseOutcome outcome, 233 Pp2TokenStream stream) { 234 if (outcome.getStatus() != RegionParseOutcome.Status.AST_OK 235 || outcome.getStatement() == null) { 236 return; 237 } 238 try { 239 overlay.annotate(outcome.getStatement(), stream, 240 outcome.getRange().getStartOffset()); 241 } catch (Throwable t) { 242 PPLogger.error(t); 243 PPLogger.info("Pp2Engine: AST overlay annotation failed for region " 244 + outcome.getRange() + "; continuing without overlay roles."); 245 } 246 } 247 248 /** 249 * Compute the overall {@link FormatStatus}: 250 * <ul> 251 * <li>{@link FormatStatus#OK} — every region used {@link RendererId#GUARDED_AST}; 252 * no diagnostics at WARNING or above.</li> 253 * <li>{@link FormatStatus#OK_WITH_RECOVERY} — at least one region used 254 * the conservative renderer, or at least one WARNING/ERROR diagnostic 255 * was emitted (but no FATAL).</li> 256 * <li>{@link FormatStatus#FAILED} — at least one FATAL diagnostic.</li> 257 * </ul> 258 */ 259 private static FormatStatus computeStatus(List<RenderedRegion> regions, 260 List<FormatDiagnostic> diags) { 261 for (FormatDiagnostic d : diags) { 262 if (d.getSeverity() == FormatDiagnostic.Severity.FATAL) { 263 return FormatStatus.FAILED; 264 } 265 } 266 for (RenderedRegion r : regions) { 267 if (isRecoveryRenderer(r.getRendererId())) { 268 return FormatStatus.OK_WITH_RECOVERY; 269 } 270 } 271 for (FormatDiagnostic d : diags) { 272 if (d.getSeverity() == FormatDiagnostic.Severity.WARNING 273 || d.getSeverity() == FormatDiagnostic.Severity.ERROR) { 274 return FormatStatus.OK_WITH_RECOVERY; 275 } 276 } 277 return FormatStatus.OK; 278 } 279 280 /** A region was recovered (not the clean AST path) if it used the island or conservative renderer. */ 281 private static boolean isRecoveryRenderer(RendererId id) { 282 return id == RendererId.CONSERVATIVE || id == RendererId.LEXICAL_ISLAND; 283 } 284 285 /** 286 * Build one per-region {@link Pp2FormatResult.Region} record from a parse 287 * outcome and the renderer that produced its text. The region-local status 288 * is {@code OK} for {@code GUARDED_AST} and {@code OK_WITH_RECOVERY} for the 289 * island / conservative recovery renderers (or any AST_ERROR region). 290 * Called per region in the interleaved loop, so it never needs to retain 291 * the full outcome list. 292 */ 293 private static Pp2FormatResult.Region toRegionRecord(RegionParseOutcome outcome, 294 RendererId rendererId) { 295 FormatStatus regionStatus; 296 if (isRecoveryRenderer(rendererId)) { 297 regionStatus = FormatStatus.OK_WITH_RECOVERY; 298 } else if (outcome.getStatus() == RegionParseOutcome.Status.AST_ERROR) { 299 // Shouldn't happen (AST_ERROR → island/conservative), but be safe. 300 regionStatus = FormatStatus.OK_WITH_RECOVERY; 301 } else { 302 regionStatus = FormatStatus.OK; 303 } 304 return new Pp2FormatResult.Region( 305 outcome.getRange().getStartOffset(), 306 outcome.getRange().getEndOffset(), 307 rendererId, 308 regionStatus); 309 } 310}