001package gudusoft.gsqlparser.parser; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.EErrorType; 005import gudusoft.gsqlparser.ESqlStatementType; 006import gudusoft.gsqlparser.ETokenStatus; 007import gudusoft.gsqlparser.ETokenType; 008import gudusoft.gsqlparser.TBaseType; 009import gudusoft.gsqlparser.TCustomLexer; 010import gudusoft.gsqlparser.TCustomParser; 011import gudusoft.gsqlparser.TCustomSqlStatement; 012import gudusoft.gsqlparser.TSourceToken; 013import gudusoft.gsqlparser.TSourceTokenList; 014import gudusoft.gsqlparser.TStatementList; 015import gudusoft.gsqlparser.TSyntaxError; 016import gudusoft.gsqlparser.compiler.TContext; 017import gudusoft.gsqlparser.compiler.TFrame; 018import gudusoft.gsqlparser.compiler.TGlobalScope; 019import gudusoft.gsqlparser.sqlcmds.ISqlCmds; 020import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory; 021import gudusoft.gsqlparser.sqlenv.TSQLEnv; 022import gudusoft.gsqlparser.stmt.TRoutine; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.io.StringReader; 031import java.nio.charset.Charset; 032import java.util.ArrayList; 033import java.util.List; 034 035/** 036 * Abstract base class providing common logic and template methods for SQL parsing. 037 * 038 * <p>This class implements the <b>Template Method Pattern</b>, defining the skeleton 039 * of the parsing algorithm while allowing subclasses to override specific steps. 040 * It provides default implementations for common operations and hooks for 041 * vendor-specific customization. 042 * 043 * <p><b>Design Pattern:</b> Template Method 044 * <ul> 045 * <li><b>Template Methods:</b> {@link #parse(ParserContext)}, {@link #tokenize(ParserContext)}</li> 046 * <li><b>Abstract Methods:</b> Must be implemented by subclasses</li> 047 * <li><b>Hook Methods:</b> Optional overrides for customization</li> 048 * </ul> 049 * 050 * <p><b>Parsing Algorithm (Template Method):</b> 051 * <ol> 052 * <li>Get lexer ({@link #getLexer(ParserContext)})</li> 053 * <li>Tokenize SQL ({@link #performTokenization(ParserContext, TCustomLexer)})</li> 054 * <li>Process tokens ({@link #processTokensBeforeParse(ParserContext, TSourceTokenList)})</li> 055 * <li>Get parser(s) ({@link #getParser(ParserContext, TSourceTokenList)})</li> 056 * <li>Parse SQL ({@link #performParsing(ParserContext, TCustomParser, TCustomParser, TSourceTokenList)})</li> 057 * <li>Semantic analysis ({@link #performSemanticAnalysis(ParserContext, TStatementList)})</li> 058 * </ol> 059 * 060 * <p><b>Subclass Responsibilities:</b> 061 * <pre> 062 * public class OracleSqlParser extends AbstractSqlParser { 063 * public OracleSqlParser() { 064 * super(EDbVendor.dbvoracle); 065 * this.delimiterChar = '/'; 066 * } 067 * 068 * // Must implement abstract methods 069 * protected TCustomLexer getLexer(ParserContext context) { 070 * return new TLexerOracle(); 071 * } 072 * 073 * protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) { 074 * return new TParserOracleSql(tokens); 075 * } 076 * 077 * // ... other abstract methods 078 * 079 * // Optionally override hook methods 080 * protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) { 081 * // Oracle-specific token processing 082 * } 083 * } 084 * </pre> 085 * 086 * @see SqlParser 087 * @see ParserContext 088 * @see SqlParseResult 089 * @since 3.2.0.0 090 */ 091public abstract class AbstractSqlParser implements SqlParser { 092 093 protected final EDbVendor vendor; 094 protected char delimiterChar = ';'; 095 protected String defaultDelimiterStr = ";"; 096 097 // Syntax errors collected during parsing 098 protected List<TSyntaxError> syntaxErrors = new ArrayList<>(); 099 100 // ========== Core Parsing Components (Reused Across Parse Operations) ========== 101 102 /** 103 * Token list container - created once in constructor, cleared before each parse. 104 * <p>This follows the component reuse pattern to avoid allocation overhead. 105 */ 106 protected TSourceTokenList sourcetokenlist; 107 108 /** 109 * Statement list container - created once in constructor, cleared before each extraction. 110 * <p>This follows the component reuse pattern to avoid allocation overhead. 111 */ 112 protected TStatementList sqlstatements; 113 114 /** 115 * Current parser context for the ongoing parse operation. 116 * <p>Set at the beginning of each parse operation, contains input SQL and options. 117 */ 118 protected ParserContext parserContext; 119 120 /** 121 * SQL command resolver for identifying statement types (SELECT, INSERT, etc.). 122 * <p>Initialized lazily using SqlCmdsFactory.get(vendor) - vendor-specific implementation. 123 */ 124 protected ISqlCmds sqlcmds; 125 126 /** 127 * Token handler callback for processing tokens as they are created. 128 * <p>Optional callback that gets invoked for each token created during tokenization. 129 */ 130 private gudusoft.gsqlparser.ITokenHandle tokenHandle = null; 131 132 /** 133 * The lexer instance used for tokenization. 134 * <p>Subclasses should set this field in their constructor to their specific lexer instance. 135 * This allows common tokenization logic in AbstractSqlParser to access the lexer generically. 136 */ 137 protected TCustomLexer lexer = null; 138 139 // ========== Semantic Analysis Infrastructure ========== 140 141 /** 142 * Global context for semantic analysis. 143 * <p>Created during performParsing phase, contains SQL environment and statement references. 144 */ 145 protected TContext globalContext; 146 147 /** 148 * SQL environment for semantic analysis. 149 * <p>Vendor-specific environment configuration, used by resolver and semantic analysis. 150 */ 151 protected TSQLEnv sqlEnv; 152 153 /** 154 * Frame stack for scope management during parsing. 155 * <p>Used to track nested scopes (global, statement, block-level) during parsing. 156 */ 157 protected java.util.Stack<TFrame> frameStack; 158 159 /** 160 * Global frame pushed to frame stack during parsing. 161 * <p>Represents the outermost scope, must be popped after parsing completes. 162 */ 163 protected TFrame globalFrame; 164 165 protected static class PreparedSqlReader { 166 private final BufferedReader reader; 167 private final String charset; 168 169 protected PreparedSqlReader(BufferedReader reader, String charset) { 170 this.reader = reader; 171 this.charset = charset; 172 } 173 174 public BufferedReader getReader() { 175 return reader; 176 } 177 178 public String getCharset() { 179 return charset; 180 } 181 } 182 183 /** 184 * Construct parser for given database vendor. 185 * 186 * @param vendor the database vendor 187 */ 188 protected AbstractSqlParser(EDbVendor vendor) { 189 if (vendor == null) { 190 throw new IllegalArgumentException("vendor cannot be null"); 191 } 192 this.vendor = vendor; 193 194 // Initialize reusable containers (cleared before each use) 195 this.sourcetokenlist = new TSourceTokenList(); 196 this.sqlstatements = new TStatementList(); 197 198 // Note: parserContext is set at the beginning of each parse operation 199 // Note: sqlcmds is initialized lazily when first needed 200 } 201 202 @Override 203 public EDbVendor getVendor() { 204 return vendor; 205 } 206 207 /** 208 * Set an event handler which will be fired when a new source token is created by the lexer during tokenization. 209 * 210 * @param tokenHandle the event handler to process the new created source token 211 */ 212 public void setTokenHandle(gudusoft.gsqlparser.ITokenHandle tokenHandle) { 213 this.tokenHandle = tokenHandle; 214 } 215 216 /** 217 * Template method for full parsing. 218 * 219 * <p>This method defines the skeleton of the parsing algorithm. 220 * Subclasses should NOT override this method; instead, they should 221 * override the abstract methods and hook methods called by this template. 222 * 223 * <p><b>Algorithm:</b> 224 * <ol> 225 * <li>Create lexer</li> 226 * <li>Tokenize (time tracked)</li> 227 * <li>Process tokens (vendor-specific preprocessing)</li> 228 * <li>Create parser(s)</li> 229 * <li>Parse (time tracked)</li> 230 * <li>Semantic analysis (time tracked)</li> 231 * <li>Interpreter (time tracked)</li> 232 * </ol> 233 * 234 * @param context immutable context with all inputs 235 * @return immutable result with all outputs 236 */ 237 @Override 238 public final SqlParseResult parse(ParserContext context) { 239 // Clear syntax errors from previous parse 240 syntaxErrors.clear(); 241 242 try { 243 // Step 1: Get raw statements (internally calls tokenize() and extractRawStatements()) 244 SqlParseResult rawResult = getrawsqlstatements(context); 245 246 if (rawResult.getErrorCode() != 0) { 247 return rawResult; 248 } 249 250 // Get tokens, lexer, and RAW STATEMENTS from raw result 251 TSourceTokenList tokens = rawResult.getSourceTokenList(); 252 TCustomLexer lexer = rawResult.getLexer(); 253 TStatementList rawStatements = rawResult.getSqlStatements(); 254 255 // Step 2: Get parser(s) 256 TCustomParser parser = getParser(context, tokens); 257 TCustomParser secondaryParser = getSecondaryParser(context, tokens); 258 259 // Step 3: Full parsing (build AST for each raw statement) 260 SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder(); 261 resultBuilder.lexer(lexer); 262 resultBuilder.sourceTokenList(tokens); 263 resultBuilder.tokenizationTimeMs(rawResult.getTokenizationTimeMs()); 264 resultBuilder.parser(parser); 265 266 long parseStart = System.currentTimeMillis(); 267 // Pass raw statements to performParsing - it will build AST for each statement 268 TStatementList statements = performParsing(context, parser, secondaryParser, tokens, rawStatements); 269 if (statements == null) { 270 statements = new TStatementList(); 271 } 272 resultBuilder.sqlStatements(statements); 273 resultBuilder.parsingTimeMs(System.currentTimeMillis() - parseStart); 274 275 // Step 4: Semantic analysis 276 if (!context.isOnlyNeedRawParseTree()) { 277 long semanticStart = System.currentTimeMillis(); 278 performSemanticAnalysis(context, statements); 279 resultBuilder.semanticAnalysisTimeMs(System.currentTimeMillis() - semanticStart); 280 } 281 282 // Step 5: Interpreter 283 if (!context.isOnlyNeedRawParseTree() && syntaxErrors.isEmpty()) { 284 long interpreterStart = System.currentTimeMillis(); 285 performInterpreter(context, statements); 286 resultBuilder.interpreterTimeMs(System.currentTimeMillis() - interpreterStart); 287 } 288 289 resultBuilder.syntaxErrors(syntaxErrors instanceof ArrayList ? 290 (ArrayList<TSyntaxError>) syntaxErrors : new ArrayList<>(syntaxErrors)); 291 resultBuilder.errorCode(syntaxErrors.isEmpty() ? 0 : syntaxErrors.size()); 292 resultBuilder.errorMessage(syntaxErrors.isEmpty() ? "" : 293 String.format("Parsing completed with %d error(s)", syntaxErrors.size())); 294 295 return resultBuilder.build(); 296 } catch (Exception e) { 297 e.printStackTrace(); 298 SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder(); 299 resultBuilder.errorCode(1); 300 String errorMsg = "Parsing failed: " + e.getMessage(); 301 resultBuilder.errorMessage(errorMsg); 302 System.out.println(errorMsg+"File:\t"+context.getSqlFilename()); 303 if (context.isDumpResolverLog()) { 304 e.printStackTrace(); 305 } 306 return resultBuilder.build(); 307 } 308 } 309 310 /** 311 * Template method for tokenization only (without full parsing). 312 * 313 * <p>This method is used by {@code getrawsqlstatements()} which only 314 * needs tokenization and raw statement extraction, without detailed 315 * syntax checking or semantic analysis. 316 * 317 * <p><b>Algorithm:</b> 318 * <ol> 319 * <li>Get lexer</li> 320 * <li>Tokenize (time tracked)</li> 321 * <li>Extract raw statements (no parsing)</li> 322 * </ol> 323 * 324 * @param context immutable context with all inputs 325 * @return immutable result with tokens and raw statements 326 */ 327 @Override 328 public final SqlParseResult tokenize(ParserContext context) { 329 SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder(); 330 331 try { 332 // Step 1: Get lexer (vendor-specific instance, may be cached) 333 TCustomLexer lexer = getLexer(context); 334 if (lexer == null) { 335 throw new IllegalStateException("getLexer() returned null"); 336 } 337 resultBuilder.lexer(lexer); 338 339 // Step 2: Perform tokenization 340 long tokenStart = System.currentTimeMillis(); 341 TSourceTokenList tokens = performTokenization(context, lexer); 342 if (tokens == null) { 343 throw new IllegalStateException("performTokenization() returned null"); 344 } 345 346 // Step 3: Post-tokenization processing (CRITICAL for correct behavior) 347 // These steps must run after tokenization to prepare tokens for parsing 348 349 // Step 3a: Post-tokenization normalization 350 doAfterTokenize(tokens); 351 352 // Step 3b: Reset token chain (CRITICAL FIX) 353 // Links all tokens via getNextTokenInChain() - required for TObjectName.toString() 354 TBaseType.resetTokenChain(tokens, 0); 355 356 // Step 3c: Process tokens using token table 357 // Vendor-specific token code adjustments (e.g., BigQuery/Snowflake DO keyword handling) 358 processTokensInTokenTable(context, lexer, tokens); 359 360 // Step 3d: Pre-parse token processing 361 // Pre-parse preprocessing (e.g., Snowflake duplicate semicolon removal) 362 processTokensBeforeParse(context, tokens); 363 364 resultBuilder.sourceTokenList(tokens); 365 resultBuilder.tokenizationTimeMs(System.currentTimeMillis() - tokenStart); 366 367 // Success 368 resultBuilder.errorCode(0); 369 resultBuilder.errorMessage(""); 370 371 } catch (Exception e) { 372 // Error occurred 373 resultBuilder.errorCode(1); 374 String errorMsg = "Tokenization failed: " + e.getMessage(); 375 resultBuilder.errorMessage(errorMsg); 376 377 // Log error if enabled 378 if (context.isDumpResolverLog()) { 379 e.printStackTrace(); 380 } 381 } 382 383 return resultBuilder.build(); 384 } 385 386 /** 387 * Template method for extracting raw statements without full parsing. 388 * 389 * <p>This method performs tokenization and raw statement extraction, 390 * but skips the expensive full parsing and semantic analysis steps. 391 * 392 * <p><b>Algorithm:</b> 393 * <ol> 394 * <li>Tokenize SQL (via {@link #tokenize(ParserContext)})</li> 395 * <li>Extract raw statements (via {@link #extractRawStatements(ParserContext, TSourceTokenList, TCustomLexer, long)})</li> 396 * <li>Return result with tokens and raw statements</li> 397 * </ol> 398 * 399 * <p><b>Equivalent to legacy API:</b> {@code TGSqlParser.getrawsqlstatements()} 400 * 401 * @param context immutable context with all inputs 402 * @return immutable result with tokens and raw statements (no AST) 403 */ 404 @Override 405 public final SqlParseResult getrawsqlstatements(ParserContext context) { 406 try { 407 // Step 1: Tokenize with all post-processing (calls tokenize()) 408 SqlParseResult tokenizeResult = tokenize(context); 409 410 // Check tokenization result 411 if (tokenizeResult.getErrorCode() != 0) { 412 return tokenizeResult; 413 } 414 415 // Get tokens and lexer from tokenize result 416 TSourceTokenList tokens = tokenizeResult.getSourceTokenList(); 417 TCustomLexer lexer = tokenizeResult.getLexer(); 418 long tokenizationTimeMs = tokenizeResult.getTokenizationTimeMs(); 419 420 // Step 2: Extract raw statements (vendor-specific) 421 // Vendor implementation creates builder, populates it, and returns complete result 422 SqlParseResult extractResult = extractRawStatements(context, tokens, lexer, tokenizationTimeMs); 423 424 return extractResult; 425 } catch (Exception e) { 426 e.printStackTrace(); 427 SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder(); 428 resultBuilder.errorCode(1); 429 resultBuilder.errorMessage("Raw statement extraction failed: " + e.getMessage() ); 430 if (context.isDumpResolverLog()) { 431 e.printStackTrace(); 432 } 433 return resultBuilder.build(); 434 } 435 } 436 437 438 439 // ========== Abstract Methods (MUST be implemented by subclasses) ========== 440 441 /** 442 * Get the lexer for this vendor. 443 * 444 * <p><b>Subclass Responsibility:</b> Return vendor-specific lexer instance. 445 * The lexer may be created fresh or cached/reused for performance. 446 * 447 * <p><b>Example:</b> 448 * <pre> 449 * protected TCustomLexer getLexer(ParserContext context) { 450 * TLexerOracle lexer = new TLexerOracle(); 451 * lexer.delimiterchar = delimiterChar; 452 * lexer.defaultDelimiterStr = defaultDelimiterStr; 453 * return lexer; 454 * } 455 * </pre> 456 * 457 * @param context the parser context 458 * @return configured lexer instance (never null) 459 */ 460 protected abstract TCustomLexer getLexer(ParserContext context); 461 462 /** 463 * Get the main parser for this vendor. 464 * 465 * <p><b>Subclass Responsibility:</b> Return vendor-specific parser instance. 466 * The parser may be created fresh or cached/reused for performance. 467 * If reusing, the token list should be updated. 468 * 469 * <p><b>Example:</b> 470 * <pre> 471 * protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) { 472 * TParserOracleSql parser = new TParserOracleSql(tokens); 473 * parser.lexer = getLexer(context); 474 * return parser; 475 * } 476 * </pre> 477 * 478 * @param context the parser context 479 * @param tokens the source token list 480 * @return configured parser instance (never null) 481 */ 482 protected abstract TCustomParser getParser(ParserContext context, TSourceTokenList tokens); 483 484 /** 485 * Perform tokenization using vendor-specific lexer. 486 * 487 * <p><b>Template Method:</b> This method implements the common tokenization 488 * algorithm across all database vendors. Subclasses customize through one hook: 489 * {@link #tokenizeVendorSql()} - Call vendor-specific tokenization logic 490 * 491 * <p><b>Algorithm:</b> 492 * <ol> 493 * <li>Store parser context</li> 494 * <li>Prepare SQL reader (file/string with charset detection)</li> 495 * <li>Configure lexer with input reader and charset</li> 496 * <li>Reset lexer state</li> 497 * <li>Clear token list and reset position</li> 498 * <li>Reset token table cache</li> 499 * <li>Call {@link #tokenizeVendorSql()} hook</li> 500 * <li>Return populated token list</li> 501 * </ol> 502 * 503 * @param context parser context with SQL input configuration 504 * @param lexer the lexer instance (same as this.flexer) 505 * @return token list populated by vendor-specific tokenization 506 * @throws RuntimeException if tokenization fails 507 */ 508 protected TSourceTokenList performTokenization(ParserContext context, TCustomLexer lexer) { 509 this.parserContext = context; 510 511 // Set token handle from context if provided (allows TGSqlParser.setTokenHandle() to work) 512 if (context.getTokenHandle() != null) { 513 this.tokenHandle = context.getTokenHandle(); 514 } 515 516 try { 517 PreparedSqlReader prepared = prepareSqlReader(context); 518 BufferedReader finputstream = prepared.getReader(); 519 String effectiveCharset = prepared.getCharset(); 520 521 // Configure lexer with input (lexer is vendor-specific flexer from subclass) 522 lexer.yyinput = finputstream; 523 if (effectiveCharset != null && !effectiveCharset.isEmpty()) { 524 lexer.setSqlCharset(effectiveCharset); 525 } 526 lexer.reset(); 527 528 // Reset token list 529 this.sourcetokenlist.clear(); 530 this.sourcetokenlist.curpos = -1; 531 532 // Reset token table cache 533 lexer.resetTokenTable(); 534 535 // HOOK: Call vendor-specific tokenization 536 tokenizeVendorSql(); 537 538 return this.sourcetokenlist; 539 540 } catch (Exception e) { 541 throw new RuntimeException("Tokenization failed: " + e.getMessage(), e); 542 } 543 } 544 545 /** 546 * Call vendor-specific tokenization logic. 547 * 548 * <p><b>Hook Method:</b> Called by {@link #performTokenization} to execute 549 * vendor-specific SQL-to-token conversion logic. 550 * 551 * <p><b>Subclass Responsibility:</b> Call the vendor-specific tokenization method 552 * (e.g., dooraclesqltexttotokenlist, domssqlsqltexttotokenlist) which reads 553 * from lexer and populates sourcetokenlist. 554 * 555 * <p><b>Example (Oracle):</b> 556 * <pre> 557 * protected void tokenizeVendorSql() { 558 * dooraclesqltexttotokenlist(); 559 * } 560 * </pre> 561 * 562 * <p><b>Example (MSSQL):</b> 563 * <pre> 564 * protected void tokenizeVendorSql() { 565 * domssqlsqltexttotokenlist(); 566 * } 567 * </pre> 568 * 569 * <p><b>Example (PostgreSQL):</b> 570 * <pre> 571 * protected void tokenizeVendorSql() { 572 * dopostgresqltexttotokenlist(); 573 * } 574 * </pre> 575 */ 576 protected abstract void tokenizeVendorSql(); 577 578 /** 579 * Extract raw statements without full parsing (public API). 580 * 581 * <p>This public method allows external callers (like TGSqlParser) to extract 582 * raw statements from an already-tokenized source list without re-tokenization. 583 * 584 * @param context the parser context 585 * @param tokens the source token list (already tokenized) 586 * @return statement list (never null) 587 * @since 3.2.0.0 588 */ 589 public final TStatementList doExtractRawStatements(ParserContext context, TSourceTokenList tokens) { 590 // Create a dummy lexer since we already have tokens 591 TCustomLexer lexer = getLexer(context); 592 593 // Call vendor-specific extraction and extract statement list from result 594 SqlParseResult result = extractRawStatements(context, tokens, lexer, 0); 595 return result.getSqlStatements() != null ? result.getSqlStatements() : new TStatementList(); 596 } 597 598 /** 599 * Extract raw statements without full parsing. 600 * 601 * <p><b>Template Method:</b> This method implements the common algorithm for 602 * extracting raw statements across all database vendors. Subclasses customize 603 * the process through two hook methods: 604 * <ul> 605 * <li>{@link #setupVendorParsersForExtraction()} - Initialize vendor parsers</li> 606 * <li>{@link #extractVendorRawStatements(SqlParseResult.Builder)} - Call vendor extraction logic</li> 607 * </ul> 608 * 609 * <p><b>Algorithm:</b> 610 * <ol> 611 * <li>Create SqlParseResult.Builder</li> 612 * <li>Set common fields (lexer, tokens, tokenization time)</li> 613 * <li>Store context and tokens for extraction</li> 614 * <li>Initialize SQL command resolver</li> 615 * <li>Call {@link #setupVendorParsersForExtraction()} hook</li> 616 * <li>Time the extraction</li> 617 * <li>Call {@link #extractVendorRawStatements(SqlParseResult.Builder)} hook</li> 618 * <li>Set parsing time</li> 619 * <li>Build and return result</li> 620 * </ol> 621 * 622 * @param context the parser context 623 * @param tokens the source token list 624 * @param lexer the lexer instance (for including in result) 625 * @param tokenizationTimeMs tokenization time from tokenize() step 626 * @return complete SqlParseResult with raw statements and metadata 627 */ 628 protected SqlParseResult extractRawStatements(ParserContext context, 629 TSourceTokenList tokens, 630 TCustomLexer lexer, 631 long tokenizationTimeMs) { 632 // Create builder for result construction 633 SqlParseResult.Builder builder = new SqlParseResult.Builder(); 634 635 // Set common result fields 636 builder.lexer(lexer); 637 builder.sourceTokenList(tokens); 638 builder.tokenizationTimeMs(tokenizationTimeMs); 639 640 // CRITICAL: Include parser(s) in result so TGSqlParser can use them in common parsing loop 641 TCustomParser parser = getParser(context, tokens); 642 builder.parser(parser); 643 644 // Include secondary parser for vendors that have one (e.g., Oracle PL/SQL parser) 645 TCustomParser secondaryParser = getSecondaryParser(context, tokens); 646 if (secondaryParser != null) { 647 builder.secondaryParser(secondaryParser); 648 } 649 650 // Store context and tokens for extraction 651 this.sourcetokenlist = tokens; 652 if (this.sqlstatements == null) { 653 this.sqlstatements = new TStatementList(); 654 } else { 655 this.sqlstatements.clear(); 656 } 657 this.syntaxErrors.clear(); // Clear syntax errors from previous extraction 658 this.parserContext = context; 659 660 // Initialize SQL command resolver (if not already done) 661 if (this.sqlcmds == null) { 662 this.sqlcmds = SqlCmdsFactory.get(vendor); 663 } 664 665 // HOOK 1: Vendor-specific parser setup (sqlcmds injection, token list update) 666 setupVendorParsersForExtraction(); 667 668 // Time the extraction 669 long extractStart = System.currentTimeMillis(); 670 671 // HOOK 2: Call vendor-specific raw statement extraction 672 extractVendorRawStatements(builder); 673 674 builder.parsingTimeMs(System.currentTimeMillis() - extractStart); 675 676 // Add extracted statements to result 677 builder.sqlStatements(this.sqlstatements); 678 679 // Add collected syntax errors to result 680 if (!syntaxErrors.isEmpty()) { 681 builder.syntaxErrors(syntaxErrors instanceof ArrayList ? 682 (ArrayList<TSyntaxError>) syntaxErrors : new ArrayList<>(syntaxErrors)); 683 } 684 685 return builder.build(); 686 } 687 688 /** 689 * Setup vendor-specific parsers for raw statement extraction. 690 * 691 * <p><b>Hook Method:</b> Called by {@link #extractRawStatements} after initializing 692 * sqlcmds but before calling the vendor-specific extraction logic. 693 * 694 * <p><b>Subclass Responsibility:</b> Inject sqlcmds into vendor parser(s) and 695 * update their token lists. Examples: 696 * <ul> 697 * <li><b>Single parser (MSSQL):</b> Inject into fparser only</li> 698 * <li><b>Dual parsers (Oracle):</b> Inject into both fparser and fplsqlparser</li> 699 * </ul> 700 * 701 * <p><b>Example (MSSQL):</b> 702 * <pre> 703 * protected void setupVendorParsersForExtraction() { 704 * this.fparser.sqlcmds = this.sqlcmds; 705 * this.fparser.sourcetokenlist = this.sourcetokenlist; 706 * } 707 * </pre> 708 * 709 * <p><b>Example (Oracle with dual parsers):</b> 710 * <pre> 711 * protected void setupVendorParsersForExtraction() { 712 * this.fparser.sqlcmds = this.sqlcmds; 713 * this.fplsqlparser.sqlcmds = this.sqlcmds; 714 * this.fparser.sourcetokenlist = this.sourcetokenlist; 715 * this.fplsqlparser.sourcetokenlist = this.sourcetokenlist; 716 * } 717 * </pre> 718 */ 719 protected abstract void setupVendorParsersForExtraction(); 720 721 /** 722 * Call vendor-specific raw statement extraction logic. 723 * 724 * <p><b>Hook Method:</b> Called by {@link #extractRawStatements} to execute 725 * the vendor-specific logic for identifying statement boundaries. 726 * 727 * <p><b>Subclass Responsibility:</b> Call the vendor-specific extraction method 728 * (e.g., dooraclegetrawsqlstatements, domssqlgetrawsqlstatements) passing the 729 * builder. The extraction method will populate the builder with raw statements. 730 * 731 * <p><b>Example (Oracle):</b> 732 * <pre> 733 * protected void extractVendorRawStatements(SqlParseResult.Builder builder) { 734 * dooraclegetrawsqlstatements(builder); 735 * } 736 * </pre> 737 * 738 * <p><b>Example (MSSQL):</b> 739 * <pre> 740 * protected void extractVendorRawStatements(SqlParseResult.Builder builder) { 741 * domssqlgetrawsqlstatements(builder); 742 * } 743 * </pre> 744 * 745 * @param builder the result builder to populate with raw statements 746 */ 747 protected abstract void extractVendorRawStatements(SqlParseResult.Builder builder); 748 749 /** 750 * Perform actual parsing with syntax checking. 751 * 752 * <p><b>Subclass Responsibility:</b> Parse SQL using vendor-specific parser 753 * and optional secondary parser (e.g., PL/SQL for Oracle). 754 * 755 * <p><b>Important:</b> This method receives raw statements that have already been 756 * extracted by {@link #getrawsqlstatements(ParserContext)}. Subclasses should NOT 757 * re-extract statements - just parse each statement to build the AST. 758 * 759 * <p><b>Example:</b> 760 * <pre> 761 * protected TStatementList performParsing(ParserContext context, 762 * TCustomParser parser, 763 * TCustomParser secondaryParser, 764 * TSourceTokenList tokens, 765 * TStatementList rawStatements) { 766 * // Use the passed-in rawStatements (DO NOT re-extract!) 767 * for (int i = 0; i < rawStatements.size(); i++) { 768 * TCustomSqlStatement stmt = rawStatements.get(i); 769 * stmt.parsestatement(...); // Build AST for each statement 770 * } 771 * return rawStatements; 772 * } 773 * </pre> 774 * 775 * @param context the parser context 776 * @param parser the main parser instance 777 * @param secondaryParser secondary parser (may be null) 778 * @param tokens the source token list 779 * @param rawStatements raw statements already extracted (never null) 780 * @return statement list with parsed AST (never null) 781 */ 782 protected abstract TStatementList performParsing(ParserContext context, 783 TCustomParser parser, 784 TCustomParser secondaryParser, 785 TSourceTokenList tokens, 786 TStatementList rawStatements); 787 788 // ========== Hook Methods (MAY be overridden by subclasses) ========== 789 790 /** 791 * Get secondary parser (e.g., PL/SQL for Oracle). 792 * 793 * <p><b>Hook Method:</b> Default implementation returns null. 794 * Override if vendor needs a secondary parser. 795 * The parser may be created fresh or cached/reused for performance. 796 * 797 * <p><b>Example (Oracle):</b> 798 * <pre> 799 * protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) { 800 * TParserOraclePLSql plsqlParser = new TParserOraclePLSql(tokens); 801 * plsqlParser.lexer = getLexer(context); 802 * return plsqlParser; 803 * } 804 * </pre> 805 * 806 * @param context the parser context 807 * @param tokens the source token list 808 * @return secondary parser instance, or null if not needed 809 */ 810 protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) { 811 return null; // Most vendors don't need this 812 } 813 814 /** 815 * Post-tokenization normalization. 816 * <p> 817 * Handles matching parentheses wrapping around SQL and marks semicolons 818 * before closing parens to be ignored. 819 * <p> 820 * Extracted from: TGSqlParser.doAfterTokenize() (lines 5123-5161) 821 * 822 * @param tokens the source token list (mutable) 823 */ 824 protected void doAfterTokenize(TSourceTokenList tokens) { 825 int leftParenCount = 0; 826 int rightParenCount = 0; 827 int leftIndex = 0; 828 int rightIndex = tokens.size() - 1; 829 830 // Count opening parentheses at the beginning 831 while (leftIndex < tokens.size() && tokens.get(leftIndex).tokencode == '(') { 832 leftParenCount++; 833 leftIndex++; 834 } 835 836 // Count closing parentheses at the end 837 while (rightIndex >= 0 && tokens.get(rightIndex).tokencode == ')') { 838 rightParenCount++; 839 rightIndex--; 840 } 841 842 // Set matching parentheses to be ignored 843 int parensToIgnore = Math.min(leftParenCount, rightParenCount); 844 // if there is a semicolon before the right parenthesis, set the semicolon to be ignored 845 // mantisbt/view.php?id=3690 846 847 if ((parensToIgnore > 0) && (tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokencode == ';')){ 848 // set to whitespace that this semicolon will be ignored during getting raw sql 849 tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokentype = ETokenType.ttwhitespace; 850 // set to ignore by yacc that this semicolon will be ignored during parsing 851 tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokenstatus = ETokenStatus.tsignorebyyacc; 852 } 853 } 854 855 /** 856 * Process tokens using token table (vendor-specific token code adjustments). 857 * <p> 858 * Currently handles BigQuery and Snowflake to convert DO keywords to identifiers 859 * when there's no corresponding WHILE/FOR. 860 * <p> 861 * Extracted from: TGSqlParser.processTokensInTokenTable() (lines 5186-5209) 862 * 863 * @param context the parser context 864 * @param lexer the lexer (for accessing TOKEN_TABLE) 865 * @param tokens the source token list (mutable) 866 */ 867 protected void processTokensInTokenTable(ParserContext context, TCustomLexer lexer, TSourceTokenList tokens) { 868 // Get token table from lexer 869 long[][] TOKEN_TABLE1 = lexer.TOKEN_TABLE; 870 871 switch (vendor){ 872 case dbvbigquery: 873 case dbvsnowflake: 874 // case 1, DO keyword: if no corresponding FOR, WHILE etc keywords found, 875 // set DO keyword's token code to TBaseType.ident 876 if (TOKEN_TABLE1[TBaseType.rrw_do][0] > 0){ 877 if ((TOKEN_TABLE1[TBaseType.rrw_while][0] == 0) && (TOKEN_TABLE1[TBaseType.rrw_for][0] == 0)){ 878 for(int i=0; i<tokens.size(); i++){ 879 TSourceToken st = tokens.get(i); 880 if (st.tokencode == TBaseType.rrw_do){ 881 st.tokencode = TBaseType.ident; 882 } 883 } 884 } 885 } 886 break; 887 } 888 } 889 890 /** 891 * Process tokens before parsing (vendor-specific adjustments). 892 * 893 * <p><b>Hook Method:</b> Default implementation handles Snowflake consecutive semicolons. 894 * Override if vendor needs additional token preprocessing. 895 * 896 * <p>Extracted from: TGSqlParser.processTokensBeforeParse() (lines 5165-5184) 897 * 898 * <p><b>Example:</b> 899 * <pre> 900 * protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) { 901 * super.processTokensBeforeParse(context, tokens); // Call base implementation 902 * // Add vendor-specific processing... 903 * } 904 * </pre> 905 * 906 * @param context the parser context 907 * @param tokens the source token list (mutable) 908 */ 909 protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) { 910 // For performance, only process for Snowflake as this is currently only needed there 911 // mantisbt/view.php?id=3579 912 if (vendor != EDbVendor.dbvsnowflake) return; 913 914 // If there are consecutive semicolon tokens, mark the second semicolon token as deleted 915 for(int i=0; i<tokens.size(); i++){ 916 TSourceToken st = tokens.get(i); 917 if (st.tokencode == ';'){ 918 TSourceToken nextToken = st.nextSolidToken(); 919 if (nextToken != null){ 920 if (nextToken.tokencode == ';'){ 921 nextToken.tokenstatus = ETokenStatus.tsdeleted; 922 } 923 } 924 } 925 } 926 } 927 928 /** 929 * Perform semantic analysis on parsed statements. 930 * 931 * <p><b>Hook Method:</b> Default implementation does nothing. 932 * Override to provide vendor-specific semantic analysis. 933 * 934 * <p><b>Typical Implementation:</b> 935 * <ul> 936 * <li>Column-to-table resolution (TSQLResolver)</li> 937 * <li>Dataflow analysis</li> 938 * <li>Reference resolution</li> 939 * <li>Scope resolution</li> 940 * </ul> 941 * 942 * @param context the parser context 943 * @param statements the parsed statements (mutable) 944 */ 945 protected void performSemanticAnalysis(ParserContext context, TStatementList statements) { 946 // Default implementation: no semantic analysis 947 // Subclasses can override for vendor-specific behavior 948 } 949 950 /** 951 * Perform interpretation/evaluation on parsed statements. 952 * 953 * <p><b>Hook Method:</b> Default implementation does nothing. 954 * Override to provide AST interpretation/evaluation. 955 * 956 * <p><b>Typical Implementation:</b> 957 * <ul> 958 * <li>Execute simple SQL statements</li> 959 * <li>Evaluate expressions</li> 960 * <li>Constant folding</li> 961 * <li>Static analysis</li> 962 * </ul> 963 * 964 * @param context the parser context 965 * @param statements the parsed statements (mutable) 966 */ 967 protected void performInterpreter(ParserContext context, TStatementList statements) { 968 // Default implementation: no interpreter 969 // Subclasses can override to provide AST interpretation 970 } 971 972 /** 973 * Copy error messages from a statement to the parser's error collection. 974 * 975 * <p>This method should be called by performParsing implementations 976 * when a statement has syntax errors. 977 * 978 * @param statement the statement with errors 979 */ 980 protected void copyErrorsFromStatement(TCustomSqlStatement statement) { 981 if (statement == null || statement.getSyntaxErrors() == null) { 982 return; 983 } 984 985 for (int i = 0; i < statement.getSyntaxErrors().size(); i++) { 986 this.syntaxErrors.add(new TSyntaxError((TSyntaxError) statement.getSyntaxErrors().get(i))); 987 } 988 } 989 990 /** 991 * Attempt error recovery for CREATE TABLE/INDEX statements with unsupported options. 992 * 993 * <p>When parsing CREATE TABLE or CREATE INDEX statements, the parser may encounter 994 * vendor-specific options that are not in the grammar. This method implements the 995 * legacy error recovery behavior by marking unsupported tokens after the main 996 * definition as SQL*Plus commands (effectively ignoring them). 997 * 998 * <p><b>Recovery Strategy:</b> 999 * <ol> 1000 * <li>Find the closing ')' of the column/index definitions (nested=0)</li> 1001 * <li>Mark all remaining tokens (except ';') as sqlpluscmd to ignore them</li> 1002 * <li>Clear errors and re-parse the statement</li> 1003 * </ol> 1004 * 1005 * <p><b>When to call:</b> After parsing a statement that has errors. 1006 * Only recovers if ENABLE_ERROR_RECOVER_IN_CREATE_TABLE is true. 1007 * 1008 * @param statement the statement to attempt recovery on 1009 * @param parseResult the result code from parsing (0 = success) 1010 * @param onlyNeedRawParseTree whether only raw parse tree is needed 1011 * @return new parse result after recovery attempt, or original if no recovery 1012 */ 1013 protected int attemptErrorRecovery(TCustomSqlStatement statement, int parseResult, boolean onlyNeedRawParseTree) { 1014 boolean doRecover = TBaseType.ENABLE_ERROR_RECOVER_IN_CREATE_TABLE; 1015 1016 if (doRecover && ((parseResult != 0) || (statement.getErrorCount() > 0))) { 1017 if (((statement.sqlstatementtype == ESqlStatementType.sstcreatetable) 1018 || ((statement.sqlstatementtype == ESqlStatementType.sstcreateindex) && (this.vendor != EDbVendor.dbvcouchbase)) 1019 ) && (!TBaseType.c_createTableStrictParsing) 1020 ) { 1021 // Only parse main body of create table/index, ignore unsupported options after closing ')' 1022 int nested = 0; 1023 boolean isIgnore = false; 1024 boolean isFoundIgnoreToken = false; 1025 TSourceToken firstIgnoreToken = null; 1026 1027 for (int k = 0; k < statement.sourcetokenlist.size(); k++) { 1028 TSourceToken st = statement.sourcetokenlist.get(k); 1029 if (isIgnore) { 1030 if (st.issolidtoken() && (st.tokencode != ';')) { 1031 isFoundIgnoreToken = true; 1032 if (firstIgnoreToken == null) { 1033 firstIgnoreToken = st; 1034 } 1035 } 1036 if (st.tokencode != ';') { 1037 st.tokencode = TBaseType.sqlpluscmd; 1038 } 1039 continue; 1040 } 1041 if (st.tokencode == (int) ')') { 1042 nested--; 1043 if (nested == 0) { 1044 // Check if next token is "AS ( SELECT" (table created from select) 1045 boolean isSelect = false; 1046 TSourceToken st1 = st.searchToken(TBaseType.rrw_as, 1); 1047 if (st1 != null) { 1048 TSourceToken st2 = st.searchToken((int) '(', 2); 1049 if (st2 != null) { 1050 TSourceToken st3 = st.searchToken(TBaseType.rrw_select, 3); 1051 isSelect = (st3 != null); 1052 } 1053 } 1054 if (!isSelect) isIgnore = true; 1055 } 1056 } 1057 if ((st.tokencode == (int) '(') || (st.tokencode == TBaseType.left_parenthesis_2)) { 1058 nested++; 1059 } 1060 } 1061 1062 // For Oracle, validate that ignored tokens are valid table properties 1063 if ((this.vendor == EDbVendor.dbvoracle) && (firstIgnoreToken != null) 1064 && (!TBaseType.searchOracleTablePros(firstIgnoreToken.toString()))) { 1065 // Not a valid Oracle table property, don't ignore 1066 isFoundIgnoreToken = false; 1067 } 1068 1069 if (isFoundIgnoreToken) { 1070 statement.clearError(); 1071 parseResult = statement.parsestatement(null, false, onlyNeedRawParseTree); 1072 } 1073 } 1074 } 1075 1076 return parseResult; 1077 } 1078 1079 /** 1080 * Get the syntax errors collected during parsing. 1081 * 1082 * @return list of syntax errors (never null) 1083 */ 1084 public List<TSyntaxError> getSyntaxErrors() { 1085 return syntaxErrors; 1086 } 1087 1088 /** 1089 * Get the count of syntax errors. 1090 * 1091 * @return number of syntax errors 1092 */ 1093 public int getErrorCount() { 1094 return syntaxErrors.size(); 1095 } 1096 1097 /** 1098 * Check if a token is a dollar function delimiter ($$, $tag$, etc.) for PostgreSQL-family databases. 1099 * <p> 1100 * Migrated from TGSqlParser.isDollarFunctionDelimiter() (lines 5074-5080). 1101 * <p> 1102 * Dollar-quoted strings are used in PostgreSQL-family databases to delimit function bodies. 1103 * Each vendor has its own delimiter token code. 1104 * 1105 * @param tokencode the token code to check 1106 * @param dbVendor the database vendor 1107 * @return true if the token is a dollar function delimiter for the given vendor 1108 */ 1109 protected boolean isDollarFunctionDelimiter(int tokencode, EDbVendor dbVendor) { 1110 return ((tokencode == TBaseType.rrw_postgresql_function_delimiter) && (dbVendor == EDbVendor.dbvpostgresql)) 1111 || ((tokencode == TBaseType.rrw_greenplum_function_delimiter) && (dbVendor == EDbVendor.dbvgreenplum)) 1112 || ((tokencode == TBaseType.rrw_redshift_function_delimiter) && (dbVendor == EDbVendor.dbvredshift)) 1113 || ((tokencode == TBaseType.rrw_snowflake_function_delimiter) && (dbVendor == EDbVendor.dbvsnowflake)) 1114 || ((tokencode == TBaseType.rrw_clickhouse_function_delimiter) && (dbVendor == EDbVendor.dbvclickhouse)); 1115 } 1116 1117 /** 1118 * Hook method called when a raw statement is complete. 1119 * <p> 1120 * This method is called by vendor-specific raw statement extraction methods 1121 * (e.g., dooraclegetrawsqlstatements) when a statement boundary is detected. 1122 * It sets up the statement with parser references and adds it to the statement list. 1123 * 1124 * @param context parser context 1125 * @param statement the completed statement 1126 * @param mainParser main parser instance 1127 * @param secondaryParser secondary parser instance (may be null) 1128 * @param statementList statement list to add to 1129 * @param isLastStatement true if this is the last statement 1130 * @param builder optional result builder (used during raw statement extraction, may be null) 1131 */ 1132 protected void onRawStatementComplete(ParserContext context, 1133 TCustomSqlStatement statement, 1134 TCustomParser mainParser, 1135 TCustomParser secondaryParser, 1136 TStatementList statementList, 1137 boolean isLastStatement, 1138 SqlParseResult.Builder builder) { 1139 if (statement == null || statementList == null) { 1140 return; 1141 } 1142 1143 // CRITICAL: Set gsqlparser reference NOW (before parsing) so nested statements 1144 // can access parser's dbvendor via getGsqlparser().getDbVendor() 1145 // This matches legacy behavior from doongetrawsqlstatementevent() 1146 if (context != null && context.getGsqlparser() != null) { 1147 // Cast to TGSqlParser - we know the type from buildContext() 1148 statement.setGsqlparser((gudusoft.gsqlparser.TGSqlParser) context.getGsqlparser()); 1149 } 1150 statement.parser = mainParser; 1151 statement.plsqlparser = secondaryParser; 1152 1153 if (statement.sourcetokenlist != null && statement.sourcetokenlist.size() > 0) { 1154 TSourceToken startToken = statement.sourcetokenlist.get(0); 1155 TSourceToken endToken = statement.sourcetokenlist.get(statement.sourcetokenlist.size() - 1); 1156 1157 statement.setStartToken(startToken); 1158 statement.setEndToken(endToken); 1159 1160 if (!isLastStatement && context != null && endToken != null) { 1161 builder.lastTokenOfStatementBeenValidated(endToken); 1162 } 1163 } 1164 1165 // Vendor-specific statement completion logic (migrated from TGSqlParser.doongetrawsqlstatementevent lines 5129-5178) 1166 onRawStatementCompleteVendorSpecific(statement); 1167 1168 statementList.add(statement); 1169 1170 } 1171 1172 /** 1173 * Hook for vendor-specific logic when a raw statement is completed. 1174 * <p> 1175 * Migrated from TGSqlParser.doongetrawsqlstatementevent() (lines 5129-5178). 1176 * <p> 1177 * This method is called after basic statement setup but before adding to the statement list. 1178 * Subclasses can override to add vendor-specific token manipulations or metadata. 1179 * <p> 1180 * Default implementation handles PostgreSQL-family routine body processing. 1181 * 1182 * @param statement the completed statement 1183 */ 1184 protected void onRawStatementCompleteVendorSpecific(TCustomSqlStatement statement) { 1185 // Handle PostgreSQL-family databases: Mark non-SQL/PLSQL routine body tokens 1186 // Migrated from TGSqlParser.doongetrawsqlstatementevent() lines 5143-5178 1187 if (((this.vendor == EDbVendor.dbvpostgresql) || (this.vendor == EDbVendor.dbvgreenplum) 1188 || (this.vendor == EDbVendor.dbvredshift) || (this.vendor == EDbVendor.dbvsnowflake) 1189 || (this.vendor == EDbVendor.dbvclickhouse)) 1190 && (statement instanceof TRoutine)) { 1191 1192 TRoutine routine = (TRoutine) statement; 1193 if (!routine.isBodyInSQL()) { 1194 TSourceToken st; 1195 boolean inBody = false; 1196 String routineBodyStr = ""; 1197 1198 for (int i = 0; i < statement.sourcetokenlist.size(); i++) { 1199 st = statement.sourcetokenlist.get(i); 1200 1201 // Check for dollar function delimiter ($$, $tag$, etc.) 1202 if (isDollarFunctionDelimiter(st.tokencode, this.vendor)) { 1203 if (!inBody) { 1204 inBody = true; 1205 routineBodyStr = st.toString(); 1206 } else { 1207 inBody = false; 1208 routineBodyStr += st.toString(); 1209 break; 1210 } 1211 continue; 1212 } 1213 1214 if (inBody) { 1215 // Mark body tokens as sqlpluscmd so they're not parsed as SQL 1216 st.tokencode = TBaseType.sqlpluscmd; 1217 routineBodyStr += st.toString(); 1218 } 1219 } 1220 1221 routine.setRoutineBody(routineBodyStr); 1222 } 1223 } 1224 } 1225 1226 private static final int ENCODING_UTF16 = 1; 1227 private static final int ENCODING_UTF32 = 2; 1228 private static final int ENCODING_UTF8_BOM = 3; 1229 1230 protected PreparedSqlReader prepareSqlReader(ParserContext context) throws IOException { 1231 BufferedReader reader; 1232 String effectiveCharset = context.getSqlCharset(); 1233 1234 if (context.getSqlText() != null) { 1235 reader = new BufferedReader(new StringReader(context.getSqlText())); 1236 return new PreparedSqlReader(reader, effectiveCharset); 1237 } 1238 1239 if (context.getSqlFilename() != null && !context.getSqlFilename().isEmpty()) { 1240 FileInputStream fileStream = new FileInputStream(context.getSqlFilename()); 1241 BufferedInputStream bufferedStream = new BufferedInputStream(fileStream, 8); 1242 int encodingType = detectEncodingFromBom(bufferedStream); 1243 String charsetToUse = resolveCharsetName(encodingType, context.getSqlCharset()); 1244 InputStreamReader streamReader = new InputStreamReader(bufferedStream, charsetToUse); 1245 reader = new BufferedReader(streamReader); 1246 skipBomIfPresent(reader, encodingType); 1247 return new PreparedSqlReader(reader, charsetToUse); 1248 } 1249 1250 InputStream contextStream = context.getSqlInputStream(); 1251 if (contextStream != null) { 1252 BufferedInputStream bufferedStream = (contextStream instanceof BufferedInputStream) 1253 ? (BufferedInputStream) contextStream 1254 : new BufferedInputStream(contextStream, 8); 1255 int encodingType = detectEncodingFromBom(bufferedStream); 1256 String charsetToUse = resolveCharsetName(encodingType, context.getSqlCharset()); 1257 InputStreamReader streamReader = new InputStreamReader(bufferedStream, charsetToUse); 1258 reader = new BufferedReader(streamReader); 1259 skipBomIfPresent(reader, encodingType); 1260 return new PreparedSqlReader(reader, charsetToUse); 1261 } 1262 1263 // Default: empty input is valid, return reader for empty string 1264 reader = new BufferedReader(new StringReader("")); 1265 return new PreparedSqlReader(reader, effectiveCharset); 1266 } 1267 1268 private int detectEncodingFromBom(BufferedInputStream stream) throws IOException { 1269 if (stream == null || !stream.markSupported()) { 1270 return 0; 1271 } 1272 1273 byte[] bom = new byte[4]; 1274 stream.mark(bom.length + 1); 1275 int read = stream.read(bom, 0, bom.length); 1276 stream.reset(); 1277 1278 if (read < 2) { 1279 return 0; 1280 } 1281 1282 if (((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) 1283 || ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF))) { 1284 if (read >= 4 && (((bom[2] == (byte) 0xFF) && (bom[3] == (byte) 0xFE)) 1285 || ((bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)))) { 1286 return ENCODING_UTF32; 1287 } 1288 return ENCODING_UTF16; 1289 } 1290 1291 if (read >= 3 && (bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { 1292 return ENCODING_UTF8_BOM; 1293 } 1294 1295 return 0; 1296 } 1297 1298 private String resolveCharsetName(int encodingType, String contextCharset) { 1299 switch (encodingType) { 1300 case ENCODING_UTF16: 1301 return "UTF-16"; 1302 case ENCODING_UTF32: 1303 return "UTF-32"; 1304 case ENCODING_UTF8_BOM: 1305 return "UTF-8"; 1306 default: 1307 if (contextCharset != null && !contextCharset.isEmpty()) { 1308 return contextCharset; 1309 } 1310 return Charset.defaultCharset().name(); 1311 } 1312 } 1313 1314 private void skipBomIfPresent(BufferedReader reader, int encodingType) throws IOException { 1315 if (encodingType != ENCODING_UTF8_BOM || reader == null || !reader.markSupported()) { 1316 return; 1317 } 1318 1319 reader.mark(1); 1320 int ch = reader.read(); 1321 if (ch != 0xFEFF && ch != -1) { 1322 reader.reset(); 1323 } 1324 } 1325 1326 // ========== Utility Methods ========== 1327 1328 /** 1329 * Initialize global context and frame stack for statement parsing. 1330 * <p> 1331 * This method sets up the semantic analysis infrastructure required during 1332 * the parsing phase. It creates: 1333 * <ul> 1334 * <li>Global context (TContext) for semantic analysis</li> 1335 * <li>SQL environment (TSQLEnv) with vendor-specific configuration</li> 1336 * <li>Frame stack for scope management</li> 1337 * <li>Global scope frame as the outermost scope</li> 1338 * </ul> 1339 * 1340 * <p><b>When to call:</b> At the beginning of performParsing(), before parsing statements. 1341 * 1342 * <p><b>Cleanup required:</b> Must call {@code globalFrame.popMeFromStack(frameStack)} 1343 * after all statements are parsed to clean up the frame stack. 1344 * 1345 * <p><b>Extracted from:</b> Identical implementations in OracleSqlParser and MssqlSqlParser 1346 * to eliminate ~16 lines of duplicate code per parser. 1347 */ 1348 protected void initializeGlobalContext() { 1349 // Initialize global context for semantic analysis 1350 this.globalContext = new TContext(); 1351 this.sqlEnv = new TSQLEnv(this.vendor) { 1352 @Override 1353 public void initSQLEnv() { 1354 // Vendor-specific initialization can be added by subclasses if needed 1355 } 1356 }; 1357 this.globalContext.setSqlEnv(this.sqlEnv, this.sqlstatements); 1358 1359 // Create global scope frame 1360 this.frameStack = new java.util.Stack<TFrame>(); 1361 TGlobalScope globalScope = new TGlobalScope(); 1362 globalScope.resetCurrentStmtIndex(); 1363 globalScope.setSqlEnv(this.sqlEnv); 1364 this.globalFrame = new TFrame(globalScope); 1365 this.globalFrame.pushMeToStack(this.frameStack); 1366 } 1367 1368 /** 1369 * Handle exceptions that occur during individual statement parsing. 1370 * <p> 1371 * This method provides robust error handling that allows parsing to continue 1372 * even when individual statements throw exceptions. It: 1373 * <ul> 1374 * <li>Creates a detailed {@link TSyntaxError} with exception information</li> 1375 * <li>Captures statement location (line, column) from first token</li> 1376 * <li>Includes statement number, exception type, and message</li> 1377 * <li>Optionally logs full stack trace if debugging is enabled</li> 1378 * <li>Adds error to {@link #syntaxErrors} list for user feedback</li> 1379 * </ul> 1380 * 1381 * <p><b>Benefits:</b> 1382 * <ul> 1383 * <li>Parsing continues for remaining statements after exception</li> 1384 * <li>Users get complete error feedback for all statements</li> 1385 * <li>Developers get stack traces for debugging parser issues</li> 1386 * </ul> 1387 * 1388 * <p><b>Example error message:</b><br> 1389 * {@code "Exception during parsing statement 3: NullPointerException - Cannot invoke..."} 1390 * 1391 * <p><b>Extracted from:</b> Identical implementations in OracleSqlParser and MssqlSqlParser 1392 * to eliminate ~51 lines of duplicate code per parser. 1393 * 1394 * @param stmt the statement that failed to parse 1395 * @param statementIndex 0-based index of the statement in the statement list 1396 * @param ex the exception that was thrown during parsing 1397 */ 1398 protected void handleStatementParsingException(TCustomSqlStatement stmt, int statementIndex, Exception ex) { 1399 // Create user-friendly error message with context 1400 String errorMsg = String.format("Exception during parsing statement %d: %s - %s", 1401 statementIndex + 1, // Convert to 1-based for user readability 1402 ex.getClass().getSimpleName(), 1403 ex.getMessage() != null ? ex.getMessage() : "No details"); 1404 1405 // Get first token of statement for error location 1406 TSourceToken firstToken = null; 1407 if (stmt.sourcetokenlist != null && stmt.sourcetokenlist.size() > 0) { 1408 firstToken = stmt.sourcetokenlist.get(0); 1409 } 1410 1411 // Create syntax error with exception details 1412 TSyntaxError syntaxError; 1413 if (firstToken != null) { 1414 // Use token location for accurate error reporting 1415 syntaxError = new TSyntaxError( 1416 firstToken.getAstext(), 1417 firstToken.lineNo, 1418 firstToken.columnNo, 1419 errorMsg, 1420 EErrorType.sperror, 1421 TBaseType.MSG_ERROR_SYNTAX_ERROR, 1422 stmt, 1423 firstToken.posinlist 1424 ); 1425 } else { 1426 // Fallback if no token info available 1427 syntaxError = new TSyntaxError( 1428 "", 1429 0, 1430 0, 1431 errorMsg, 1432 EErrorType.sperror, 1433 TBaseType.MSG_ERROR_SYNTAX_ERROR, 1434 stmt, 1435 -1 1436 ); 1437 } 1438 1439 this.syntaxErrors.add(syntaxError); 1440 1441 // Log to console if debugging enabled 1442 if (TBaseType.DUMP_RESOLVER_LOG_TO_CONSOLE) { 1443 System.err.println("ERROR: " + errorMsg); 1444 ex.printStackTrace(); 1445 } 1446 } 1447 1448 /** 1449 * Hook method for vendor-specific post-processing after a statement is parsed. 1450 * <p> 1451 * This method is called after each statement is successfully parsed but before 1452 * error recovery and error collection. Subclasses can override this to perform 1453 * vendor-specific operations such as: 1454 * <ul> 1455 * <li>Checking for vendor-specific syntax errors in nested statements</li> 1456 * <li>Validating vendor-specific constraints</li> 1457 * <li>Collecting vendor-specific metadata</li> 1458 * </ul> 1459 * 1460 * <p><b>Default implementation:</b> Does nothing (no-op). 1461 * 1462 * <p><b>Example override (Oracle):</b><br> 1463 * <pre>{@code 1464 * @Override 1465 * protected void afterStatementParsed(TCustomSqlStatement stmt) { 1466 * if (stmt.isoracleplsql()) { 1467 * findAllSyntaxErrorsInPlsql(stmt); 1468 * } 1469 * } 1470 * }</pre> 1471 * 1472 * <p><b>When called:</b> After {@code stmt.parsestatement()} succeeds, 1473 * before {@code handleCreateTableErrorRecovery()} and {@code copyErrorsFromStatement()}. 1474 * 1475 * @param stmt the statement that was just parsed 1476 */ 1477 protected void afterStatementParsed(TCustomSqlStatement stmt) { 1478 // Default: no additional processing 1479 // Subclasses override to add vendor-specific post-processing 1480 } 1481 1482 /** 1483 * Get next source token from the lexer. 1484 * <p> 1485 * This method wraps the lexer's yylexwrap() call and performs several important tasks: 1486 * <ul> 1487 * <li>Fetches the next raw token from the lexer</li> 1488 * <li>Combines consecutive whitespace/newline tokens for cleaner token stream</li> 1489 * <li>Sets token metadata (vendor, status, container, position in list)</li> 1490 * <li>Optionally calls token handler callback</li> 1491 * </ul> 1492 * 1493 * <p><b>Token Consolidation Rules:</b> 1494 * <ul> 1495 * <li>Whitespace after a newline is merged into the newline token</li> 1496 * <li>Consecutive newlines are merged into a single newline token</li> 1497 * </ul> 1498 * 1499 * <p><b>Implementation Note:</b> 1500 * This method is extracted from TGSqlParser.getanewsourcetoken() and made 1501 * available to all database-specific parsers to avoid code duplication. 1502 * 1503 * @return next source token, or null if end of input 1504 */ 1505 protected TSourceToken getanewsourcetoken() { 1506 TSourceToken pst = null, prevst; 1507 1508 while (true) { 1509 pst = new TSourceToken(""); 1510 if (lexer.yylexwrap(pst) == 0) { 1511 pst = null; 1512 break; 1513 } 1514 1515 pst.setDbvendor(vendor); 1516 pst.tokenstatus = ETokenStatus.tsoriginal; 1517 1518 if (pst.tokentype == ETokenType.ttreturn) { 1519 pst.setAstext(towinlinebreak(pst.getAstext())); 1520 } 1521 1522 // Combine space & linebreak after a linebreak into one 1523 if ((pst.tokentype == ETokenType.ttwhitespace) 1524 && (sourcetokenlist.curpos >= 0)) { 1525 prevst = sourcetokenlist.get(sourcetokenlist.curpos); 1526 if (prevst.tokentype == ETokenType.ttreturn) { 1527 // Can't discard whitespace after linebreak, it will be used 1528 // to judge whether / at the beginning of the line is a sqlplus cmd or not 1529 // check isValidPlaceForDivToSqlplusCmd for more 1530 prevst.setAstext(prevst.getAstext() + pst.getAstext()); 1531 continue; 1532 } 1533 } 1534 1535 // Combine consecutive newlines 1536 if ((pst.tokentype == ETokenType.ttreturn) 1537 && (sourcetokenlist.curpos >= 0)) { 1538 prevst = sourcetokenlist.get(sourcetokenlist.curpos); 1539 1540 if (prevst.tokentype == ETokenType.ttreturn) { 1541 prevst.setAstext(prevst.getAstext() + pst.getAstext()); 1542 continue; 1543 } 1544 1545 // Note: The original code has a commented section about merging 1546 // whitespace with newline. We're preserving the behavior here 1547 // which does NOT merge preceding whitespace with newline. 1548 } 1549 1550 break; 1551 } 1552 1553 if (pst != null) { 1554 pst.container = sourcetokenlist; 1555 sourcetokenlist.curpos = sourcetokenlist.curpos + 1; 1556 pst.posinlist = sourcetokenlist.curpos; 1557 1558 // Optional token handler callback 1559 if (tokenHandle != null) { 1560 tokenHandle.processToken(pst); 1561 } 1562 } 1563 1564 lexer.setTokenTableValue(pst); 1565 return pst; 1566 } 1567 1568 /** 1569 * Convert line breaks to Windows format. 1570 * <p> 1571 * Currently returns the input unchanged. This method exists for compatibility 1572 * with the original TGSqlParser implementation. 1573 * 1574 * @param s Input string 1575 * @return String with Windows line breaks (currently unchanged) 1576 */ 1577 protected String towinlinebreak(String s) { 1578 return s; 1579 // if (s == null) return null; 1580 // return s.replace("\n", "\r\n"); 1581 } 1582 1583 /** 1584 * Get the delimiter character for this vendor. 1585 * 1586 * @return delimiter character (e.g., ';', '/', '$') 1587 */ 1588 public char getDelimiterChar() { 1589 return delimiterChar; 1590 } 1591 1592 /** 1593 * Get the default delimiter string for this vendor. 1594 * 1595 * @return default delimiter string 1596 */ 1597 public String getDefaultDelimiterStr() { 1598 return defaultDelimiterStr; 1599 } 1600 1601 @Override 1602 public String toString() { 1603 return getClass().getSimpleName() + "{vendor=" + vendor + "}"; 1604 } 1605}