Source code

001package gudusoft.gsqlparser.parser;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.EErrorType;
005import gudusoft.gsqlparser.ESqlStatementType;
006import gudusoft.gsqlparser.ETokenStatus;
007import gudusoft.gsqlparser.ETokenType;
008import gudusoft.gsqlparser.TBaseType;
009import gudusoft.gsqlparser.TCustomLexer;
010import gudusoft.gsqlparser.TCustomParser;
011import gudusoft.gsqlparser.TCustomSqlStatement;
012import gudusoft.gsqlparser.TSourceToken;
013import gudusoft.gsqlparser.TSourceTokenList;
014import gudusoft.gsqlparser.TStatementList;
015import gudusoft.gsqlparser.TSyntaxError;
016import gudusoft.gsqlparser.compiler.TContext;
017import gudusoft.gsqlparser.compiler.TFrame;
018import gudusoft.gsqlparser.compiler.TGlobalScope;
019import gudusoft.gsqlparser.sqlcmds.ISqlCmds;
020import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory;
021import gudusoft.gsqlparser.sqlenv.TSQLEnv;
022import gudusoft.gsqlparser.stmt.TRoutine;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.FileInputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.io.StringReader;
031import java.nio.charset.Charset;
032import java.util.ArrayList;
033import java.util.List;
034
035/**
036 * Abstract base class providing common logic and template methods for SQL parsing.
037 *
038 * <p>This class implements the <b>Template Method Pattern</b>, defining the skeleton
039 * of the parsing algorithm while allowing subclasses to override specific steps.
040 * It provides default implementations for common operations and hooks for
041 * vendor-specific customization.
042 *
043 * <p><b>Design Pattern:</b> Template Method
044 * <ul>
045 *   <li><b>Template Methods:</b> {@link #parse(ParserContext)}, {@link #tokenize(ParserContext)}</li>
046 *   <li><b>Abstract Methods:</b> Must be implemented by subclasses</li>
047 *   <li><b>Hook Methods:</b> Optional overrides for customization</li>
048 * </ul>
049 *
050 * <p><b>Parsing Algorithm (Template Method):</b>
051 * <ol>
052 *   <li>Get lexer ({@link #getLexer(ParserContext)})</li>
053 *   <li>Tokenize SQL ({@link #performTokenization(ParserContext, TCustomLexer)})</li>
054 *   <li>Process tokens ({@link #processTokensBeforeParse(ParserContext, TSourceTokenList)})</li>
055 *   <li>Get parser(s) ({@link #getParser(ParserContext, TSourceTokenList)})</li>
056 *   <li>Parse SQL ({@link #performParsing(ParserContext, TCustomParser, TCustomParser, TSourceTokenList)})</li>
057 *   <li>Semantic analysis ({@link #performSemanticAnalysis(ParserContext, TStatementList)})</li>
058 * </ol>
059 *
060 * <p><b>Subclass Responsibilities:</b>
061 * <pre>
062 * public class OracleSqlParser extends AbstractSqlParser {
063 *     public OracleSqlParser() {
064 *         super(EDbVendor.dbvoracle);
065 *         this.delimiterChar = '/';
066 *     }
067 *
068 *     // Must implement abstract methods
069 *     protected TCustomLexer getLexer(ParserContext context) {
070 *         return new TLexerOracle();
071 *     }
072 *
073 *     protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) {
074 *         return new TParserOracleSql(tokens);
075 *     }
076 *
077 *     // ... other abstract methods
078 *
079 *     // Optionally override hook methods
080 *     protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) {
081 *         // Oracle-specific token processing
082 *     }
083 * }
084 * </pre>
085 *
086 * @see SqlParser
087 * @see ParserContext
088 * @see SqlParseResult
089 * @since 3.2.0.0
090 */
091public abstract class AbstractSqlParser implements SqlParser {
092
093    protected final EDbVendor vendor;
094    protected char delimiterChar = ';';
095    protected String defaultDelimiterStr = ";";
096
097    // Syntax errors collected during parsing
098    protected List<TSyntaxError> syntaxErrors = new ArrayList<>();
099
100    // ========== Core Parsing Components (Reused Across Parse Operations) ==========
101
102    /**
103     * Token list container - created once in constructor, cleared before each parse.
104     * <p>This follows the component reuse pattern to avoid allocation overhead.
105     */
106    protected TSourceTokenList sourcetokenlist;
107
108    /**
109     * Statement list container - created once in constructor, cleared before each extraction.
110     * <p>This follows the component reuse pattern to avoid allocation overhead.
111     */
112    protected TStatementList sqlstatements;
113
114    /**
115     * Current parser context for the ongoing parse operation.
116     * <p>Set at the beginning of each parse operation, contains input SQL and options.
117     */
118    protected ParserContext parserContext;
119
120    /**
121     * SQL command resolver for identifying statement types (SELECT, INSERT, etc.).
122     * <p>Initialized lazily using SqlCmdsFactory.get(vendor) - vendor-specific implementation.
123     */
124    protected ISqlCmds sqlcmds;
125
126    /**
127     * Token handler callback for processing tokens as they are created.
128     * <p>Optional callback that gets invoked for each token created during tokenization.
129     */
130    private gudusoft.gsqlparser.ITokenHandle tokenHandle = null;
131
132    /**
133     * The lexer instance used for tokenization.
134     * <p>Subclasses should set this field in their constructor to their specific lexer instance.
135     * This allows common tokenization logic in AbstractSqlParser to access the lexer generically.
136     */
137    protected TCustomLexer lexer = null;
138
139    // ========== Semantic Analysis Infrastructure ==========
140
141    /**
142     * Global context for semantic analysis.
143     * <p>Created during performParsing phase, contains SQL environment and statement references.
144     */
145    protected TContext globalContext;
146
147    /**
148     * SQL environment for semantic analysis.
149     * <p>Vendor-specific environment configuration, used by resolver and semantic analysis.
150     */
151    protected TSQLEnv sqlEnv;
152
153    /**
154     * Frame stack for scope management during parsing.
155     * <p>Used to track nested scopes (global, statement, block-level) during parsing.
156     */
157    protected java.util.Stack<TFrame> frameStack;
158
159    /**
160     * Global frame pushed to frame stack during parsing.
161     * <p>Represents the outermost scope, must be popped after parsing completes.
162     */
163    protected TFrame globalFrame;
164
165    protected static class PreparedSqlReader {
166        private final BufferedReader reader;
167        private final String charset;
168
169        protected PreparedSqlReader(BufferedReader reader, String charset) {
170            this.reader = reader;
171            this.charset = charset;
172        }
173
174        public BufferedReader getReader() {
175            return reader;
176        }
177
178        public String getCharset() {
179            return charset;
180        }
181    }
182
183    /**
184     * Construct parser for given database vendor.
185     *
186     * @param vendor the database vendor
187     */
188    protected AbstractSqlParser(EDbVendor vendor) {
189        if (vendor == null) {
190            throw new IllegalArgumentException("vendor cannot be null");
191        }
192        this.vendor = vendor;
193
194        // Initialize reusable containers (cleared before each use)
195        this.sourcetokenlist = new TSourceTokenList();
196        this.sqlstatements = new TStatementList();
197
198        // Note: parserContext is set at the beginning of each parse operation
199        // Note: sqlcmds is initialized lazily when first needed
200    }
201
202    @Override
203    public EDbVendor getVendor() {
204        return vendor;
205    }
206
207    /**
208     * Set an event handler which will be fired when a new source token is created by the lexer during tokenization.
209     *
210     * @param tokenHandle the event handler to process the new created source token
211     */
212    public void setTokenHandle(gudusoft.gsqlparser.ITokenHandle tokenHandle) {
213        this.tokenHandle = tokenHandle;
214    }
215
216    /**
217     * Template method for full parsing.
218     *
219     * <p>This method defines the skeleton of the parsing algorithm.
220     * Subclasses should NOT override this method; instead, they should
221     * override the abstract methods and hook methods called by this template.
222     *
223     * <p><b>Algorithm:</b>
224     * <ol>
225     *   <li>Create lexer</li>
226     *   <li>Tokenize (time tracked)</li>
227     *   <li>Process tokens (vendor-specific preprocessing)</li>
228     *   <li>Create parser(s)</li>
229     *   <li>Parse (time tracked)</li>
230     *   <li>Semantic analysis (time tracked)</li>
231     *   <li>Interpreter (time tracked)</li>
232     * </ol>
233     *
234     * @param context immutable context with all inputs
235     * @return immutable result with all outputs
236     */
237    @Override
238    public final SqlParseResult parse(ParserContext context) {
239        // Clear syntax errors from previous parse
240        syntaxErrors.clear();
241
242        try {
243            // Step 1: Get raw statements (internally calls tokenize() and extractRawStatements())
244            SqlParseResult rawResult = getrawsqlstatements(context);
245
246            if (rawResult.getErrorCode() != 0) {
247                return rawResult;
248            }
249
250            // Get tokens, lexer, and RAW STATEMENTS from raw result
251            TSourceTokenList tokens = rawResult.getSourceTokenList();
252            TCustomLexer lexer = rawResult.getLexer();
253            TStatementList rawStatements = rawResult.getSqlStatements();
254
255            // Step 2: Get parser(s)
256            TCustomParser parser = getParser(context, tokens);
257            TCustomParser secondaryParser = getSecondaryParser(context, tokens);
258
259            // Step 3: Full parsing (build AST for each raw statement)
260            SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder();
261            resultBuilder.lexer(lexer);
262            resultBuilder.sourceTokenList(tokens);
263            resultBuilder.tokenizationTimeMs(rawResult.getTokenizationTimeMs());
264            resultBuilder.parser(parser);
265
266            long parseStart = System.currentTimeMillis();
267            // Pass raw statements to performParsing - it will build AST for each statement
268            TStatementList statements = performParsing(context, parser, secondaryParser, tokens, rawStatements);
269            if (statements == null) {
270                statements = new TStatementList();
271            }
272            resultBuilder.sqlStatements(statements);
273            resultBuilder.parsingTimeMs(System.currentTimeMillis() - parseStart);
274
275            // Step 4: Semantic analysis
276            if (!context.isOnlyNeedRawParseTree()) {
277                long semanticStart = System.currentTimeMillis();
278                performSemanticAnalysis(context, statements);
279                resultBuilder.semanticAnalysisTimeMs(System.currentTimeMillis() - semanticStart);
280            }
281
282            // Step 5: Interpreter
283            if (!context.isOnlyNeedRawParseTree() && syntaxErrors.isEmpty()) {
284                long interpreterStart = System.currentTimeMillis();
285                performInterpreter(context, statements);
286                resultBuilder.interpreterTimeMs(System.currentTimeMillis() - interpreterStart);
287            }
288
289            resultBuilder.syntaxErrors(syntaxErrors instanceof ArrayList ?
290                (ArrayList<TSyntaxError>) syntaxErrors : new ArrayList<>(syntaxErrors));
291            resultBuilder.errorCode(syntaxErrors.isEmpty() ? 0 : syntaxErrors.size());
292            resultBuilder.errorMessage(syntaxErrors.isEmpty() ? "" :
293                String.format("Parsing completed with %d error(s)", syntaxErrors.size()));
294
295            return resultBuilder.build();
296        } catch (Exception e) {
297            e.printStackTrace();
298            SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder();
299            resultBuilder.errorCode(1);
300            String errorMsg = "Parsing failed: " + e.getMessage();
301            resultBuilder.errorMessage(errorMsg);
302            System.out.println(errorMsg+"File:\t"+context.getSqlFilename());
303            if (context.isDumpResolverLog()) {
304                e.printStackTrace();
305            }
306            return resultBuilder.build();
307        }
308    }
309
310    /**
311     * Template method for tokenization only (without full parsing).
312     *
313     * <p>This method is used by {@code getrawsqlstatements()} which only
314     * needs tokenization and raw statement extraction, without detailed
315     * syntax checking or semantic analysis.
316     *
317     * <p><b>Algorithm:</b>
318     * <ol>
319     *   <li>Get lexer</li>
320     *   <li>Tokenize (time tracked)</li>
321     *   <li>Extract raw statements (no parsing)</li>
322     * </ol>
323     *
324     * @param context immutable context with all inputs
325     * @return immutable result with tokens and raw statements
326     */
327    @Override
328    public final SqlParseResult tokenize(ParserContext context) {
329        SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder();
330
331        try {
332            // Step 1: Get lexer (vendor-specific instance, may be cached)
333            TCustomLexer lexer = getLexer(context);
334            if (lexer == null) {
335                throw new IllegalStateException("getLexer() returned null");
336            }
337            resultBuilder.lexer(lexer);
338
339            // Step 2: Perform tokenization
340            long tokenStart = System.currentTimeMillis();
341            TSourceTokenList tokens = performTokenization(context, lexer);
342            if (tokens == null) {
343                throw new IllegalStateException("performTokenization() returned null");
344            }
345
346            // Step 3: Post-tokenization processing (CRITICAL for correct behavior)
347            // These steps must run after tokenization to prepare tokens for parsing
348
349            // Step 3a: Post-tokenization normalization
350            doAfterTokenize(tokens);
351
352            // Step 3b: Reset token chain (CRITICAL FIX)
353            // Links all tokens via getNextTokenInChain() - required for TObjectName.toString()
354            TBaseType.resetTokenChain(tokens, 0);
355
356            // Step 3c: Process tokens using token table
357            // Vendor-specific token code adjustments (e.g., BigQuery/Snowflake DO keyword handling)
358            processTokensInTokenTable(context, lexer, tokens);
359
360            // Step 3d: Pre-parse token processing
361            // Pre-parse preprocessing (e.g., Snowflake duplicate semicolon removal)
362            processTokensBeforeParse(context, tokens);
363
364            resultBuilder.sourceTokenList(tokens);
365            resultBuilder.tokenizationTimeMs(System.currentTimeMillis() - tokenStart);
366
367            // Success
368            resultBuilder.errorCode(0);
369            resultBuilder.errorMessage("");
370
371        } catch (Exception e) {
372            // Error occurred
373            resultBuilder.errorCode(1);
374            String errorMsg = "Tokenization failed: " + e.getMessage();
375            resultBuilder.errorMessage(errorMsg);
376
377            // Log error if enabled
378            if (context.isDumpResolverLog()) {
379                e.printStackTrace();
380            }
381        }
382
383        return resultBuilder.build();
384    }
385
386    /**
387     * Template method for extracting raw statements without full parsing.
388     *
389     * <p>This method performs tokenization and raw statement extraction,
390     * but skips the expensive full parsing and semantic analysis steps.
391     *
392     * <p><b>Algorithm:</b>
393     * <ol>
394     *   <li>Tokenize SQL (via {@link #tokenize(ParserContext)})</li>
395     *   <li>Extract raw statements (via {@link #extractRawStatements(ParserContext, TSourceTokenList, TCustomLexer, long)})</li>
396     *   <li>Return result with tokens and raw statements</li>
397     * </ol>
398     *
399     * <p><b>Equivalent to legacy API:</b> {@code TGSqlParser.getrawsqlstatements()}
400     *
401     * @param context immutable context with all inputs
402     * @return immutable result with tokens and raw statements (no AST)
403     */
404    @Override
405    public final SqlParseResult getrawsqlstatements(ParserContext context) {
406        try {
407            // Step 1: Tokenize with all post-processing (calls tokenize())
408            SqlParseResult tokenizeResult = tokenize(context);
409
410            // Check tokenization result
411            if (tokenizeResult.getErrorCode() != 0) {
412                return tokenizeResult;
413            }
414
415            // Get tokens and lexer from tokenize result
416            TSourceTokenList tokens = tokenizeResult.getSourceTokenList();
417            TCustomLexer lexer = tokenizeResult.getLexer();
418            long tokenizationTimeMs = tokenizeResult.getTokenizationTimeMs();
419
420            // Step 2: Extract raw statements (vendor-specific)
421            // Vendor implementation creates builder, populates it, and returns complete result
422            SqlParseResult extractResult = extractRawStatements(context, tokens, lexer, tokenizationTimeMs);
423
424            return extractResult;
425        } catch (Exception e) {
426            e.printStackTrace();
427            SqlParseResult.Builder resultBuilder = new SqlParseResult.Builder();
428            resultBuilder.errorCode(1);
429            resultBuilder.errorMessage("Raw statement extraction failed: " + e.getMessage() );
430            if (context.isDumpResolverLog()) {
431                e.printStackTrace();
432            }
433            return resultBuilder.build();
434        }
435    }
436
437
438
439    // ========== Abstract Methods (MUST be implemented by subclasses) ==========
440
441    /**
442     * Get the lexer for this vendor.
443     *
444     * <p><b>Subclass Responsibility:</b> Return vendor-specific lexer instance.
445     * The lexer may be created fresh or cached/reused for performance.
446     *
447     * <p><b>Example:</b>
448     * <pre>
449     * protected TCustomLexer getLexer(ParserContext context) {
450     *     TLexerOracle lexer = new TLexerOracle();
451     *     lexer.delimiterchar = delimiterChar;
452     *     lexer.defaultDelimiterStr = defaultDelimiterStr;
453     *     return lexer;
454     * }
455     * </pre>
456     *
457     * @param context the parser context
458     * @return configured lexer instance (never null)
459     */
460    protected abstract TCustomLexer getLexer(ParserContext context);
461
462    /**
463     * Get the main parser for this vendor.
464     *
465     * <p><b>Subclass Responsibility:</b> Return vendor-specific parser instance.
466     * The parser may be created fresh or cached/reused for performance.
467     * If reusing, the token list should be updated.
468     *
469     * <p><b>Example:</b>
470     * <pre>
471     * protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) {
472     *     TParserOracleSql parser = new TParserOracleSql(tokens);
473     *     parser.lexer = getLexer(context);
474     *     return parser;
475     * }
476     * </pre>
477     *
478     * @param context the parser context
479     * @param tokens the source token list
480     * @return configured parser instance (never null)
481     */
482    protected abstract TCustomParser getParser(ParserContext context, TSourceTokenList tokens);
483
484    /**
485     * Perform tokenization using vendor-specific lexer.
486     *
487     * <p><b>Template Method:</b> This method implements the common tokenization
488     * algorithm across all database vendors. Subclasses customize through one hook:
489     * {@link #tokenizeVendorSql()} - Call vendor-specific tokenization logic
490     *
491     * <p><b>Algorithm:</b>
492     * <ol>
493     *   <li>Store parser context</li>
494     *   <li>Prepare SQL reader (file/string with charset detection)</li>
495     *   <li>Configure lexer with input reader and charset</li>
496     *   <li>Reset lexer state</li>
497     *   <li>Clear token list and reset position</li>
498     *   <li>Reset token table cache</li>
499     *   <li>Call {@link #tokenizeVendorSql()} hook</li>
500     *   <li>Return populated token list</li>
501     * </ol>
502     *
503     * @param context parser context with SQL input configuration
504     * @param lexer the lexer instance (same as this.flexer)
505     * @return token list populated by vendor-specific tokenization
506     * @throws RuntimeException if tokenization fails
507     */
508    protected TSourceTokenList performTokenization(ParserContext context, TCustomLexer lexer) {
509        this.parserContext = context;
510
511        // Set token handle from context if provided (allows TGSqlParser.setTokenHandle() to work)
512        if (context.getTokenHandle() != null) {
513            this.tokenHandle = context.getTokenHandle();
514        }
515
516        try {
517            PreparedSqlReader prepared = prepareSqlReader(context);
518            BufferedReader finputstream = prepared.getReader();
519            String effectiveCharset = prepared.getCharset();
520
521            // Configure lexer with input (lexer is vendor-specific flexer from subclass)
522            lexer.yyinput = finputstream;
523            if (effectiveCharset != null && !effectiveCharset.isEmpty()) {
524                lexer.setSqlCharset(effectiveCharset);
525            }
526            lexer.reset();
527
528            // Reset token list
529            this.sourcetokenlist.clear();
530            this.sourcetokenlist.curpos = -1;
531
532            // Reset token table cache
533            lexer.resetTokenTable();
534
535            // HOOK: Call vendor-specific tokenization
536            tokenizeVendorSql();
537
538            return this.sourcetokenlist;
539
540        } catch (Exception e) {
541            throw new RuntimeException("Tokenization failed: " + e.getMessage(), e);
542        }
543    }
544
545    /**
546     * Call vendor-specific tokenization logic.
547     *
548     * <p><b>Hook Method:</b> Called by {@link #performTokenization} to execute
549     * vendor-specific SQL-to-token conversion logic.
550     *
551     * <p><b>Subclass Responsibility:</b> Call the vendor-specific tokenization method
552     * (e.g., dooraclesqltexttotokenlist, domssqlsqltexttotokenlist) which reads
553     * from lexer and populates sourcetokenlist.
554     *
555     * <p><b>Example (Oracle):</b>
556     * <pre>
557     * protected void tokenizeVendorSql() {
558     *     dooraclesqltexttotokenlist();
559     * }
560     * </pre>
561     *
562     * <p><b>Example (MSSQL):</b>
563     * <pre>
564     * protected void tokenizeVendorSql() {
565     *     domssqlsqltexttotokenlist();
566     * }
567     * </pre>
568     *
569     * <p><b>Example (PostgreSQL):</b>
570     * <pre>
571     * protected void tokenizeVendorSql() {
572     *     dopostgresqltexttotokenlist();
573     * }
574     * </pre>
575     */
576    protected abstract void tokenizeVendorSql();
577
578    /**
579     * Extract raw statements without full parsing (public API).
580     *
581     * <p>This public method allows external callers (like TGSqlParser) to extract
582     * raw statements from an already-tokenized source list without re-tokenization.
583     *
584     * @param context the parser context
585     * @param tokens the source token list (already tokenized)
586     * @return statement list (never null)
587     * @since 3.2.0.0
588     */
589    public final TStatementList doExtractRawStatements(ParserContext context, TSourceTokenList tokens) {
590        // Create a dummy lexer since we already have tokens
591        TCustomLexer lexer = getLexer(context);
592
593        // Call vendor-specific extraction and extract statement list from result
594        SqlParseResult result = extractRawStatements(context, tokens, lexer, 0);
595        return result.getSqlStatements() != null ? result.getSqlStatements() : new TStatementList();
596    }
597
598    /**
599     * Extract raw statements without full parsing.
600     *
601     * <p><b>Template Method:</b> This method implements the common algorithm for
602     * extracting raw statements across all database vendors. Subclasses customize
603     * the process through two hook methods:
604     * <ul>
605     *   <li>{@link #setupVendorParsersForExtraction()} - Initialize vendor parsers</li>
606     *   <li>{@link #extractVendorRawStatements(SqlParseResult.Builder)} - Call vendor extraction logic</li>
607     * </ul>
608     *
609     * <p><b>Algorithm:</b>
610     * <ol>
611     *   <li>Create SqlParseResult.Builder</li>
612     *   <li>Set common fields (lexer, tokens, tokenization time)</li>
613     *   <li>Store context and tokens for extraction</li>
614     *   <li>Initialize SQL command resolver</li>
615     *   <li>Call {@link #setupVendorParsersForExtraction()} hook</li>
616     *   <li>Time the extraction</li>
617     *   <li>Call {@link #extractVendorRawStatements(SqlParseResult.Builder)} hook</li>
618     *   <li>Set parsing time</li>
619     *   <li>Build and return result</li>
620     * </ol>
621     *
622     * @param context the parser context
623     * @param tokens the source token list
624     * @param lexer the lexer instance (for including in result)
625     * @param tokenizationTimeMs tokenization time from tokenize() step
626     * @return complete SqlParseResult with raw statements and metadata
627     */
628    protected SqlParseResult extractRawStatements(ParserContext context,
629                                                  TSourceTokenList tokens,
630                                                  TCustomLexer lexer,
631                                                  long tokenizationTimeMs) {
632        // Create builder for result construction
633        SqlParseResult.Builder builder = new SqlParseResult.Builder();
634
635        // Set common result fields
636        builder.lexer(lexer);
637        builder.sourceTokenList(tokens);
638        builder.tokenizationTimeMs(tokenizationTimeMs);
639
640        // CRITICAL: Include parser(s) in result so TGSqlParser can use them in common parsing loop
641        TCustomParser parser = getParser(context, tokens);
642        builder.parser(parser);
643
644        // Include secondary parser for vendors that have one (e.g., Oracle PL/SQL parser)
645        TCustomParser secondaryParser = getSecondaryParser(context, tokens);
646        if (secondaryParser != null) {
647            builder.secondaryParser(secondaryParser);
648        }
649
650        // Store context and tokens for extraction
651        this.sourcetokenlist = tokens;
652        if (this.sqlstatements == null) {
653            this.sqlstatements = new TStatementList();
654        } else {
655            this.sqlstatements.clear();
656        }
657        this.syntaxErrors.clear();  // Clear syntax errors from previous extraction
658        this.parserContext = context;
659
660        // Initialize SQL command resolver (if not already done)
661        if (this.sqlcmds == null) {
662            this.sqlcmds = SqlCmdsFactory.get(vendor);
663        }
664
665        // HOOK 1: Vendor-specific parser setup (sqlcmds injection, token list update)
666        setupVendorParsersForExtraction();
667
668        // Time the extraction
669        long extractStart = System.currentTimeMillis();
670
671        // HOOK 2: Call vendor-specific raw statement extraction
672        extractVendorRawStatements(builder);
673
674        builder.parsingTimeMs(System.currentTimeMillis() - extractStart);
675
676        // Add extracted statements to result
677        builder.sqlStatements(this.sqlstatements);
678
679        // Add collected syntax errors to result
680        if (!syntaxErrors.isEmpty()) {
681            builder.syntaxErrors(syntaxErrors instanceof ArrayList ?
682                (ArrayList<TSyntaxError>) syntaxErrors : new ArrayList<>(syntaxErrors));
683        }
684
685        return builder.build();
686    }
687
688    /**
689     * Setup vendor-specific parsers for raw statement extraction.
690     *
691     * <p><b>Hook Method:</b> Called by {@link #extractRawStatements} after initializing
692     * sqlcmds but before calling the vendor-specific extraction logic.
693     *
694     * <p><b>Subclass Responsibility:</b> Inject sqlcmds into vendor parser(s) and
695     * update their token lists. Examples:
696     * <ul>
697     *   <li><b>Single parser (MSSQL):</b> Inject into fparser only</li>
698     *   <li><b>Dual parsers (Oracle):</b> Inject into both fparser and fplsqlparser</li>
699     * </ul>
700     *
701     * <p><b>Example (MSSQL):</b>
702     * <pre>
703     * protected void setupVendorParsersForExtraction() {
704     *     this.fparser.sqlcmds = this.sqlcmds;
705     *     this.fparser.sourcetokenlist = this.sourcetokenlist;
706     * }
707     * </pre>
708     *
709     * <p><b>Example (Oracle with dual parsers):</b>
710     * <pre>
711     * protected void setupVendorParsersForExtraction() {
712     *     this.fparser.sqlcmds = this.sqlcmds;
713     *     this.fplsqlparser.sqlcmds = this.sqlcmds;
714     *     this.fparser.sourcetokenlist = this.sourcetokenlist;
715     *     this.fplsqlparser.sourcetokenlist = this.sourcetokenlist;
716     * }
717     * </pre>
718     */
719    protected abstract void setupVendorParsersForExtraction();
720
721    /**
722     * Call vendor-specific raw statement extraction logic.
723     *
724     * <p><b>Hook Method:</b> Called by {@link #extractRawStatements} to execute
725     * the vendor-specific logic for identifying statement boundaries.
726     *
727     * <p><b>Subclass Responsibility:</b> Call the vendor-specific extraction method
728     * (e.g., dooraclegetrawsqlstatements, domssqlgetrawsqlstatements) passing the
729     * builder. The extraction method will populate the builder with raw statements.
730     *
731     * <p><b>Example (Oracle):</b>
732     * <pre>
733     * protected void extractVendorRawStatements(SqlParseResult.Builder builder) {
734     *     dooraclegetrawsqlstatements(builder);
735     * }
736     * </pre>
737     *
738     * <p><b>Example (MSSQL):</b>
739     * <pre>
740     * protected void extractVendorRawStatements(SqlParseResult.Builder builder) {
741     *     domssqlgetrawsqlstatements(builder);
742     * }
743     * </pre>
744     *
745     * @param builder the result builder to populate with raw statements
746     */
747    protected abstract void extractVendorRawStatements(SqlParseResult.Builder builder);
748
749    /**
750     * Perform actual parsing with syntax checking.
751     *
752     * <p><b>Subclass Responsibility:</b> Parse SQL using vendor-specific parser
753     * and optional secondary parser (e.g., PL/SQL for Oracle).
754     *
755     * <p><b>Important:</b> This method receives raw statements that have already been
756     * extracted by {@link #getrawsqlstatements(ParserContext)}. Subclasses should NOT
757     * re-extract statements - just parse each statement to build the AST.
758     *
759     * <p><b>Example:</b>
760     * <pre>
761     * protected TStatementList performParsing(ParserContext context,
762     *                                         TCustomParser parser,
763     *                                         TCustomParser secondaryParser,
764     *                                         TSourceTokenList tokens,
765     *                                         TStatementList rawStatements) {
766     *     // Use the passed-in rawStatements (DO NOT re-extract!)
767     *     for (int i = 0; i &lt; rawStatements.size(); i++) {
768     *         TCustomSqlStatement stmt = rawStatements.get(i);
769     *         stmt.parsestatement(...);  // Build AST for each statement
770     *     }
771     *     return rawStatements;
772     * }
773     * </pre>
774     *
775     * @param context the parser context
776     * @param parser the main parser instance
777     * @param secondaryParser secondary parser (may be null)
778     * @param tokens the source token list
779     * @param rawStatements raw statements already extracted (never null)
780     * @return statement list with parsed AST (never null)
781     */
782    protected abstract TStatementList performParsing(ParserContext context,
783                                                     TCustomParser parser,
784                                                     TCustomParser secondaryParser,
785                                                     TSourceTokenList tokens,
786                                                     TStatementList rawStatements);
787
788    // ========== Hook Methods (MAY be overridden by subclasses) ==========
789
790    /**
791     * Get secondary parser (e.g., PL/SQL for Oracle).
792     *
793     * <p><b>Hook Method:</b> Default implementation returns null.
794     * Override if vendor needs a secondary parser.
795     * The parser may be created fresh or cached/reused for performance.
796     *
797     * <p><b>Example (Oracle):</b>
798     * <pre>
799     * protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) {
800     *     TParserOraclePLSql plsqlParser = new TParserOraclePLSql(tokens);
801     *     plsqlParser.lexer = getLexer(context);
802     *     return plsqlParser;
803     * }
804     * </pre>
805     *
806     * @param context the parser context
807     * @param tokens the source token list
808     * @return secondary parser instance, or null if not needed
809     */
810    protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) {
811        return null; // Most vendors don't need this
812    }
813
814    /**
815     * Post-tokenization normalization.
816     * <p>
817     * Handles matching parentheses wrapping around SQL and marks semicolons
818     * before closing parens to be ignored.
819     * <p>
820     * Extracted from: TGSqlParser.doAfterTokenize() (lines 5123-5161)
821     *
822     * @param tokens the source token list (mutable)
823     */
824    protected void doAfterTokenize(TSourceTokenList tokens) {
825        int leftParenCount = 0;
826        int rightParenCount = 0;
827        int leftIndex = 0;
828        int rightIndex = tokens.size() - 1;
829
830        // Count opening parentheses at the beginning
831        while (leftIndex < tokens.size() && tokens.get(leftIndex).tokencode == '(') {
832            leftParenCount++;
833            leftIndex++;
834        }
835
836        // Count closing parentheses at the end
837        while (rightIndex >= 0 && tokens.get(rightIndex).tokencode == ')') {
838            rightParenCount++;
839            rightIndex--;
840        }
841
842        // Set matching parentheses to be ignored
843        int parensToIgnore = Math.min(leftParenCount, rightParenCount);
844        // if there is a semicolon before the right parenthesis, set the semicolon to be ignored
845        // mantisbt/view.php?id=3690
846
847        if ((parensToIgnore > 0) && (tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokencode == ';')){
848            // set to whitespace that this semicolon will be ignored during getting raw sql
849            tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokentype = ETokenType.ttwhitespace;
850            // set to ignore by yacc that this semicolon will be ignored during parsing
851            tokens.get(tokens.size() - 1 - (parensToIgnore - 1) - 1).tokenstatus = ETokenStatus.tsignorebyyacc;
852        }
853    }
854
855    /**
856     * Process tokens using token table (vendor-specific token code adjustments).
857     * <p>
858     * Currently handles BigQuery and Snowflake to convert DO keywords to identifiers
859     * when there's no corresponding WHILE/FOR.
860     * <p>
861     * Extracted from: TGSqlParser.processTokensInTokenTable() (lines 5186-5209)
862     *
863     * @param context the parser context
864     * @param lexer the lexer (for accessing TOKEN_TABLE)
865     * @param tokens the source token list (mutable)
866     */
867    protected void processTokensInTokenTable(ParserContext context, TCustomLexer lexer, TSourceTokenList tokens) {
868        // Get token table from lexer
869        long[][] TOKEN_TABLE1 = lexer.TOKEN_TABLE;
870
871        switch (vendor){
872            case dbvbigquery:
873            case dbvsnowflake:
874                // case 1, DO keyword: if no corresponding FOR, WHILE etc keywords found,
875                // set DO keyword's token code to TBaseType.ident
876                if (TOKEN_TABLE1[TBaseType.rrw_do][0] > 0){
877                    if ((TOKEN_TABLE1[TBaseType.rrw_while][0] == 0) && (TOKEN_TABLE1[TBaseType.rrw_for][0] == 0)){
878                        for(int i=0; i<tokens.size(); i++){
879                            TSourceToken st = tokens.get(i);
880                            if (st.tokencode == TBaseType.rrw_do){
881                                st.tokencode = TBaseType.ident;
882                            }
883                        }
884                    }
885                }
886                break;
887        }
888    }
889
890    /**
891     * Process tokens before parsing (vendor-specific adjustments).
892     *
893     * <p><b>Hook Method:</b> Default implementation handles Snowflake consecutive semicolons.
894     * Override if vendor needs additional token preprocessing.
895     *
896     * <p>Extracted from: TGSqlParser.processTokensBeforeParse() (lines 5165-5184)
897     *
898     * <p><b>Example:</b>
899     * <pre>
900     * protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) {
901     *     super.processTokensBeforeParse(context, tokens); // Call base implementation
902     *     // Add vendor-specific processing...
903     * }
904     * </pre>
905     *
906     * @param context the parser context
907     * @param tokens the source token list (mutable)
908     */
909    protected void processTokensBeforeParse(ParserContext context, TSourceTokenList tokens) {
910        // For performance, only process for Snowflake as this is currently only needed there
911        // mantisbt/view.php?id=3579
912        if (vendor != EDbVendor.dbvsnowflake) return;
913
914        // If there are consecutive semicolon tokens, mark the second semicolon token as deleted
915        for(int i=0; i<tokens.size(); i++){
916            TSourceToken st = tokens.get(i);
917            if (st.tokencode == ';'){
918                TSourceToken nextToken = st.nextSolidToken();
919                if (nextToken != null){
920                    if (nextToken.tokencode == ';'){
921                        nextToken.tokenstatus = ETokenStatus.tsdeleted;
922                    }
923                }
924            }
925        }
926    }
927
928    /**
929     * Perform semantic analysis on parsed statements.
930     *
931     * <p><b>Hook Method:</b> Default implementation does nothing.
932     * Override to provide vendor-specific semantic analysis.
933     *
934     * <p><b>Typical Implementation:</b>
935     * <ul>
936     *   <li>Column-to-table resolution (TSQLResolver)</li>
937     *   <li>Dataflow analysis</li>
938     *   <li>Reference resolution</li>
939     *   <li>Scope resolution</li>
940     * </ul>
941     *
942     * @param context the parser context
943     * @param statements the parsed statements (mutable)
944     */
945    protected void performSemanticAnalysis(ParserContext context, TStatementList statements) {
946        // Default implementation: no semantic analysis
947        // Subclasses can override for vendor-specific behavior
948    }
949
950    /**
951     * Perform interpretation/evaluation on parsed statements.
952     *
953     * <p><b>Hook Method:</b> Default implementation does nothing.
954     * Override to provide AST interpretation/evaluation.
955     *
956     * <p><b>Typical Implementation:</b>
957     * <ul>
958     *   <li>Execute simple SQL statements</li>
959     *   <li>Evaluate expressions</li>
960     *   <li>Constant folding</li>
961     *   <li>Static analysis</li>
962     * </ul>
963     *
964     * @param context the parser context
965     * @param statements the parsed statements (mutable)
966     */
967    protected void performInterpreter(ParserContext context, TStatementList statements) {
968        // Default implementation: no interpreter
969        // Subclasses can override to provide AST interpretation
970    }
971
972    /**
973     * Copy error messages from a statement to the parser's error collection.
974     *
975     * <p>This method should be called by performParsing implementations
976     * when a statement has syntax errors.
977     *
978     * @param statement the statement with errors
979     */
980    protected void copyErrorsFromStatement(TCustomSqlStatement statement) {
981        if (statement == null || statement.getSyntaxErrors() == null) {
982            return;
983        }
984
985        for (int i = 0; i < statement.getSyntaxErrors().size(); i++) {
986            this.syntaxErrors.add(new TSyntaxError((TSyntaxError) statement.getSyntaxErrors().get(i)));
987        }
988    }
989
990    /**
991     * Attempt error recovery for CREATE TABLE/INDEX statements with unsupported options.
992     *
993     * <p>When parsing CREATE TABLE or CREATE INDEX statements, the parser may encounter
994     * vendor-specific options that are not in the grammar. This method implements the
995     * legacy error recovery behavior by marking unsupported tokens after the main
996     * definition as SQL*Plus commands (effectively ignoring them).
997     *
998     * <p><b>Recovery Strategy:</b>
999     * <ol>
1000     *   <li>Find the closing ')' of the column/index definitions (nested=0)</li>
1001     *   <li>Mark all remaining tokens (except ';') as sqlpluscmd to ignore them</li>
1002     *   <li>Clear errors and re-parse the statement</li>
1003     * </ol>
1004     *
1005     * <p><b>When to call:</b> After parsing a statement that has errors.
1006     * Only recovers if ENABLE_ERROR_RECOVER_IN_CREATE_TABLE is true.
1007     *
1008     * @param statement the statement to attempt recovery on
1009     * @param parseResult the result code from parsing (0 = success)
1010     * @param onlyNeedRawParseTree whether only raw parse tree is needed
1011     * @return new parse result after recovery attempt, or original if no recovery
1012     */
1013    protected int attemptErrorRecovery(TCustomSqlStatement statement, int parseResult, boolean onlyNeedRawParseTree) {
1014        boolean doRecover = TBaseType.ENABLE_ERROR_RECOVER_IN_CREATE_TABLE;
1015
1016        if (doRecover && ((parseResult != 0) || (statement.getErrorCount() > 0))) {
1017            if (((statement.sqlstatementtype == ESqlStatementType.sstcreatetable)
1018                    || ((statement.sqlstatementtype == ESqlStatementType.sstcreateindex) && (this.vendor != EDbVendor.dbvcouchbase))
1019                    ) && (!TBaseType.c_createTableStrictParsing)
1020            ) {
1021                // Only parse main body of create table/index, ignore unsupported options after closing ')'
1022                int nested = 0;
1023                boolean isIgnore = false;
1024                boolean isFoundIgnoreToken = false;
1025                TSourceToken firstIgnoreToken = null;
1026
1027                for (int k = 0; k < statement.sourcetokenlist.size(); k++) {
1028                    TSourceToken st = statement.sourcetokenlist.get(k);
1029                    if (isIgnore) {
1030                        if (st.issolidtoken() && (st.tokencode != ';')) {
1031                            isFoundIgnoreToken = true;
1032                            if (firstIgnoreToken == null) {
1033                                firstIgnoreToken = st;
1034                            }
1035                        }
1036                        if (st.tokencode != ';') {
1037                            st.tokencode = TBaseType.sqlpluscmd;
1038                        }
1039                        continue;
1040                    }
1041                    if (st.tokencode == (int) ')') {
1042                        nested--;
1043                        if (nested == 0) {
1044                            // Check if next token is "AS ( SELECT" (table created from select)
1045                            boolean isSelect = false;
1046                            TSourceToken st1 = st.searchToken(TBaseType.rrw_as, 1);
1047                            if (st1 != null) {
1048                                TSourceToken st2 = st.searchToken((int) '(', 2);
1049                                if (st2 != null) {
1050                                    TSourceToken st3 = st.searchToken(TBaseType.rrw_select, 3);
1051                                    isSelect = (st3 != null);
1052                                }
1053                            }
1054                            if (!isSelect) isIgnore = true;
1055                        }
1056                    }
1057                    if ((st.tokencode == (int) '(') || (st.tokencode == TBaseType.left_parenthesis_2)) {
1058                        nested++;
1059                    }
1060                }
1061
1062                // For Oracle, validate that ignored tokens are valid table properties
1063                if ((this.vendor == EDbVendor.dbvoracle) && (firstIgnoreToken != null)
1064                        && (!TBaseType.searchOracleTablePros(firstIgnoreToken.toString()))) {
1065                    // Not a valid Oracle table property, don't ignore
1066                    isFoundIgnoreToken = false;
1067                }
1068
1069                if (isFoundIgnoreToken) {
1070                    statement.clearError();
1071                    parseResult = statement.parsestatement(null, false, onlyNeedRawParseTree);
1072                }
1073            }
1074        }
1075
1076        return parseResult;
1077    }
1078
1079    /**
1080     * Get the syntax errors collected during parsing.
1081     *
1082     * @return list of syntax errors (never null)
1083     */
1084    public List<TSyntaxError> getSyntaxErrors() {
1085        return syntaxErrors;
1086    }
1087
1088    /**
1089     * Get the count of syntax errors.
1090     *
1091     * @return number of syntax errors
1092     */
1093    public int getErrorCount() {
1094        return syntaxErrors.size();
1095    }
1096
1097    /**
1098     * Check if a token is a dollar function delimiter ($$, $tag$, etc.) for PostgreSQL-family databases.
1099     * <p>
1100     * Migrated from TGSqlParser.isDollarFunctionDelimiter() (lines 5074-5080).
1101     * <p>
1102     * Dollar-quoted strings are used in PostgreSQL-family databases to delimit function bodies.
1103     * Each vendor has its own delimiter token code.
1104     *
1105     * @param tokencode the token code to check
1106     * @param dbVendor the database vendor
1107     * @return true if the token is a dollar function delimiter for the given vendor
1108     */
1109    protected boolean isDollarFunctionDelimiter(int tokencode, EDbVendor dbVendor) {
1110        return ((tokencode == TBaseType.rrw_postgresql_function_delimiter) && (dbVendor == EDbVendor.dbvpostgresql))
1111                || ((tokencode == TBaseType.rrw_greenplum_function_delimiter) && (dbVendor == EDbVendor.dbvgreenplum))
1112                || ((tokencode == TBaseType.rrw_redshift_function_delimiter) && (dbVendor == EDbVendor.dbvredshift))
1113                || ((tokencode == TBaseType.rrw_snowflake_function_delimiter) && (dbVendor == EDbVendor.dbvsnowflake));
1114    }
1115
1116    /**
1117     * Hook method called when a raw statement is complete.
1118     * <p>
1119     * This method is called by vendor-specific raw statement extraction methods
1120     * (e.g., dooraclegetrawsqlstatements) when a statement boundary is detected.
1121     * It sets up the statement with parser references and adds it to the statement list.
1122     *
1123     * @param context parser context
1124     * @param statement the completed statement
1125     * @param mainParser main parser instance
1126     * @param secondaryParser secondary parser instance (may be null)
1127     * @param statementList statement list to add to
1128     * @param isLastStatement true if this is the last statement
1129     * @param builder optional result builder (used during raw statement extraction, may be null)
1130     */
1131    protected void onRawStatementComplete(ParserContext context,
1132                                          TCustomSqlStatement statement,
1133                                          TCustomParser mainParser,
1134                                          TCustomParser secondaryParser,
1135                                          TStatementList statementList,
1136                                          boolean isLastStatement,
1137                                          SqlParseResult.Builder builder) {
1138        if (statement == null || statementList == null) {
1139            return;
1140        }
1141
1142        // CRITICAL: Set gsqlparser reference NOW (before parsing) so nested statements
1143        // can access parser's dbvendor via getGsqlparser().getDbVendor()
1144        // This matches legacy behavior from doongetrawsqlstatementevent()
1145        if (context != null && context.getGsqlparser() != null) {
1146            // Cast to TGSqlParser - we know the type from buildContext()
1147            statement.setGsqlparser((gudusoft.gsqlparser.TGSqlParser) context.getGsqlparser());
1148        }
1149        statement.parser = mainParser;
1150        statement.plsqlparser = secondaryParser;
1151
1152        if (statement.sourcetokenlist != null && statement.sourcetokenlist.size() > 0) {
1153            TSourceToken startToken = statement.sourcetokenlist.get(0);
1154            TSourceToken endToken = statement.sourcetokenlist.get(statement.sourcetokenlist.size() - 1);
1155
1156            statement.setStartToken(startToken);
1157            statement.setEndToken(endToken);
1158
1159            // Always set lastTokenOfStatementBeenValidated when a statement is found
1160            // This ensures getLastLineNoOfLastStatementBeenValidated() returns valid line number
1161            // for any successfully identified statement, regardless of whether it ends with semicolon
1162            if (context != null && endToken != null && builder != null) {
1163                builder.lastTokenOfStatementBeenValidated(endToken);
1164            }
1165        }
1166
1167        // Vendor-specific statement completion logic (migrated from TGSqlParser.doongetrawsqlstatementevent lines 5129-5178)
1168        onRawStatementCompleteVendorSpecific(statement);
1169
1170        statementList.add(statement);
1171
1172    }
1173
1174    /**
1175     * Hook for vendor-specific logic when a raw statement is completed.
1176     * <p>
1177     * Migrated from TGSqlParser.doongetrawsqlstatementevent() (lines 5129-5178).
1178     * <p>
1179     * This method is called after basic statement setup but before adding to the statement list.
1180     * Subclasses can override to add vendor-specific token manipulations or metadata.
1181     * <p>
1182     * Default implementation handles PostgreSQL-family routine body processing.
1183     *
1184     * @param statement the completed statement
1185     */
1186    protected void onRawStatementCompleteVendorSpecific(TCustomSqlStatement statement) {
1187        // Handle PostgreSQL-family databases: Mark non-SQL/PLSQL routine body tokens
1188        // Migrated from TGSqlParser.doongetrawsqlstatementevent() lines 5143-5178
1189        if (((this.vendor == EDbVendor.dbvpostgresql) || (this.vendor == EDbVendor.dbvgreenplum)
1190                || (this.vendor == EDbVendor.dbvredshift) || (this.vendor == EDbVendor.dbvsnowflake))
1191                && (statement instanceof TRoutine)) {
1192
1193            TRoutine routine = (TRoutine) statement;
1194            if (!routine.isBodyInSQL()) {
1195                TSourceToken st;
1196                boolean inBody = false;
1197                String routineBodyStr = "";
1198
1199                for (int i = 0; i < statement.sourcetokenlist.size(); i++) {
1200                    st = statement.sourcetokenlist.get(i);
1201
1202                    // Check for dollar function delimiter ($$, $tag$, etc.)
1203                    if (isDollarFunctionDelimiter(st.tokencode, this.vendor)) {
1204                        if (!inBody) {
1205                            inBody = true;
1206                            routineBodyStr = st.toString();
1207                        } else {
1208                            inBody = false;
1209                            routineBodyStr += st.toString();
1210                            break;
1211                        }
1212                        continue;
1213                    }
1214
1215                    if (inBody) {
1216                        // Mark body tokens as sqlpluscmd so they're not parsed as SQL
1217                        st.tokencode = TBaseType.sqlpluscmd;
1218                        routineBodyStr += st.toString();
1219                    }
1220                }
1221
1222                routine.setRoutineBody(routineBodyStr);
1223            }
1224        }
1225    }
1226
1227    private static final int ENCODING_UTF16 = 1;
1228    private static final int ENCODING_UTF32 = 2;
1229    private static final int ENCODING_UTF8_BOM = 3;
1230
1231    protected PreparedSqlReader prepareSqlReader(ParserContext context) throws IOException {
1232        BufferedReader reader;
1233        String effectiveCharset = context.getSqlCharset();
1234
1235        if (context.getSqlText() != null) {
1236            reader = new BufferedReader(new StringReader(context.getSqlText()));
1237            return new PreparedSqlReader(reader, effectiveCharset);
1238        }
1239
1240        if (context.getSqlFilename() != null && !context.getSqlFilename().isEmpty()) {
1241            FileInputStream fileStream = new FileInputStream(context.getSqlFilename());
1242            BufferedInputStream bufferedStream = new BufferedInputStream(fileStream, 8);
1243            int encodingType = detectEncodingFromBom(bufferedStream);
1244            String charsetToUse = resolveCharsetName(encodingType, context.getSqlCharset());
1245            InputStreamReader streamReader = new InputStreamReader(bufferedStream, charsetToUse);
1246            reader = new BufferedReader(streamReader);
1247            skipBomIfPresent(reader, encodingType);
1248            return new PreparedSqlReader(reader, charsetToUse);
1249        }
1250
1251        InputStream contextStream = context.getSqlInputStream();
1252        if (contextStream != null) {
1253            BufferedInputStream bufferedStream = (contextStream instanceof BufferedInputStream)
1254                ? (BufferedInputStream) contextStream
1255                : new BufferedInputStream(contextStream, 8);
1256            int encodingType = detectEncodingFromBom(bufferedStream);
1257            String charsetToUse = resolveCharsetName(encodingType, context.getSqlCharset());
1258            InputStreamReader streamReader = new InputStreamReader(bufferedStream, charsetToUse);
1259            reader = new BufferedReader(streamReader);
1260            skipBomIfPresent(reader, encodingType);
1261            return new PreparedSqlReader(reader, charsetToUse);
1262        }
1263
1264        // Default: empty input is valid, return reader for empty string
1265        reader = new BufferedReader(new StringReader(""));
1266        return new PreparedSqlReader(reader, effectiveCharset);
1267    }
1268
1269    private int detectEncodingFromBom(BufferedInputStream stream) throws IOException {
1270        if (stream == null || !stream.markSupported()) {
1271            return 0;
1272        }
1273
1274        byte[] bom = new byte[4];
1275        stream.mark(bom.length + 1);
1276        int read = stream.read(bom, 0, bom.length);
1277        stream.reset();
1278
1279        if (read < 2) {
1280            return 0;
1281        }
1282
1283        if (((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE))
1284            || ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF))) {
1285            if (read >= 4 && (((bom[2] == (byte) 0xFF) && (bom[3] == (byte) 0xFE))
1286                || ((bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)))) {
1287                return ENCODING_UTF32;
1288            }
1289            return ENCODING_UTF16;
1290        }
1291
1292        if (read >= 3 && (bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
1293            return ENCODING_UTF8_BOM;
1294        }
1295
1296        return 0;
1297    }
1298
1299    private String resolveCharsetName(int encodingType, String contextCharset) {
1300        switch (encodingType) {
1301            case ENCODING_UTF16:
1302                return "UTF-16";
1303            case ENCODING_UTF32:
1304                return "UTF-32";
1305            case ENCODING_UTF8_BOM:
1306                return "UTF-8";
1307            default:
1308                if (contextCharset != null && !contextCharset.isEmpty()) {
1309                    return contextCharset;
1310                }
1311                return Charset.defaultCharset().name();
1312        }
1313    }
1314
1315    private void skipBomIfPresent(BufferedReader reader, int encodingType) throws IOException {
1316        if (encodingType != ENCODING_UTF8_BOM || reader == null || !reader.markSupported()) {
1317            return;
1318        }
1319
1320        reader.mark(1);
1321        int ch = reader.read();
1322        if (ch != 0xFEFF && ch != -1) {
1323            reader.reset();
1324        }
1325    }
1326
1327    // ========== Utility Methods ==========
1328
1329    /**
1330     * Initialize global context and frame stack for statement parsing.
1331     * <p>
1332     * This method sets up the semantic analysis infrastructure required during
1333     * the parsing phase. It creates:
1334     * <ul>
1335     *   <li>Global context (TContext) for semantic analysis</li>
1336     *   <li>SQL environment (TSQLEnv) with vendor-specific configuration</li>
1337     *   <li>Frame stack for scope management</li>
1338     *   <li>Global scope frame as the outermost scope</li>
1339     * </ul>
1340     *
1341     * <p><b>When to call:</b> At the beginning of performParsing(), before parsing statements.
1342     *
1343     * <p><b>Cleanup required:</b> Must call {@code globalFrame.popMeFromStack(frameStack)}
1344     * after all statements are parsed to clean up the frame stack.
1345     *
1346     * <p><b>Extracted from:</b> Identical implementations in OracleSqlParser and MssqlSqlParser
1347     * to eliminate ~16 lines of duplicate code per parser.
1348     */
1349    protected void initializeGlobalContext() {
1350        // Initialize global context for semantic analysis
1351        this.globalContext = new TContext();
1352        this.sqlEnv = new TSQLEnv(this.vendor) {
1353            @Override
1354            public void initSQLEnv() {
1355                // Vendor-specific initialization can be added by subclasses if needed
1356            }
1357        };
1358        this.globalContext.setSqlEnv(this.sqlEnv, this.sqlstatements);
1359
1360        // Create global scope frame
1361        this.frameStack = new java.util.Stack<TFrame>();
1362        TGlobalScope globalScope = new TGlobalScope();
1363        globalScope.resetCurrentStmtIndex();
1364        globalScope.setSqlEnv(this.sqlEnv);
1365        this.globalFrame = new TFrame(globalScope);
1366        this.globalFrame.pushMeToStack(this.frameStack);
1367    }
1368
1369    /**
1370     * Handle exceptions that occur during individual statement parsing.
1371     * <p>
1372     * This method provides robust error handling that allows parsing to continue
1373     * even when individual statements throw exceptions. It:
1374     * <ul>
1375     *   <li>Creates a detailed {@link TSyntaxError} with exception information</li>
1376     *   <li>Captures statement location (line, column) from first token</li>
1377     *   <li>Includes statement number, exception type, and message</li>
1378     *   <li>Optionally logs full stack trace if debugging is enabled</li>
1379     *   <li>Adds error to {@link #syntaxErrors} list for user feedback</li>
1380     * </ul>
1381     *
1382     * <p><b>Benefits:</b>
1383     * <ul>
1384     *   <li>Parsing continues for remaining statements after exception</li>
1385     *   <li>Users get complete error feedback for all statements</li>
1386     *   <li>Developers get stack traces for debugging parser issues</li>
1387     * </ul>
1388     *
1389     * <p><b>Example error message:</b><br>
1390     * {@code "Exception during parsing statement 3: NullPointerException - Cannot invoke..."}
1391     *
1392     * <p><b>Extracted from:</b> Identical implementations in OracleSqlParser and MssqlSqlParser
1393     * to eliminate ~51 lines of duplicate code per parser.
1394     *
1395     * @param stmt the statement that failed to parse
1396     * @param statementIndex 0-based index of the statement in the statement list
1397     * @param ex the exception that was thrown during parsing
1398     */
1399    protected void handleStatementParsingException(TCustomSqlStatement stmt, int statementIndex, Exception ex) {
1400        // Create user-friendly error message with context
1401        String errorMsg = String.format("Exception during parsing statement %d: %s - %s",
1402            statementIndex + 1,  // Convert to 1-based for user readability
1403            ex.getClass().getSimpleName(),
1404            ex.getMessage() != null ? ex.getMessage() : "No details");
1405
1406        // Get first token of statement for error location
1407        TSourceToken firstToken = null;
1408        if (stmt.sourcetokenlist != null && stmt.sourcetokenlist.size() > 0) {
1409            firstToken = stmt.sourcetokenlist.get(0);
1410        }
1411
1412        // Create syntax error with exception details
1413        TSyntaxError syntaxError;
1414        if (firstToken != null) {
1415            // Use token location for accurate error reporting
1416            syntaxError = new TSyntaxError(
1417                firstToken.getAstext(),
1418                firstToken.lineNo,
1419                firstToken.columnNo,
1420                errorMsg,
1421                EErrorType.sperror,
1422                TBaseType.MSG_ERROR_SYNTAX_ERROR,
1423                stmt,
1424                firstToken.posinlist
1425            );
1426        } else {
1427            // Fallback if no token info available
1428            syntaxError = new TSyntaxError(
1429                "",
1430                0,
1431                0,
1432                errorMsg,
1433                EErrorType.sperror,
1434                TBaseType.MSG_ERROR_SYNTAX_ERROR,
1435                stmt,
1436                -1
1437            );
1438        }
1439
1440        this.syntaxErrors.add(syntaxError);
1441
1442        // Log to console if debugging enabled
1443        if (TBaseType.DUMP_RESOLVER_LOG_TO_CONSOLE) {
1444            System.err.println("ERROR: " + errorMsg);
1445            ex.printStackTrace();
1446        }
1447    }
1448
1449    /**
1450     * Hook method for vendor-specific post-processing after a statement is parsed.
1451     * <p>
1452     * This method is called after each statement is successfully parsed but before
1453     * error recovery and error collection. Subclasses can override this to perform
1454     * vendor-specific operations such as:
1455     * <ul>
1456     *   <li>Checking for vendor-specific syntax errors in nested statements</li>
1457     *   <li>Validating vendor-specific constraints</li>
1458     *   <li>Collecting vendor-specific metadata</li>
1459     * </ul>
1460     *
1461     * <p><b>Default implementation:</b> Does nothing (no-op).
1462     *
1463     * <p><b>Example override (Oracle):</b><br>
1464     * <pre>{@code
1465     * @Override
1466     * protected void afterStatementParsed(TCustomSqlStatement stmt) {
1467     *     if (stmt.isoracleplsql()) {
1468     *         findAllSyntaxErrorsInPlsql(stmt);
1469     *     }
1470     * }
1471     * }</pre>
1472     *
1473     * <p><b>When called:</b> After {@code stmt.parsestatement()} succeeds,
1474     * before {@code handleCreateTableErrorRecovery()} and {@code copyErrorsFromStatement()}.
1475     *
1476     * @param stmt the statement that was just parsed
1477     */
1478    protected void afterStatementParsed(TCustomSqlStatement stmt) {
1479        // Default: no additional processing
1480        // Subclasses override to add vendor-specific post-processing
1481    }
1482
1483    /**
1484     * Get next source token from the lexer.
1485     * <p>
1486     * This method wraps the lexer's yylexwrap() call and performs several important tasks:
1487     * <ul>
1488     *   <li>Fetches the next raw token from the lexer</li>
1489     *   <li>Combines consecutive whitespace/newline tokens for cleaner token stream</li>
1490     *   <li>Sets token metadata (vendor, status, container, position in list)</li>
1491     *   <li>Optionally calls token handler callback</li>
1492     * </ul>
1493     *
1494     * <p><b>Token Consolidation Rules:</b>
1495     * <ul>
1496     *   <li>Whitespace after a newline is merged into the newline token</li>
1497     *   <li>Consecutive newlines are merged into a single newline token</li>
1498     * </ul>
1499     *
1500     * <p><b>Implementation Note:</b>
1501     * This method is extracted from TGSqlParser.getanewsourcetoken() and made
1502     * available to all database-specific parsers to avoid code duplication.
1503     *
1504     * @return next source token, or null if end of input
1505     */
1506    protected TSourceToken getanewsourcetoken() {
1507        TSourceToken pst = null, prevst;
1508
1509        while (true) {
1510            pst = new TSourceToken("");
1511            if (lexer.yylexwrap(pst) == 0) {
1512                pst = null;
1513                break;
1514            }
1515
1516            pst.setDbvendor(vendor);
1517            pst.tokenstatus = ETokenStatus.tsoriginal;
1518
1519            if (pst.tokentype == ETokenType.ttreturn) {
1520                pst.setAstext(towinlinebreak(pst.getAstext()));
1521            }
1522
1523            // Combine space & linebreak after a linebreak into one
1524            if ((pst.tokentype == ETokenType.ttwhitespace)
1525                    && (sourcetokenlist.curpos >= 0)) {
1526                prevst = sourcetokenlist.get(sourcetokenlist.curpos);
1527                if (prevst.tokentype == ETokenType.ttreturn) {
1528                    // Can't discard whitespace after linebreak, it will be used
1529                    // to judge whether / at the beginning of the line is a sqlplus cmd or not
1530                    // check isValidPlaceForDivToSqlplusCmd for more
1531                    prevst.setAstext(prevst.getAstext() + pst.getAstext());
1532                    continue;
1533                }
1534            }
1535
1536            // Combine consecutive newlines
1537            if ((pst.tokentype == ETokenType.ttreturn)
1538                    && (sourcetokenlist.curpos >= 0)) {
1539                prevst = sourcetokenlist.get(sourcetokenlist.curpos);
1540
1541                if (prevst.tokentype == ETokenType.ttreturn) {
1542                    prevst.setAstext(prevst.getAstext() + pst.getAstext());
1543                    continue;
1544                }
1545
1546                // Note: The original code has a commented section about merging
1547                // whitespace with newline. We're preserving the behavior here
1548                // which does NOT merge preceding whitespace with newline.
1549            }
1550
1551            break;
1552        }
1553
1554        if (pst != null) {
1555            pst.container = sourcetokenlist;
1556            sourcetokenlist.curpos = sourcetokenlist.curpos + 1;
1557            pst.posinlist = sourcetokenlist.curpos;
1558
1559            // Optional token handler callback
1560            if (tokenHandle != null) {
1561                tokenHandle.processToken(pst);
1562            }
1563        }
1564
1565        lexer.setTokenTableValue(pst);
1566        return pst;
1567    }
1568
1569    /**
1570     * Convert line breaks to Windows format.
1571     * <p>
1572     * Currently returns the input unchanged. This method exists for compatibility
1573     * with the original TGSqlParser implementation.
1574     *
1575     * @param s Input string
1576     * @return String with Windows line breaks (currently unchanged)
1577     */
1578    protected String towinlinebreak(String s) {
1579        return s;
1580        // if (s == null) return null;
1581        // return s.replace("\n", "\r\n");
1582    }
1583
1584    /**
1585     * Get the delimiter character for this vendor.
1586     *
1587     * @return delimiter character (e.g., ';', '/', '$')
1588     */
1589    public char getDelimiterChar() {
1590        return delimiterChar;
1591    }
1592
1593    /**
1594     * Get the default delimiter string for this vendor.
1595     *
1596     * @return default delimiter string
1597     */
1598    public String getDefaultDelimiterStr() {
1599        return defaultDelimiterStr;
1600    }
1601
1602    @Override
1603    public String toString() {
1604        return getClass().getSimpleName() + "{vendor=" + vendor + "}";
1605    }
1606}