Source code

001package gudusoft.gsqlparser.parser;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.TBaseType;
005import gudusoft.gsqlparser.TCustomLexer;
006import gudusoft.gsqlparser.TCustomParser;
007import gudusoft.gsqlparser.TCustomSqlStatement;
008import gudusoft.gsqlparser.TLexerImpala;
009import gudusoft.gsqlparser.TParserImpala;
010import gudusoft.gsqlparser.TSourceToken;
011import gudusoft.gsqlparser.TSourceTokenList;
012import gudusoft.gsqlparser.TStatementList;
013import gudusoft.gsqlparser.TSyntaxError;
014import gudusoft.gsqlparser.EFindSqlStateType;
015import gudusoft.gsqlparser.ETokenType;
016import gudusoft.gsqlparser.ETokenStatus;
017import gudusoft.gsqlparser.ESqlStatementType;
018import gudusoft.gsqlparser.EErrorType;
019import gudusoft.gsqlparser.stmt.TUnknownSqlStatement;
020import gudusoft.gsqlparser.sqlcmds.ISqlCmds;
021import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory;
022import gudusoft.gsqlparser.compiler.TContext;
023import gudusoft.gsqlparser.sqlenv.TSQLEnv;
024import gudusoft.gsqlparser.compiler.TGlobalScope;
025import gudusoft.gsqlparser.compiler.TFrame;
026import gudusoft.gsqlparser.resolver.TSQLResolver;
027import gudusoft.gsqlparser.TLog;
028import gudusoft.gsqlparser.compiler.TASTEvaluator;
029
030import java.io.BufferedReader;
031import java.util.ArrayList;
032import java.util.Arrays;
033import java.util.List;
034import java.util.Stack;
035
036/**
037 * Apache Impala SQL parser implementation.
038 *
039 * <p>This parser handles Impala-specific SQL syntax including:
040 * <ul>
041 *   <li>Hive-compatible SQL dialect</li>
042 *   <li>Backtick-quoted identifiers (`schema.table`)</li>
043 *   <li>Semicolon statement delimiters</li>
044 *   <li>Impala-specific functions and syntax</li>
045 * </ul>
046 *
047 * <p><b>Design Notes:</b>
048 * <ul>
049 *   <li>Extends {@link AbstractSqlParser} using the template method pattern</li>
050 *   <li>Uses {@link TLexerImpala} for tokenization</li>
051 *   <li>Uses {@link TParserImpala} for parsing</li>
052 *   <li>Shares tokenization logic with Hive (dohivetexttotokenlist)</li>
053 *   <li>Delimiter character: ';' for SQL statements</li>
054 * </ul>
055 *
056 * <p><b>Usage Example:</b>
057 * <pre>
058 * // Get Impala parser from factory
059 * SqlParser parser = SqlParserFactory.get(EDbVendor.dbvimpala);
060 *
061 * // Build context
062 * ParserContext context = new ParserContext.Builder(EDbVendor.dbvimpala)
063 *     .sqlText("SELECT * FROM employees WHERE dept_id = 10")
064 *     .build();
065 *
066 * // Parse
067 * SqlParseResult result = parser.parse(context);
068 *
069 * // Access statements
070 * TStatementList statements = result.getSqlStatements();
071 * </pre>
072 *
073 * @see SqlParser
074 * @see AbstractSqlParser
075 * @see TLexerImpala
076 * @see TParserImpala
077 * @since 3.2.0.0
078 */
079public class ImpalaSqlParser extends AbstractSqlParser {
080
081    /**
082     * Construct Apache Impala SQL parser.
083     * <p>
084     * Configures the parser for Impala database with default delimiter (;).
085     * <p>
086     * Following the original TGSqlParser pattern, the lexer and parser are
087     * created once in the constructor and reused for all parsing operations.
088     */
089    public ImpalaSqlParser() {
090        super(EDbVendor.dbvimpala);
091        this.delimiterChar = ';';
092        this.defaultDelimiterStr = ";";
093
094        // Create lexer once - will be reused for all parsing operations
095        this.flexer = new TLexerImpala();
096        this.flexer.delimiterchar = this.delimiterChar;
097        this.flexer.defaultDelimiterStr = this.defaultDelimiterStr;
098
099        // Set parent's lexer reference for shared tokenization logic
100        this.lexer = this.flexer;
101
102        // Create parser once - will be reused for all parsing operations
103        this.fparser = new TParserImpala(null);
104        this.fparser.lexer = this.flexer;
105    }
106
107    // ========== Parser Components ==========
108
109    /** The Impala lexer used for tokenization */
110    public TLexerImpala flexer;
111
112    /** Impala parser (for Impala statements) */
113    private TParserImpala fparser;
114
115    /** Current statement being built during extraction */
116    private TCustomSqlStatement gcurrentsqlstatement;
117
118    // Note: Global context and frame stack fields inherited from AbstractSqlParser:
119    // - protected TContext globalContext
120    // - protected TSQLEnv sqlEnv
121    // - protected Stack<TFrame> frameStack
122    // - protected TFrame globalFrame
123
124    // ========== AbstractSqlParser Abstract Methods Implementation ==========
125
126    /**
127     * Return the Impala lexer instance.
128     */
129    @Override
130    protected TCustomLexer getLexer(ParserContext context) {
131        return this.flexer;
132    }
133
134    /**
135     * Return the Impala SQL parser instance with updated token list.
136     */
137    @Override
138    protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) {
139        this.fparser.sourcetokenlist = tokens;
140        return this.fparser;
141    }
142
143    /**
144     * Call Impala-specific tokenization logic.
145     * <p>
146     * Delegates to doimpalatexttotokenlist which internally calls dohivetexttotokenlist.
147     * Impala shares the same tokenization logic as Hive.
148     */
149    @Override
150    protected void tokenizeVendorSql() {
151        doimpalatexttotokenlist();
152    }
153
154    /**
155     * Setup Impala parser for raw statement extraction.
156     * <p>
157     * Impala uses a single parser, so we inject sqlcmds and update
158     * the token list for the main parser only.
159     */
160    @Override
161    protected void setupVendorParsersForExtraction() {
162        // Inject sqlcmds into parser (required for make_stmt)
163        this.fparser.sqlcmds = this.sqlcmds;
164
165        // Update token list for parser
166        this.fparser.sourcetokenlist = this.sourcetokenlist;
167    }
168
169    /**
170     * Call Impala-specific raw statement extraction logic.
171     * <p>
172     * Delegates to doimpalagetrawsqlstatements which internally calls dohivegetrawsqlstatements.
173     * Impala shares the same statement extraction logic as Hive.
174     */
175    @Override
176    protected void extractVendorRawStatements(SqlParseResult.Builder builder) {
177        doimpalagetrawsqlstatements(builder);
178    }
179
180    /**
181     * Perform full parsing of statements with syntax checking.
182     * <p>
183     * This method orchestrates the parsing of all statements.
184     */
185    @Override
186    protected TStatementList performParsing(ParserContext context,
187                                           TCustomParser parser,
188                                           TCustomParser secondaryParser,
189                                           TSourceTokenList tokens,
190                                           TStatementList rawStatements) {
191        // Store references
192        this.fparser = (TParserImpala) parser;
193        this.sourcetokenlist = tokens;
194        this.parserContext = context;
195
196        // Use the raw statements passed from AbstractSqlParser.parse()
197        this.sqlstatements = rawStatements;
198
199        // Initialize statement parsing infrastructure
200        this.sqlcmds = SqlCmdsFactory.get(vendor);
201
202        // Inject sqlcmds into parser (required for make_stmt and other methods)
203        this.fparser.sqlcmds = this.sqlcmds;
204
205        // Initialize global context for semantic analysis
206        initializeGlobalContext();
207
208        // Parse each statement with exception handling for robustness
209        for (int i = 0; i < sqlstatements.size(); i++) {
210            TCustomSqlStatement stmt = sqlstatements.getRawSql(i);
211
212            try {
213                stmt.setFrameStack(frameStack);
214
215                // Parse the statement
216                int parseResult = stmt.parsestatement(null, false, context.isOnlyNeedRawParseTree());
217
218                // Handle error recovery for CREATE TABLE/INDEX
219                boolean doRecover = TBaseType.ENABLE_ERROR_RECOVER_IN_CREATE_TABLE;
220                if (doRecover && ((parseResult != 0) || (stmt.getErrorCount() > 0))) {
221                    handleCreateTableErrorRecovery(stmt);
222                }
223
224                // Collect syntax errors
225                if ((parseResult != 0) || (stmt.getErrorCount() > 0)) {
226                    copyErrorsFromStatement(stmt);
227                }
228
229            } catch (Exception ex) {
230                // Use inherited exception handler from AbstractSqlParser
231                // This provides consistent error handling across all database parsers
232                handleStatementParsingException(stmt, i, ex);
233                continue;
234            }
235        }
236
237        // Clean up frame stack
238        if (globalFrame != null) {
239            globalFrame.popMeFromStack(frameStack);
240        }
241
242        return this.sqlstatements;
243    }
244
245    // Note: initializeGlobalContext() inherited from AbstractSqlParser
246    // Note: No override of afterStatementParsed() needed - default (no-op) is appropriate for Impala
247
248    /**
249     * Handle error recovery for CREATE TABLE/INDEX statements.
250     */
251    private void handleCreateTableErrorRecovery(TCustomSqlStatement stmt) {
252        if (((stmt.sqlstatementtype == ESqlStatementType.sstcreatetable)
253                || (stmt.sqlstatementtype == ESqlStatementType.sstcreateindex))
254                && (!TBaseType.c_createTableStrictParsing)) {
255
256            int nested = 0;
257            boolean isIgnore = false, isFoundIgnoreToken = false;
258            TSourceToken firstIgnoreToken = null;
259
260            for (int k = 0; k < stmt.sourcetokenlist.size(); k++) {
261                TSourceToken st = stmt.sourcetokenlist.get(k);
262                if (isIgnore) {
263                    if (st.issolidtoken() && (st.tokencode != ';')) {
264                        isFoundIgnoreToken = true;
265                        if (firstIgnoreToken == null) {
266                            firstIgnoreToken = st;
267                        }
268                    }
269                    if (st.tokencode != ';') {
270                        st.tokencode = TBaseType.sqlpluscmd;
271                    }
272                    continue;
273                }
274                if (st.tokencode == (int) ')') {
275                    nested--;
276                    if (nested == 0) {
277                        boolean isSelect = false;
278                        TSourceToken st1 = st.searchToken(TBaseType.rrw_as, 1);
279                        if (st1 != null) {
280                            TSourceToken st2 = st.searchToken((int) '(', 2);
281                            if (st2 != null) {
282                                TSourceToken st3 = st.searchToken(TBaseType.rrw_select, 3);
283                                isSelect = (st3 != null);
284                            }
285                        }
286                        if (!isSelect) isIgnore = true;
287                    }
288                } else if (st.tokencode == (int) '(') {
289                    nested++;
290                }
291            }
292
293            if (isFoundIgnoreToken) {
294                stmt.clearError();
295                stmt.parsestatement(null, false);
296            }
297        }
298    }
299
300    /**
301     * Perform Impala-specific semantic analysis using TSQLResolver.
302     */
303    @Override
304    protected void performSemanticAnalysis(ParserContext context, TStatementList statements) {
305        if (TBaseType.isEnableResolver() && getSyntaxErrors().isEmpty()) {
306            TSQLResolver resolver = new TSQLResolver(globalContext, statements);
307            resolver.resolve();
308        }
309    }
310
311    /**
312     * Perform interpretation/evaluation on parsed statements.
313     */
314    @Override
315    protected void performInterpreter(ParserContext context, TStatementList statements) {
316        if (TBaseType.ENABLE_INTERPRETER && getSyntaxErrors().isEmpty()) {
317            TLog.clearLogs();
318            TGlobalScope interpreterScope = new TGlobalScope(sqlEnv);
319            TLog.enableInterpreterLogOnly();
320            TASTEvaluator astEvaluator = new TASTEvaluator(statements, interpreterScope);
321            astEvaluator.eval();
322        }
323    }
324
325    // ========== Impala-Specific Tokenization ==========
326
327    /**
328     * Impala-specific tokenization logic.
329     * <p>
330     * Extracted from: TGSqlParser.doimpalatexttotokenlist() (line 4600)
331     * Delegates to dohivetexttotokenlist as Impala uses the same tokenization as Hive.
332     */
333    private void doimpalatexttotokenlist() {
334        dohivetexttotokenlist();
335    }
336
337    /**
338     * Hive/Impala-specific tokenization logic.
339     * <p>
340     * Extracted from: TGSqlParser.dohivetexttotokenlist() (lines 4558-4598)
341     * <p>
342     * Handles:
343     * <ul>
344     *   <li>Basic token processing</li>
345     *   <li>MAP keyword disambiguation</li>
346     *   <li>Backtick-quoted qualified names (`schema.table_name`)</li>
347     * </ul>
348     */
349    private void dohivetexttotokenlist() {
350        TSourceToken asourcetoken, lcprevst;
351        int yychar;
352
353        asourcetoken = getanewsourcetoken();
354        if (asourcetoken == null) return;
355        yychar = asourcetoken.tokencode;
356
357        while (yychar > 0) {
358            if (asourcetoken != null) {
359                sourcetokenlist.add(asourcetoken);
360            }
361            asourcetoken = getanewsourcetoken();
362            if (asourcetoken == null) break;
363
364            // Handle MAP keyword disambiguation
365            if (asourcetoken.tokencode == TBaseType.rrw_map) {
366                TSourceToken token = asourcetoken.searchToken(')', -1);
367                if (token != null) {
368                    asourcetoken.tokencode = TBaseType.ident;
369                }
370            } else if (asourcetoken.tokencode == '(') {
371                // Reserved for future function identification logic
372                // TSourceToken token = asourcetoken.searchToken(TBaseType.ident,-1);
373                // if (token != null){
374                //     token.tokencode = TBaseType.HIVE_FUNC_IDENT;
375                // }
376            }
377
378            yychar = asourcetoken.tokencode;
379
380            // Handle backtick-quoted qualified names: `schema.table_name`
381            if ((asourcetoken.tokencode == TBaseType.ident)
382                    && (asourcetoken.toString().startsWith("`")) && (asourcetoken.toString().endsWith("`"))
383                    && (asourcetoken.toString().indexOf(".") > 0)) {
384                yychar = splitQualifiedNameInBacktick(asourcetoken);
385                asourcetoken = null;
386            }
387        }
388    }
389
390    /**
391     * Split a backtick-quoted qualified identifier into separate tokens.
392     * <p>
393     * Extracted from: TGSqlParser.splitQualifiedNameInBacktick() (lines 3458-3503)
394     * <p>
395     * For example, `schema.table_name` is split into:
396     * <ul>
397     *   <li>`schema` (identifier)</li>
398     *   <li>. (period)</li>
399     *   <li>`table_name` (identifier)</li>
400     * </ul>
401     *
402     * @param asourcetoken The qualified identifier token to split
403     * @return The token code of the last token created
404     */
405    private int splitQualifiedNameInBacktick(TSourceToken asourcetoken) {
406        int yychar = 0;
407
408        List<String> elephantList = Arrays.asList(TBaseType.getTextWithoutQuoted(asourcetoken.toString()).split("\\."));
409        int p = 0, offset = 0;
410        for (String s : elephantList) {
411            TSourceToken pst = new TSourceToken("`" + s + "`");
412            pst.tokencode = asourcetoken.tokencode;
413            pst.tokentype = asourcetoken.tokentype;
414            pst.tokenstatus = asourcetoken.tokenstatus;
415            pst.lineNo = asourcetoken.lineNo;
416            pst.columnNo = asourcetoken.columnNo + offset;
417            if (p == 0) offset++; // this counts the first ` token
418            offset = offset + s.length();
419            pst.container = sourcetokenlist;
420            if (p > 0) { // For tokens after the first, increment position pointer
421                sourcetokenlist.curpos = sourcetokenlist.curpos + 1;
422            }
423            pst.posinlist = sourcetokenlist.curpos;
424
425            sourcetokenlist.add(pst);
426            yychar = pst.tokencode;
427
428            if (p != elephantList.size() - 1) {
429                // Add period token between backtick-quoted identifiers
430                TSourceToken periodst = new TSourceToken(".");
431                periodst.tokencode = '.';
432                periodst.tokentype = ETokenType.ttperiod;
433                periodst.tokenstatus = asourcetoken.tokenstatus;
434                periodst.lineNo = asourcetoken.lineNo;
435                periodst.columnNo = asourcetoken.columnNo + offset;
436                offset++;
437                periodst.container = sourcetokenlist;
438                sourcetokenlist.curpos = sourcetokenlist.curpos + 1;
439                periodst.posinlist = sourcetokenlist.curpos;
440                sourcetokenlist.add(periodst);
441                yychar = periodst.tokencode;
442            }
443
444            p++;
445        }
446
447        return yychar;
448    }
449
450    // ========== Impala-Specific Raw Statement Extraction ==========
451
452    /**
453     * Impala-specific raw statement extraction logic.
454     * <p>
455     * Extracted from: TGSqlParser.doimpalagetrawsqlstatements() (lines 11013-11015)
456     * Delegates to dohivegetrawsqlstatements as Impala uses the same statement extraction as Hive.
457     */
458    private void doimpalagetrawsqlstatements(SqlParseResult.Builder builder) {
459        dohivegetrawsqlstatements(builder);
460    }
461
462    /**
463     * Hive/Impala-specific raw statement extraction logic.
464     * <p>
465     * Extracted from: TGSqlParser.dohivegetrawsqlstatements() (lines 11017-11145)
466     * <p>
467     * Handles:
468     * <ul>
469     *   <li>Semicolon statement delimiters</li>
470     *   <li>Token adjustments (CHARSET, DATE function, SORT BY)</li>
471     *   <li>Continuous semicolons as comments</li>
472     *   <li>Error token handling</li>
473     * </ul>
474     */
475    private void dohivegetrawsqlstatements(SqlParseResult.Builder builder) {
476        if (TBaseType.assigned(sqlstatements)) sqlstatements.clear();
477        if (!TBaseType.assigned(sourcetokenlist)) {
478            builder.errorCode(-1);
479            return;
480        }
481
482        gcurrentsqlstatement = null;
483        EFindSqlStateType gst = EFindSqlStateType.stnormal;
484        TSourceToken lcprevsolidtoken = null, ast = null;
485
486        for (int i = 0; i < sourcetokenlist.size(); i++) {
487
488            if ((ast != null) && (ast.issolidtoken()))
489                lcprevsolidtoken = ast;
490
491            ast = sourcetokenlist.get(i);
492            sourcetokenlist.curpos = i;
493
494            // Token adjustments specific to Hive/Impala
495            if (ast.tokencode == TBaseType.hive_CharSetName) {
496                TSourceToken st1 = ast.searchToken(TBaseType.hive_CharSetLiteral, 1);
497                if (st1 == null) {
498                    ast.tokencode = TBaseType.ident;
499                }
500            } else if (ast.tokencode == TBaseType.rrw_date) {
501                TSourceToken st1 = ast.nextSolidToken();
502                if (st1 != null) {
503                    if (st1.tokencode == '(') {
504                        ast.tokencode = TBaseType.rrw_hive_DATE_FUNCTION;
505                    }
506                }
507            } else if (ast.tokencode == TBaseType.rrw_sort) {
508                TSourceToken st1 = ast.searchToken(TBaseType.rrw_by, 1);
509                if (st1 == null) {
510                    ast.tokencode = TBaseType.ident;
511                }
512            }
513
514            switch (gst) {
515                case sterror: {
516                    if (ast.tokentype == ETokenType.ttsemicolon) {
517                        gcurrentsqlstatement.sourcetokenlist.add(ast);
518                        onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder);
519                        gst = EFindSqlStateType.stnormal;
520                    } else {
521                        gcurrentsqlstatement.sourcetokenlist.add(ast);
522                    }
523                    break;
524                } //sterror
525
526                case stnormal: {
527                    if ((ast.tokencode == TBaseType.cmtdoublehyphen)
528                            || (ast.tokencode == TBaseType.cmtslashstar)
529                            || (ast.tokencode == TBaseType.lexspace)
530                            || (ast.tokencode == TBaseType.lexnewline)
531                            || (ast.tokentype == ETokenType.ttsemicolon)) {
532                        if (gcurrentsqlstatement != null) {
533                            gcurrentsqlstatement.sourcetokenlist.add(ast);
534                        }
535
536                        if ((lcprevsolidtoken != null) && (ast.tokentype == ETokenType.ttsemicolon)) {
537                            if (lcprevsolidtoken.tokentype == ETokenType.ttsemicolon) {
538                                // ;;;; continuous semicolon, treat it as comment
539                                ast.tokentype = ETokenType.ttsimplecomment;
540                                ast.tokencode = TBaseType.cmtdoublehyphen;
541                            }
542                        }
543
544                        continue;
545                    }
546
547                    gcurrentsqlstatement = sqlcmds.issql(ast, gst, gcurrentsqlstatement);
548
549                    if (gcurrentsqlstatement != null) {
550                        gst = EFindSqlStateType.stsql;
551                        gcurrentsqlstatement.sourcetokenlist.add(ast);
552                    } else {
553                        // error token found
554
555                        this.syntaxErrors.add(new TSyntaxError(ast.getAstext(), ast.lineNo, (ast.columnNo < 0 ? 0 : ast.columnNo)
556                                , "Error when tokenlize", EErrorType.spwarning, TBaseType.MSG_WARNING_ERROR_WHEN_TOKENIZE, null, ast.posinlist));
557
558                        ast.tokentype = ETokenType.tttokenlizererrortoken;
559                        gst = EFindSqlStateType.sterror;
560
561                        gcurrentsqlstatement = new TUnknownSqlStatement(vendor);
562                        gcurrentsqlstatement.sqlstatementtype = ESqlStatementType.sstinvalid;
563                        gcurrentsqlstatement.sourcetokenlist.add(ast);
564                    }
565
566                    break;
567                } // stnormal
568
569                case stsql: {
570                    if (ast.tokentype == ETokenType.ttsemicolon) {
571                        gst = EFindSqlStateType.stnormal;
572                        gcurrentsqlstatement.sourcetokenlist.add(ast);
573                        gcurrentsqlstatement.semicolonended = ast;
574                        onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder);
575                        continue;
576                    }
577
578                    gcurrentsqlstatement.sourcetokenlist.add(ast);
579                    break;
580                }//case stsql
581
582            } //switch
583        }//for
584
585        // last statement
586        if ((gcurrentsqlstatement != null) &&
587                ((gst == EFindSqlStateType.stsql) || (gst == EFindSqlStateType.sterror))) {
588            onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, true, builder);
589        }
590
591        // Set results in builder
592        builder.sqlStatements(this.sqlstatements);
593        builder.errorCode(syntaxErrors.size());
594        builder.errorMessage(syntaxErrors.size() == 0 ? "" : String.format("Extraction completed with %d error(s)", syntaxErrors.size()));
595    }
596
597    @Override
598    public String toString() {
599        return "ImpalaSqlParser{vendor=" + vendor + "}";
600    }
601}