Source code

001package gudusoft.gsqlparser.parser;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.TBaseType;
005import gudusoft.gsqlparser.TCustomLexer;
006import gudusoft.gsqlparser.TCustomParser;
007import gudusoft.gsqlparser.TCustomSqlStatement;
008import gudusoft.gsqlparser.TLexerHive;
009import gudusoft.gsqlparser.TParserHive;
010import gudusoft.gsqlparser.TSourceToken;
011import gudusoft.gsqlparser.TSourceTokenList;
012import gudusoft.gsqlparser.TStatementList;
013import gudusoft.gsqlparser.TSyntaxError;
014import gudusoft.gsqlparser.EFindSqlStateType;
015import gudusoft.gsqlparser.ETokenType;
016import gudusoft.gsqlparser.ETokenStatus;
017import gudusoft.gsqlparser.ESqlStatementType;
018import gudusoft.gsqlparser.EErrorType;
019import gudusoft.gsqlparser.stmt.TUnknownSqlStatement;
020import gudusoft.gsqlparser.sqlcmds.ISqlCmds;
021import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory;
022import gudusoft.gsqlparser.compiler.TContext;
023import gudusoft.gsqlparser.sqlenv.TSQLEnv;
024import gudusoft.gsqlparser.compiler.TGlobalScope;
025import gudusoft.gsqlparser.compiler.TFrame;
026import gudusoft.gsqlparser.resolver.TSQLResolver;
027import gudusoft.gsqlparser.TLog;
028import gudusoft.gsqlparser.compiler.TASTEvaluator;
029
030import java.io.BufferedReader;
031import java.util.ArrayList;
032import java.util.Arrays;
033import java.util.List;
034import java.util.Stack;
035
036/**
037 * Apache Hive SQL parser implementation.
038 *
039 * <p>This parser handles Hive-specific SQL syntax including:
040 * <ul>
041 *   <li>Hive DDL statements (CREATE TABLE/DATABASE with Hive-specific options)</li>
042 *   <li>Hive DML statements (INSERT OVERWRITE, LOAD DATA, etc.)</li>
043 *   <li>HiveQL functions and extensions</li>
044 *   <li>Backtick-quoted identifiers including qualified names (`schema.table`)</li>
045 *   <li>Hive-specific keywords and data types</li>
046 * </ul>
047 *
048 * <p><b>Design Notes:</b>
049 * <ul>
050 *   <li>Extends {@link AbstractSqlParser} using the template method pattern</li>
051 *   <li>Uses {@link TLexerHive} for tokenization</li>
052 *   <li>Uses {@link TParserHive} for parsing</li>
053 *   <li>Delimiter character: ';' for SQL statements</li>
054 *   <li>Splits backtick-quoted qualified names (`schema.table`) into individual tokens</li>
055 * </ul>
056 *
057 * <p><b>Usage Example:</b>
058 * <pre>
059 * // Get Hive parser from factory
060 * SqlParser parser = SqlParserFactory.get(EDbVendor.dbvhive);
061 *
062 * // Build context
063 * ParserContext context = new ParserContext.Builder(EDbVendor.dbvhive)
064 *     .sqlText("SELECT * FROM `default.employee` WHERE dept = 'IT'")
065 *     .build();
066 *
067 * // Parse
068 * SqlParseResult result = parser.parse(context);
069 *
070 * // Access statements
071 * TStatementList statements = result.getSqlStatements();
072 * </pre>
073 *
074 * @see SqlParser
075 * @see AbstractSqlParser
076 * @see TLexerHive
077 * @see TParserHive
078 * @since 3.2.0.0
079 */
080public class HiveSqlParser extends AbstractSqlParser {
081
082    /**
083     * Construct Hive SQL parser.
084     * <p>
085     * Configures the parser for Hive database with default delimiter (;).
086     * <p>
087     * Following the original TGSqlParser pattern, the lexer and parser are
088     * created once in the constructor and reused for all parsing operations.
089     */
090    public HiveSqlParser() {
091        super(EDbVendor.dbvhive);
092        this.delimiterChar = ';';
093        this.defaultDelimiterStr = ";";
094
095        // Create lexer once - will be reused for all parsing operations
096        this.flexer = new TLexerHive();
097        this.flexer.delimiterchar = this.delimiterChar;
098        this.flexer.defaultDelimiterStr = this.defaultDelimiterStr;
099
100        // Set parent's lexer reference for shared tokenization logic
101        this.lexer = this.flexer;
102
103        // Create parser once - will be reused for all parsing operations
104        this.fparser = new TParserHive(null);
105        this.fparser.lexer = this.flexer;
106    }
107
108    // ========== Parser Components ==========
109
110    /** The Hive lexer used for tokenization */
111    public TLexerHive flexer;
112
113    /** SQL parser (for Hive statements) */
114    private TParserHive fparser;
115
116    /** Current statement being built during extraction */
117    private TCustomSqlStatement gcurrentsqlstatement;
118
119    /** Parser context for current operation */
120    private ParserContext parserContext;
121
122    // Note: Global context and frame stack fields inherited from AbstractSqlParser:
123    // - protected TContext globalContext
124    // - protected TSQLEnv sqlEnv
125    // - protected Stack<TFrame> frameStack
126    // - protected TFrame globalFrame
127    // - protected TSourceTokenList sourcetokenlist
128    // - protected TStatementList sqlstatements
129    // - protected ISqlCmds sqlcmds
130    // - protected TCustomLexer lexer
131
132    // ========== AbstractSqlParser Abstract Methods Implementation ==========
133
134    /**
135     * Return the Hive lexer instance.
136     */
137    @Override
138    protected TCustomLexer getLexer(ParserContext context) {
139        return this.flexer;
140    }
141
142    /**
143     * Return the Hive SQL parser instance with updated token list.
144     */
145    @Override
146    protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) {
147        this.fparser.sourcetokenlist = tokens;
148        return this.fparser;
149    }
150
151    /**
152     * Hive does not use a secondary parser (unlike Oracle with PL/SQL).
153     */
154    @Override
155    protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) {
156        return null;
157    }
158
159    /**
160     * Call Hive-specific tokenization logic.
161     * <p>
162     * Delegates to dohivetexttotokenlist which handles Hive's
163     * specific keyword recognition, backtick-quoted identifiers, and
164     * qualified name splitting.
165     */
166    @Override
167    protected void tokenizeVendorSql() {
168        dohivetexttotokenlist();
169    }
170
171    /**
172     * Setup Hive parser for raw statement extraction.
173     * <p>
174     * Hive uses a single parser, so we inject sqlcmds and update
175     * the token list for the main parser only.
176     */
177    @Override
178    protected void setupVendorParsersForExtraction() {
179        // Inject sqlcmds into parser (required for make_stmt)
180        this.fparser.sqlcmds = this.sqlcmds;
181
182        // Update token list for parser
183        this.fparser.sourcetokenlist = this.sourcetokenlist;
184    }
185
186    /**
187     * Call Hive-specific raw statement extraction logic.
188     * <p>
189     * Delegates to dohivegetrawsqlstatements which handles Hive's
190     * statement delimiters (semicolons).
191     * <p>
192     * Note: parserContext is already set by AbstractSqlParser before this is called
193     */
194    @Override
195    protected void extractVendorRawStatements(SqlParseResult.Builder builder) {
196        int errorCount = dohivegetrawsqlstatements(builder);
197        // Error count is tracked internally; errors are already added to syntaxErrors list
198
199        // Set the extracted statements in the builder
200        builder.sqlStatements(this.sqlstatements);
201    }
202
203    // ========== Tokenization Methods ==========
204
205    /**
206     * Tokenize Hive SQL text into a list of tokens.
207     * <p>
208     * This method handles Hive-specific token processing:
209     * <ul>
210     *   <li>Splits backtick-quoted qualified names (`schema.table`) into separate tokens</li>
211     *   <li>Handles MAP keyword disambiguation</li>
212     *   <li>Handles all standard SQL tokens (keywords, identifiers, operators, etc.)</li>
213     * </ul>
214     * <p>
215     * Migrated from TGSqlParser.dohivetexttotokenlist()
216     */
217    private void dohivetexttotokenlist() {
218
219        TSourceToken asourcetoken, lcprevst;
220        int yychar;
221
222        asourcetoken = getanewsourcetoken();
223        if (asourcetoken == null) return;
224        yychar = asourcetoken.tokencode;
225
226        while (yychar > 0) {
227            if (asourcetoken != null) {
228                sourcetokenlist.add(asourcetoken);
229            }
230            asourcetoken = getanewsourcetoken();
231            if (asourcetoken == null) break;
232            if (asourcetoken.tokencode == TBaseType.rrw_map) {
233                TSourceToken token = asourcetoken.searchToken(')', -1);
234                if (token != null) {
235                    asourcetoken.tokencode = TBaseType.ident;
236                }
237            } else if (asourcetoken.tokencode == '(') {
238//            TSourceToken token = asourcetoken.searchToken(TBaseType.ident,-1);
239//            if (token != null){
240//                token.tokencode = TBaseType.HIVE_FUNC_IDENT;
241//            }
242            }
243            yychar = asourcetoken.tokencode;
244
245            // `schema.table_name`
246            if ((asourcetoken.tokencode == TBaseType.ident)
247                    && (asourcetoken.toString().startsWith("`")) && (asourcetoken.toString().endsWith("`"))
248                    && (asourcetoken.toString().indexOf(".") > 0)
249            ) {
250                yychar = splitQualifiedNameInBacktick(asourcetoken);
251                asourcetoken = null;
252            }
253
254        }
255
256    }
257
258    /**
259     * Turn one token: `schema.table_name` into 3 tokens: `schema` . `table_name`
260     * <p>
261     * This helper method splits backtick-quoted qualified names into individual
262     * identifier and period tokens, preserving line/column information for each part.
263     * <p>
264     * Migrated from TGSqlParser.splitQualifiedNameInBacktick()
265     *
266     * @param asourcetoken the token to split
267     * @return the token code of the last token created
268     */
269    private int splitQualifiedNameInBacktick(TSourceToken asourcetoken) {
270        int yychar = 0;
271
272        List<String> elephantList = Arrays.asList(TBaseType.getTextWithoutQuoted(asourcetoken.toString()).split("\\."));
273        int p = 0, offset = 0;
274        for (String s : elephantList) {
275            TSourceToken pst = new TSourceToken("`" + s + "`");
276            pst.tokencode = asourcetoken.tokencode;
277            pst.tokentype = asourcetoken.tokentype;
278            pst.tokenstatus = asourcetoken.tokenstatus;
279            pst.lineNo = asourcetoken.lineNo;
280            pst.columnNo = asourcetoken.columnNo + offset;
281            if (p == 0) offset++; // this count the first ` token
282            offset = offset + s.length();
283            pst.container = sourcetokenlist;
284            if (p > 0) { // 第一个token使用被拆分前那个token的位置，从第二个开始的token，需要先把列表的位置指针加 1
285                sourcetokenlist.curpos = sourcetokenlist.curpos + 1;
286            }
287            pst.posinlist = sourcetokenlist.curpos;
288
289            sourcetokenlist.add(pst);
290            yychar = pst.tokencode;
291
292            if (p != elephantList.size() - 1) {
293                //`schema.table_name`, add period token in the middle of the backtick included identifier.
294                TSourceToken periodst = new TSourceToken(".");
295                periodst.tokencode = '.';
296                periodst.tokentype = ETokenType.ttperiod;
297                periodst.tokenstatus = asourcetoken.tokenstatus;
298                periodst.lineNo = asourcetoken.lineNo;
299                periodst.columnNo = asourcetoken.columnNo + offset;
300                offset++;
301                periodst.container = sourcetokenlist;
302                sourcetokenlist.curpos = sourcetokenlist.curpos + 1;
303                periodst.posinlist = sourcetokenlist.curpos;
304                sourcetokenlist.add(periodst);
305                yychar = periodst.tokencode;
306            }
307
308            p++;
309        }
310
311        return yychar;
312
313    }
314
315    // ========== Raw Statement Extraction ==========
316
317    /**
318     * Extract raw SQL statements from the token list.
319     * <p>
320     * This method separates individual SQL statements without full syntax checking.
321     * It handles Hive-specific syntax including:
322     * <ul>
323     *   <li>Token code adjustments (CharSetName, DATE function, SORT keyword)</li>
324     *   <li>Semicolon-terminated statements</li>
325     *   <li>Continuous semicolon handling (treated as comments)</li>
326     * </ul>
327     * <p>
328     * Migrated from TGSqlParser.dohivegetrawsqlstatements()
329     *
330     * @param builder the result builder to populate
331     * @return number of errors encountered
332     */
333    private int dohivegetrawsqlstatements(SqlParseResult.Builder builder) {
334
335        if (TBaseType.assigned(sqlstatements)) sqlstatements.clear();
336        if (!TBaseType.assigned(sourcetokenlist)) return -1;
337
338        gcurrentsqlstatement = null;
339        EFindSqlStateType gst = EFindSqlStateType.stnormal;
340        TSourceToken lcprevsolidtoken = null, ast = null;
341
342        for (int i = 0; i < sourcetokenlist.size(); i++) {
343
344            if ((ast != null) && (ast.issolidtoken()))
345                lcprevsolidtoken = ast;
346
347            ast = sourcetokenlist.get(i);
348            sourcetokenlist.curpos = i;
349
350            if (ast.tokencode == TBaseType.hive_CharSetName) {
351                TSourceToken st1 = ast.searchToken(TBaseType.hive_CharSetLiteral, 1);
352                if (st1 == null) {
353                    ast.tokencode = TBaseType.ident;
354                }
355            } else if (ast.tokencode == TBaseType.rrw_date) {
356                TSourceToken st1 = ast.nextSolidToken(); //ast.searchToken('(',1);
357                if (st1 != null) {
358                    if (st1.tokencode == '(') {
359                        ast.tokencode = TBaseType.rrw_hive_DATE_FUNCTION;
360                    }
361                }
362            } else if (ast.tokencode == TBaseType.rrw_sort) {
363                TSourceToken st1 = ast.searchToken(TBaseType.rrw_by, 1);
364                if (st1 == null) {
365                    ast.tokencode = TBaseType.ident;
366                }
367            }
368
369            switch (gst) {
370                case sterror: {
371                    if (ast.tokentype == ETokenType.ttsemicolon) {
372                        gcurrentsqlstatement.sourcetokenlist.add(ast);
373                        onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder);
374                        gst = EFindSqlStateType.stnormal;
375                    } else {
376                        gcurrentsqlstatement.sourcetokenlist.add(ast);
377                    }
378                    break;
379                } //sterror
380
381                case stnormal: {
382                    if ((ast.tokencode == TBaseType.cmtdoublehyphen)
383                            || (ast.tokencode == TBaseType.cmtslashstar)
384                            || (ast.tokencode == TBaseType.lexspace)
385                            || (ast.tokencode == TBaseType.lexnewline)
386                            || (ast.tokentype == ETokenType.ttsemicolon)) {
387                        if (gcurrentsqlstatement != null) {
388                            gcurrentsqlstatement.sourcetokenlist.add(ast);
389                        }
390
391                        if ((lcprevsolidtoken != null) && (ast.tokentype == ETokenType.ttsemicolon)) {
392                            if (lcprevsolidtoken.tokentype == ETokenType.ttsemicolon) {
393                                // ;;;; continuous semicolon,treat it as comment
394                                ast.tokentype = ETokenType.ttsimplecomment;
395                                ast.tokencode = TBaseType.cmtdoublehyphen;
396                            }
397                        }
398
399                        continue;
400                    }
401
402
403                    gcurrentsqlstatement = sqlcmds.issql(ast, gst, gcurrentsqlstatement);
404
405                    if (gcurrentsqlstatement != null) {
406                        gst = EFindSqlStateType.stsql;
407                        gcurrentsqlstatement.sourcetokenlist.add(ast);
408                    } else {
409                        //error tokentext found
410
411                        this.syntaxErrors.add(new TSyntaxError(ast.getAstext(), ast.lineNo, (ast.columnNo < 0 ? 0 : ast.columnNo)
412                                , "Error when tokenlize", EErrorType.spwarning, TBaseType.MSG_WARNING_ERROR_WHEN_TOKENIZE, null, ast.posinlist));
413
414                        ast.tokentype = ETokenType.tttokenlizererrortoken;
415                        gst = EFindSqlStateType.sterror;
416
417                        gcurrentsqlstatement = new TUnknownSqlStatement(vendor);
418                        gcurrentsqlstatement.sqlstatementtype = ESqlStatementType.sstinvalid;
419                        gcurrentsqlstatement.sourcetokenlist.add(ast);
420
421                    }
422
423                    break;
424                } // stnormal
425
426                case stsql: {
427                    if (ast.tokentype == ETokenType.ttsemicolon) {
428                        gst = EFindSqlStateType.stnormal;
429                        gcurrentsqlstatement.sourcetokenlist.add(ast);
430                        gcurrentsqlstatement.semicolonended = ast;
431                        onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder);
432                        continue;
433                    }
434
435                    // SET without semicolons: if current statement is SET and we encounter another SET,
436                    // complete the current statement and start a new one
437                    if (ast.tokencode == TBaseType.rrw_set
438                            && gcurrentsqlstatement != null
439                            && gcurrentsqlstatement.sqlstatementtype == ESqlStatementType.ssthiveSet) {
440                        onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder);
441                        gcurrentsqlstatement = sqlcmds.issql(ast, EFindSqlStateType.stnormal, null);
442                        if (gcurrentsqlstatement != null) {
443                            gcurrentsqlstatement.sourcetokenlist.add(ast);
444                        }
445                        break;
446                    }
447
448                    gcurrentsqlstatement.sourcetokenlist.add(ast);
449                    break;
450                }//case stsql
451
452            } //switch
453        }//for
454
455        //last statement
456        if ((gcurrentsqlstatement != null) &&
457                ((gst == EFindSqlStateType.stsql) || (gst == EFindSqlStateType.sterror))) {
458            onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, true, builder);
459        }
460
461        return syntaxErrors.size();
462    }
463
464    // ========== Statement Parsing ==========
465
466    /**
467     * Parse all raw SQL statements.
468     * <p>
469     * This method performs full syntax analysis of each statement:
470     * <ul>
471     *   <li>Initializes global context and SQL environment</li>
472     *   <li>Parses each statement using TParserHive</li>
473     *   <li>Handles errors with optional error recovery</li>
474     *   <li>Collects syntax errors for reporting</li>
475     * </ul>
476     * <p>
477     * Migrated from TGSqlParser.performParsing()
478     *
479     * @param context the parser context
480     * @param parser the main parser (TParserHive)
481     * @param secondaryParser the secondary parser (null for Hive)
482     * @param tokens the source token list
483     * @param rawStatements raw statements already extracted (never null)
484     * @return the parsed statement list
485     */
486    @Override
487    protected TStatementList performParsing(ParserContext context, TCustomParser parser, TCustomParser secondaryParser, TSourceTokenList tokens, TStatementList rawStatements) {
488        this.parserContext = context;
489        this.fparser = (TParserHive) parser;
490        this.sourcetokenlist = tokens;
491        this.sqlstatements = rawStatements;
492
493        // Initialize sqlcmds for this parsing operation
494        if (this.sqlcmds == null) {
495            this.sqlcmds = SqlCmdsFactory.get(vendor);
496        }
497
498        // CRITICAL: Inject sqlcmds into parser (required for make_stmt to work)
499        this.fparser.sqlcmds = this.sqlcmds;
500
501        // Initialize global context (inherited method from AbstractSqlParser)
502        initializeGlobalContext();
503
504        // Parse each statement
505        for (int i = 0; i < sqlstatements.size(); i++) {
506            TCustomSqlStatement stmt = sqlstatements.getRawSql(i);
507
508            try {
509                // Set frame stack for nested scope resolution
510                stmt.setFrameStack(frameStack);
511
512                // Parse the statement
513                int parseResult = stmt.parsestatement(null, false, context.isOnlyNeedRawParseTree());
514
515                // Attempt error recovery using inherited method
516                parseResult = attemptErrorRecovery(stmt, parseResult, context.isOnlyNeedRawParseTree());
517
518                // Collect errors from statement
519                if ((parseResult != 0) || (stmt.getErrorCount() > 0)) {
520                    copyErrorsFromStatement(stmt);
521                }
522
523            } catch (Exception ex) {
524                // Use inherited exception handler from AbstractSqlParser
525                handleStatementParsingException(stmt, i, ex);
526                continue;
527            }
528        }
529
530        // Clean up frame stack
531        if (globalFrame != null) globalFrame.popMeFromStack(frameStack);
532
533        return sqlstatements;
534    }
535
536    // ========== Semantic Analysis ==========
537
538    /**
539     * Perform semantic analysis on parsed statements.
540     * <p>
541     * Runs TSQLResolver to build relationships between tables and columns,
542     * resolve references, and perform type checking.
543     */
544    @Override
545    protected void performSemanticAnalysis(ParserContext context, TStatementList statements) {
546        if (TBaseType.isEnableResolver() && getSyntaxErrors().isEmpty()) {
547            TSQLResolver resolver = new TSQLResolver(globalContext, statements);
548            resolver.resolve();
549        }
550    }
551
552    // ========== Interpretation ==========
553
554    /**
555     * Perform interpretation/evaluation on statements.
556     * <p>
557     * Runs TASTEvaluator for compile-time constant expression evaluation.
558     * Hive does not require interpretation currently.
559     */
560    @Override
561    protected void performInterpreter(ParserContext context, TStatementList statements) {
562        // Hive does not require interpretation currently
563    }
564
565    @Override
566    public String toString() {
567        return "HiveSqlParser{vendor=" + vendor + "}";
568    }
569}