001package gudusoft.gsqlparser.parser; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TBaseType; 005import gudusoft.gsqlparser.TCustomLexer; 006import gudusoft.gsqlparser.TCustomParser; 007import gudusoft.gsqlparser.TCustomSqlStatement; 008import gudusoft.gsqlparser.TLexerImpala; 009import gudusoft.gsqlparser.TParserImpala; 010import gudusoft.gsqlparser.TSourceToken; 011import gudusoft.gsqlparser.TSourceTokenList; 012import gudusoft.gsqlparser.TStatementList; 013import gudusoft.gsqlparser.TSyntaxError; 014import gudusoft.gsqlparser.EFindSqlStateType; 015import gudusoft.gsqlparser.ETokenType; 016import gudusoft.gsqlparser.ETokenStatus; 017import gudusoft.gsqlparser.ESqlStatementType; 018import gudusoft.gsqlparser.EErrorType; 019import gudusoft.gsqlparser.stmt.TUnknownSqlStatement; 020import gudusoft.gsqlparser.sqlcmds.ISqlCmds; 021import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory; 022import gudusoft.gsqlparser.compiler.TContext; 023import gudusoft.gsqlparser.sqlenv.TSQLEnv; 024import gudusoft.gsqlparser.compiler.TGlobalScope; 025import gudusoft.gsqlparser.compiler.TFrame; 026import gudusoft.gsqlparser.resolver.TSQLResolver; 027import gudusoft.gsqlparser.TLog; 028import gudusoft.gsqlparser.compiler.TASTEvaluator; 029 030import java.io.BufferedReader; 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.List; 034import java.util.Stack; 035 036/** 037 * Apache Impala SQL parser implementation. 038 * 039 * <p>This parser handles Impala-specific SQL syntax including: 040 * <ul> 041 * <li>Hive-compatible SQL dialect</li> 042 * <li>Backtick-quoted identifiers (`schema.table`)</li> 043 * <li>Semicolon statement delimiters</li> 044 * <li>Impala-specific functions and syntax</li> 045 * </ul> 046 * 047 * <p><b>Design Notes:</b> 048 * <ul> 049 * <li>Extends {@link AbstractSqlParser} using the template method pattern</li> 050 * <li>Uses {@link TLexerImpala} for tokenization</li> 051 * <li>Uses {@link TParserImpala} for parsing</li> 052 * <li>Shares tokenization logic with Hive (dohivetexttotokenlist)</li> 053 * <li>Delimiter character: ';' for SQL statements</li> 054 * </ul> 055 * 056 * <p><b>Usage Example:</b> 057 * <pre> 058 * // Get Impala parser from factory 059 * SqlParser parser = SqlParserFactory.get(EDbVendor.dbvimpala); 060 * 061 * // Build context 062 * ParserContext context = new ParserContext.Builder(EDbVendor.dbvimpala) 063 * .sqlText("SELECT * FROM employees WHERE dept_id = 10") 064 * .build(); 065 * 066 * // Parse 067 * SqlParseResult result = parser.parse(context); 068 * 069 * // Access statements 070 * TStatementList statements = result.getSqlStatements(); 071 * </pre> 072 * 073 * @see SqlParser 074 * @see AbstractSqlParser 075 * @see TLexerImpala 076 * @see TParserImpala 077 * @since 3.2.0.0 078 */ 079public class ImpalaSqlParser extends AbstractSqlParser { 080 081 /** 082 * Construct Apache Impala SQL parser. 083 * <p> 084 * Configures the parser for Impala database with default delimiter (;). 085 * <p> 086 * Following the original TGSqlParser pattern, the lexer and parser are 087 * created once in the constructor and reused for all parsing operations. 088 */ 089 public ImpalaSqlParser() { 090 super(EDbVendor.dbvimpala); 091 this.delimiterChar = ';'; 092 this.defaultDelimiterStr = ";"; 093 094 // Create lexer once - will be reused for all parsing operations 095 this.flexer = new TLexerImpala(); 096 this.flexer.delimiterchar = this.delimiterChar; 097 this.flexer.defaultDelimiterStr = this.defaultDelimiterStr; 098 099 // Set parent's lexer reference for shared tokenization logic 100 this.lexer = this.flexer; 101 102 // Create parser once - will be reused for all parsing operations 103 this.fparser = new TParserImpala(null); 104 this.fparser.lexer = this.flexer; 105 } 106 107 // ========== Parser Components ========== 108 109 /** The Impala lexer used for tokenization */ 110 public TLexerImpala flexer; 111 112 /** Impala parser (for Impala statements) */ 113 private TParserImpala fparser; 114 115 /** Current statement being built during extraction */ 116 private TCustomSqlStatement gcurrentsqlstatement; 117 118 // Note: Global context and frame stack fields inherited from AbstractSqlParser: 119 // - protected TContext globalContext 120 // - protected TSQLEnv sqlEnv 121 // - protected Stack<TFrame> frameStack 122 // - protected TFrame globalFrame 123 124 // ========== AbstractSqlParser Abstract Methods Implementation ========== 125 126 /** 127 * Return the Impala lexer instance. 128 */ 129 @Override 130 protected TCustomLexer getLexer(ParserContext context) { 131 return this.flexer; 132 } 133 134 /** 135 * Return the Impala SQL parser instance with updated token list. 136 */ 137 @Override 138 protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) { 139 this.fparser.sourcetokenlist = tokens; 140 return this.fparser; 141 } 142 143 /** 144 * Call Impala-specific tokenization logic. 145 * <p> 146 * Delegates to doimpalatexttotokenlist which internally calls dohivetexttotokenlist. 147 * Impala shares the same tokenization logic as Hive. 148 */ 149 @Override 150 protected void tokenizeVendorSql() { 151 doimpalatexttotokenlist(); 152 } 153 154 /** 155 * Setup Impala parser for raw statement extraction. 156 * <p> 157 * Impala uses a single parser, so we inject sqlcmds and update 158 * the token list for the main parser only. 159 */ 160 @Override 161 protected void setupVendorParsersForExtraction() { 162 // Inject sqlcmds into parser (required for make_stmt) 163 this.fparser.sqlcmds = this.sqlcmds; 164 165 // Update token list for parser 166 this.fparser.sourcetokenlist = this.sourcetokenlist; 167 } 168 169 /** 170 * Call Impala-specific raw statement extraction logic. 171 * <p> 172 * Delegates to doimpalagetrawsqlstatements which internally calls dohivegetrawsqlstatements. 173 * Impala shares the same statement extraction logic as Hive. 174 */ 175 @Override 176 protected void extractVendorRawStatements(SqlParseResult.Builder builder) { 177 doimpalagetrawsqlstatements(builder); 178 } 179 180 /** 181 * Perform full parsing of statements with syntax checking. 182 * <p> 183 * This method orchestrates the parsing of all statements. 184 */ 185 @Override 186 protected TStatementList performParsing(ParserContext context, 187 TCustomParser parser, 188 TCustomParser secondaryParser, 189 TSourceTokenList tokens, 190 TStatementList rawStatements) { 191 // Store references 192 this.fparser = (TParserImpala) parser; 193 this.sourcetokenlist = tokens; 194 this.parserContext = context; 195 196 // Use the raw statements passed from AbstractSqlParser.parse() 197 this.sqlstatements = rawStatements; 198 199 // Initialize statement parsing infrastructure 200 this.sqlcmds = SqlCmdsFactory.get(vendor); 201 202 // Inject sqlcmds into parser (required for make_stmt and other methods) 203 this.fparser.sqlcmds = this.sqlcmds; 204 205 // Initialize global context for semantic analysis 206 initializeGlobalContext(); 207 208 // Parse each statement with exception handling for robustness 209 for (int i = 0; i < sqlstatements.size(); i++) { 210 TCustomSqlStatement stmt = sqlstatements.getRawSql(i); 211 212 try { 213 stmt.setFrameStack(frameStack); 214 215 // Parse the statement 216 int parseResult = stmt.parsestatement(null, false, context.isOnlyNeedRawParseTree()); 217 218 // Handle error recovery for CREATE TABLE/INDEX 219 boolean doRecover = TBaseType.ENABLE_ERROR_RECOVER_IN_CREATE_TABLE; 220 if (doRecover && ((parseResult != 0) || (stmt.getErrorCount() > 0))) { 221 handleCreateTableErrorRecovery(stmt); 222 } 223 224 // Collect syntax errors 225 if ((parseResult != 0) || (stmt.getErrorCount() > 0)) { 226 copyErrorsFromStatement(stmt); 227 } 228 229 } catch (Exception ex) { 230 // Use inherited exception handler from AbstractSqlParser 231 // This provides consistent error handling across all database parsers 232 handleStatementParsingException(stmt, i, ex); 233 continue; 234 } 235 } 236 237 // Clean up frame stack 238 if (globalFrame != null) { 239 globalFrame.popMeFromStack(frameStack); 240 } 241 242 return this.sqlstatements; 243 } 244 245 // Note: initializeGlobalContext() inherited from AbstractSqlParser 246 // Note: No override of afterStatementParsed() needed - default (no-op) is appropriate for Impala 247 248 /** 249 * Handle error recovery for CREATE TABLE/INDEX statements. 250 */ 251 private void handleCreateTableErrorRecovery(TCustomSqlStatement stmt) { 252 if (((stmt.sqlstatementtype == ESqlStatementType.sstcreatetable) 253 || (stmt.sqlstatementtype == ESqlStatementType.sstcreateindex)) 254 && (!TBaseType.c_createTableStrictParsing)) { 255 256 int nested = 0; 257 boolean isIgnore = false, isFoundIgnoreToken = false; 258 TSourceToken firstIgnoreToken = null; 259 260 for (int k = 0; k < stmt.sourcetokenlist.size(); k++) { 261 TSourceToken st = stmt.sourcetokenlist.get(k); 262 if (isIgnore) { 263 if (st.issolidtoken() && (st.tokencode != ';')) { 264 isFoundIgnoreToken = true; 265 if (firstIgnoreToken == null) { 266 firstIgnoreToken = st; 267 } 268 } 269 if (st.tokencode != ';') { 270 st.tokencode = TBaseType.sqlpluscmd; 271 } 272 continue; 273 } 274 if (st.tokencode == (int) ')') { 275 nested--; 276 if (nested == 0) { 277 boolean isSelect = false; 278 TSourceToken st1 = st.searchToken(TBaseType.rrw_as, 1); 279 if (st1 != null) { 280 TSourceToken st2 = st.searchToken((int) '(', 2); 281 if (st2 != null) { 282 TSourceToken st3 = st.searchToken(TBaseType.rrw_select, 3); 283 isSelect = (st3 != null); 284 } 285 } 286 if (!isSelect) isIgnore = true; 287 } 288 } else if (st.tokencode == (int) '(') { 289 nested++; 290 } 291 } 292 293 if (isFoundIgnoreToken) { 294 stmt.clearError(); 295 stmt.parsestatement(null, false); 296 } 297 } 298 } 299 300 /** 301 * Perform Impala-specific semantic analysis using TSQLResolver. 302 */ 303 @Override 304 protected void performSemanticAnalysis(ParserContext context, TStatementList statements) { 305 if (TBaseType.isEnableResolver() && getSyntaxErrors().isEmpty()) { 306 TSQLResolver resolver = new TSQLResolver(globalContext, statements); 307 resolver.resolve(); 308 } 309 } 310 311 /** 312 * Perform interpretation/evaluation on parsed statements. 313 */ 314 @Override 315 protected void performInterpreter(ParserContext context, TStatementList statements) { 316 if (TBaseType.ENABLE_INTERPRETER && getSyntaxErrors().isEmpty()) { 317 TLog.clearLogs(); 318 TGlobalScope interpreterScope = new TGlobalScope(sqlEnv); 319 TLog.enableInterpreterLogOnly(); 320 TASTEvaluator astEvaluator = new TASTEvaluator(statements, interpreterScope); 321 astEvaluator.eval(); 322 } 323 } 324 325 // ========== Impala-Specific Tokenization ========== 326 327 /** 328 * Impala-specific tokenization logic. 329 * <p> 330 * Extracted from: TGSqlParser.doimpalatexttotokenlist() (line 4600) 331 * Delegates to dohivetexttotokenlist as Impala uses the same tokenization as Hive. 332 */ 333 private void doimpalatexttotokenlist() { 334 dohivetexttotokenlist(); 335 } 336 337 /** 338 * Hive/Impala-specific tokenization logic. 339 * <p> 340 * Extracted from: TGSqlParser.dohivetexttotokenlist() (lines 4558-4598) 341 * <p> 342 * Handles: 343 * <ul> 344 * <li>Basic token processing</li> 345 * <li>MAP keyword disambiguation</li> 346 * <li>Backtick-quoted qualified names (`schema.table_name`)</li> 347 * </ul> 348 */ 349 private void dohivetexttotokenlist() { 350 TSourceToken asourcetoken, lcprevst; 351 int yychar; 352 353 asourcetoken = getanewsourcetoken(); 354 if (asourcetoken == null) return; 355 yychar = asourcetoken.tokencode; 356 357 while (yychar > 0) { 358 if (asourcetoken != null) { 359 sourcetokenlist.add(asourcetoken); 360 } 361 asourcetoken = getanewsourcetoken(); 362 if (asourcetoken == null) break; 363 364 // Handle MAP keyword disambiguation 365 if (asourcetoken.tokencode == TBaseType.rrw_map) { 366 TSourceToken token = asourcetoken.searchToken(')', -1); 367 if (token != null) { 368 asourcetoken.tokencode = TBaseType.ident; 369 } 370 } else if (asourcetoken.tokencode == '(') { 371 // Reserved for future function identification logic 372 // TSourceToken token = asourcetoken.searchToken(TBaseType.ident,-1); 373 // if (token != null){ 374 // token.tokencode = TBaseType.HIVE_FUNC_IDENT; 375 // } 376 } 377 378 yychar = asourcetoken.tokencode; 379 380 // Handle backtick-quoted qualified names: `schema.table_name` 381 if ((asourcetoken.tokencode == TBaseType.ident) 382 && (asourcetoken.toString().startsWith("`")) && (asourcetoken.toString().endsWith("`")) 383 && (asourcetoken.toString().indexOf(".") > 0)) { 384 yychar = splitQualifiedNameInBacktick(asourcetoken); 385 asourcetoken = null; 386 } 387 } 388 } 389 390 /** 391 * Split a backtick-quoted qualified identifier into separate tokens. 392 * <p> 393 * Extracted from: TGSqlParser.splitQualifiedNameInBacktick() (lines 3458-3503) 394 * <p> 395 * For example, `schema.table_name` is split into: 396 * <ul> 397 * <li>`schema` (identifier)</li> 398 * <li>. (period)</li> 399 * <li>`table_name` (identifier)</li> 400 * </ul> 401 * 402 * @param asourcetoken The qualified identifier token to split 403 * @return The token code of the last token created 404 */ 405 private int splitQualifiedNameInBacktick(TSourceToken asourcetoken) { 406 int yychar = 0; 407 408 List<String> elephantList = Arrays.asList(TBaseType.getTextWithoutQuoted(asourcetoken.toString()).split("\\.")); 409 int p = 0, offset = 0; 410 for (String s : elephantList) { 411 TSourceToken pst = new TSourceToken("`" + s + "`"); 412 pst.tokencode = asourcetoken.tokencode; 413 pst.tokentype = asourcetoken.tokentype; 414 pst.tokenstatus = asourcetoken.tokenstatus; 415 pst.lineNo = asourcetoken.lineNo; 416 pst.columnNo = asourcetoken.columnNo + offset; 417 if (p == 0) offset++; // this counts the first ` token 418 offset = offset + s.length(); 419 pst.container = sourcetokenlist; 420 if (p > 0) { // For tokens after the first, increment position pointer 421 sourcetokenlist.curpos = sourcetokenlist.curpos + 1; 422 } 423 pst.posinlist = sourcetokenlist.curpos; 424 425 sourcetokenlist.add(pst); 426 yychar = pst.tokencode; 427 428 if (p != elephantList.size() - 1) { 429 // Add period token between backtick-quoted identifiers 430 TSourceToken periodst = new TSourceToken("."); 431 periodst.tokencode = '.'; 432 periodst.tokentype = ETokenType.ttperiod; 433 periodst.tokenstatus = asourcetoken.tokenstatus; 434 periodst.lineNo = asourcetoken.lineNo; 435 periodst.columnNo = asourcetoken.columnNo + offset; 436 offset++; 437 periodst.container = sourcetokenlist; 438 sourcetokenlist.curpos = sourcetokenlist.curpos + 1; 439 periodst.posinlist = sourcetokenlist.curpos; 440 sourcetokenlist.add(periodst); 441 yychar = periodst.tokencode; 442 } 443 444 p++; 445 } 446 447 return yychar; 448 } 449 450 // ========== Impala-Specific Raw Statement Extraction ========== 451 452 /** 453 * Impala-specific raw statement extraction logic. 454 * <p> 455 * Extracted from: TGSqlParser.doimpalagetrawsqlstatements() (lines 11013-11015) 456 * Delegates to dohivegetrawsqlstatements as Impala uses the same statement extraction as Hive. 457 */ 458 private void doimpalagetrawsqlstatements(SqlParseResult.Builder builder) { 459 dohivegetrawsqlstatements(builder); 460 } 461 462 /** 463 * Hive/Impala-specific raw statement extraction logic. 464 * <p> 465 * Extracted from: TGSqlParser.dohivegetrawsqlstatements() (lines 11017-11145) 466 * <p> 467 * Handles: 468 * <ul> 469 * <li>Semicolon statement delimiters</li> 470 * <li>Token adjustments (CHARSET, DATE function, SORT BY)</li> 471 * <li>Continuous semicolons as comments</li> 472 * <li>Error token handling</li> 473 * </ul> 474 */ 475 private void dohivegetrawsqlstatements(SqlParseResult.Builder builder) { 476 if (TBaseType.assigned(sqlstatements)) sqlstatements.clear(); 477 if (!TBaseType.assigned(sourcetokenlist)) { 478 builder.errorCode(-1); 479 return; 480 } 481 482 gcurrentsqlstatement = null; 483 EFindSqlStateType gst = EFindSqlStateType.stnormal; 484 TSourceToken lcprevsolidtoken = null, ast = null; 485 486 for (int i = 0; i < sourcetokenlist.size(); i++) { 487 488 if ((ast != null) && (ast.issolidtoken())) 489 lcprevsolidtoken = ast; 490 491 ast = sourcetokenlist.get(i); 492 sourcetokenlist.curpos = i; 493 494 // Token adjustments specific to Hive/Impala 495 if (ast.tokencode == TBaseType.hive_CharSetName) { 496 TSourceToken st1 = ast.searchToken(TBaseType.hive_CharSetLiteral, 1); 497 if (st1 == null) { 498 ast.tokencode = TBaseType.ident; 499 } 500 } else if (ast.tokencode == TBaseType.rrw_date) { 501 TSourceToken st1 = ast.nextSolidToken(); 502 if (st1 != null) { 503 if (st1.tokencode == '(') { 504 ast.tokencode = TBaseType.rrw_hive_DATE_FUNCTION; 505 } 506 } 507 } else if (ast.tokencode == TBaseType.rrw_sort) { 508 TSourceToken st1 = ast.searchToken(TBaseType.rrw_by, 1); 509 if (st1 == null) { 510 ast.tokencode = TBaseType.ident; 511 } 512 } 513 514 switch (gst) { 515 case sterror: { 516 if (ast.tokentype == ETokenType.ttsemicolon) { 517 gcurrentsqlstatement.sourcetokenlist.add(ast); 518 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder); 519 gst = EFindSqlStateType.stnormal; 520 } else { 521 gcurrentsqlstatement.sourcetokenlist.add(ast); 522 } 523 break; 524 } //sterror 525 526 case stnormal: { 527 if ((ast.tokencode == TBaseType.cmtdoublehyphen) 528 || (ast.tokencode == TBaseType.cmtslashstar) 529 || (ast.tokencode == TBaseType.lexspace) 530 || (ast.tokencode == TBaseType.lexnewline) 531 || (ast.tokentype == ETokenType.ttsemicolon)) { 532 if (gcurrentsqlstatement != null) { 533 gcurrentsqlstatement.sourcetokenlist.add(ast); 534 } 535 536 if ((lcprevsolidtoken != null) && (ast.tokentype == ETokenType.ttsemicolon)) { 537 if (lcprevsolidtoken.tokentype == ETokenType.ttsemicolon) { 538 // ;;;; continuous semicolon, treat it as comment 539 ast.tokentype = ETokenType.ttsimplecomment; 540 ast.tokencode = TBaseType.cmtdoublehyphen; 541 } 542 } 543 544 continue; 545 } 546 547 gcurrentsqlstatement = sqlcmds.issql(ast, gst, gcurrentsqlstatement); 548 549 if (gcurrentsqlstatement != null) { 550 gst = EFindSqlStateType.stsql; 551 gcurrentsqlstatement.sourcetokenlist.add(ast); 552 } else { 553 // error token found 554 555 this.syntaxErrors.add(new TSyntaxError(ast.getAstext(), ast.lineNo, (ast.columnNo < 0 ? 0 : ast.columnNo) 556 , "Error when tokenlize", EErrorType.spwarning, TBaseType.MSG_WARNING_ERROR_WHEN_TOKENIZE, null, ast.posinlist)); 557 558 ast.tokentype = ETokenType.tttokenlizererrortoken; 559 gst = EFindSqlStateType.sterror; 560 561 gcurrentsqlstatement = new TUnknownSqlStatement(vendor); 562 gcurrentsqlstatement.sqlstatementtype = ESqlStatementType.sstinvalid; 563 gcurrentsqlstatement.sourcetokenlist.add(ast); 564 } 565 566 break; 567 } // stnormal 568 569 case stsql: { 570 if (ast.tokentype == ETokenType.ttsemicolon) { 571 gst = EFindSqlStateType.stnormal; 572 gcurrentsqlstatement.sourcetokenlist.add(ast); 573 gcurrentsqlstatement.semicolonended = ast; 574 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder); 575 continue; 576 } 577 578 gcurrentsqlstatement.sourcetokenlist.add(ast); 579 break; 580 }//case stsql 581 582 } //switch 583 }//for 584 585 // last statement 586 if ((gcurrentsqlstatement != null) && 587 ((gst == EFindSqlStateType.stsql) || (gst == EFindSqlStateType.sterror))) { 588 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, true, builder); 589 } 590 591 // Set results in builder 592 builder.sqlStatements(this.sqlstatements); 593 builder.errorCode(syntaxErrors.size()); 594 builder.errorMessage(syntaxErrors.size() == 0 ? "" : String.format("Extraction completed with %d error(s)", syntaxErrors.size())); 595 } 596 597 @Override 598 public String toString() { 599 return "ImpalaSqlParser{vendor=" + vendor + "}"; 600 } 601}