001package gudusoft.gsqlparser.parser; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TBaseType; 005import gudusoft.gsqlparser.TCustomLexer; 006import gudusoft.gsqlparser.TCustomParser; 007import gudusoft.gsqlparser.TCustomSqlStatement; 008import gudusoft.gsqlparser.TLexerHive; 009import gudusoft.gsqlparser.TParserHive; 010import gudusoft.gsqlparser.TSourceToken; 011import gudusoft.gsqlparser.TSourceTokenList; 012import gudusoft.gsqlparser.TStatementList; 013import gudusoft.gsqlparser.TSyntaxError; 014import gudusoft.gsqlparser.EFindSqlStateType; 015import gudusoft.gsqlparser.ETokenType; 016import gudusoft.gsqlparser.ETokenStatus; 017import gudusoft.gsqlparser.ESqlStatementType; 018import gudusoft.gsqlparser.EErrorType; 019import gudusoft.gsqlparser.stmt.TUnknownSqlStatement; 020import gudusoft.gsqlparser.sqlcmds.ISqlCmds; 021import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory; 022import gudusoft.gsqlparser.compiler.TContext; 023import gudusoft.gsqlparser.sqlenv.TSQLEnv; 024import gudusoft.gsqlparser.compiler.TGlobalScope; 025import gudusoft.gsqlparser.compiler.TFrame; 026import gudusoft.gsqlparser.resolver.TSQLResolver; 027import gudusoft.gsqlparser.TLog; 028import gudusoft.gsqlparser.compiler.TASTEvaluator; 029 030import java.io.BufferedReader; 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.List; 034import java.util.Stack; 035 036/** 037 * Apache Hive SQL parser implementation. 038 * 039 * <p>This parser handles Hive-specific SQL syntax including: 040 * <ul> 041 * <li>Hive DDL statements (CREATE TABLE/DATABASE with Hive-specific options)</li> 042 * <li>Hive DML statements (INSERT OVERWRITE, LOAD DATA, etc.)</li> 043 * <li>HiveQL functions and extensions</li> 044 * <li>Backtick-quoted identifiers including qualified names (`schema.table`)</li> 045 * <li>Hive-specific keywords and data types</li> 046 * </ul> 047 * 048 * <p><b>Design Notes:</b> 049 * <ul> 050 * <li>Extends {@link AbstractSqlParser} using the template method pattern</li> 051 * <li>Uses {@link TLexerHive} for tokenization</li> 052 * <li>Uses {@link TParserHive} for parsing</li> 053 * <li>Delimiter character: ';' for SQL statements</li> 054 * <li>Splits backtick-quoted qualified names (`schema.table`) into individual tokens</li> 055 * </ul> 056 * 057 * <p><b>Usage Example:</b> 058 * <pre> 059 * // Get Hive parser from factory 060 * SqlParser parser = SqlParserFactory.get(EDbVendor.dbvhive); 061 * 062 * // Build context 063 * ParserContext context = new ParserContext.Builder(EDbVendor.dbvhive) 064 * .sqlText("SELECT * FROM `default.employee` WHERE dept = 'IT'") 065 * .build(); 066 * 067 * // Parse 068 * SqlParseResult result = parser.parse(context); 069 * 070 * // Access statements 071 * TStatementList statements = result.getSqlStatements(); 072 * </pre> 073 * 074 * @see SqlParser 075 * @see AbstractSqlParser 076 * @see TLexerHive 077 * @see TParserHive 078 * @since 3.2.0.0 079 */ 080public class HiveSqlParser extends AbstractSqlParser { 081 082 /** 083 * Construct Hive SQL parser. 084 * <p> 085 * Configures the parser for Hive database with default delimiter (;). 086 * <p> 087 * Following the original TGSqlParser pattern, the lexer and parser are 088 * created once in the constructor and reused for all parsing operations. 089 */ 090 public HiveSqlParser() { 091 super(EDbVendor.dbvhive); 092 this.delimiterChar = ';'; 093 this.defaultDelimiterStr = ";"; 094 095 // Create lexer once - will be reused for all parsing operations 096 this.flexer = new TLexerHive(); 097 this.flexer.delimiterchar = this.delimiterChar; 098 this.flexer.defaultDelimiterStr = this.defaultDelimiterStr; 099 100 // Set parent's lexer reference for shared tokenization logic 101 this.lexer = this.flexer; 102 103 // Create parser once - will be reused for all parsing operations 104 this.fparser = new TParserHive(null); 105 this.fparser.lexer = this.flexer; 106 } 107 108 // ========== Parser Components ========== 109 110 /** The Hive lexer used for tokenization */ 111 public TLexerHive flexer; 112 113 /** SQL parser (for Hive statements) */ 114 private TParserHive fparser; 115 116 /** Current statement being built during extraction */ 117 private TCustomSqlStatement gcurrentsqlstatement; 118 119 /** Parser context for current operation */ 120 private ParserContext parserContext; 121 122 // Note: Global context and frame stack fields inherited from AbstractSqlParser: 123 // - protected TContext globalContext 124 // - protected TSQLEnv sqlEnv 125 // - protected Stack<TFrame> frameStack 126 // - protected TFrame globalFrame 127 // - protected TSourceTokenList sourcetokenlist 128 // - protected TStatementList sqlstatements 129 // - protected ISqlCmds sqlcmds 130 // - protected TCustomLexer lexer 131 132 // ========== AbstractSqlParser Abstract Methods Implementation ========== 133 134 /** 135 * Return the Hive lexer instance. 136 */ 137 @Override 138 protected TCustomLexer getLexer(ParserContext context) { 139 return this.flexer; 140 } 141 142 /** 143 * Return the Hive SQL parser instance with updated token list. 144 */ 145 @Override 146 protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) { 147 this.fparser.sourcetokenlist = tokens; 148 return this.fparser; 149 } 150 151 /** 152 * Hive does not use a secondary parser (unlike Oracle with PL/SQL). 153 */ 154 @Override 155 protected TCustomParser getSecondaryParser(ParserContext context, TSourceTokenList tokens) { 156 return null; 157 } 158 159 /** 160 * Call Hive-specific tokenization logic. 161 * <p> 162 * Delegates to dohivetexttotokenlist which handles Hive's 163 * specific keyword recognition, backtick-quoted identifiers, and 164 * qualified name splitting. 165 */ 166 @Override 167 protected void tokenizeVendorSql() { 168 dohivetexttotokenlist(); 169 } 170 171 /** 172 * Setup Hive parser for raw statement extraction. 173 * <p> 174 * Hive uses a single parser, so we inject sqlcmds and update 175 * the token list for the main parser only. 176 */ 177 @Override 178 protected void setupVendorParsersForExtraction() { 179 // Inject sqlcmds into parser (required for make_stmt) 180 this.fparser.sqlcmds = this.sqlcmds; 181 182 // Update token list for parser 183 this.fparser.sourcetokenlist = this.sourcetokenlist; 184 } 185 186 /** 187 * Call Hive-specific raw statement extraction logic. 188 * <p> 189 * Delegates to dohivegetrawsqlstatements which handles Hive's 190 * statement delimiters (semicolons). 191 * <p> 192 * Note: parserContext is already set by AbstractSqlParser before this is called 193 */ 194 @Override 195 protected void extractVendorRawStatements(SqlParseResult.Builder builder) { 196 int errorCount = dohivegetrawsqlstatements(builder); 197 // Error count is tracked internally; errors are already added to syntaxErrors list 198 199 // Set the extracted statements in the builder 200 builder.sqlStatements(this.sqlstatements); 201 } 202 203 // ========== Tokenization Methods ========== 204 205 /** 206 * Tokenize Hive SQL text into a list of tokens. 207 * <p> 208 * This method handles Hive-specific token processing: 209 * <ul> 210 * <li>Splits backtick-quoted qualified names (`schema.table`) into separate tokens</li> 211 * <li>Handles MAP keyword disambiguation</li> 212 * <li>Handles all standard SQL tokens (keywords, identifiers, operators, etc.)</li> 213 * </ul> 214 * <p> 215 * Migrated from TGSqlParser.dohivetexttotokenlist() 216 */ 217 private void dohivetexttotokenlist() { 218 219 TSourceToken asourcetoken, lcprevst; 220 int yychar; 221 222 asourcetoken = getanewsourcetoken(); 223 if (asourcetoken == null) return; 224 yychar = asourcetoken.tokencode; 225 226 while (yychar > 0) { 227 if (asourcetoken != null) { 228 sourcetokenlist.add(asourcetoken); 229 } 230 asourcetoken = getanewsourcetoken(); 231 if (asourcetoken == null) break; 232 if (asourcetoken.tokencode == TBaseType.rrw_map) { 233 TSourceToken token = asourcetoken.searchToken(')', -1); 234 if (token != null) { 235 asourcetoken.tokencode = TBaseType.ident; 236 } 237 } else if (asourcetoken.tokencode == '(') { 238// TSourceToken token = asourcetoken.searchToken(TBaseType.ident,-1); 239// if (token != null){ 240// token.tokencode = TBaseType.HIVE_FUNC_IDENT; 241// } 242 } 243 yychar = asourcetoken.tokencode; 244 245 // `schema.table_name` 246 if ((asourcetoken.tokencode == TBaseType.ident) 247 && (asourcetoken.toString().startsWith("`")) && (asourcetoken.toString().endsWith("`")) 248 && (asourcetoken.toString().indexOf(".") > 0) 249 ) { 250 yychar = splitQualifiedNameInBacktick(asourcetoken); 251 asourcetoken = null; 252 } 253 254 } 255 256 } 257 258 /** 259 * Turn one token: `schema.table_name` into 3 tokens: `schema` . `table_name` 260 * <p> 261 * This helper method splits backtick-quoted qualified names into individual 262 * identifier and period tokens, preserving line/column information for each part. 263 * <p> 264 * Migrated from TGSqlParser.splitQualifiedNameInBacktick() 265 * 266 * @param asourcetoken the token to split 267 * @return the token code of the last token created 268 */ 269 private int splitQualifiedNameInBacktick(TSourceToken asourcetoken) { 270 int yychar = 0; 271 272 List<String> elephantList = Arrays.asList(TBaseType.getTextWithoutQuoted(asourcetoken.toString()).split("\\.")); 273 int p = 0, offset = 0; 274 for (String s : elephantList) { 275 TSourceToken pst = new TSourceToken("`" + s + "`"); 276 pst.tokencode = asourcetoken.tokencode; 277 pst.tokentype = asourcetoken.tokentype; 278 pst.tokenstatus = asourcetoken.tokenstatus; 279 pst.lineNo = asourcetoken.lineNo; 280 pst.columnNo = asourcetoken.columnNo + offset; 281 if (p == 0) offset++; // this count the first ` token 282 offset = offset + s.length(); 283 pst.container = sourcetokenlist; 284 if (p > 0) { // 第一个token使用被拆分前那个token的位置,从第二个开始的token,需要先把列表的位置指针加 1 285 sourcetokenlist.curpos = sourcetokenlist.curpos + 1; 286 } 287 pst.posinlist = sourcetokenlist.curpos; 288 289 sourcetokenlist.add(pst); 290 yychar = pst.tokencode; 291 292 if (p != elephantList.size() - 1) { 293 //`schema.table_name`, add period token in the middle of the backtick included identifier. 294 TSourceToken periodst = new TSourceToken("."); 295 periodst.tokencode = '.'; 296 periodst.tokentype = ETokenType.ttperiod; 297 periodst.tokenstatus = asourcetoken.tokenstatus; 298 periodst.lineNo = asourcetoken.lineNo; 299 periodst.columnNo = asourcetoken.columnNo + offset; 300 offset++; 301 periodst.container = sourcetokenlist; 302 sourcetokenlist.curpos = sourcetokenlist.curpos + 1; 303 periodst.posinlist = sourcetokenlist.curpos; 304 sourcetokenlist.add(periodst); 305 yychar = periodst.tokencode; 306 } 307 308 p++; 309 } 310 311 return yychar; 312 313 } 314 315 // ========== Raw Statement Extraction ========== 316 317 /** 318 * Extract raw SQL statements from the token list. 319 * <p> 320 * This method separates individual SQL statements without full syntax checking. 321 * It handles Hive-specific syntax including: 322 * <ul> 323 * <li>Token code adjustments (CharSetName, DATE function, SORT keyword)</li> 324 * <li>Semicolon-terminated statements</li> 325 * <li>Continuous semicolon handling (treated as comments)</li> 326 * </ul> 327 * <p> 328 * Migrated from TGSqlParser.dohivegetrawsqlstatements() 329 * 330 * @param builder the result builder to populate 331 * @return number of errors encountered 332 */ 333 private int dohivegetrawsqlstatements(SqlParseResult.Builder builder) { 334 335 if (TBaseType.assigned(sqlstatements)) sqlstatements.clear(); 336 if (!TBaseType.assigned(sourcetokenlist)) return -1; 337 338 gcurrentsqlstatement = null; 339 EFindSqlStateType gst = EFindSqlStateType.stnormal; 340 TSourceToken lcprevsolidtoken = null, ast = null; 341 342 for (int i = 0; i < sourcetokenlist.size(); i++) { 343 344 if ((ast != null) && (ast.issolidtoken())) 345 lcprevsolidtoken = ast; 346 347 ast = sourcetokenlist.get(i); 348 sourcetokenlist.curpos = i; 349 350 if (ast.tokencode == TBaseType.hive_CharSetName) { 351 TSourceToken st1 = ast.searchToken(TBaseType.hive_CharSetLiteral, 1); 352 if (st1 == null) { 353 ast.tokencode = TBaseType.ident; 354 } 355 } else if (ast.tokencode == TBaseType.rrw_date) { 356 TSourceToken st1 = ast.nextSolidToken(); //ast.searchToken('(',1); 357 if (st1 != null) { 358 if (st1.tokencode == '(') { 359 ast.tokencode = TBaseType.rrw_hive_DATE_FUNCTION; 360 } 361 } 362 } else if (ast.tokencode == TBaseType.rrw_sort) { 363 TSourceToken st1 = ast.searchToken(TBaseType.rrw_by, 1); 364 if (st1 == null) { 365 ast.tokencode = TBaseType.ident; 366 } 367 } 368 369 switch (gst) { 370 case sterror: { 371 if (ast.tokentype == ETokenType.ttsemicolon) { 372 gcurrentsqlstatement.sourcetokenlist.add(ast); 373 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder); 374 gst = EFindSqlStateType.stnormal; 375 } else { 376 gcurrentsqlstatement.sourcetokenlist.add(ast); 377 } 378 break; 379 } //sterror 380 381 case stnormal: { 382 if ((ast.tokencode == TBaseType.cmtdoublehyphen) 383 || (ast.tokencode == TBaseType.cmtslashstar) 384 || (ast.tokencode == TBaseType.lexspace) 385 || (ast.tokencode == TBaseType.lexnewline) 386 || (ast.tokentype == ETokenType.ttsemicolon)) { 387 if (gcurrentsqlstatement != null) { 388 gcurrentsqlstatement.sourcetokenlist.add(ast); 389 } 390 391 if ((lcprevsolidtoken != null) && (ast.tokentype == ETokenType.ttsemicolon)) { 392 if (lcprevsolidtoken.tokentype == ETokenType.ttsemicolon) { 393 // ;;;; continuous semicolon,treat it as comment 394 ast.tokentype = ETokenType.ttsimplecomment; 395 ast.tokencode = TBaseType.cmtdoublehyphen; 396 } 397 } 398 399 continue; 400 } 401 402 403 gcurrentsqlstatement = sqlcmds.issql(ast, gst, gcurrentsqlstatement); 404 405 if (gcurrentsqlstatement != null) { 406 gst = EFindSqlStateType.stsql; 407 gcurrentsqlstatement.sourcetokenlist.add(ast); 408 } else { 409 //error tokentext found 410 411 this.syntaxErrors.add(new TSyntaxError(ast.getAstext(), ast.lineNo, (ast.columnNo < 0 ? 0 : ast.columnNo) 412 , "Error when tokenlize", EErrorType.spwarning, TBaseType.MSG_WARNING_ERROR_WHEN_TOKENIZE, null, ast.posinlist)); 413 414 ast.tokentype = ETokenType.tttokenlizererrortoken; 415 gst = EFindSqlStateType.sterror; 416 417 gcurrentsqlstatement = new TUnknownSqlStatement(vendor); 418 gcurrentsqlstatement.sqlstatementtype = ESqlStatementType.sstinvalid; 419 gcurrentsqlstatement.sourcetokenlist.add(ast); 420 421 } 422 423 break; 424 } // stnormal 425 426 case stsql: { 427 if (ast.tokentype == ETokenType.ttsemicolon) { 428 gst = EFindSqlStateType.stnormal; 429 gcurrentsqlstatement.sourcetokenlist.add(ast); 430 gcurrentsqlstatement.semicolonended = ast; 431 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, false, builder); 432 continue; 433 } 434 435 gcurrentsqlstatement.sourcetokenlist.add(ast); 436 break; 437 }//case stsql 438 439 } //switch 440 }//for 441 442 //last statement 443 if ((gcurrentsqlstatement != null) && 444 ((gst == EFindSqlStateType.stsql) || (gst == EFindSqlStateType.sterror))) { 445 onRawStatementComplete(parserContext, gcurrentsqlstatement, fparser, null, sqlstatements, true, builder); 446 } 447 448 return syntaxErrors.size(); 449 } 450 451 // ========== Statement Parsing ========== 452 453 /** 454 * Parse all raw SQL statements. 455 * <p> 456 * This method performs full syntax analysis of each statement: 457 * <ul> 458 * <li>Initializes global context and SQL environment</li> 459 * <li>Parses each statement using TParserHive</li> 460 * <li>Handles errors with optional error recovery</li> 461 * <li>Collects syntax errors for reporting</li> 462 * </ul> 463 * <p> 464 * Migrated from TGSqlParser.performParsing() 465 * 466 * @param context the parser context 467 * @param parser the main parser (TParserHive) 468 * @param secondaryParser the secondary parser (null for Hive) 469 * @param tokens the source token list 470 * @param rawStatements raw statements already extracted (never null) 471 * @return the parsed statement list 472 */ 473 @Override 474 protected TStatementList performParsing(ParserContext context, TCustomParser parser, TCustomParser secondaryParser, TSourceTokenList tokens, TStatementList rawStatements) { 475 this.parserContext = context; 476 this.fparser = (TParserHive) parser; 477 this.sourcetokenlist = tokens; 478 this.sqlstatements = rawStatements; 479 480 // Initialize sqlcmds for this parsing operation 481 if (this.sqlcmds == null) { 482 this.sqlcmds = SqlCmdsFactory.get(vendor); 483 } 484 485 // CRITICAL: Inject sqlcmds into parser (required for make_stmt to work) 486 this.fparser.sqlcmds = this.sqlcmds; 487 488 // Initialize global context (inherited method from AbstractSqlParser) 489 initializeGlobalContext(); 490 491 // Parse each statement 492 for (int i = 0; i < sqlstatements.size(); i++) { 493 TCustomSqlStatement stmt = sqlstatements.getRawSql(i); 494 495 try { 496 // Set frame stack for nested scope resolution 497 stmt.setFrameStack(frameStack); 498 499 // Parse the statement 500 int parseResult = stmt.parsestatement(null, false, context.isOnlyNeedRawParseTree()); 501 502 // Attempt error recovery using inherited method 503 parseResult = attemptErrorRecovery(stmt, parseResult, context.isOnlyNeedRawParseTree()); 504 505 // Collect errors from statement 506 if ((parseResult != 0) || (stmt.getErrorCount() > 0)) { 507 copyErrorsFromStatement(stmt); 508 } 509 510 } catch (Exception ex) { 511 // Use inherited exception handler from AbstractSqlParser 512 handleStatementParsingException(stmt, i, ex); 513 continue; 514 } 515 } 516 517 // Clean up frame stack 518 if (globalFrame != null) globalFrame.popMeFromStack(frameStack); 519 520 return sqlstatements; 521 } 522 523 // ========== Semantic Analysis ========== 524 525 /** 526 * Perform semantic analysis on parsed statements. 527 * <p> 528 * Runs TSQLResolver to build relationships between tables and columns, 529 * resolve references, and perform type checking. 530 */ 531 @Override 532 protected void performSemanticAnalysis(ParserContext context, TStatementList statements) { 533 if (TBaseType.isEnableResolver() && getSyntaxErrors().isEmpty()) { 534 TSQLResolver resolver = new TSQLResolver(globalContext, statements); 535 resolver.resolve(); 536 } 537 } 538 539 // ========== Interpretation ========== 540 541 /** 542 * Perform interpretation/evaluation on statements. 543 * <p> 544 * Runs TASTEvaluator for compile-time constant expression evaluation. 545 * Hive does not require interpretation currently. 546 */ 547 @Override 548 protected void performInterpreter(ParserContext context, TStatementList statements) { 549 // Hive does not require interpretation currently 550 } 551 552 @Override 553 public String toString() { 554 return "HiveSqlParser{vendor=" + vendor + "}"; 555 } 556}