001package gudusoft.gsqlparser.parser; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TBaseType; 005import gudusoft.gsqlparser.TCustomLexer; 006import gudusoft.gsqlparser.TCustomParser; 007import gudusoft.gsqlparser.TCustomSqlStatement; 008import gudusoft.gsqlparser.TLexerDoris; 009import gudusoft.gsqlparser.TParserDoris; 010import gudusoft.gsqlparser.TSourceToken; 011import gudusoft.gsqlparser.TSourceTokenList; 012import gudusoft.gsqlparser.TStatementList; 013import gudusoft.gsqlparser.TSyntaxError; 014import gudusoft.gsqlparser.EFindSqlStateType; 015import gudusoft.gsqlparser.ETokenType; 016import gudusoft.gsqlparser.ETokenStatus; 017import gudusoft.gsqlparser.ESqlStatementType; 018import gudusoft.gsqlparser.EErrorType; 019import gudusoft.gsqlparser.stmt.TUnknownSqlStatement; 020import gudusoft.gsqlparser.sqlcmds.ISqlCmds; 021import gudusoft.gsqlparser.sqlcmds.SqlCmdsFactory; 022import gudusoft.gsqlparser.compiler.TContext; 023import gudusoft.gsqlparser.sqlenv.TSQLEnv; 024import gudusoft.gsqlparser.compiler.TGlobalScope; 025import gudusoft.gsqlparser.compiler.TFrame; 026 027import java.io.BufferedReader; 028import java.util.ArrayList; 029import java.util.List; 030import java.util.Stack; 031 032/** 033 * Apache Doris SQL parser implementation. 034 * 035 * <p>Doris is MySQL-compatible, so this parser is based on the MySQL parser 036 * with extensions for Doris-specific features including: 037 * <ul> 038 * <li>OLAP data models (DUPLICATE KEY, AGGREGATE KEY, UNIQUE KEY)</li> 039 * <li>DISTRIBUTED BY clause</li> 040 * <li>PROPERTIES clause</li> 041 * <li>Doris-specific data types (LARGEINT, HLL, BITMAP, etc.)</li> 042 * <li>Load and export statements (BROKER LOAD, EXPORT, ROUTINE LOAD)</li> 043 * </ul> 044 * 045 * <p><b>Design Notes:</b> 046 * <ul> 047 * <li>Extends {@link AbstractSqlParser}</li> 048 * <li>Uses {@link TLexerDoris} and {@link TParserDoris}</li> 049 * <li>Delimiter character: ';'</li> 050 * </ul> 051 * 052 * @see SqlParser 053 * @see AbstractSqlParser 054 * @see TLexerDoris 055 * @see TParserDoris 056 * @since 3.2.0.0 057 */ 058public class DorisSqlParser extends AbstractSqlParser { 059 060 // ========== Lexer and Parser Instances ========== 061 062 /** The Doris lexer used for tokenization (public for TGSqlParser.getFlexer()) */ 063 public TLexerDoris flexer; 064 private TParserDoris fparser; 065 066 // ========== State Variables for Raw Statement Extraction ========== 067 private String userDelimiterStr; 068 private char curdelimiterchar; 069 private boolean waitingDelimiter; 070 071 // ========== Constructor ========== 072 073 /** 074 * Construct Doris SQL parser. 075 * <p> 076 * Configures the parser for Apache Doris database with default delimiter: semicolon (;) 077 */ 078 public DorisSqlParser() { 079 super(EDbVendor.dbvdoris); 080 081 // Set delimiter character - Doris uses semicolon like standard MySQL 082 this.delimiterChar = ';'; 083 this.defaultDelimiterStr = ";"; 084 085 // Create lexer once - will be reused for all parsing operations 086 this.flexer = new TLexerDoris(); 087 this.flexer.delimiterchar = this.delimiterChar; 088 this.flexer.defaultDelimiterStr = this.defaultDelimiterStr; 089 090 // CRITICAL: Set lexer for inherited getanewsourcetoken() method 091 this.lexer = this.flexer; 092 093 // Create parser once - will be reused for all parsing operations 094 this.fparser = new TParserDoris(null); 095 this.fparser.lexer = this.flexer; 096 } 097 098 // ========== AbstractSqlParser Abstract Methods Implementation ========== 099 100 @Override 101 protected TCustomLexer getLexer(ParserContext context) { 102 return this.flexer; 103 } 104 105 @Override 106 protected TCustomParser getParser(ParserContext context, TSourceTokenList tokens) { 107 this.fparser.sourcetokenlist = tokens; 108 return this.fparser; 109 } 110 111 @Override 112 protected void tokenizeVendorSql() { 113 dodoristexttotokenlist(); 114 } 115 116 @Override 117 protected void setupVendorParsersForExtraction() { 118 this.fparser.sqlcmds = this.sqlcmds; 119 this.fparser.sourcetokenlist = this.sourcetokenlist; 120 } 121 122 @Override 123 protected void extractVendorRawStatements(SqlParseResult.Builder builder) { 124 dodorisgetrawsqlstatements(builder); 125 } 126 127 @Override 128 protected TStatementList performParsing(ParserContext context, 129 TCustomParser parser, 130 TCustomParser secondaryParser, 131 TSourceTokenList tokens, 132 TStatementList rawStatements) { 133 this.sourcetokenlist = tokens; 134 this.parserContext = context; 135 this.sqlstatements = rawStatements; 136 137 this.sqlcmds = SqlCmdsFactory.get(vendor); 138 this.fparser.sqlcmds = this.sqlcmds; 139 140 initializeGlobalContext(); 141 142 for (int i = 0; i < sqlstatements.size(); i++) { 143 TCustomSqlStatement stmt = sqlstatements.getRawSql(i); 144 145 try { 146 stmt.setFrameStack(frameStack); 147 int parseResult = stmt.parsestatement(null, false, context.isOnlyNeedRawParseTree()); 148 149 boolean doRecover = TBaseType.ENABLE_ERROR_RECOVER_IN_CREATE_TABLE; 150 if (doRecover && ((parseResult != 0) || (stmt.getErrorCount() > 0))) { 151 handleCreateTableErrorRecovery(stmt); 152 } 153 154 if ((parseResult != 0) || (stmt.getErrorCount() > 0)) { 155 copyErrorsFromStatement(stmt); 156 } 157 } catch (Exception ex) { 158 handleStatementParsingException(stmt, i, ex); 159 continue; 160 } 161 } 162 163 if (globalFrame != null) { 164 globalFrame.popMeFromStack(frameStack); 165 } 166 167 return this.sqlstatements; 168 } 169 170 private void handleCreateTableErrorRecovery(TCustomSqlStatement stmt) { 171 if ((stmt.sqlstatementtype != ESqlStatementType.sstcreatetable) || TBaseType.c_createTableStrictParsing) { 172 return; 173 } 174 175 int nested = 0; 176 boolean isIgnore = false, isFoundIgnoreToken = false; 177 TSourceToken firstIgnoreToken = null; 178 179 for (int k = 0; k < stmt.sourcetokenlist.size(); k++) { 180 TSourceToken st = stmt.sourcetokenlist.get(k); 181 if (isIgnore) { 182 if (st.issolidtoken() && (st.tokencode != ';')) { 183 isFoundIgnoreToken = true; 184 if (firstIgnoreToken == null) { 185 firstIgnoreToken = st; 186 } 187 } 188 if (st.tokencode != ';') { 189 st.tokencode = TBaseType.sqlpluscmd; 190 } 191 continue; 192 } 193 if (st.tokencode == (int) ')') { 194 nested--; 195 if (nested == 0) { 196 boolean isSelect = false; 197 TSourceToken st1 = st.searchToken(TBaseType.rrw_as, 1); 198 if (st1 != null) { 199 TSourceToken st2 = st.searchToken((int) '(', 2); 200 if (st2 != null) { 201 TSourceToken st3 = st.searchToken(TBaseType.rrw_select, 3); 202 isSelect = (st3 != null); 203 } 204 } 205 if (!isSelect) isIgnore = true; 206 } 207 } else if (st.tokencode == (int) '(') { 208 nested++; 209 } 210 } 211 212 if (isFoundIgnoreToken) { 213 stmt.clearError(); 214 stmt.parsestatement(null, false, this.parserContext.isOnlyNeedRawParseTree()); 215 } 216 } 217 218 // ========== Doris-Specific Tokenization ========== 219 220 private void dodoristexttotokenlist() { 221 TSourceToken asourcetoken, lcprevst; 222 int yychar; 223 224 asourcetoken = getanewsourcetoken(); 225 if (asourcetoken == null) return; 226 yychar = asourcetoken.tokencode; 227 228 while (yychar > 0) { 229 sourcetokenlist.add(asourcetoken); 230 asourcetoken = getanewsourcetoken(); 231 if (asourcetoken == null) break; 232 233 if (asourcetoken.tokencode == TBaseType.rrw_rollup) { 234 lcprevst = getprevsolidtoken(asourcetoken); 235 if (lcprevst != null) { 236 if (lcprevst.tokencode == TBaseType.rrw_with) 237 lcprevst.tokencode = TBaseType.with_rollup; 238 } 239 } 240 241 yychar = asourcetoken.tokencode; 242 } 243 } 244 245 private TSourceToken getprevsolidtoken(TSourceToken ptoken) { 246 TSourceToken ret = null; 247 TSourceTokenList lctokenlist = ptoken.container; 248 249 if (lctokenlist != null) { 250 if ((ptoken.posinlist > 0) && (lctokenlist.size() > ptoken.posinlist - 1)) { 251 if (!( 252 (lctokenlist.get(ptoken.posinlist - 1).tokentype == ETokenType.ttwhitespace) 253 || (lctokenlist.get(ptoken.posinlist - 1).tokentype == ETokenType.ttreturn) 254 || (lctokenlist.get(ptoken.posinlist - 1).tokentype == ETokenType.ttsimplecomment) 255 || (lctokenlist.get(ptoken.posinlist - 1).tokentype == ETokenType.ttbracketedcomment) 256 )) { 257 ret = lctokenlist.get(ptoken.posinlist - 1); 258 } else { 259 ret = lctokenlist.nextsolidtoken(ptoken.posinlist - 1, -1, false); 260 } 261 } 262 } 263 return ret; 264 } 265 266 // ========== Doris-Specific Raw Statement Extraction ========== 267 268 private void dodorisgetrawsqlstatements(SqlParseResult.Builder builder) { 269 TCustomSqlStatement gcurrentsqlstatement = null; 270 EFindSqlStateType gst = EFindSqlStateType.stnormal; 271 272 userDelimiterStr = defaultDelimiterStr; 273 274 if (TBaseType.assigned(sqlstatements)) sqlstatements.clear(); 275 if (!TBaseType.assigned(sourcetokenlist)) { 276 builder.sqlStatements(this.sqlstatements); 277 builder.errorCode(1); 278 builder.errorMessage("No source token list available"); 279 return; 280 } 281 282 for (int i = 0; i < sourcetokenlist.size(); i++) { 283 TSourceToken ast = sourcetokenlist.get(i); 284 sourcetokenlist.curpos = i; 285 286 performRawStatementTokenTransformations(ast); 287 288 switch (gst) { 289 case sterror: { 290 if (ast.tokentype == ETokenType.ttsemicolon) { 291 appendToken(gcurrentsqlstatement, ast); 292 onRawStatementComplete(this.parserContext, gcurrentsqlstatement, this.fparser, null, this.sqlstatements, false, builder); 293 gst = EFindSqlStateType.stnormal; 294 } else { 295 appendToken(gcurrentsqlstatement, ast); 296 } 297 break; 298 } 299 300 case stnormal: { 301 if ((ast.tokencode == TBaseType.cmtdoublehyphen) 302 || (ast.tokencode == TBaseType.cmtslashstar) 303 || (ast.tokencode == TBaseType.lexspace) 304 || (ast.tokencode == TBaseType.lexnewline) 305 || (ast.tokentype == ETokenType.ttsemicolon)) { 306 if (TBaseType.assigned(gcurrentsqlstatement)) { 307 appendToken(gcurrentsqlstatement, ast); 308 } 309 continue; 310 } 311 312 gcurrentsqlstatement = sqlcmds.issql(ast, gst, gcurrentsqlstatement); 313 314 if (TBaseType.assigned(gcurrentsqlstatement)) { 315 gst = EFindSqlStateType.stsql; 316 appendToken(gcurrentsqlstatement, ast); 317 } 318 319 if (!TBaseType.assigned(gcurrentsqlstatement)) { 320 this.syntaxErrors.add(new TSyntaxError(ast.getAstext(), ast.lineNo, (ast.columnNo < 0 ? 0 : ast.columnNo), 321 "Error when tokenize", EErrorType.spwarning, TBaseType.MSG_WARNING_ERROR_WHEN_TOKENIZE, null, ast.posinlist)); 322 323 ast.tokentype = ETokenType.tttokenlizererrortoken; 324 gst = EFindSqlStateType.sterror; 325 326 gcurrentsqlstatement = new TUnknownSqlStatement(vendor); 327 gcurrentsqlstatement.sqlstatementtype = ESqlStatementType.sstinvalid; 328 appendToken(gcurrentsqlstatement, ast); 329 } 330 break; 331 } 332 333 case stsql: { 334 if (ast.tokentype == ETokenType.ttsemicolon) { 335 gst = EFindSqlStateType.stnormal; 336 appendToken(gcurrentsqlstatement, ast); 337 gcurrentsqlstatement.semicolonended = ast; 338 onRawStatementComplete(this.parserContext, gcurrentsqlstatement, this.fparser, null, this.sqlstatements, false, builder); 339 continue; 340 } 341 342 if (ast.tokencode == TBaseType.cmtdoublehyphen) { 343 if (ast.toString().trim().endsWith(TBaseType.sqlflow_stmt_delimiter_str)) { 344 gst = EFindSqlStateType.stnormal; 345 onRawStatementComplete(this.parserContext, gcurrentsqlstatement, this.fparser, null, this.sqlstatements, false, builder); 346 continue; 347 } 348 } 349 350 appendToken(gcurrentsqlstatement, ast); 351 break; 352 } 353 354 default: 355 break; 356 } 357 } 358 359 // Last statement 360 if (TBaseType.assigned(gcurrentsqlstatement) && ((gst == EFindSqlStateType.stsql) || (gst == EFindSqlStateType.sterror))) { 361 onRawStatementComplete(this.parserContext, gcurrentsqlstatement, this.fparser, null, this.sqlstatements, true, builder); 362 } 363 364 builder.sqlStatements(this.sqlstatements); 365 builder.syntaxErrors(syntaxErrors instanceof ArrayList ? 366 (ArrayList<TSyntaxError>) syntaxErrors : new ArrayList<>(syntaxErrors)); 367 builder.errorCode(syntaxErrors.isEmpty() ? 0 : syntaxErrors.size()); 368 } 369 370 private void performRawStatementTokenTransformations(TSourceToken ast) { 371 // Doris-specific token transformations can be added here 372 // For now, handle common MySQL-compatible transformations 373 if (ast.tokencode == TBaseType.rrw_date) { 374 TSourceToken st1 = ast.nextSolidToken(); 375 if (st1 != null) { 376 if (st1.tokencode == '(') { 377 ast.tokencode = TBaseType.rrw_mysql_date_function; 378 } else if (st1.tokencode == TBaseType.sconst) { 379 ast.tokencode = TBaseType.rrw_mysql_date_const; 380 } 381 } 382 } else if (ast.tokencode == TBaseType.rrw_time) { 383 TSourceToken st1 = ast.nextSolidToken(); 384 if (st1 != null) { 385 if (st1.tokencode == TBaseType.sconst) { 386 ast.tokencode = TBaseType.rrw_mysql_time_const; 387 } 388 } 389 } else if (ast.tokencode == TBaseType.rrw_timestamp) { 390 TSourceToken st1 = ast.nextSolidToken(); 391 if (st1 != null) { 392 if (st1.tokencode == TBaseType.sconst) { 393 ast.tokencode = TBaseType.rrw_mysql_timestamp_constant; 394 } 395 } 396 } 397 } 398 399 private void appendToken(TCustomSqlStatement statement, TSourceToken token) { 400 if (statement == null || token == null) { 401 return; 402 } 403 token.stmt = statement; 404 statement.sourcetokenlist.add(token); 405 } 406 407 @Override 408 public String toString() { 409 return "DorisSqlParser{vendor=" + vendor + "}"; 410 } 411}