001package gudusoft.gsqlparser.parser.powerquery; 002 003import gudusoft.gsqlparser.ETokenType; 004import gudusoft.gsqlparser.TBaseType; 005import gudusoft.gsqlparser.TSourceToken; 006import gudusoft.gsqlparser.TSourceTokenList; 007 008import java.io.BufferedReader; 009import java.io.IOException; 010 011/** 012 * Hand-written Power Query M-Language tokenizer. 013 * 014 * <p>Reads raw M text and produces a {@link TSourceTokenList} compatible 015 * with the rest of the GSP machinery. Emits: 016 * 017 * <ul> 018 * <li>{@code IDENT} ({@link TBaseType#ident}) — simple identifiers</li> 019 * <li>{@code QIDENT} ({@link PowerQueryTokenCodes#QIDENT}) — {@code #"..."}</li> 020 * <li>{@code SCONST} ({@link TBaseType#sconst}) — string literals, raw form preserved in token text</li> 021 * <li>{@code ICONST} / {@code FCONST} — integer / decimal numbers</li> 022 * <li>M keywords ({@link PowerQueryTokenCodes#RW_LET} …)</li> 023 * <li>Single-char punct (ASCII code as token code)</li> 024 * <li>{@code FAT_ARROW} ({@code =>}), {@code DOUBLE_DOT} ({@code ..}), 025 * {@code ELLIPSIS} ({@code ...})</li> 026 * <li>{@code _COMMENT} ({@code //}) and block-comment tokens — treated as 027 * whitespace by downstream, but preserved so the token list can be 028 * inspected for bug reports</li> 029 * </ul> 030 * 031 * <p>The tokenizer is deliberately simple and forgiving: unknown characters 032 * become single-character tokens rather than errors. The document parser 033 * decides what to do with them. 034 */ 035public class PowerQueryTokenizer { 036 037 private final String text; 038 private int pos; 039 private long lineNo = 1; 040 private long colNo = 1; 041 private long offset = 0; 042 043 public PowerQueryTokenizer(String text) { 044 this.text = text != null ? text : ""; 045 } 046 047 /** Convenience constructor that slurps a {@link BufferedReader}. */ 048 public static PowerQueryTokenizer from(BufferedReader reader) throws IOException { 049 StringBuilder sb = new StringBuilder(); 050 char[] buf = new char[4096]; 051 int n; 052 while ((n = reader.read(buf)) != -1) { 053 sb.append(buf, 0, n); 054 } 055 return new PowerQueryTokenizer(sb.toString()); 056 } 057 058 /** Tokenize {@code text} and append every produced token to {@code out}. */ 059 public void tokenizeInto(TSourceTokenList out) { 060 while (pos < text.length()) { 061 int startLine = (int) lineNo; 062 int startCol = (int) colNo; 063 long startOffset = offset; 064 char c = text.charAt(pos); 065 066 if (c == ' ' || c == '\t' || c == '\r') { 067 advance(); 068 continue; 069 } 070 if (c == '\n') { 071 advance(); 072 continue; 073 } 074 075 // Line comment 076 if (c == '/' && peek(1) == '/') { 077 StringBuilder raw = new StringBuilder(); 078 while (pos < text.length() && text.charAt(pos) != '\n') { 079 raw.append(text.charAt(pos)); 080 advance(); 081 } 082 appendToken(out, raw.toString(), TBaseType.cmtdoublehyphen, 083 ETokenType.ttsimplecomment, startLine, startCol, startOffset); 084 continue; 085 } 086 087 // Block comment /* ... */ 088 if (c == '/' && peek(1) == '*') { 089 StringBuilder raw = new StringBuilder(); 090 raw.append("/*"); 091 advance(); advance(); 092 int depth = 1; 093 while (pos < text.length() && depth > 0) { 094 char d = text.charAt(pos); 095 if (d == '*' && peek(1) == '/') { 096 raw.append("*/"); 097 advance(); advance(); 098 depth--; 099 } else if (d == '/' && peek(1) == '*') { 100 raw.append("/*"); 101 advance(); advance(); 102 depth++; 103 } else { 104 raw.append(d); 105 advance(); 106 } 107 } 108 appendToken(out, raw.toString(), TBaseType.cmtslashstar, 109 ETokenType.ttbracketedcomment, startLine, startCol, startOffset); 110 continue; 111 } 112 113 // Quoted identifier #"..." 114 if (c == '#' && peek(1) == '"') { 115 StringBuilder raw = new StringBuilder(); 116 raw.append('#').append('"'); 117 advance(); advance(); 118 while (pos < text.length()) { 119 char d = text.charAt(pos); 120 if (d == '"' && peek(1) == '"') { 121 raw.append("\"\""); 122 advance(); advance(); 123 } else if (d == '"') { 124 raw.append('"'); 125 advance(); 126 break; 127 } else { 128 raw.append(d); 129 advance(); 130 } 131 } 132 appendToken(out, raw.toString(), PowerQueryTokenCodes.QIDENT, 133 ETokenType.ttdqstring, startLine, startCol, startOffset); 134 continue; 135 } 136 137 // String literal "..." 138 if (c == '"') { 139 StringBuilder raw = new StringBuilder(); 140 raw.append('"'); 141 advance(); 142 while (pos < text.length()) { 143 char d = text.charAt(pos); 144 if (d == '"' && peek(1) == '"') { 145 raw.append("\"\""); 146 advance(); advance(); 147 } else if (d == '"') { 148 raw.append('"'); 149 advance(); 150 break; 151 } else if (d == '#' && peek(1) == '(') { 152 int close = text.indexOf(')', pos + 2); 153 if (close < 0) { 154 raw.append(d); 155 advance(); 156 } else { 157 while (pos <= close) { 158 raw.append(text.charAt(pos)); 159 advance(); 160 } 161 } 162 } else { 163 raw.append(d); 164 advance(); 165 } 166 } 167 appendToken(out, raw.toString(), TBaseType.sconst, 168 ETokenType.ttsqstring, startLine, startCol, startOffset); 169 continue; 170 } 171 172 // Ellipsis ... or double-dot .. 173 if (c == '.' && peek(1) == '.' && peek(2) == '.') { 174 advance(); advance(); advance(); 175 appendToken(out, "...", PowerQueryTokenCodes.ELLIPSIS, 176 ETokenType.ttunknown, startLine, startCol, startOffset); 177 continue; 178 } 179 if (c == '.' && peek(1) == '.') { 180 advance(); advance(); 181 appendToken(out, "..", PowerQueryTokenCodes.DOUBLE_DOT, 182 ETokenType.ttunknown, startLine, startCol, startOffset); 183 continue; 184 } 185 186 // Fat arrow => 187 if (c == '=' && peek(1) == '>') { 188 advance(); advance(); 189 appendToken(out, "=>", PowerQueryTokenCodes.FAT_ARROW, 190 ETokenType.ttunknown, startLine, startCol, startOffset); 191 continue; 192 } 193 194 // Comparison operators <>, <=, >= 195 if (c == '<' && peek(1) == '>') { 196 advance(); advance(); 197 appendToken(out, "<>", TBaseType.not_equal, 198 ETokenType.ttunknown, startLine, startCol, startOffset); 199 continue; 200 } 201 if (c == '<' && peek(1) == '=') { 202 advance(); advance(); 203 appendToken(out, "<=", TBaseType.less_equal, 204 ETokenType.ttunknown, startLine, startCol, startOffset); 205 continue; 206 } 207 if (c == '>' && peek(1) == '=') { 208 advance(); advance(); 209 appendToken(out, ">=", TBaseType.great_equal, 210 ETokenType.ttunknown, startLine, startCol, startOffset); 211 continue; 212 } 213 214 // Number (integer or decimal, with optional exponent) 215 if (isDigit(c) || (c == '.' && isDigit(peek(1)))) { 216 StringBuilder raw = new StringBuilder(); 217 boolean seenDot = false; 218 boolean seenExp = false; 219 while (pos < text.length()) { 220 char d = text.charAt(pos); 221 if (isDigit(d)) { 222 raw.append(d); 223 advance(); 224 } else if (d == '.' && !seenDot && !seenExp && isDigit(peek(1))) { 225 raw.append(d); 226 advance(); 227 seenDot = true; 228 } else if ((d == 'e' || d == 'E') && !seenExp) { 229 raw.append(d); 230 advance(); 231 seenExp = true; 232 if (pos < text.length() 233 && (text.charAt(pos) == '+' || text.charAt(pos) == '-')) { 234 raw.append(text.charAt(pos)); 235 advance(); 236 } 237 } else { 238 break; 239 } 240 } 241 int code = (seenDot || seenExp) ? TBaseType.fconst : TBaseType.iconst; 242 appendToken(out, raw.toString(), code, 243 ETokenType.ttnumber, startLine, startCol, startOffset); 244 continue; 245 } 246 247 // Identifier or keyword 248 if (isIdentStart(c)) { 249 StringBuilder raw = new StringBuilder(); 250 while (pos < text.length() && isIdentPart(text.charAt(pos))) { 251 raw.append(text.charAt(pos)); 252 advance(); 253 } 254 String word = raw.toString(); 255 int kwCode = PowerQueryTokenCodes.keywordCode( 256 word.toUpperCase(java.util.Locale.ROOT)); 257 if (kwCode >= 0) { 258 appendToken(out, word, kwCode, 259 ETokenType.ttkeyword, startLine, startCol, startOffset); 260 } else { 261 appendToken(out, word, TBaseType.ident, 262 ETokenType.ttidentifier, startLine, startCol, startOffset); 263 } 264 continue; 265 } 266 267 // Single-character punct (use ASCII value as tokencode) 268 int code = c; 269 ETokenType kind = classifyPunct(c); 270 String raw = String.valueOf(c); 271 advance(); 272 appendToken(out, raw, code, kind, startLine, startCol, startOffset); 273 } 274 } 275 276 private void appendToken(TSourceTokenList out, String text, int code, 277 ETokenType kind, int line, int col, long startOffset) { 278 TSourceToken tok = new TSourceToken(); 279 tok.setAstext(text); 280 tok.tokencode = code; 281 tok.tokentype = kind; 282 tok.lineNo = line; 283 tok.columnNo = col; 284 tok.offset = startOffset; 285 tok.container = out; 286 tok.posinlist = out.size(); 287 out.add(tok); 288 } 289 290 private ETokenType classifyPunct(char c) { 291 switch (c) { 292 case ';': return ETokenType.ttsemicolon; 293 case ':': return ETokenType.ttcolon; 294 case ',': return ETokenType.ttcomma; 295 case '.': return ETokenType.ttperiod; 296 case '=': return ETokenType.ttequals; 297 case '?': return ETokenType.ttquestionmark; 298 case '+': return ETokenType.ttplussign; 299 case '-': return ETokenType.ttminussign; 300 case '*': return ETokenType.ttasterisk; 301 default: return ETokenType.ttunknown; 302 } 303 } 304 305 private void advance() { 306 if (pos >= text.length()) return; 307 char c = text.charAt(pos); 308 if (c == '\n') { 309 lineNo++; 310 colNo = 1; 311 } else { 312 colNo++; 313 } 314 pos++; 315 offset++; 316 } 317 318 private char peek(int delta) { 319 int idx = pos + delta; 320 if (idx < 0 || idx >= text.length()) return '\0'; 321 return text.charAt(idx); 322 } 323 324 private static boolean isDigit(char c) { 325 return c >= '0' && c <= '9'; 326 } 327 328 private static boolean isIdentStart(char c) { 329 return c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 330 } 331 332 private static boolean isIdentPart(char c) { 333 // M identifiers are [A-Za-z_][A-Za-z0-9_]* — no dot; dot is a separator. 334 return isIdentStart(c) || isDigit(c); 335 } 336}