Source code

001package gudusoft.gsqlparser.parser.powerquery;
002
003import gudusoft.gsqlparser.ETokenType;
004import gudusoft.gsqlparser.TBaseType;
005import gudusoft.gsqlparser.TSourceToken;
006import gudusoft.gsqlparser.TSourceTokenList;
007
008import java.io.BufferedReader;
009import java.io.IOException;
010
011/**
012 * Hand-written Power Query M-Language tokenizer.
013 *
014 * <p>Reads raw M text and produces a {@link TSourceTokenList} compatible
015 * with the rest of the GSP machinery.  Emits:
016 *
017 * <ul>
018 *   <li>{@code IDENT} ({@link TBaseType#ident}) — simple identifiers</li>
019 *   <li>{@code QIDENT} ({@link PowerQueryTokenCodes#QIDENT}) — {@code #"..."}</li>
020 *   <li>{@code SCONST} ({@link TBaseType#sconst}) — string literals, raw form preserved in token text</li>
021 *   <li>{@code ICONST} / {@code FCONST} — integer / decimal numbers</li>
022 *   <li>M keywords ({@link PowerQueryTokenCodes#RW_LET} …)</li>
023 *   <li>Single-char punct (ASCII code as token code)</li>
024 *   <li>{@code FAT_ARROW} ({@code =>}), {@code DOUBLE_DOT} ({@code ..}),
025 *       {@code ELLIPSIS} ({@code ...})</li>
026 *   <li>{@code _COMMENT} ({@code //}) and block-comment tokens — treated as
027 *       whitespace by downstream, but preserved so the token list can be
028 *       inspected for bug reports</li>
029 * </ul>
030 *
031 * <p>The tokenizer is deliberately simple and forgiving: unknown characters
032 * become single-character tokens rather than errors.  The document parser
033 * decides what to do with them.
034 */
035public class PowerQueryTokenizer {
036
037    private final String text;
038    private int pos;
039    private long lineNo = 1;
040    private long colNo = 1;
041    private long offset = 0;
042
043    public PowerQueryTokenizer(String text) {
044        this.text = text != null ? text : "";
045    }
046
047    /** Convenience constructor that slurps a {@link BufferedReader}. */
048    public static PowerQueryTokenizer from(BufferedReader reader) throws IOException {
049        StringBuilder sb = new StringBuilder();
050        char[] buf = new char[4096];
051        int n;
052        while ((n = reader.read(buf)) != -1) {
053            sb.append(buf, 0, n);
054        }
055        return new PowerQueryTokenizer(sb.toString());
056    }
057
058    /** Tokenize {@code text} and append every produced token to {@code out}. */
059    public void tokenizeInto(TSourceTokenList out) {
060        while (pos < text.length()) {
061            int startLine = (int) lineNo;
062            int startCol  = (int) colNo;
063            long startOffset = offset;
064            char c = text.charAt(pos);
065
066            if (c == ' ' || c == '\t' || c == '\r') {
067                advance();
068                continue;
069            }
070            if (c == '\n') {
071                advance();
072                continue;
073            }
074
075            // Line comment
076            if (c == '/' && peek(1) == '/') {
077                StringBuilder raw = new StringBuilder();
078                while (pos < text.length() && text.charAt(pos) != '\n') {
079                    raw.append(text.charAt(pos));
080                    advance();
081                }
082                appendToken(out, raw.toString(), TBaseType.cmtdoublehyphen,
083                        ETokenType.ttsimplecomment, startLine, startCol, startOffset);
084                continue;
085            }
086
087            // Block comment /* ... */
088            if (c == '/' && peek(1) == '*') {
089                StringBuilder raw = new StringBuilder();
090                raw.append("/*");
091                advance(); advance();
092                int depth = 1;
093                while (pos < text.length() && depth > 0) {
094                    char d = text.charAt(pos);
095                    if (d == '*' && peek(1) == '/') {
096                        raw.append("*/");
097                        advance(); advance();
098                        depth--;
099                    } else if (d == '/' && peek(1) == '*') {
100                        raw.append("/*");
101                        advance(); advance();
102                        depth++;
103                    } else {
104                        raw.append(d);
105                        advance();
106                    }
107                }
108                appendToken(out, raw.toString(), TBaseType.cmtslashstar,
109                        ETokenType.ttbracketedcomment, startLine, startCol, startOffset);
110                continue;
111            }
112
113            // Quoted identifier  #"..."
114            if (c == '#' && peek(1) == '"') {
115                StringBuilder raw = new StringBuilder();
116                raw.append('#').append('"');
117                advance(); advance();
118                while (pos < text.length()) {
119                    char d = text.charAt(pos);
120                    if (d == '"' && peek(1) == '"') {
121                        raw.append("\"\"");
122                        advance(); advance();
123                    } else if (d == '"') {
124                        raw.append('"');
125                        advance();
126                        break;
127                    } else {
128                        raw.append(d);
129                        advance();
130                    }
131                }
132                appendToken(out, raw.toString(), PowerQueryTokenCodes.QIDENT,
133                        ETokenType.ttdqstring, startLine, startCol, startOffset);
134                continue;
135            }
136
137            // String literal  "..."
138            if (c == '"') {
139                StringBuilder raw = new StringBuilder();
140                raw.append('"');
141                advance();
142                while (pos < text.length()) {
143                    char d = text.charAt(pos);
144                    if (d == '"' && peek(1) == '"') {
145                        raw.append("\"\"");
146                        advance(); advance();
147                    } else if (d == '"') {
148                        raw.append('"');
149                        advance();
150                        break;
151                    } else if (d == '#' && peek(1) == '(') {
152                        int close = text.indexOf(')', pos + 2);
153                        if (close < 0) {
154                            raw.append(d);
155                            advance();
156                        } else {
157                            while (pos <= close) {
158                                raw.append(text.charAt(pos));
159                                advance();
160                            }
161                        }
162                    } else {
163                        raw.append(d);
164                        advance();
165                    }
166                }
167                appendToken(out, raw.toString(), TBaseType.sconst,
168                        ETokenType.ttsqstring, startLine, startCol, startOffset);
169                continue;
170            }
171
172            // Ellipsis ... or double-dot ..
173            if (c == '.' && peek(1) == '.' && peek(2) == '.') {
174                advance(); advance(); advance();
175                appendToken(out, "...", PowerQueryTokenCodes.ELLIPSIS,
176                        ETokenType.ttunknown, startLine, startCol, startOffset);
177                continue;
178            }
179            if (c == '.' && peek(1) == '.') {
180                advance(); advance();
181                appendToken(out, "..", PowerQueryTokenCodes.DOUBLE_DOT,
182                        ETokenType.ttunknown, startLine, startCol, startOffset);
183                continue;
184            }
185
186            // Fat arrow  =>
187            if (c == '=' && peek(1) == '>') {
188                advance(); advance();
189                appendToken(out, "=>", PowerQueryTokenCodes.FAT_ARROW,
190                        ETokenType.ttunknown, startLine, startCol, startOffset);
191                continue;
192            }
193
194            // Comparison operators <>, <=, >=
195            if (c == '<' && peek(1) == '>') {
196                advance(); advance();
197                appendToken(out, "<>", TBaseType.not_equal,
198                        ETokenType.ttunknown, startLine, startCol, startOffset);
199                continue;
200            }
201            if (c == '<' && peek(1) == '=') {
202                advance(); advance();
203                appendToken(out, "<=", TBaseType.less_equal,
204                        ETokenType.ttunknown, startLine, startCol, startOffset);
205                continue;
206            }
207            if (c == '>' && peek(1) == '=') {
208                advance(); advance();
209                appendToken(out, ">=", TBaseType.great_equal,
210                        ETokenType.ttunknown, startLine, startCol, startOffset);
211                continue;
212            }
213
214            // Number  (integer or decimal, with optional exponent)
215            if (isDigit(c) || (c == '.' && isDigit(peek(1)))) {
216                StringBuilder raw = new StringBuilder();
217                boolean seenDot = false;
218                boolean seenExp = false;
219                while (pos < text.length()) {
220                    char d = text.charAt(pos);
221                    if (isDigit(d)) {
222                        raw.append(d);
223                        advance();
224                    } else if (d == '.' && !seenDot && !seenExp && isDigit(peek(1))) {
225                        raw.append(d);
226                        advance();
227                        seenDot = true;
228                    } else if ((d == 'e' || d == 'E') && !seenExp) {
229                        raw.append(d);
230                        advance();
231                        seenExp = true;
232                        if (pos < text.length()
233                                && (text.charAt(pos) == '+' || text.charAt(pos) == '-')) {
234                            raw.append(text.charAt(pos));
235                            advance();
236                        }
237                    } else {
238                        break;
239                    }
240                }
241                int code = (seenDot || seenExp) ? TBaseType.fconst : TBaseType.iconst;
242                appendToken(out, raw.toString(), code,
243                        ETokenType.ttnumber, startLine, startCol, startOffset);
244                continue;
245            }
246
247            // Identifier or keyword
248            if (isIdentStart(c)) {
249                StringBuilder raw = new StringBuilder();
250                while (pos < text.length() && isIdentPart(text.charAt(pos))) {
251                    raw.append(text.charAt(pos));
252                    advance();
253                }
254                String word = raw.toString();
255                int kwCode = PowerQueryTokenCodes.keywordCode(
256                        word.toUpperCase(java.util.Locale.ROOT));
257                if (kwCode >= 0) {
258                    appendToken(out, word, kwCode,
259                            ETokenType.ttkeyword, startLine, startCol, startOffset);
260                } else {
261                    appendToken(out, word, TBaseType.ident,
262                            ETokenType.ttidentifier, startLine, startCol, startOffset);
263                }
264                continue;
265            }
266
267            // Single-character punct (use ASCII value as tokencode)
268            int code = c;
269            ETokenType kind = classifyPunct(c);
270            String raw = String.valueOf(c);
271            advance();
272            appendToken(out, raw, code, kind, startLine, startCol, startOffset);
273        }
274    }
275
276    private void appendToken(TSourceTokenList out, String text, int code,
277                             ETokenType kind, int line, int col, long startOffset) {
278        TSourceToken tok = new TSourceToken();
279        tok.setAstext(text);
280        tok.tokencode = code;
281        tok.tokentype = kind;
282        tok.lineNo = line;
283        tok.columnNo = col;
284        tok.offset = startOffset;
285        tok.container = out;
286        tok.posinlist = out.size();
287        out.add(tok);
288    }
289
290    private ETokenType classifyPunct(char c) {
291        switch (c) {
292            case ';': return ETokenType.ttsemicolon;
293            case ':': return ETokenType.ttcolon;
294            case ',': return ETokenType.ttcomma;
295            case '.': return ETokenType.ttperiod;
296            case '=': return ETokenType.ttequals;
297            case '?': return ETokenType.ttquestionmark;
298            case '+': return ETokenType.ttplussign;
299            case '-': return ETokenType.ttminussign;
300            case '*': return ETokenType.ttasterisk;
301            default:  return ETokenType.ttunknown;
302        }
303    }
304
305    private void advance() {
306        if (pos >= text.length()) return;
307        char c = text.charAt(pos);
308        if (c == '\n') {
309            lineNo++;
310            colNo = 1;
311        } else {
312            colNo++;
313        }
314        pos++;
315        offset++;
316    }
317
318    private char peek(int delta) {
319        int idx = pos + delta;
320        if (idx < 0 || idx >= text.length()) return '\0';
321        return text.charAt(idx);
322    }
323
324    private static boolean isDigit(char c) {
325        return c >= '0' && c <= '9';
326    }
327
328    private static boolean isIdentStart(char c) {
329        return c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
330    }
331
332    private static boolean isIdentPart(char c) {
333        // M identifiers are [A-Za-z_][A-Za-z0-9_]* — no dot; dot is a separator.
334        return isIdentStart(c) || isDigit(c);
335    }
336}