Source code

001package gudusoft.gsqlparser.common.structured;
002
003import java.util.ArrayList;
004import java.util.List;
005
006/**
007 * Recursive descent parser for vendor-neutral structured type strings.
008 * Currently supports {@link StructuredSchemaDialect#SPARK_SQL_TYPE_STRING}:
009 *
010 * <pre>
011 * type       := primitive | array | struct | map
012 * array      := ARRAY '&lt;' type '&gt;'
013 * struct     := STRUCT '&lt;' field (',' field)* '&gt;'
014 * field      := identifier ':' type
015 * map        := MAP '&lt;' type ',' type '&gt;'
016 * primitive  := STRING | INT | BIGINT | DOUBLE | FLOAT | BOOLEAN | DATE
017 *             | TIMESTAMP | DECIMAL ('(' ... ')')? | identifier
018 * identifier := backquoted_identifier | bare_identifier
019 * </pre>
020 *
021 * <p>Inputs may include one layer of surrounding single or double quotes
022 * (matching the SQL string literal); they are stripped before parsing, and
023 * doubled quotes ({@code ''}) inside the literal are unescaped to a single
024 * quote.
025 */
026public final class StructuredSchemaParser {
027
028    private StructuredSchemaParser() {}
029
030    public static StructuredType parse(String schemaText, StructuredSchemaDialect dialect) {
031        if (schemaText == null) {
032            throw new StructuredSchemaParseException("schema text must not be null");
033        }
034        if (dialect == null) {
035            throw new StructuredSchemaParseException("dialect must not be null");
036        }
037        String stripped = stripQuotes(schemaText.trim());
038        switch (dialect) {
039            case SPARK_SQL_TYPE_STRING:
040                return new Cursor(stripped).readTypeAndExpectEnd();
041            default:
042                throw new StructuredSchemaParseException("unsupported dialect: " + dialect);
043        }
044    }
045
046    private static String stripQuotes(String s) {
047        if (s.length() >= 2) {
048            char first = s.charAt(0);
049            char last = s.charAt(s.length() - 1);
050            if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) {
051                String inner = s.substring(1, s.length() - 1);
052                String doubled = String.valueOf(first) + first;
053                return inner.replace(doubled, String.valueOf(first));
054            }
055        }
056        return s;
057    }
058
059    private static final class Cursor {
060        private final String text;
061        private int pos;
062
063        Cursor(String text) {
064            this.text = text;
065            this.pos = 0;
066        }
067
068        StructuredType readTypeAndExpectEnd() {
069            StructuredType t = readType();
070            skipWs();
071            if (pos != text.length()) {
072                throw new StructuredSchemaParseException(
073                        "unexpected trailing input at position " + pos + ": " + remaining());
074            }
075            return t;
076        }
077
078        StructuredType readType() {
079            skipWs();
080            if (pos >= text.length()) {
081                throw new StructuredSchemaParseException("unexpected end of schema");
082            }
083            String ident = readIdentifier();
084            String upper = ident.toUpperCase();
085            switch (upper) {
086                case "ARRAY":
087                    return readArray();
088                case "STRUCT":
089                    return readStruct();
090                case "MAP":
091                    return readMap();
092                case "DECIMAL":
093                case "NUMERIC":
094                    skipOptionalParenArgs();
095                    return new StructuredPrimitiveType(upper);
096                default:
097                    return new StructuredPrimitiveType(upper);
098            }
099        }
100
101        StructuredArrayType readArray() {
102            expect('<');
103            StructuredType element = readType();
104            skipWs();
105            expect('>');
106            return new StructuredArrayType(element);
107        }
108
109        StructuredStructType readStruct() {
110            expect('<');
111            List<StructuredStructField> fields = new ArrayList<>();
112            skipWs();
113            if (peek() != '>') {
114                fields.add(readField());
115                skipWs();
116                while (peek() == ',') {
117                    pos++;
118                    fields.add(readField());
119                    skipWs();
120                }
121            }
122            expect('>');
123            return new StructuredStructType(fields);
124        }
125
126        StructuredStructField readField() {
127            skipWs();
128            String name = readIdentifier();
129            skipWs();
130            expect(':');
131            StructuredType t = readType();
132            return new StructuredStructField(name, t);
133        }
134
135        StructuredMapType readMap() {
136            expect('<');
137            StructuredType k = readType();
138            skipWs();
139            expect(',');
140            StructuredType v = readType();
141            skipWs();
142            expect('>');
143            return new StructuredMapType(k, v);
144        }
145
146        String readIdentifier() {
147            skipWs();
148            if (pos >= text.length()) {
149                throw new StructuredSchemaParseException("expected identifier at end of schema");
150            }
151            char c = text.charAt(pos);
152            if (c == '`') {
153                int start = ++pos;
154                while (pos < text.length() && text.charAt(pos) != '`') {
155                    pos++;
156                }
157                if (pos >= text.length()) {
158                    throw new StructuredSchemaParseException("unterminated backquoted identifier");
159                }
160                String ident = text.substring(start, pos);
161                pos++; // consume closing backtick
162                return ident;
163            }
164            int start = pos;
165            while (pos < text.length()) {
166                char ch = text.charAt(pos);
167                if (Character.isLetterOrDigit(ch) || ch == '_') {
168                    pos++;
169                } else {
170                    break;
171                }
172            }
173            if (start == pos) {
174                throw new StructuredSchemaParseException(
175                        "expected identifier at position " + pos + ": " + remaining());
176            }
177            return text.substring(start, pos);
178        }
179
180        void skipOptionalParenArgs() {
181            skipWs();
182            if (pos < text.length() && text.charAt(pos) == '(') {
183                int depth = 0;
184                while (pos < text.length()) {
185                    char c = text.charAt(pos++);
186                    if (c == '(') depth++;
187                    else if (c == ')') {
188                        depth--;
189                        if (depth == 0) return;
190                    }
191                }
192                throw new StructuredSchemaParseException("unterminated parenthesized type args");
193            }
194        }
195
196        void skipWs() {
197            while (pos < text.length() && Character.isWhitespace(text.charAt(pos))) {
198                pos++;
199            }
200        }
201
202        char peek() {
203            skipWs();
204            return pos < text.length() ? text.charAt(pos) : '\0';
205        }
206
207        void expect(char ch) {
208            skipWs();
209            if (pos >= text.length() || text.charAt(pos) != ch) {
210                throw new StructuredSchemaParseException(
211                        "expected '" + ch + "' at position " + pos + ": " + remaining());
212            }
213            pos++;
214        }
215
216        String remaining() {
217            return pos < text.length() ? text.substring(pos) : "<end>";
218        }
219    }
220}