001package gudusoft.gsqlparser.common.structured; 002 003import java.util.ArrayList; 004import java.util.List; 005 006/** 007 * Recursive descent parser for vendor-neutral structured type strings. 008 * Currently supports {@link StructuredSchemaDialect#SPARK_SQL_TYPE_STRING}: 009 * 010 * <pre> 011 * type := primitive | array | struct | map 012 * array := ARRAY '<' type '>' 013 * struct := STRUCT '<' field (',' field)* '>' 014 * field := identifier ':' type 015 * map := MAP '<' type ',' type '>' 016 * primitive := STRING | INT | BIGINT | DOUBLE | FLOAT | BOOLEAN | DATE 017 * | TIMESTAMP | DECIMAL ('(' ... ')')? | identifier 018 * identifier := backquoted_identifier | bare_identifier 019 * </pre> 020 * 021 * <p>Inputs may include one layer of surrounding single or double quotes 022 * (matching the SQL string literal); they are stripped before parsing, and 023 * doubled quotes ({@code ''}) inside the literal are unescaped to a single 024 * quote. 025 */ 026public final class StructuredSchemaParser { 027 028 private StructuredSchemaParser() {} 029 030 public static StructuredType parse(String schemaText, StructuredSchemaDialect dialect) { 031 if (schemaText == null) { 032 throw new StructuredSchemaParseException("schema text must not be null"); 033 } 034 if (dialect == null) { 035 throw new StructuredSchemaParseException("dialect must not be null"); 036 } 037 String stripped = stripQuotes(schemaText.trim()); 038 switch (dialect) { 039 case SPARK_SQL_TYPE_STRING: 040 return new Cursor(stripped).readTypeAndExpectEnd(); 041 default: 042 throw new StructuredSchemaParseException("unsupported dialect: " + dialect); 043 } 044 } 045 046 private static String stripQuotes(String s) { 047 if (s.length() >= 2) { 048 char first = s.charAt(0); 049 char last = s.charAt(s.length() - 1); 050 if ((first == '\'' && last == '\'') || (first == '"' && last == '"')) { 051 String inner = s.substring(1, s.length() - 1); 052 String doubled = String.valueOf(first) + first; 053 return inner.replace(doubled, String.valueOf(first)); 054 } 055 } 056 return s; 057 } 058 059 private static final class Cursor { 060 private final String text; 061 private int pos; 062 063 Cursor(String text) { 064 this.text = text; 065 this.pos = 0; 066 } 067 068 StructuredType readTypeAndExpectEnd() { 069 StructuredType t = readType(); 070 skipWs(); 071 if (pos != text.length()) { 072 throw new StructuredSchemaParseException( 073 "unexpected trailing input at position " + pos + ": " + remaining()); 074 } 075 return t; 076 } 077 078 StructuredType readType() { 079 skipWs(); 080 if (pos >= text.length()) { 081 throw new StructuredSchemaParseException("unexpected end of schema"); 082 } 083 String ident = readIdentifier(); 084 String upper = ident.toUpperCase(); 085 switch (upper) { 086 case "ARRAY": 087 return readArray(); 088 case "STRUCT": 089 return readStruct(); 090 case "MAP": 091 return readMap(); 092 case "DECIMAL": 093 case "NUMERIC": 094 skipOptionalParenArgs(); 095 return new StructuredPrimitiveType(upper); 096 default: 097 return new StructuredPrimitiveType(upper); 098 } 099 } 100 101 StructuredArrayType readArray() { 102 expect('<'); 103 StructuredType element = readType(); 104 skipWs(); 105 expect('>'); 106 return new StructuredArrayType(element); 107 } 108 109 StructuredStructType readStruct() { 110 expect('<'); 111 List<StructuredStructField> fields = new ArrayList<>(); 112 skipWs(); 113 if (peek() != '>') { 114 fields.add(readField()); 115 skipWs(); 116 while (peek() == ',') { 117 pos++; 118 fields.add(readField()); 119 skipWs(); 120 } 121 } 122 expect('>'); 123 return new StructuredStructType(fields); 124 } 125 126 StructuredStructField readField() { 127 skipWs(); 128 String name = readIdentifier(); 129 skipWs(); 130 expect(':'); 131 StructuredType t = readType(); 132 return new StructuredStructField(name, t); 133 } 134 135 StructuredMapType readMap() { 136 expect('<'); 137 StructuredType k = readType(); 138 skipWs(); 139 expect(','); 140 StructuredType v = readType(); 141 skipWs(); 142 expect('>'); 143 return new StructuredMapType(k, v); 144 } 145 146 String readIdentifier() { 147 skipWs(); 148 if (pos >= text.length()) { 149 throw new StructuredSchemaParseException("expected identifier at end of schema"); 150 } 151 char c = text.charAt(pos); 152 if (c == '`') { 153 int start = ++pos; 154 while (pos < text.length() && text.charAt(pos) != '`') { 155 pos++; 156 } 157 if (pos >= text.length()) { 158 throw new StructuredSchemaParseException("unterminated backquoted identifier"); 159 } 160 String ident = text.substring(start, pos); 161 pos++; // consume closing backtick 162 return ident; 163 } 164 int start = pos; 165 while (pos < text.length()) { 166 char ch = text.charAt(pos); 167 if (Character.isLetterOrDigit(ch) || ch == '_') { 168 pos++; 169 } else { 170 break; 171 } 172 } 173 if (start == pos) { 174 throw new StructuredSchemaParseException( 175 "expected identifier at position " + pos + ": " + remaining()); 176 } 177 return text.substring(start, pos); 178 } 179 180 void skipOptionalParenArgs() { 181 skipWs(); 182 if (pos < text.length() && text.charAt(pos) == '(') { 183 int depth = 0; 184 while (pos < text.length()) { 185 char c = text.charAt(pos++); 186 if (c == '(') depth++; 187 else if (c == ')') { 188 depth--; 189 if (depth == 0) return; 190 } 191 } 192 throw new StructuredSchemaParseException("unterminated parenthesized type args"); 193 } 194 } 195 196 void skipWs() { 197 while (pos < text.length() && Character.isWhitespace(text.charAt(pos))) { 198 pos++; 199 } 200 } 201 202 char peek() { 203 skipWs(); 204 return pos < text.length() ? text.charAt(pos) : '\0'; 205 } 206 207 void expect(char ch) { 208 skipWs(); 209 if (pos >= text.length() || text.charAt(pos) != ch) { 210 throw new StructuredSchemaParseException( 211 "expected '" + ch + "' at position " + pos + ": " + remaining()); 212 } 213 pos++; 214 } 215 216 String remaining() { 217 return pos < text.length() ? text.substring(pos) : "<end>"; 218 } 219 } 220}