001package gudusoft.gsqlparser.pp2.island; 002 003import gudusoft.gsqlparser.ETokenType; 004import gudusoft.gsqlparser.pp2.island.SqlScopeDetector.SqlScopeResult; 005import gudusoft.gsqlparser.pp2.token.Pp2Token; 006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 007import gudusoft.gsqlparser.pp2.token.TokenRole; 008 009import java.util.ArrayList; 010import java.util.Collections; 011import java.util.HashMap; 012import java.util.HashSet; 013import java.util.List; 014import java.util.Locale; 015import java.util.Map; 016import java.util.Set; 017 018/** 019 * Annotates each token with the clause it belongs to ({@link ClausePart}) and 020 * marks master/clause keyword roles, by walking the token stream left-to-right 021 * and tracking the current clause per SQL scope level. 022 * 023 * <h2>SQL-scope aware</h2> 024 * 025 * <p>The annotator consumes {@link SqlScopeResult} (S20) so a subquery's 026 * clauses do not leak into the enclosing query: the "current clause" is tracked 027 * independently per scope level. When the walk descends into a deeper scope, 028 * that level's clause state starts fresh; when it returns, the enclosing level's 029 * clause (e.g. {@code FROM} around a derived-table subquery) is intact. 030 * 031 * <h2>Expression-depth gating</h2> 032 * 033 * <p>A clause keyword only changes the current clause when it appears at the 034 * query block's own paren depth — not inside a nested expression. This stops 035 * window specs ({@code OVER (ORDER BY a)}), aggregate filters 036 * ({@code FILTER (WHERE ...)}), and parenthesised predicates from corrupting 037 * the clause state. The function-form keywords {@code LEFT(} / {@code RIGHT(} 038 * are also recognised as function calls (not JOIN keywords) when immediately 039 * followed by an open paren. 040 * 041 * <h2>Designated annotator</h2> 042 * 043 * <p>S21 is one of the designated role-annotator stages (plan forbidden #6), so 044 * it sets {@link TokenRole#KEYWORD_MASTER} on master keywords 045 * (SELECT/INSERT/UPDATE/DELETE/MERGE) and {@link TokenRole#KEYWORD_CLAUSE} on 046 * recognised clause keywords. It never mutates the wrapped {@code TSourceToken}. 047 * The per-token clause membership is returned as a {@link ClauseScopeResult} 048 * side structure (a token belongs to exactly one clause, so a small enum is 049 * clearer than seven boolean roles). 050 * 051 * <p>Iterative single pass; no recursion. Plan reference: §7.3/S21, §7.4/S21. 052 */ 053public final class ClauseScopeAnnotator { 054 055 private static final Set<String> MASTER_KEYWORDS = setOf( 056 "SELECT", "INSERT", "UPDATE", "DELETE", "MERGE"); 057 058 /** Keyword text -> the clause it starts. */ 059 private static final Map<String, ClausePart> CLAUSE_KEYWORDS = buildClauseKeywords(); 060 061 private static Map<String, ClausePart> buildClauseKeywords() { 062 Map<String, ClausePart> m = new HashMap<String, ClausePart>(); 063 m.put("FROM", ClausePart.FROM); 064 m.put("WHERE", ClausePart.WHERE); 065 m.put("GROUP", ClausePart.GROUP_BY); 066 m.put("ORDER", ClausePart.ORDER_BY); 067 m.put("HAVING", ClausePart.HAVING); 068 m.put("INTO", ClausePart.INTO); 069 m.put("VALUES", ClausePart.INSERT_VALUES); 070 m.put("SET", ClausePart.UPDATE_SET); 071 // JOIN family — all map to the JOIN clause. 072 for (String j : new String[]{"JOIN", "INNER", "LEFT", "RIGHT", "FULL", 073 "CROSS", "NATURAL", "OUTER", "ON", "USING", "APPLY"}) { 074 m.put(j, ClausePart.JOIN); 075 } 076 // Set operators. 077 for (String s : new String[]{"UNION", "INTERSECT", "EXCEPT", "MINUS"}) { 078 m.put(s, ClausePart.SET_OP); 079 } 080 return Collections.unmodifiableMap(m); 081 } 082 083 /** 084 * Annotate {@code stream} clause membership using the SQL scope levels. 085 * 086 * @param stream the token stream; must not be null 087 * @param sqlScope the S20 scope result for the same stream; must not be null 088 * and must cover the same token count 089 * @return a {@link ClauseScopeResult}; never null 090 * @throws NullPointerException if any argument is null 091 * @throws IllegalArgumentException if {@code sqlScope.size()} != stream size 092 */ 093 public ClauseScopeResult annotate(Pp2TokenStream stream, SqlScopeResult sqlScope) { 094 if (stream == null) throw new NullPointerException("stream"); 095 if (sqlScope == null) throw new NullPointerException("sqlScope"); 096 int n = stream.size(); 097 if (sqlScope.size() != n) { 098 throw new IllegalArgumentException( 099 "sqlScope size " + sqlScope.size() + " != stream size " + n); 100 } 101 102 ClausePart[] part = new ClausePart[n]; 103 List<Integer> masterIndices = new ArrayList<Integer>(); 104 105 int levels = Math.max(1, sqlScope.getMaxLevel() + 1); 106 // Current clause per SQL scope level. Index by level. 107 ClausePart[] current = new ClausePart[levels]; 108 java.util.Arrays.fill(current, ClausePart.NONE); 109 // The paren depth at which each level's clauses live (the query block's 110 // base). Clause keywords only transition when paren depth == base[level]. 111 int[] base = new int[levels]; 112 113 int parenDepth = 0; 114 int prevLevel = 0; 115 for (int i = 0; i < n; i++) { 116 Pp2Token t = stream.get(i); 117 int level = sqlScope.levelAt(i); 118 119 // Descending into deeper subquery scope(s) resets each newly entered 120 // level's clause and records its base paren depth (just inside the 121 // query paren). Reset every level in (prevLevel, level], not just the 122 // target, in case S20 levels ever step by more than one. 123 if (level > prevLevel) { 124 for (int l = prevLevel + 1; l <= level; l++) { 125 current[l] = ClausePart.NONE; 126 base[l] = parenDepth; 127 } 128 } 129 prevLevel = level; 130 131 ETokenType type = t.getSourceToken().tokentype; 132 if (type == ETokenType.ttsemicolon) { 133 // Statement boundary: reset clause state so the next statement's 134 // header does not inherit the previous statement's clause. 135 java.util.Arrays.fill(current, ClausePart.NONE); 136 java.util.Arrays.fill(base, 0); 137 parenDepth = 0; 138 part[i] = ClausePart.NONE; 139 } else if (type == ETokenType.ttleftparenthesis) { 140 part[i] = current[level]; 141 parenDepth++; 142 } else if (type == ETokenType.ttrightparenthesis) { 143 if (parenDepth > 0) parenDepth--; 144 part[i] = current[level]; 145 } else if (type == ETokenType.ttkeyword) { 146 String text = t.getText(); 147 String upper = text == null ? "" : text.toUpperCase(Locale.ROOT); 148 boolean atBase = parenDepth == base[level]; 149 if (MASTER_KEYWORDS.contains(upper)) { 150 t.addRole(TokenRole.KEYWORD_MASTER); 151 masterIndices.add(i); 152 if (atBase) { 153 // SELECT opens its projection list; other masters leave the 154 // clause NONE until a recognised sub-clause keyword appears. 155 current[level] = "SELECT".equals(upper) 156 ? ClausePart.SELECT_LIST : ClausePart.NONE; 157 } 158 part[i] = current[level]; 159 } else if (CLAUSE_KEYWORDS.containsKey(upper) 160 && !isFunctionForm(upper, stream, i) 161 && transitionAllowed(CLAUSE_KEYWORDS.get(upper), current[level], 162 atBase)) { 163 // A genuine clause transition. 164 t.addRole(TokenRole.KEYWORD_CLAUSE); 165 current[level] = CLAUSE_KEYWORDS.get(upper); 166 part[i] = current[level]; 167 } else { 168 // Generic keyword, OR a clause keyword inside an expression 169 // (window/filter/func/parenthesised predicate): stays in the 170 // current clause without transitioning. 171 part[i] = current[level]; 172 } 173 } else { 174 part[i] = current[level]; 175 } 176 } 177 178 return new ClauseScopeResult(part, Collections.unmodifiableList(masterIndices)); 179 } 180 181 /** 182 * Whether a clause keyword may transition the current clause given the 183 * paren context. Transitions normally require being at the query block's 184 * base paren depth ({@code atBase}). The exception is JOIN-family keywords 185 * inside a table-expression group such as {@code FROM (t1 LEFT JOIN t2 ON ...)}: 186 * those legitimately appear above base, but only when the current clause is 187 * already FROM or JOIN (the table region). Window/filter/expression keywords 188 * are not JOIN-family, so they remain gated by base depth. 189 */ 190 private static boolean transitionAllowed(ClausePart mapped, ClausePart current, 191 boolean atBase) { 192 if (atBase) return true; 193 return mapped == ClausePart.JOIN 194 && (current == ClausePart.FROM || current == ClausePart.JOIN); 195 } 196 197 /** Keywords that double as function names; treated as functions when followed by '('. */ 198 private static final Set<String> FUNCTION_FORM_KEYWORDS = setOf("LEFT", "RIGHT"); 199 200 /** 201 * True if {@code upper} is a keyword that doubles as a function name (LEFT, 202 * RIGHT) and is immediately followed (skipping comments) by an open paren — 203 * i.e. it is a function call, not a JOIN keyword. Other clause keywords that 204 * legitimately precede a paren (FROM/JOIN/WHERE/ON/APPLY before a subquery or 205 * parenthesised predicate) are not treated as function forms. 206 */ 207 private static boolean isFunctionForm(String upper, Pp2TokenStream stream, int i) { 208 if (!FUNCTION_FORM_KEYWORDS.contains(upper)) return false; 209 for (int j = i + 1; j < stream.size(); j++) { 210 ETokenType type = stream.get(j).getSourceToken().tokentype; 211 if (type == ETokenType.ttsimplecomment 212 || type == ETokenType.ttbracketedcomment 213 || type == ETokenType.ttCPPComment) { 214 continue; 215 } 216 return type == ETokenType.ttleftparenthesis; 217 } 218 return false; 219 } 220 221 private static Set<String> setOf(String... values) { 222 Set<String> s = new HashSet<String>(); 223 Collections.addAll(s, values); 224 return Collections.unmodifiableSet(s); 225 } 226 227 /** Per-token clause membership plus the master-keyword token indices. */ 228 public static final class ClauseScopeResult { 229 private final ClausePart[] part; 230 private final List<Integer> masterIndices; 231 232 ClauseScopeResult(ClausePart[] part, List<Integer> masterIndices) { 233 this.part = part; 234 this.masterIndices = masterIndices; 235 } 236 237 /** Number of tokens covered. */ 238 public int size() { return part.length; } 239 240 /** The clause the token at {@code index} belongs to. */ 241 public ClausePart partAt(int index) { return part[index]; } 242 243 /** Indices of master keyword tokens (SELECT/INSERT/UPDATE/DELETE/MERGE), in order. */ 244 public List<Integer> getMasterIndices() { return masterIndices; } 245 } 246}