001package gudusoft.gsqlparser.pp2.zone; 002 003import gudusoft.gsqlparser.ETokenType; 004import gudusoft.gsqlparser.TSourceToken; 005import gudusoft.gsqlparser.pp2.token.Pp2Token; 006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 007import gudusoft.gsqlparser.pp2.token.TokenRole; 008 009/** 010 * Detects protected zones in a {@link Pp2TokenStream} and annotates the 011 * affected tokens with the relevant {@link TokenRole} flags. 012 * 013 * <p>"Protected" here means: pp2 must emit the token's text verbatim. The 014 * layout rules in S24–S29 may re-space <i>around</i> a protected token 015 * but never alter its contents. 016 * 017 * <h2>What this detector recognizes</h2> 018 * 019 * <ul> 020 * <li><b>Line comments</b> ({@code ttsimplecomment}, {@code ttCPPComment}) 021 * → {@link TokenRole#COMMENT_LINE}.</li> 022 * <li><b>Block comments</b> ({@code ttbracketedcomment}) 023 * → {@link TokenRole#COMMENT_BLOCK}.</li> 024 * <li><b>Hint comments</b> (block comment whose text begins with 025 * {@code "/*+"}) → {@link TokenRole#HINT} in addition to 026 * {@code COMMENT_BLOCK}.</li> 027 * <li><b>String literals</b> ({@code ttsqstring}) 028 * → {@link TokenRole#LITERAL}.</li> 029 * <li><b>Quoted identifiers</b> ({@code ttdqstring}, 030 * {@code ttdbstring}, {@code ttbrstring}) 031 * → {@link TokenRole#QUOTED_IDENTIFIER}.</li> 032 * <li><b>No-format blocks</b>: every token whose source position lies 033 * between a {@code --BEGIN_NO_FORMAT} line comment and a matching 034 * {@code --END_NO_FORMAT} line comment is flagged with 035 * {@link TokenRole#NO_FORMAT_ZONE}. The markers themselves carry 036 * the flag too.</li> 037 * <li><b>Template placeholders</b>: tokens whose text matches the 038 * shapes {@code ${name}}, {@code {{name}}}, or {@code #{name}} 039 * are flagged {@link TokenRole#TEMPLATE_PLACEHOLDER} and 040 * {@link TokenRole#NO_FORMAT_ZONE} so layout never reflows 041 * inside them. {@code ${name}} arrives from the lexer as a 042 * single identifier-style token and is matched in the per-token 043 * pass; {@code {{name}}} and {@code #{name}} are tokenized as 044 * multiple tokens (e.g., {@code "{" "{" name "}" "}"}) and a 045 * second pass walks the stream looking for those sequences. The 046 * second-pass body check rejects shapes whose interior does not 047 * look like a placeholder body, so SQL fragments that happen to 048 * contain isolated braces (e.g., JSON path literals) are not 049 * misflagged.</li> 050 * </ul> 051 * 052 * <p>The detector does not consume or produce tokens; it only adds roles 053 * to existing {@link Pp2Token}s. Repeated invocations are idempotent (the 054 * underlying {@link java.util.EnumSet} is set-typed). 055 * 056 * <p>Plan reference: §7.3/S9, §7.4/S9. 057 */ 058public final class ProtectedZoneDetector { 059 060 /** Marker comment text that opens a no-format block. */ 061 public static final String NO_FORMAT_BEGIN = "--BEGIN_NO_FORMAT"; 062 /** Marker comment text that closes a no-format block. */ 063 public static final String NO_FORMAT_END = "--END_NO_FORMAT"; 064 065 /** 066 * Annotate the supplied stream in place. The stream's structural 067 * immutability is not violated — only the per-token role sets are 068 * touched. 069 * 070 * <p>Performed in two passes: 071 * <ol> 072 * <li>Per-token classification (comments, literals, quoted ids, 073 * single-token placeholders) and the no-format block state 074 * machine.</li> 075 * <li>Multi-token template placeholders {@code {{name}}} and 076 * {@code #{name}}, detected by scanning consecutive token text.</li> 077 * </ol> 078 * 079 * <p>Idempotent: repeated invocations don't double-flag because the 080 * underlying role set is set-typed. 081 * 082 * @throws NullPointerException if {@code stream} is null 083 */ 084 public void annotate(Pp2TokenStream stream) { 085 if (stream == null) throw new NullPointerException("stream"); 086 annotatePerToken(stream); 087 annotateMultiTokenPlaceholders(stream); 088 } 089 090 private void annotatePerToken(Pp2TokenStream stream) { 091 boolean inNoFormat = false; 092 for (int i = 0; i < stream.size(); i++) { 093 Pp2Token wrapped = stream.get(i); 094 TSourceToken t = wrapped.getSourceToken(); 095 ETokenType type = t.tokentype; 096 String text = wrapped.getText(); 097 098 // Per-token basic classification. 099 switch (type) { 100 case ttsimplecomment: 101 case ttCPPComment: 102 wrapped.addRole(TokenRole.COMMENT_LINE); 103 break; 104 case ttbracketedcomment: 105 wrapped.addRole(TokenRole.COMMENT_BLOCK); 106 if (isHintText(text)) { 107 wrapped.addRole(TokenRole.HINT); 108 } 109 break; 110 case ttsqstring: 111 wrapped.addRole(TokenRole.LITERAL); 112 break; 113 case ttdqstring: 114 case ttdbstring: 115 case ttbrstring: 116 wrapped.addRole(TokenRole.QUOTED_IDENTIFIER); 117 break; 118 default: 119 // not a protected token type 120 break; 121 } 122 123 // Single-token template placeholders (e.g., "${name}"). 124 if (isTemplatePlaceholder(text)) { 125 wrapped.addRole(TokenRole.TEMPLATE_PLACEHOLDER); 126 wrapped.addRole(TokenRole.NO_FORMAT_ZONE); 127 } 128 129 // No-format block state machine. A BEGIN marker turns the 130 // zone on; the BEGIN comment itself is part of the zone (so 131 // pp2 emits it verbatim and downstream tools can rely on it 132 // being preserved). An END marker turns the zone off after 133 // being marked itself. 134 // 135 // The state machine is non-nesting: a second BEGIN inside an 136 // open block re-opens (no-op). Documented limitation for v2. 137 if (inNoFormat) { 138 wrapped.addRole(TokenRole.NO_FORMAT_ZONE); 139 } 140 if (isNoFormatBeginMarker(type, text)) { 141 wrapped.addRole(TokenRole.NO_FORMAT_ZONE); 142 inNoFormat = true; 143 } else if (isNoFormatEndMarker(type, text)) { 144 wrapped.addRole(TokenRole.NO_FORMAT_ZONE); 145 inNoFormat = false; 146 } 147 } 148 } 149 150 /** 151 * Second pass: detect template placeholder shapes that span multiple 152 * tokens because the lexer split them into pieces. Recognized shapes: 153 * <ul> 154 * <li>{@code "{" "{" body... "}" "}"}: Mustache-style.</li> 155 * <li>{@code "#" "{" body... "}"}: MyBatis-style.</li> 156 * </ul> 157 * Only flags when the open and close markers are present and the 158 * intervening body looks like a placeholder body (idents, digits, 159 * underscores, dots) so SQL fragments that happen to use {@code {{} 160 * (e.g., array-literal syntax in JSON path expressions) are not 161 * misflagged. If no matching close marker is found within a small 162 * search window, the open tokens are left untouched. 163 */ 164 private void annotateMultiTokenPlaceholders(Pp2TokenStream stream) { 165 int n = stream.size(); 166 for (int i = 0; i < n; i++) { 167 String ti = stream.get(i).getText(); 168 // {{ ... }} 169 if ("{".equals(ti) && i + 1 < n && "{".equals(stream.get(i + 1).getText())) { 170 int end = findMustacheClose(stream, i + 2); 171 if (end >= 0) { 172 flagPlaceholderRange(stream, i, end); 173 i = end; 174 continue; 175 } 176 } 177 // # { ... } 178 if ("#".equals(ti) && i + 1 < n && "{".equals(stream.get(i + 1).getText())) { 179 int end = findBraceClose(stream, i + 2); 180 if (end >= 0) { 181 flagPlaceholderRange(stream, i, end); 182 i = end; 183 } 184 } 185 } 186 } 187 188 /** 189 * Find a matching {@code "}" "}"} pair after the {@code {{} opener. 190 * The body must look like a placeholder body. Returns the index of 191 * the second {@code }} or {@code -1} if not found. 192 */ 193 private static int findMustacheClose(Pp2TokenStream stream, int from) { 194 int n = stream.size(); 195 for (int j = from; j + 1 < n; j++) { 196 if ("}".equals(stream.get(j).getText()) 197 && "}".equals(stream.get(j + 1).getText())) { 198 // Verify the body in between looks like a placeholder body. 199 if (looksLikePlaceholderBodyTokens(stream, from, j)) { 200 return j + 1; 201 } 202 return -1; 203 } 204 } 205 return -1; 206 } 207 208 /** Find the matching {@code "}"} after the {@code #{} opener. */ 209 private static int findBraceClose(Pp2TokenStream stream, int from) { 210 int n = stream.size(); 211 for (int j = from; j < n; j++) { 212 if ("}".equals(stream.get(j).getText())) { 213 if (looksLikePlaceholderBodyTokens(stream, from, j)) { 214 return j; 215 } 216 return -1; 217 } 218 } 219 return -1; 220 } 221 222 /** 223 * Check the [from, to) token range: each token's text must look like 224 * placeholder-body content (idents, digits, dots, underscores, dashes). 225 * An empty range is rejected — placeholders must have a body. 226 */ 227 private static boolean looksLikePlaceholderBodyTokens(Pp2TokenStream stream, 228 int from, int to) { 229 if (to <= from) return false; 230 for (int j = from; j < to; j++) { 231 String text = stream.get(j).getText(); 232 if (text == null || text.isEmpty()) return false; 233 for (int k = 0; k < text.length(); k++) { 234 char c = text.charAt(k); 235 if (Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == '-') { 236 continue; 237 } 238 return false; 239 } 240 } 241 return true; 242 } 243 244 private static void flagPlaceholderRange(Pp2TokenStream stream, int start, int end) { 245 for (int j = start; j <= end; j++) { 246 stream.get(j).addRole(TokenRole.TEMPLATE_PLACEHOLDER); 247 stream.get(j).addRole(TokenRole.NO_FORMAT_ZONE); 248 } 249 } 250 251 // --- single-token detectors (public static for testability) --------- 252 253 /** 254 * True if the given text is a hint-style block comment, i.e., starts 255 * with {@code "/*+"}. 256 */ 257 public static boolean isHintText(String text) { 258 return text != null && text.length() >= 3 259 && text.charAt(0) == '/' && text.charAt(1) == '*' 260 && text.charAt(2) == '+'; 261 } 262 263 /** 264 * True if the given text matches a single-token template-placeholder 265 * shape: {@code ${name}}, {@code {{name}}}, or {@code #{name}}. 266 */ 267 public static boolean isTemplatePlaceholder(String text) { 268 if (text == null || text.length() < 4) return false; 269 if (text.charAt(0) == '$' && text.charAt(1) == '{' 270 && text.charAt(text.length() - 1) == '}') { 271 return looksLikePlaceholderBody(text, 2, text.length() - 1); 272 } 273 if (text.charAt(0) == '#' && text.charAt(1) == '{' 274 && text.charAt(text.length() - 1) == '}') { 275 return looksLikePlaceholderBody(text, 2, text.length() - 1); 276 } 277 if (text.length() >= 6 278 && text.charAt(0) == '{' && text.charAt(1) == '{' 279 && text.charAt(text.length() - 2) == '}' 280 && text.charAt(text.length() - 1) == '}') { 281 return looksLikePlaceholderBody(text, 2, text.length() - 2); 282 } 283 return false; 284 } 285 286 private static boolean looksLikePlaceholderBody(String text, int from, int to) { 287 if (to <= from) return false; 288 for (int i = from; i < to; i++) { 289 char c = text.charAt(i); 290 if (Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == '-') { 291 continue; 292 } 293 return false; 294 } 295 return true; 296 } 297 298 /** 299 * True if the given line-comment text opens a no-format block. The 300 * match is case-sensitive and ignores trailing whitespace; the comment 301 * may include trailing characters as long as the prefix matches the 302 * documented marker. 303 */ 304 public static boolean isNoFormatBeginMarker(ETokenType type, String text) { 305 if (text == null) return false; 306 if (type != ETokenType.ttsimplecomment && type != ETokenType.ttCPPComment) { 307 return false; 308 } 309 return text.startsWith(NO_FORMAT_BEGIN); 310 } 311 312 /** True if the given line-comment text closes a no-format block. */ 313 public static boolean isNoFormatEndMarker(ETokenType type, String text) { 314 if (text == null) return false; 315 if (type != ETokenType.ttsimplecomment && type != ETokenType.ttCPPComment) { 316 return false; 317 } 318 return text.startsWith(NO_FORMAT_END); 319 } 320}