Source code

001package gudusoft.gsqlparser.pp2.zone;
002
003import gudusoft.gsqlparser.ETokenType;
004import gudusoft.gsqlparser.TSourceToken;
005import gudusoft.gsqlparser.pp2.token.Pp2Token;
006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
007import gudusoft.gsqlparser.pp2.token.TokenRole;
008
009/**
010 * Detects protected zones in a {@link Pp2TokenStream} and annotates the
011 * affected tokens with the relevant {@link TokenRole} flags.
012 *
013 * <p>"Protected" here means: pp2 must emit the token's text verbatim. The
014 * layout rules in S24–S29 may re-space <i>around</i> a protected token
015 * but never alter its contents.
016 *
017 * <h2>What this detector recognizes</h2>
018 *
019 * <ul>
020 *   <li><b>Line comments</b> ({@code ttsimplecomment}, {@code ttCPPComment})
021 *       → {@link TokenRole#COMMENT_LINE}.</li>
022 *   <li><b>Block comments</b> ({@code ttbracketedcomment})
023 *       → {@link TokenRole#COMMENT_BLOCK}.</li>
024 *   <li><b>Hint comments</b> (block comment whose text begins with
025 *       {@code "/*+"}) → {@link TokenRole#HINT} in addition to
026 *       {@code COMMENT_BLOCK}.</li>
027 *   <li><b>String literals</b> ({@code ttsqstring})
028 *       → {@link TokenRole#LITERAL}.</li>
029 *   <li><b>Quoted identifiers</b> ({@code ttdqstring},
030 *       {@code ttdbstring}, {@code ttbrstring})
031 *       → {@link TokenRole#QUOTED_IDENTIFIER}.</li>
032 *   <li><b>No-format blocks</b>: every token whose source position lies
033 *       between a {@code --BEGIN_NO_FORMAT} line comment and a matching
034 *       {@code --END_NO_FORMAT} line comment is flagged with
035 *       {@link TokenRole#NO_FORMAT_ZONE}. The markers themselves carry
036 *       the flag too.</li>
037 *   <li><b>Template placeholders</b>: tokens whose text matches the
038 *       shapes {@code ${name}}, {@code {{name}}}, or {@code #{name}}
039 *       are flagged {@link TokenRole#TEMPLATE_PLACEHOLDER} and
040 *       {@link TokenRole#NO_FORMAT_ZONE} so layout never reflows
041 *       inside them. {@code ${name}} arrives from the lexer as a
042 *       single identifier-style token and is matched in the per-token
043 *       pass; {@code {{name}}} and {@code #{name}} are tokenized as
044 *       multiple tokens (e.g., {@code "{" "{" name "}" "}"}) and a
045 *       second pass walks the stream looking for those sequences. The
046 *       second-pass body check rejects shapes whose interior does not
047 *       look like a placeholder body, so SQL fragments that happen to
048 *       contain isolated braces (e.g., JSON path literals) are not
049 *       misflagged.</li>
050 * </ul>
051 *
052 * <p>The detector does not consume or produce tokens; it only adds roles
053 * to existing {@link Pp2Token}s. Repeated invocations are idempotent (the
054 * underlying {@link java.util.EnumSet} is set-typed).
055 *
056 * <p>Plan reference: §7.3/S9, §7.4/S9.
057 */
058public final class ProtectedZoneDetector {
059
060    /** Marker comment text that opens a no-format block. */
061    public static final String NO_FORMAT_BEGIN = "--BEGIN_NO_FORMAT";
062    /** Marker comment text that closes a no-format block. */
063    public static final String NO_FORMAT_END = "--END_NO_FORMAT";
064
065    /**
066     * Annotate the supplied stream in place. The stream's structural
067     * immutability is not violated — only the per-token role sets are
068     * touched.
069     *
070     * <p>Performed in two passes:
071     * <ol>
072     *   <li>Per-token classification (comments, literals, quoted ids,
073     *       single-token placeholders) and the no-format block state
074     *       machine.</li>
075     *   <li>Multi-token template placeholders {@code {{name}}} and
076     *       {@code #{name}}, detected by scanning consecutive token text.</li>
077     * </ol>
078     *
079     * <p>Idempotent: repeated invocations don't double-flag because the
080     * underlying role set is set-typed.
081     *
082     * @throws NullPointerException if {@code stream} is null
083     */
084    public void annotate(Pp2TokenStream stream) {
085        if (stream == null) throw new NullPointerException("stream");
086        annotatePerToken(stream);
087        annotateMultiTokenPlaceholders(stream);
088    }
089
090    private void annotatePerToken(Pp2TokenStream stream) {
091        boolean inNoFormat = false;
092        for (int i = 0; i < stream.size(); i++) {
093            Pp2Token wrapped = stream.get(i);
094            TSourceToken t = wrapped.getSourceToken();
095            ETokenType type = t.tokentype;
096            String text = wrapped.getText();
097
098            // Per-token basic classification.
099            switch (type) {
100                case ttsimplecomment:
101                case ttCPPComment:
102                    wrapped.addRole(TokenRole.COMMENT_LINE);
103                    break;
104                case ttbracketedcomment:
105                    wrapped.addRole(TokenRole.COMMENT_BLOCK);
106                    if (isHintText(text)) {
107                        wrapped.addRole(TokenRole.HINT);
108                    }
109                    break;
110                case ttsqstring:
111                    wrapped.addRole(TokenRole.LITERAL);
112                    break;
113                case ttdqstring:
114                case ttdbstring:
115                case ttbrstring:
116                    wrapped.addRole(TokenRole.QUOTED_IDENTIFIER);
117                    break;
118                default:
119                    // not a protected token type
120                    break;
121            }
122
123            // Single-token template placeholders (e.g., "${name}").
124            if (isTemplatePlaceholder(text)) {
125                wrapped.addRole(TokenRole.TEMPLATE_PLACEHOLDER);
126                wrapped.addRole(TokenRole.NO_FORMAT_ZONE);
127            }
128
129            // No-format block state machine. A BEGIN marker turns the
130            // zone on; the BEGIN comment itself is part of the zone (so
131            // pp2 emits it verbatim and downstream tools can rely on it
132            // being preserved). An END marker turns the zone off after
133            // being marked itself.
134            //
135            // The state machine is non-nesting: a second BEGIN inside an
136            // open block re-opens (no-op). Documented limitation for v2.
137            if (inNoFormat) {
138                wrapped.addRole(TokenRole.NO_FORMAT_ZONE);
139            }
140            if (isNoFormatBeginMarker(type, text)) {
141                wrapped.addRole(TokenRole.NO_FORMAT_ZONE);
142                inNoFormat = true;
143            } else if (isNoFormatEndMarker(type, text)) {
144                wrapped.addRole(TokenRole.NO_FORMAT_ZONE);
145                inNoFormat = false;
146            }
147        }
148    }
149
150    /**
151     * Second pass: detect template placeholder shapes that span multiple
152     * tokens because the lexer split them into pieces. Recognized shapes:
153     * <ul>
154     *   <li>{@code "{" "{" body... "}" "}"}: Mustache-style.</li>
155     *   <li>{@code "#" "{" body... "}"}: MyBatis-style.</li>
156     * </ul>
157     * Only flags when the open and close markers are present and the
158     * intervening body looks like a placeholder body (idents, digits,
159     * underscores, dots) so SQL fragments that happen to use {@code {{}
160     * (e.g., array-literal syntax in JSON path expressions) are not
161     * misflagged. If no matching close marker is found within a small
162     * search window, the open tokens are left untouched.
163     */
164    private void annotateMultiTokenPlaceholders(Pp2TokenStream stream) {
165        int n = stream.size();
166        for (int i = 0; i < n; i++) {
167            String ti = stream.get(i).getText();
168            // {{ ... }}
169            if ("{".equals(ti) && i + 1 < n && "{".equals(stream.get(i + 1).getText())) {
170                int end = findMustacheClose(stream, i + 2);
171                if (end >= 0) {
172                    flagPlaceholderRange(stream, i, end);
173                    i = end;
174                    continue;
175                }
176            }
177            // # { ... }
178            if ("#".equals(ti) && i + 1 < n && "{".equals(stream.get(i + 1).getText())) {
179                int end = findBraceClose(stream, i + 2);
180                if (end >= 0) {
181                    flagPlaceholderRange(stream, i, end);
182                    i = end;
183                }
184            }
185        }
186    }
187
188    /**
189     * Find a matching {@code "}" "}"} pair after the {@code {{} opener.
190     * The body must look like a placeholder body. Returns the index of
191     * the second {@code }} or {@code -1} if not found.
192     */
193    private static int findMustacheClose(Pp2TokenStream stream, int from) {
194        int n = stream.size();
195        for (int j = from; j + 1 < n; j++) {
196            if ("}".equals(stream.get(j).getText())
197                && "}".equals(stream.get(j + 1).getText())) {
198                // Verify the body in between looks like a placeholder body.
199                if (looksLikePlaceholderBodyTokens(stream, from, j)) {
200                    return j + 1;
201                }
202                return -1;
203            }
204        }
205        return -1;
206    }
207
208    /** Find the matching {@code "}"} after the {@code #{} opener. */
209    private static int findBraceClose(Pp2TokenStream stream, int from) {
210        int n = stream.size();
211        for (int j = from; j < n; j++) {
212            if ("}".equals(stream.get(j).getText())) {
213                if (looksLikePlaceholderBodyTokens(stream, from, j)) {
214                    return j;
215                }
216                return -1;
217            }
218        }
219        return -1;
220    }
221
222    /**
223     * Check the [from, to) token range: each token's text must look like
224     * placeholder-body content (idents, digits, dots, underscores, dashes).
225     * An empty range is rejected — placeholders must have a body.
226     */
227    private static boolean looksLikePlaceholderBodyTokens(Pp2TokenStream stream,
228                                                          int from, int to) {
229        if (to <= from) return false;
230        for (int j = from; j < to; j++) {
231            String text = stream.get(j).getText();
232            if (text == null || text.isEmpty()) return false;
233            for (int k = 0; k < text.length(); k++) {
234                char c = text.charAt(k);
235                if (Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == '-') {
236                    continue;
237                }
238                return false;
239            }
240        }
241        return true;
242    }
243
244    private static void flagPlaceholderRange(Pp2TokenStream stream, int start, int end) {
245        for (int j = start; j <= end; j++) {
246            stream.get(j).addRole(TokenRole.TEMPLATE_PLACEHOLDER);
247            stream.get(j).addRole(TokenRole.NO_FORMAT_ZONE);
248        }
249    }
250
251    // --- single-token detectors (public static for testability) ---------
252
253    /**
254     * True if the given text is a hint-style block comment, i.e., starts
255     * with {@code "/*+"}.
256     */
257    public static boolean isHintText(String text) {
258        return text != null && text.length() >= 3
259            && text.charAt(0) == '/' && text.charAt(1) == '*'
260            && text.charAt(2) == '+';
261    }
262
263    /**
264     * True if the given text matches a single-token template-placeholder
265     * shape: {@code ${name}}, {@code {{name}}}, or {@code #{name}}.
266     */
267    public static boolean isTemplatePlaceholder(String text) {
268        if (text == null || text.length() < 4) return false;
269        if (text.charAt(0) == '$' && text.charAt(1) == '{'
270            && text.charAt(text.length() - 1) == '}') {
271            return looksLikePlaceholderBody(text, 2, text.length() - 1);
272        }
273        if (text.charAt(0) == '#' && text.charAt(1) == '{'
274            && text.charAt(text.length() - 1) == '}') {
275            return looksLikePlaceholderBody(text, 2, text.length() - 1);
276        }
277        if (text.length() >= 6
278            && text.charAt(0) == '{' && text.charAt(1) == '{'
279            && text.charAt(text.length() - 2) == '}'
280            && text.charAt(text.length() - 1) == '}') {
281            return looksLikePlaceholderBody(text, 2, text.length() - 2);
282        }
283        return false;
284    }
285
286    private static boolean looksLikePlaceholderBody(String text, int from, int to) {
287        if (to <= from) return false;
288        for (int i = from; i < to; i++) {
289            char c = text.charAt(i);
290            if (Character.isLetterOrDigit(c) || c == '_' || c == '.' || c == '-') {
291                continue;
292            }
293            return false;
294        }
295        return true;
296    }
297
298    /**
299     * True if the given line-comment text opens a no-format block. The
300     * match is case-sensitive and ignores trailing whitespace; the comment
301     * may include trailing characters as long as the prefix matches the
302     * documented marker.
303     */
304    public static boolean isNoFormatBeginMarker(ETokenType type, String text) {
305        if (text == null) return false;
306        if (type != ETokenType.ttsimplecomment && type != ETokenType.ttCPPComment) {
307            return false;
308        }
309        return text.startsWith(NO_FORMAT_BEGIN);
310    }
311
312    /** True if the given line-comment text closes a no-format block. */
313    public static boolean isNoFormatEndMarker(ETokenType type, String text) {
314        if (text == null) return false;
315        if (type != ETokenType.ttsimplecomment && type != ETokenType.ttCPPComment) {
316            return false;
317        }
318        return text.startsWith(NO_FORMAT_END);
319    }
320}