Source code

001package gudusoft.gsqlparser.pp2.token;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.ETokenType;
005import gudusoft.gsqlparser.TGSqlParser;
006import gudusoft.gsqlparser.TSourceToken;
007import gudusoft.gsqlparser.TSourceTokenList;
008import gudusoft.gsqlparser.pp2.Pp2FormatOptions;
009
010import java.util.ArrayList;
011import java.util.List;
012
013/**
014 * Compares two SQL inputs for token-level equivalence modulo formatting.
015 *
016 * <p>The contract — verified by 12 hand-built cases in
017 * {@code TokenEquivalenceTest} — is that two SQL strings are equivalent
018 * when:
019 *
020 * <ul>
021 *   <li>Their <i>solid + comment</i> token sequences (in order) are equal
022 *       character-by-character.</li>
023 *   <li>Whitespace differences (spaces, tabs, linebreaks, indent) are
024 *       ignored.</li>
025 *   <li>Comment position may shift between the two streams — the comments
026 *       must appear in the same relative order, but their surrounding
027 *       whitespace does not matter.</li>
028 *   <li>Case differences on keywords and unquoted identifiers are tolerated
029 *       when {@code caseInsensitive=true}. Quoted identifiers, string
030 *       literals, and operator/punctuation tokens are always compared
031 *       byte-exact because case changes there alter semantics.</li>
032 * </ul>
033 *
034 * <p>This helper is the safety net for several downstream slices:
035 * <ul>
036 *   <li><b>S13</b> ({@code GuardedAstDelegate}) — verifies that
037 *       {@code FormatterFactory.pp()}'s output preserves every solid
038 *       input token.</li>
039 *   <li><b>S32</b> golden tests — sanity-checks goldens against their
040 *       inputs.</li>
041 *   <li><b>S33</b> overlay annotator — exercises the equivalence helper
042 *       on real corpus inputs.</li>
043 *   <li><b>S34</b> content-preservation — the strongest property test on
044 *       the 50-SQL corpus.</li>
045 *   <li><b>S35</b> per-vendor smoke — confirms every vendor preserves
046 *       content through the pp2 pipeline.</li>
047 * </ul>
048 *
049 * <p>The helper deliberately operates on raw SQL strings (not
050 * {@link Pp2TokenStream} instances) because the comparison is between an
051 * arbitrary input and an arbitrary output — there is no single pp2 stream
052 * shared between the two. The internal tokenization uses the GSP lexer
053 * directly with the same overlap-skip discipline as
054 * {@link Pp2TokenStreamBuilder}.
055 *
056 * <p>Plan reference: §7.3/S10, §7.4/S10, Q8.
057 */
058public final class TokenEquivalence {
059
060    private TokenEquivalence() {
061        // utility class
062    }
063
064    /**
065     * Case-insensitive comparison using the default Oracle dialect. The
066     * "case insensitive" mode tolerates case differences on keywords and
067     * unquoted identifiers; literals and quoted identifiers are always
068     * byte-exact.
069     */
070    public static boolean equalsModuloFormatting(String left, String right,
071                                                 Pp2FormatOptions opts) {
072        return equalsModuloFormatting(left, right, opts,
073            EDbVendor.dbvoracle, true);
074    }
075
076    /**
077     * Full-control comparison.
078     *
079     * @param left  first SQL string; must not be null
080     * @param right second SQL string; must not be null
081     * @param opts  pp2 options (currently unused, reserved for future
082     *              comment-tolerance settings); may be null
083     * @param vendor dialect to tokenize with; must not be null
084     * @param caseInsensitive when {@code true}, keyword and unquoted
085     *                        identifier comparisons are case-insensitive;
086     *                        when {@code false}, all comparisons are
087     *                        byte-exact on text
088     * @return {@code true} iff the two strings have the same token
089     *         sequence under the rules described in the class Javadoc
090     * @throws NullPointerException if {@code left}, {@code right}, or
091     *         {@code vendor} is null
092     */
093    public static boolean equalsModuloFormatting(String left, String right,
094                                                 Pp2FormatOptions opts,
095                                                 EDbVendor vendor,
096                                                 boolean caseInsensitive) {
097        if (left == null) throw new NullPointerException("left");
098        if (right == null) throw new NullPointerException("right");
099        if (vendor == null) throw new NullPointerException("vendor");
100        SplitTokens a = tokenizeSplit(left, vendor);
101        SplitTokens b = tokenizeSplit(right, vendor);
102        // Solid (non-comment) tokens must match in order.
103        if (!compareSequences(a.solids, b.solids, caseInsensitive)) {
104            return false;
105        }
106        // Comments must appear in the same relative order, and each
107        // comment's text must be byte-exact. Position may shift relative
108        // to the solid tokens (plan §16 Q8).
109        return compareSequences(a.comments, b.comments, caseInsensitive);
110    }
111
112    private static boolean compareSequences(List<TokenInfo> a, List<TokenInfo> b,
113                                            boolean caseInsensitive) {
114        if (a.size() != b.size()) return false;
115        for (int i = 0; i < a.size(); i++) {
116            if (!tokensEquivalent(a.get(i), b.get(i), caseInsensitive)) {
117                return false;
118            }
119        }
120        return true;
121    }
122
123    /** Lightweight value carrier for comparable tokens. */
124    private static final class TokenInfo {
125        final ETokenType type;
126        final String text;
127        TokenInfo(ETokenType type, String text) {
128            this.type = type;
129            this.text = text;
130        }
131    }
132
133    /** Tokenization result split into solid tokens and comments. */
134    private static final class SplitTokens {
135        final List<TokenInfo> solids;
136        final List<TokenInfo> comments;
137        SplitTokens(List<TokenInfo> solids, List<TokenInfo> comments) {
138            this.solids = solids;
139            this.comments = comments;
140        }
141    }
142
143    private static SplitTokens tokenizeSplit(String sql, EDbVendor vendor) {
144        TGSqlParser parser = new TGSqlParser(vendor);
145        parser.sqltext = sql;
146        parser.tokenizeSqltext();
147        TSourceTokenList list = parser.getSourcetokenlist();
148        List<TokenInfo> solids = new ArrayList<TokenInfo>(list.size());
149        List<TokenInfo> comments = new ArrayList<TokenInfo>();
150        long lastEmittedEnd = -1L;
151        for (int i = 0; i < list.size(); i++) {
152            TSourceToken t = list.get(i);
153            if (t == null) continue;
154            String text = t.toString();
155            if (text == null || text.isEmpty()) continue;
156            // Skip phantom shadowed tokens (GSP "${name}" overlap quirk —
157            // see TokenCoverage Javadoc and slice S9 resume doc).
158            if (TokenCoverage.isFullyShadowed(t, lastEmittedEnd)) {
159                continue;
160            }
161            ETokenType type = t.tokentype;
162            // Skip whitespace entirely.
163            if (Pp2TokenStreamBuilder.isFoldable(type)) {
164                continue;
165            }
166            TokenInfo info = new TokenInfo(type, text);
167            if (isCommentType(type)) {
168                comments.add(info);
169            } else {
170                solids.add(info);
171            }
172            lastEmittedEnd = TokenCoverage.endOffset(t);
173        }
174        return new SplitTokens(solids, comments);
175    }
176
177    private static boolean isCommentType(ETokenType type) {
178        if (type == null) return false;
179        switch (type) {
180            case ttsimplecomment:
181            case ttbracketedcomment:
182            case ttCPPComment:
183                return true;
184            default:
185                return false;
186        }
187    }
188
189    private static boolean tokensEquivalent(TokenInfo a, TokenInfo b,
190                                            boolean caseInsensitive) {
191        if (a.type != b.type) {
192            return false;
193        }
194        if (caseInsensitive && typeAllowsCaseChange(a.type)) {
195            return a.text.equalsIgnoreCase(b.text);
196        }
197        return a.text.equals(b.text);
198    }
199
200    /**
201     * Token types whose case may legitimately change between input and
202     * output: keywords and unquoted identifiers. Everything else
203     * (literals, quoted identifiers, comments, punctuation, operators)
204     * is compared byte-exact even in case-insensitive mode.
205     */
206    private static boolean typeAllowsCaseChange(ETokenType type) {
207        if (type == null) return false;
208        switch (type) {
209            case ttkeyword:
210            case ttnonreservedkeyword:
211            case ttidentifier:
212                return true;
213            default:
214                return false;
215        }
216    }
217}