001package gudusoft.gsqlparser.pp2.token; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.ETokenType; 005import gudusoft.gsqlparser.TGSqlParser; 006import gudusoft.gsqlparser.TSourceToken; 007import gudusoft.gsqlparser.TSourceTokenList; 008import gudusoft.gsqlparser.pp2.Pp2FormatOptions; 009 010import java.util.ArrayList; 011import java.util.List; 012 013/** 014 * Compares two SQL inputs for token-level equivalence modulo formatting. 015 * 016 * <p>The contract — verified by 12 hand-built cases in 017 * {@code TokenEquivalenceTest} — is that two SQL strings are equivalent 018 * when: 019 * 020 * <ul> 021 * <li>Their <i>solid + comment</i> token sequences (in order) are equal 022 * character-by-character.</li> 023 * <li>Whitespace differences (spaces, tabs, linebreaks, indent) are 024 * ignored.</li> 025 * <li>Comment position may shift between the two streams — the comments 026 * must appear in the same relative order, but their surrounding 027 * whitespace does not matter.</li> 028 * <li>Case differences on keywords and unquoted identifiers are tolerated 029 * when {@code caseInsensitive=true}. Quoted identifiers, string 030 * literals, and operator/punctuation tokens are always compared 031 * byte-exact because case changes there alter semantics.</li> 032 * </ul> 033 * 034 * <p>This helper is the safety net for several downstream slices: 035 * <ul> 036 * <li><b>S13</b> ({@code GuardedAstDelegate}) — verifies that 037 * {@code FormatterFactory.pp()}'s output preserves every solid 038 * input token.</li> 039 * <li><b>S32</b> golden tests — sanity-checks goldens against their 040 * inputs.</li> 041 * <li><b>S33</b> overlay annotator — exercises the equivalence helper 042 * on real corpus inputs.</li> 043 * <li><b>S34</b> content-preservation — the strongest property test on 044 * the 50-SQL corpus.</li> 045 * <li><b>S35</b> per-vendor smoke — confirms every vendor preserves 046 * content through the pp2 pipeline.</li> 047 * </ul> 048 * 049 * <p>The helper deliberately operates on raw SQL strings (not 050 * {@link Pp2TokenStream} instances) because the comparison is between an 051 * arbitrary input and an arbitrary output — there is no single pp2 stream 052 * shared between the two. The internal tokenization uses the GSP lexer 053 * directly with the same overlap-skip discipline as 054 * {@link Pp2TokenStreamBuilder}. 055 * 056 * <p>Plan reference: §7.3/S10, §7.4/S10, Q8. 057 */ 058public final class TokenEquivalence { 059 060 private TokenEquivalence() { 061 // utility class 062 } 063 064 /** 065 * Case-insensitive comparison using the default Oracle dialect. The 066 * "case insensitive" mode tolerates case differences on keywords and 067 * unquoted identifiers; literals and quoted identifiers are always 068 * byte-exact. 069 */ 070 public static boolean equalsModuloFormatting(String left, String right, 071 Pp2FormatOptions opts) { 072 return equalsModuloFormatting(left, right, opts, 073 EDbVendor.dbvoracle, true); 074 } 075 076 /** 077 * Full-control comparison. 078 * 079 * @param left first SQL string; must not be null 080 * @param right second SQL string; must not be null 081 * @param opts pp2 options (currently unused, reserved for future 082 * comment-tolerance settings); may be null 083 * @param vendor dialect to tokenize with; must not be null 084 * @param caseInsensitive when {@code true}, keyword and unquoted 085 * identifier comparisons are case-insensitive; 086 * when {@code false}, all comparisons are 087 * byte-exact on text 088 * @return {@code true} iff the two strings have the same token 089 * sequence under the rules described in the class Javadoc 090 * @throws NullPointerException if {@code left}, {@code right}, or 091 * {@code vendor} is null 092 */ 093 public static boolean equalsModuloFormatting(String left, String right, 094 Pp2FormatOptions opts, 095 EDbVendor vendor, 096 boolean caseInsensitive) { 097 if (left == null) throw new NullPointerException("left"); 098 if (right == null) throw new NullPointerException("right"); 099 if (vendor == null) throw new NullPointerException("vendor"); 100 SplitTokens a = tokenizeSplit(left, vendor); 101 SplitTokens b = tokenizeSplit(right, vendor); 102 // Solid (non-comment) tokens must match in order. 103 if (!compareSequences(a.solids, b.solids, caseInsensitive)) { 104 return false; 105 } 106 // Comments must appear in the same relative order, and each 107 // comment's text must be byte-exact. Position may shift relative 108 // to the solid tokens (plan §16 Q8). 109 return compareSequences(a.comments, b.comments, caseInsensitive); 110 } 111 112 private static boolean compareSequences(List<TokenInfo> a, List<TokenInfo> b, 113 boolean caseInsensitive) { 114 if (a.size() != b.size()) return false; 115 for (int i = 0; i < a.size(); i++) { 116 if (!tokensEquivalent(a.get(i), b.get(i), caseInsensitive)) { 117 return false; 118 } 119 } 120 return true; 121 } 122 123 /** Lightweight value carrier for comparable tokens. */ 124 private static final class TokenInfo { 125 final ETokenType type; 126 final String text; 127 TokenInfo(ETokenType type, String text) { 128 this.type = type; 129 this.text = text; 130 } 131 } 132 133 /** Tokenization result split into solid tokens and comments. */ 134 private static final class SplitTokens { 135 final List<TokenInfo> solids; 136 final List<TokenInfo> comments; 137 SplitTokens(List<TokenInfo> solids, List<TokenInfo> comments) { 138 this.solids = solids; 139 this.comments = comments; 140 } 141 } 142 143 private static SplitTokens tokenizeSplit(String sql, EDbVendor vendor) { 144 TGSqlParser parser = new TGSqlParser(vendor); 145 parser.sqltext = sql; 146 parser.tokenizeSqltext(); 147 TSourceTokenList list = parser.getSourcetokenlist(); 148 List<TokenInfo> solids = new ArrayList<TokenInfo>(list.size()); 149 List<TokenInfo> comments = new ArrayList<TokenInfo>(); 150 long lastEmittedEnd = -1L; 151 for (int i = 0; i < list.size(); i++) { 152 TSourceToken t = list.get(i); 153 if (t == null) continue; 154 String text = t.toString(); 155 if (text == null || text.isEmpty()) continue; 156 // Skip phantom shadowed tokens (GSP "${name}" overlap quirk — 157 // see TokenCoverage Javadoc and slice S9 resume doc). 158 if (TokenCoverage.isFullyShadowed(t, lastEmittedEnd)) { 159 continue; 160 } 161 ETokenType type = t.tokentype; 162 // Skip whitespace entirely. 163 if (Pp2TokenStreamBuilder.isFoldable(type)) { 164 continue; 165 } 166 TokenInfo info = new TokenInfo(type, text); 167 if (isCommentType(type)) { 168 comments.add(info); 169 } else { 170 solids.add(info); 171 } 172 lastEmittedEnd = TokenCoverage.endOffset(t); 173 } 174 return new SplitTokens(solids, comments); 175 } 176 177 private static boolean isCommentType(ETokenType type) { 178 if (type == null) return false; 179 switch (type) { 180 case ttsimplecomment: 181 case ttbracketedcomment: 182 case ttCPPComment: 183 return true; 184 default: 185 return false; 186 } 187 } 188 189 private static boolean tokensEquivalent(TokenInfo a, TokenInfo b, 190 boolean caseInsensitive) { 191 if (a.type != b.type) { 192 return false; 193 } 194 if (caseInsensitive && typeAllowsCaseChange(a.type)) { 195 return a.text.equalsIgnoreCase(b.text); 196 } 197 return a.text.equals(b.text); 198 } 199 200 /** 201 * Token types whose case may legitimately change between input and 202 * output: keywords and unquoted identifiers. Everything else 203 * (literals, quoted identifiers, comments, punctuation, operators) 204 * is compared byte-exact even in case-insensitive mode. 205 */ 206 private static boolean typeAllowsCaseChange(ETokenType type) { 207 if (type == null) return false; 208 switch (type) { 209 case ttkeyword: 210 case ttnonreservedkeyword: 211 case ttidentifier: 212 return true; 213 default: 214 return false; 215 } 216 } 217}