001package gudusoft.gsqlparser.pp2.island; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.pp.logger.PPLogger; 005 006import java.io.BufferedReader; 007import java.io.IOException; 008import java.io.InputStream; 009import java.io.InputStreamReader; 010import java.nio.charset.Charset; 011import java.util.ArrayList; 012import java.util.Collections; 013import java.util.Comparator; 014import java.util.HashSet; 015import java.util.LinkedHashMap; 016import java.util.List; 017import java.util.Locale; 018import java.util.Map; 019import java.util.Set; 020import java.util.concurrent.ConcurrentHashMap; 021import java.util.concurrent.atomic.AtomicInteger; 022 023/** 024 * Per-vendor table of multi-word SQL keyword phrases that the lexical island 025 * pipeline treats as a single keyword unit (e.g. {@code LEFT OUTER JOIN}, 026 * {@code UNION ALL}, {@code GROUP BY}). 027 * 028 * <h2>Resource format</h2> 029 * 030 * <p>The phrases are declared in {@code /pp2/keywords.properties} on the 031 * classpath. {@code org.json} is not available to {@code gsp_java_core}, so a 032 * line-based format is used instead of {@code keywords.json} (plan §5.5 and 033 * forbidden action #8 — no new third-party dependency). Each rule line is: 034 * 035 * <pre> 036 * <SCOPE>: <phrase> | <phrase> | ... 037 * </pre> 038 * 039 * where {@code SCOPE} is {@code ALL} or an {@link EDbVendor} enum constant name 040 * (e.g. {@code dbvoracle}). Blank lines and {@code #} comments are ignored. 041 * 042 * <h2>Caching</h2> 043 * 044 * <p>The resource is parsed <b>once per JVM</b> into an immutable base model 045 * (eager class-init). {@link #forVendor(EDbVendor)} returns an immutable, 046 * longest-phrase-first {@link Phrase} list, memoised per vendor via 047 * {@link ConcurrentHashMap#computeIfAbsent}. {@link #getLoadCount()} exposes how 048 * many times the resource file itself was parsed (always 1 in a healthy JVM); 049 * the S18 test asserts this. 050 * 051 * <p>Plan reference: §7.3/S18, §7.4/S18. 052 */ 053public final class MultiWordKeywordTable { 054 055 /** Classpath location of the line-based phrase resource. */ 056 static final String RESOURCE_PATH = "/pp2/keywords.properties"; 057 058 /** Scope token meaning "applies to every dialect". */ 059 private static final String SCOPE_ALL = "ALL"; 060 061 /** Valid vendor-scope keys: every {@link EDbVendor} constant name, lower-cased. */ 062 private static final Set<String> VALID_VENDOR_KEYS = buildValidVendorKeys(); 063 064 /** Counts how many times the resource file was parsed (should be 1/JVM). */ 065 private static final AtomicInteger LOAD_COUNT = new AtomicInteger(0); 066 067 /** Phrases that apply to every dialect (longest-first). */ 068 private static final List<Phrase> ALL_PHRASES; 069 070 /** Vendor-scope name (lower-cased enum constant) -> its extra phrases. */ 071 private static final Map<String, List<Phrase>> VENDOR_PHRASES; 072 073 /** Per-vendor memoised merged (ALL + vendor) lists, longest-first. */ 074 private static final ConcurrentHashMap<EDbVendor, List<Phrase>> PER_VENDOR_CACHE = 075 new ConcurrentHashMap<EDbVendor, List<Phrase>>(); 076 077 static { 078 ParsedTable parsed = loadFromResource(); 079 ALL_PHRASES = Collections.unmodifiableList(parsed.allPhrases); 080 VENDOR_PHRASES = Collections.unmodifiableMap(parsed.vendorPhrases); 081 } 082 083 private MultiWordKeywordTable() { 084 // static-only utility 085 } 086 087 /** 088 * Return the immutable, longest-phrase-first list of multi-word keyword 089 * phrases applicable to {@code vendor} (the universal {@code ALL} phrases 090 * plus any vendor-specific ones). Memoised per vendor. 091 * 092 * @throws NullPointerException if {@code vendor} is null 093 */ 094 public static List<Phrase> forVendor(EDbVendor vendor) { 095 if (vendor == null) throw new NullPointerException("vendor"); 096 List<Phrase> cached = PER_VENDOR_CACHE.get(vendor); 097 if (cached != null) return cached; 098 return PER_VENDOR_CACHE.computeIfAbsent(vendor, new java.util.function.Function<EDbVendor, List<Phrase>>() { 099 @Override 100 public List<Phrase> apply(EDbVendor v) { 101 List<Phrase> merged = new ArrayList<Phrase>(ALL_PHRASES); 102 List<Phrase> vendorSpecific = VENDOR_PHRASES.get(v.name().toLowerCase(Locale.ROOT)); 103 if (vendorSpecific != null) merged.addAll(vendorSpecific); 104 Collections.sort(merged, LONGEST_FIRST); 105 return Collections.unmodifiableList(merged); 106 } 107 }); 108 } 109 110 /** Number of times the resource file was parsed (1 in a healthy JVM). */ 111 public static int getLoadCount() { 112 return LOAD_COUNT.get(); 113 } 114 115 /** Sort phrases by descending word count so the merger matches longest-first. */ 116 private static final Comparator<Phrase> LONGEST_FIRST = new Comparator<Phrase>() { 117 @Override 118 public int compare(Phrase a, Phrase b) { 119 return b.getWords().size() - a.getWords().size(); 120 } 121 }; 122 123 // ---- resource parsing ---------------------------------------------- 124 125 private static Set<String> buildValidVendorKeys() { 126 Set<String> keys = new HashSet<String>(); 127 for (EDbVendor v : EDbVendor.values()) { 128 keys.add(v.name().toLowerCase(Locale.ROOT)); 129 } 130 return Collections.unmodifiableSet(keys); 131 } 132 133 private static ParsedTable loadFromResource() { 134 LOAD_COUNT.incrementAndGet(); 135 ParsedTable table = new ParsedTable(); 136 InputStream in = MultiWordKeywordTable.class.getResourceAsStream(RESOURCE_PATH); 137 if (in == null) { 138 PPLogger.info("MultiWordKeywordTable: resource " + RESOURCE_PATH 139 + " not found on classpath; multi-word keyword merging disabled."); 140 return table; 141 } 142 BufferedReader reader = new BufferedReader( 143 new InputStreamReader(in, Charset.forName("UTF-8"))); 144 try { 145 String line; 146 while ((line = reader.readLine()) != null) { 147 parseLine(line, table); 148 } 149 } catch (IOException e) { 150 PPLogger.error(e); 151 PPLogger.info("MultiWordKeywordTable: failed reading " + RESOURCE_PATH 152 + "; using whatever was parsed so far."); 153 } finally { 154 try { reader.close(); } catch (IOException ignored) { /* nothing to do */ } 155 } 156 return table; 157 } 158 159 private static void parseLine(String rawLine, ParsedTable table) { 160 String line = rawLine.trim(); 161 if (line.isEmpty() || line.charAt(0) == '#') return; 162 163 int colon = line.indexOf(':'); 164 if (colon < 0) { 165 PPLogger.info("MultiWordKeywordTable: ignoring malformed line (no scope ':'): " + rawLine); 166 return; 167 } 168 String scope = line.substring(0, colon).trim(); 169 String body = line.substring(colon + 1).trim(); 170 if (scope.isEmpty() || body.isEmpty()) return; 171 172 List<Phrase> target; 173 if (SCOPE_ALL.equalsIgnoreCase(scope)) { 174 target = table.allPhrases; 175 } else { 176 String key = scope.toLowerCase(Locale.ROOT); 177 if (!VALID_VENDOR_KEYS.contains(key)) { 178 // A typo'd vendor scope would silently create a dead phrase 179 // list (it could never be looked up by a real EDbVendor). For 180 // an allowlist whose correctness depends on exact vendor 181 // scoping, surface it instead of swallowing it. 182 PPLogger.info("MultiWordKeywordTable: ignoring rule with unknown vendor scope '" 183 + scope + "' (not an EDbVendor constant): " + rawLine); 184 return; 185 } 186 target = table.vendorPhrases.get(key); 187 if (target == null) { 188 target = new ArrayList<Phrase>(); 189 table.vendorPhrases.put(key, target); 190 } 191 } 192 193 for (String chunk : body.split("\\|")) { 194 Phrase phrase = Phrase.parse(chunk); 195 if (phrase != null) target.add(phrase); 196 } 197 } 198 199 /** Mutable accumulator used only during parsing. */ 200 private static final class ParsedTable { 201 final List<Phrase> allPhrases = new ArrayList<Phrase>(); 202 final Map<String, List<Phrase>> vendorPhrases = new LinkedHashMap<String, List<Phrase>>(); 203 } 204 205 /** 206 * An immutable multi-word keyword phrase: its lower-cased words (for 207 * matching) and an upper-cased canonical rendering (for reporting/merging). 208 */ 209 public static final class Phrase { 210 private final List<String> words; // lower-cased, for matching 211 private final String canonical; // upper-cased, space-joined 212 213 private Phrase(List<String> words, String canonical) { 214 this.words = Collections.unmodifiableList(words); 215 this.canonical = canonical; 216 } 217 218 /** Parse one " WORD WORD ..." chunk; returns null if it has no words. */ 219 static Phrase parse(String chunk) { 220 String trimmed = chunk.trim(); 221 if (trimmed.isEmpty()) return null; 222 String[] parts = trimmed.split("\\s+"); 223 List<String> lower = new ArrayList<String>(parts.length); 224 StringBuilder canon = new StringBuilder(); 225 for (String p : parts) { 226 if (p.isEmpty()) continue; 227 if (canon.length() > 0) canon.append(' '); 228 canon.append(p.toUpperCase(Locale.ROOT)); 229 lower.add(p.toLowerCase(Locale.ROOT)); 230 } 231 if (lower.isEmpty()) return null; 232 return new Phrase(lower, canon.toString()); 233 } 234 235 /** The phrase's lower-cased words, in order. */ 236 public List<String> getWords() { return words; } 237 238 /** Upper-cased, single-space-joined canonical form (e.g. "LEFT OUTER JOIN"). */ 239 public String getCanonical() { return canonical; } 240 241 @Override 242 public String toString() { return "Phrase[" + canonical + "]"; } 243 } 244}