001package gudusoft.gsqlparser.pp2.island;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.pp.logger.PPLogger;
005
006import java.io.BufferedReader;
007import java.io.IOException;
008import java.io.InputStream;
009import java.io.InputStreamReader;
010import java.nio.charset.Charset;
011import java.util.ArrayList;
012import java.util.Collections;
013import java.util.Comparator;
014import java.util.HashSet;
015import java.util.LinkedHashMap;
016import java.util.List;
017import java.util.Locale;
018import java.util.Map;
019import java.util.Set;
020import java.util.concurrent.ConcurrentHashMap;
021import java.util.concurrent.atomic.AtomicInteger;
022
023/**
024 * Per-vendor table of multi-word SQL keyword phrases that the lexical island
025 * pipeline treats as a single keyword unit (e.g. {@code LEFT OUTER JOIN},
026 * {@code UNION ALL}, {@code GROUP BY}).
027 *
028 * <h2>Resource format</h2>
029 *
030 * <p>The phrases are declared in {@code /pp2/keywords.properties} on the
031 * classpath. {@code org.json} is not available to {@code gsp_java_core}, so a
032 * line-based format is used instead of {@code keywords.json} (plan §5.5 and
033 * forbidden action #8 — no new third-party dependency). Each rule line is:
034 *
035 * <pre>
036 *   &lt;SCOPE&gt;: &lt;phrase&gt; | &lt;phrase&gt; | ...
037 * </pre>
038 *
039 * where {@code SCOPE} is {@code ALL} or an {@link EDbVendor} enum constant name
040 * (e.g. {@code dbvoracle}). Blank lines and {@code #} comments are ignored.
041 *
042 * <h2>Caching</h2>
043 *
044 * <p>The resource is parsed <b>once per JVM</b> into an immutable base model
045 * (eager class-init). {@link #forVendor(EDbVendor)} returns an immutable,
046 * longest-phrase-first {@link Phrase} list, memoised per vendor via
047 * {@link ConcurrentHashMap#computeIfAbsent}. {@link #getLoadCount()} exposes how
048 * many times the resource file itself was parsed (always 1 in a healthy JVM);
049 * the S18 test asserts this.
050 *
051 * <p>Plan reference: §7.3/S18, §7.4/S18.
052 */
053public final class MultiWordKeywordTable {
054
055    /** Classpath location of the line-based phrase resource. */
056    static final String RESOURCE_PATH = "/pp2/keywords.properties";
057
058    /** Scope token meaning "applies to every dialect". */
059    private static final String SCOPE_ALL = "ALL";
060
061    /** Valid vendor-scope keys: every {@link EDbVendor} constant name, lower-cased. */
062    private static final Set<String> VALID_VENDOR_KEYS = buildValidVendorKeys();
063
064    /** Counts how many times the resource file was parsed (should be 1/JVM). */
065    private static final AtomicInteger LOAD_COUNT = new AtomicInteger(0);
066
067    /** Phrases that apply to every dialect (longest-first). */
068    private static final List<Phrase> ALL_PHRASES;
069
070    /** Vendor-scope name (lower-cased enum constant) -> its extra phrases. */
071    private static final Map<String, List<Phrase>> VENDOR_PHRASES;
072
073    /** Per-vendor memoised merged (ALL + vendor) lists, longest-first. */
074    private static final ConcurrentHashMap<EDbVendor, List<Phrase>> PER_VENDOR_CACHE =
075        new ConcurrentHashMap<EDbVendor, List<Phrase>>();
076
077    static {
078        ParsedTable parsed = loadFromResource();
079        ALL_PHRASES = Collections.unmodifiableList(parsed.allPhrases);
080        VENDOR_PHRASES = Collections.unmodifiableMap(parsed.vendorPhrases);
081    }
082
083    private MultiWordKeywordTable() {
084        // static-only utility
085    }
086
087    /**
088     * Return the immutable, longest-phrase-first list of multi-word keyword
089     * phrases applicable to {@code vendor} (the universal {@code ALL} phrases
090     * plus any vendor-specific ones). Memoised per vendor.
091     *
092     * @throws NullPointerException if {@code vendor} is null
093     */
094    public static List<Phrase> forVendor(EDbVendor vendor) {
095        if (vendor == null) throw new NullPointerException("vendor");
096        List<Phrase> cached = PER_VENDOR_CACHE.get(vendor);
097        if (cached != null) return cached;
098        return PER_VENDOR_CACHE.computeIfAbsent(vendor, new java.util.function.Function<EDbVendor, List<Phrase>>() {
099            @Override
100            public List<Phrase> apply(EDbVendor v) {
101                List<Phrase> merged = new ArrayList<Phrase>(ALL_PHRASES);
102                List<Phrase> vendorSpecific = VENDOR_PHRASES.get(v.name().toLowerCase(Locale.ROOT));
103                if (vendorSpecific != null) merged.addAll(vendorSpecific);
104                Collections.sort(merged, LONGEST_FIRST);
105                return Collections.unmodifiableList(merged);
106            }
107        });
108    }
109
110    /** Number of times the resource file was parsed (1 in a healthy JVM). */
111    public static int getLoadCount() {
112        return LOAD_COUNT.get();
113    }
114
115    /** Sort phrases by descending word count so the merger matches longest-first. */
116    private static final Comparator<Phrase> LONGEST_FIRST = new Comparator<Phrase>() {
117        @Override
118        public int compare(Phrase a, Phrase b) {
119            return b.getWords().size() - a.getWords().size();
120        }
121    };
122
123    // ---- resource parsing ----------------------------------------------
124
125    private static Set<String> buildValidVendorKeys() {
126        Set<String> keys = new HashSet<String>();
127        for (EDbVendor v : EDbVendor.values()) {
128            keys.add(v.name().toLowerCase(Locale.ROOT));
129        }
130        return Collections.unmodifiableSet(keys);
131    }
132
133    private static ParsedTable loadFromResource() {
134        LOAD_COUNT.incrementAndGet();
135        ParsedTable table = new ParsedTable();
136        InputStream in = MultiWordKeywordTable.class.getResourceAsStream(RESOURCE_PATH);
137        if (in == null) {
138            PPLogger.info("MultiWordKeywordTable: resource " + RESOURCE_PATH
139                + " not found on classpath; multi-word keyword merging disabled.");
140            return table;
141        }
142        BufferedReader reader = new BufferedReader(
143            new InputStreamReader(in, Charset.forName("UTF-8")));
144        try {
145            String line;
146            while ((line = reader.readLine()) != null) {
147                parseLine(line, table);
148            }
149        } catch (IOException e) {
150            PPLogger.error(e);
151            PPLogger.info("MultiWordKeywordTable: failed reading " + RESOURCE_PATH
152                + "; using whatever was parsed so far.");
153        } finally {
154            try { reader.close(); } catch (IOException ignored) { /* nothing to do */ }
155        }
156        return table;
157    }
158
159    private static void parseLine(String rawLine, ParsedTable table) {
160        String line = rawLine.trim();
161        if (line.isEmpty() || line.charAt(0) == '#') return;
162
163        int colon = line.indexOf(':');
164        if (colon < 0) {
165            PPLogger.info("MultiWordKeywordTable: ignoring malformed line (no scope ':'): " + rawLine);
166            return;
167        }
168        String scope = line.substring(0, colon).trim();
169        String body = line.substring(colon + 1).trim();
170        if (scope.isEmpty() || body.isEmpty()) return;
171
172        List<Phrase> target;
173        if (SCOPE_ALL.equalsIgnoreCase(scope)) {
174            target = table.allPhrases;
175        } else {
176            String key = scope.toLowerCase(Locale.ROOT);
177            if (!VALID_VENDOR_KEYS.contains(key)) {
178                // A typo'd vendor scope would silently create a dead phrase
179                // list (it could never be looked up by a real EDbVendor). For
180                // an allowlist whose correctness depends on exact vendor
181                // scoping, surface it instead of swallowing it.
182                PPLogger.info("MultiWordKeywordTable: ignoring rule with unknown vendor scope '"
183                    + scope + "' (not an EDbVendor constant): " + rawLine);
184                return;
185            }
186            target = table.vendorPhrases.get(key);
187            if (target == null) {
188                target = new ArrayList<Phrase>();
189                table.vendorPhrases.put(key, target);
190            }
191        }
192
193        for (String chunk : body.split("\\|")) {
194            Phrase phrase = Phrase.parse(chunk);
195            if (phrase != null) target.add(phrase);
196        }
197    }
198
199    /** Mutable accumulator used only during parsing. */
200    private static final class ParsedTable {
201        final List<Phrase> allPhrases = new ArrayList<Phrase>();
202        final Map<String, List<Phrase>> vendorPhrases = new LinkedHashMap<String, List<Phrase>>();
203    }
204
205    /**
206     * An immutable multi-word keyword phrase: its lower-cased words (for
207     * matching) and an upper-cased canonical rendering (for reporting/merging).
208     */
209    public static final class Phrase {
210        private final List<String> words;       // lower-cased, for matching
211        private final String canonical;         // upper-cased, space-joined
212
213        private Phrase(List<String> words, String canonical) {
214            this.words = Collections.unmodifiableList(words);
215            this.canonical = canonical;
216        }
217
218        /** Parse one " WORD WORD ..." chunk; returns null if it has no words. */
219        static Phrase parse(String chunk) {
220            String trimmed = chunk.trim();
221            if (trimmed.isEmpty()) return null;
222            String[] parts = trimmed.split("\\s+");
223            List<String> lower = new ArrayList<String>(parts.length);
224            StringBuilder canon = new StringBuilder();
225            for (String p : parts) {
226                if (p.isEmpty()) continue;
227                if (canon.length() > 0) canon.append(' ');
228                canon.append(p.toUpperCase(Locale.ROOT));
229                lower.add(p.toLowerCase(Locale.ROOT));
230            }
231            if (lower.isEmpty()) return null;
232            return new Phrase(lower, canon.toString());
233        }
234
235        /** The phrase's lower-cased words, in order. */
236        public List<String> getWords() { return words; }
237
238        /** Upper-cased, single-space-joined canonical form (e.g. "LEFT OUTER JOIN"). */
239        public String getCanonical() { return canonical; }
240
241        @Override
242        public String toString() { return "Phrase[" + canonical + "]"; }
243    }
244}