Source code

001package gudusoft.gsqlparser.pp2.island;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.pp2.island.MultiWordKeywordTable.Phrase;
005import gudusoft.gsqlparser.pp2.token.Pp2Token;
006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream;
007
008import java.util.ArrayList;
009import java.util.Collections;
010import java.util.List;
011
012/**
013 * Identifies runs of consecutive tokens that form a multi-word SQL keyword
014 * (e.g. {@code LEFT OUTER JOIN}, {@code UNION ALL}, {@code GROUP BY}) so the
015 * lexical island pipeline can treat each run as a single keyword unit.
016 *
017 * <h2>What it does and does NOT do</h2>
018 *
019 * <p>The merger is <b>read-only</b>: it returns a list of {@link Match}
020 * spans and does not mutate any {@link Pp2Token} or its roles. (Role mutation
021 * is reserved for the designated annotator stages — S9 zone detector, S21
022 * clause annotator, S33 overlay — not this slice.) Later pipeline stages
023 * consume the match list to know which token indices belong to one keyword.
024 *
025 * <h2>Matching rules</h2>
026 *
027 * <ul>
028 *   <li><b>Allowlist, not word-count.</b> Only phrases present in
029 *       {@link MultiWordKeywordTable} merge. {@code GROUP BY} merges;
030 *       {@code LEFT JOIN} does not (only the three-word {@code LEFT OUTER JOIN}
031 *       form is in the table).</li>
032 *   <li><b>Left-to-right, longest-match-at-each-position, non-overlapping.</b>
033 *       The scan walks tokens in source order; at each position the longest
034 *       applicable phrase wins and its tokens are skipped. This is a local
035 *       (per-position) greedy match, not a global longest-set optimisation —
036 *       it mirrors the Delphi {@code TSQLion} lexical scan. The curated
037 *       {@link MultiWordKeywordTable} allowlist contains no phrases where an
038 *       earlier shorter match would preclude a later longer one, so the two
039 *       agree here. If a future table introduces such overlaps, revisit this
040 *       to a candidate-generate-then-select-by-length strategy.</li>
041 *   <li><b>Directly consecutive tokens, case-insensitive text.</b> A comment
042 *       or any other token between two phrase words breaks the match (its text
043 *       will not equal the expected keyword word, so it naturally fails). Quoted
044 *       identifiers and string literals never match because their text carries
045 *       the surrounding quotes.</li>
046 * </ul>
047 *
048 * <p>Plan reference: §7.3/S18, §7.4/S18.
049 */
050public final class MultiWordKeywordMerger {
051
052    /**
053     * Find all multi-word keyword spans in {@code stream} for {@code vendor}.
054     *
055     * @param stream the token stream to scan; must not be null
056     * @param vendor the dialect whose phrase table to use; must not be null
057     * @return an immutable, source-ordered, non-overlapping list of matches
058     *         (possibly empty); never null
059     * @throws NullPointerException if {@code stream} or {@code vendor} is null
060     */
061    public List<Match> findMatches(Pp2TokenStream stream, EDbVendor vendor) {
062        if (stream == null) throw new NullPointerException("stream");
063        if (vendor == null) throw new NullPointerException("vendor");
064
065        List<Phrase> phrases = MultiWordKeywordTable.forVendor(vendor); // longest-first
066        if (phrases.isEmpty() || stream.isEmpty()) {
067            return Collections.emptyList();
068        }
069
070        List<Match> matches = new ArrayList<Match>();
071        int n = stream.size();
072        int i = 0;
073        while (i < n) {
074            Phrase hit = longestPhraseAt(stream, i, phrases);
075            if (hit != null) {
076                int end = i + hit.getWords().size() - 1;
077                matches.add(new Match(i, end, hit.getCanonical()));
078                i = end + 1; // non-overlapping: skip the consumed run
079            } else {
080                i++;
081            }
082        }
083        return Collections.unmodifiableList(matches);
084    }
085
086    /**
087     * The longest phrase that matches starting exactly at {@code start}, or
088     * {@code null} if none does. {@code phrases} is assumed longest-first, so
089     * the first match wins.
090     */
091    private static Phrase longestPhraseAt(Pp2TokenStream stream, int start,
092                                          List<Phrase> phrases) {
093        for (Phrase p : phrases) {
094            if (matchesAt(stream, start, p)) return p;
095        }
096        return null;
097    }
098
099    /** True iff each phrase word equals (case-insensitive) the consecutive token text. */
100    private static boolean matchesAt(Pp2TokenStream stream, int start, Phrase phrase) {
101        List<String> words = phrase.getWords();
102        int end = start + words.size();
103        if (end > stream.size()) return false;
104        for (int k = 0; k < words.size(); k++) {
105            Pp2Token t = stream.get(start + k);
106            String text = t.getText();
107            if (text == null || !text.equalsIgnoreCase(words.get(k))) {
108                return false;
109            }
110        }
111        return true;
112    }
113
114    /**
115     * An immutable multi-word keyword span over a {@link Pp2TokenStream}:
116     * the inclusive token-index range and the canonical upper-cased phrase.
117     */
118    public static final class Match {
119        private final int startIndex;       // inclusive
120        private final int endIndex;         // inclusive
121        private final String canonical;
122
123        public Match(int startIndex, int endIndex, String canonical) {
124            if (startIndex < 0) {
125                throw new IllegalArgumentException("startIndex < 0: " + startIndex);
126            }
127            if (endIndex < startIndex) {
128                throw new IllegalArgumentException(
129                    "endIndex < startIndex: " + endIndex + " < " + startIndex);
130            }
131            if (canonical == null) throw new NullPointerException("canonical");
132            this.startIndex = startIndex;
133            this.endIndex = endIndex;
134            this.canonical = canonical;
135        }
136
137        /** First token index of the phrase (inclusive). */
138        public int getStartIndex() { return startIndex; }
139
140        /** Last token index of the phrase (inclusive). */
141        public int getEndIndex() { return endIndex; }
142
143        /** Number of tokens the phrase spans. */
144        public int getLength() { return endIndex - startIndex + 1; }
145
146        /** Upper-cased canonical phrase, e.g. {@code "LEFT OUTER JOIN"}. */
147        public String getCanonical() { return canonical; }
148
149        @Override
150        public String toString() {
151            return "Match[" + startIndex + ".." + endIndex + " " + canonical + "]";
152        }
153    }
154}