001package gudusoft.gsqlparser.pp2.island; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.pp2.island.MultiWordKeywordTable.Phrase; 005import gudusoft.gsqlparser.pp2.token.Pp2Token; 006import gudusoft.gsqlparser.pp2.token.Pp2TokenStream; 007 008import java.util.ArrayList; 009import java.util.Collections; 010import java.util.List; 011 012/** 013 * Identifies runs of consecutive tokens that form a multi-word SQL keyword 014 * (e.g. {@code LEFT OUTER JOIN}, {@code UNION ALL}, {@code GROUP BY}) so the 015 * lexical island pipeline can treat each run as a single keyword unit. 016 * 017 * <h2>What it does and does NOT do</h2> 018 * 019 * <p>The merger is <b>read-only</b>: it returns a list of {@link Match} 020 * spans and does not mutate any {@link Pp2Token} or its roles. (Role mutation 021 * is reserved for the designated annotator stages — S9 zone detector, S21 022 * clause annotator, S33 overlay — not this slice.) Later pipeline stages 023 * consume the match list to know which token indices belong to one keyword. 024 * 025 * <h2>Matching rules</h2> 026 * 027 * <ul> 028 * <li><b>Allowlist, not word-count.</b> Only phrases present in 029 * {@link MultiWordKeywordTable} merge. {@code GROUP BY} merges; 030 * {@code LEFT JOIN} does not (only the three-word {@code LEFT OUTER JOIN} 031 * form is in the table).</li> 032 * <li><b>Left-to-right, longest-match-at-each-position, non-overlapping.</b> 033 * The scan walks tokens in source order; at each position the longest 034 * applicable phrase wins and its tokens are skipped. This is a local 035 * (per-position) greedy match, not a global longest-set optimisation — 036 * it mirrors the Delphi {@code TSQLion} lexical scan. The curated 037 * {@link MultiWordKeywordTable} allowlist contains no phrases where an 038 * earlier shorter match would preclude a later longer one, so the two 039 * agree here. If a future table introduces such overlaps, revisit this 040 * to a candidate-generate-then-select-by-length strategy.</li> 041 * <li><b>Directly consecutive tokens, case-insensitive text.</b> A comment 042 * or any other token between two phrase words breaks the match (its text 043 * will not equal the expected keyword word, so it naturally fails). Quoted 044 * identifiers and string literals never match because their text carries 045 * the surrounding quotes.</li> 046 * </ul> 047 * 048 * <p>Plan reference: §7.3/S18, §7.4/S18. 049 */ 050public final class MultiWordKeywordMerger { 051 052 /** 053 * Find all multi-word keyword spans in {@code stream} for {@code vendor}. 054 * 055 * @param stream the token stream to scan; must not be null 056 * @param vendor the dialect whose phrase table to use; must not be null 057 * @return an immutable, source-ordered, non-overlapping list of matches 058 * (possibly empty); never null 059 * @throws NullPointerException if {@code stream} or {@code vendor} is null 060 */ 061 public List<Match> findMatches(Pp2TokenStream stream, EDbVendor vendor) { 062 if (stream == null) throw new NullPointerException("stream"); 063 if (vendor == null) throw new NullPointerException("vendor"); 064 065 List<Phrase> phrases = MultiWordKeywordTable.forVendor(vendor); // longest-first 066 if (phrases.isEmpty() || stream.isEmpty()) { 067 return Collections.emptyList(); 068 } 069 070 List<Match> matches = new ArrayList<Match>(); 071 int n = stream.size(); 072 int i = 0; 073 while (i < n) { 074 Phrase hit = longestPhraseAt(stream, i, phrases); 075 if (hit != null) { 076 int end = i + hit.getWords().size() - 1; 077 matches.add(new Match(i, end, hit.getCanonical())); 078 i = end + 1; // non-overlapping: skip the consumed run 079 } else { 080 i++; 081 } 082 } 083 return Collections.unmodifiableList(matches); 084 } 085 086 /** 087 * The longest phrase that matches starting exactly at {@code start}, or 088 * {@code null} if none does. {@code phrases} is assumed longest-first, so 089 * the first match wins. 090 */ 091 private static Phrase longestPhraseAt(Pp2TokenStream stream, int start, 092 List<Phrase> phrases) { 093 for (Phrase p : phrases) { 094 if (matchesAt(stream, start, p)) return p; 095 } 096 return null; 097 } 098 099 /** True iff each phrase word equals (case-insensitive) the consecutive token text. */ 100 private static boolean matchesAt(Pp2TokenStream stream, int start, Phrase phrase) { 101 List<String> words = phrase.getWords(); 102 int end = start + words.size(); 103 if (end > stream.size()) return false; 104 for (int k = 0; k < words.size(); k++) { 105 Pp2Token t = stream.get(start + k); 106 String text = t.getText(); 107 if (text == null || !text.equalsIgnoreCase(words.get(k))) { 108 return false; 109 } 110 } 111 return true; 112 } 113 114 /** 115 * An immutable multi-word keyword span over a {@link Pp2TokenStream}: 116 * the inclusive token-index range and the canonical upper-cased phrase. 117 */ 118 public static final class Match { 119 private final int startIndex; // inclusive 120 private final int endIndex; // inclusive 121 private final String canonical; 122 123 public Match(int startIndex, int endIndex, String canonical) { 124 if (startIndex < 0) { 125 throw new IllegalArgumentException("startIndex < 0: " + startIndex); 126 } 127 if (endIndex < startIndex) { 128 throw new IllegalArgumentException( 129 "endIndex < startIndex: " + endIndex + " < " + startIndex); 130 } 131 if (canonical == null) throw new NullPointerException("canonical"); 132 this.startIndex = startIndex; 133 this.endIndex = endIndex; 134 this.canonical = canonical; 135 } 136 137 /** First token index of the phrase (inclusive). */ 138 public int getStartIndex() { return startIndex; } 139 140 /** Last token index of the phrase (inclusive). */ 141 public int getEndIndex() { return endIndex; } 142 143 /** Number of tokens the phrase spans. */ 144 public int getLength() { return endIndex - startIndex + 1; } 145 146 /** Upper-cased canonical phrase, e.g. {@code "LEFT OUTER JOIN"}. */ 147 public String getCanonical() { return canonical; } 148 149 @Override 150 public String toString() { 151 return "Match[" + startIndex + ".." + endIndex + " " + canonical + "]"; 152 } 153 } 154}