Source code

001package gudusoft.gsqlparser.catalog.runtime;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.catalog.input.model.IdentifierConfig;
005import gudusoft.gsqlparser.sqlenv.CollatorProvider;
006import gudusoft.gsqlparser.sqlenv.ESQLDataObjectType;
007import gudusoft.gsqlparser.sqlenv.IdentifierProfile;
008import gudusoft.gsqlparser.sqlenv.IdentifierService;
009
010import java.util.ArrayList;
011import java.util.BitSet;
012import java.util.List;
013import java.util.Objects;
014
015/**
016 * Thin wrapper over {@link IdentifierService} that keeps the {@code catalog/**} dependency
017 * direction one-way: the package depends on {@code IdentifierService} and
018 * {@link IdentifierConfig} but on no other {@code sqlenv} type.
019 *
020 * <p>Plan §9.2 / §9.5. Every case-folding, normalization, qualified-name comparison, and
021 * map key inside {@code catalog/**} routes through this class (or directly through
022 * {@code IdentifierService}); the forbidden-apis Maven plugin enforces this mechanically.</p>
023 *
024 * <p>{@link IdentifierConfig} fields that map to {@code IdentifierProfile.VendorFlags}
025 * (currently {@code mysqlLowerCaseTableNames} and {@code mssqlCollation}) are honored by
026 * constructing a per-call {@code IdentifierService} when they differ from defaults.</p>
027 *
028 * <p><strong>Round 1 scope cuts (tracked for P1B Round 2):</strong>
029 * <ul>
030 *   <li>{@code foldUnquotedToUpper} / {@code foldUnquotedToLower} on
031 *       {@link IdentifierConfig} are recorded but not yet projected onto
032 *       {@code IdentifierProfile} — the per-call profile is still constructed via
033 *       {@code IdentifierProfile.forVendor(vendor, flags)} which uses vendor-default fold
034 *       rules. Callers who declare a non-default fold get vendor-default behavior.</li>
035 *   <li>{@code tableCaseSensitive} / {@code columnCaseSensitive} (BigQuery split policy)
036 *       are accepted but not honored: extending {@code IdentifierProfile} with per-object-
037 *       group sensitivity flags is part of Round 2.</li>
038 *   <li>{@link #areEqual} Tier-2 fallback for COLLATION_BASED dialects routes through
039 *       {@link IdentifierService#areEqualStatic} (cached vendor defaults) and therefore
040 *       does not honor a custom {@code mssqlCollation} or MySQL
041 *       {@code lower_case_table_names=2}; Tier 1 (cfg-aware normalized comparison)
042 *       handles the lctn=1 / lctn=2 case-insensitive cases since their normalize folds
043 *       to a single canonical form.</li>
044 *   <li>{@link #keyForMap} can produce different keys for two names that
045 *       {@link #areEqual} considers equal in COLLATION_BASED dialects — same contract as
046 *       {@code IdentifierService.canUseCompositeKey}. Snapshot lookups must fall back to
047 *       {@code areEqual} on a key miss.</li>
048 * </ul>
049 *
050 * <p>The defaults-driven path (the primary use of this class in P1B/P1C) is fully covered
051 * by tests and is the one used by every Phase 1 reader and provider.</p>
052 */
053public final class CatalogIdentifierPolicy {
054
055    private static final char KEY_SEPARATOR = '\u0001';
056
057    private CatalogIdentifierPolicy() {
058        // Static utility — no instances.
059    }
060
061    /**
062     * Parse a possibly-quoted, possibly-multi-segment qualified name into a
063     * {@link CatalogQualifiedName} with both raw and normalized segments tracked.
064     *
065     * <p>Each segment is normalized using the {@link ESQLDataObjectType} appropriate for its
066     * position in the qualified name (catalog → schema → table → column), so vendor-specific
067     * dual policies like BigQuery's case-sensitive tables vs case-insensitive columns are
068     * applied per-segment rather than en bloc.</p>
069     */
070    public static CatalogQualifiedName parse(String raw, CatalogObjectKind kind,
071                                             IdentifierConfig cfg, EDbVendor vendor) {
072        if (raw == null || raw.isEmpty()) {
073            throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: raw must be non-empty");
074        }
075        if (kind == null) {
076            throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: kind is required");
077        }
078        if (vendor == null) {
079            throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: vendor is required");
080        }
081        IdentifierService service = (cfg != null) ? serviceFor(cfg, vendor) : null;
082        SegmentSplit split = splitSegments(raw, vendor);
083        BitSet quoted = split.quoted;
084        List<String> rawSegments = split.segments;
085        List<String> normalized = new ArrayList<String>(rawSegments.size());
086        for (int i = 0; i < rawSegments.size(); i++) {
087            String segment = rawSegments.get(i);
088            ESQLDataObjectType type = kindForSegment(kind, i, rawSegments.size());
089            if (service != null) {
090                normalized.add(service.normalize(segment, type));
091            } else {
092                normalized.add(IdentifierService.normalizeStatic(vendor, type, segment));
093            }
094        }
095        return new CatalogQualifiedName(rawSegments, normalized, quoted, kind, vendor);
096    }
097
098    /**
099     * Build a {@link CatalogQualifiedName} from segments that have already been normalized
100     * by {@link IdentifierService}. Used by the lazy {@code TSQLEnv} bridge, where
101     * {@code TSQLEnv.doSearchSchemaObject} hands over already-normalized catalog/schema/
102     * object segments and rejoining + re-parsing would mis-handle quoted-embedded-dot
103     * names like Oracle {@code "a.b"."c"}.
104     *
105     * <p>Each segment is taken verbatim — no further normalization is applied — and the
106     * resulting name carries identical raw and normalized segments. Quoted-flag bits are
107     * empty by default; callers that have lost quoting information at this layer
108     * (the legacy bridge path) accept the limitation that quoted-vs-unquoted distinction
109     * is no longer recoverable. Snapshot lookups on this name still match because the
110     * snapshot keys also derive from the same {@link IdentifierService} normalization.</p>
111     */
112    public static CatalogQualifiedName fromAlreadyNormalizedSegments(List<String> segments,
113                                                                     CatalogObjectKind kind,
114                                                                     EDbVendor vendor) {
115        if (segments == null || segments.isEmpty()) {
116            throw new IllegalArgumentException(
117                "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: segments must be non-empty");
118        }
119        if (kind == null) {
120            throw new IllegalArgumentException(
121                "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: kind is required");
122        }
123        if (vendor == null) {
124            throw new IllegalArgumentException(
125                "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: vendor is required");
126        }
127        // Defensive copy — don't trust the caller to keep their list stable.
128        List<String> normalized = new ArrayList<String>(segments);
129        for (int i = 0; i < normalized.size(); i++) {
130            String s = normalized.get(i);
131            if (s == null) {
132                throw new IllegalArgumentException(
133                    "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: segment " + i + " is null");
134            }
135        }
136        return new CatalogQualifiedName(normalized, normalized, new BitSet(), kind, vendor);
137    }
138
139    /**
140     * Map-friendly stable key for a {@link CatalogQualifiedName}.
141     *
142     * <p>Length-prefixes each normalized segment so quoted identifiers that contain dots —
143     * e.g., Oracle {@code "a.b"."c"} vs {@code "a"."b.c"} — produce distinct keys despite
144     * having the same flat join.</p>
145     *
146     * <p><strong>COLLATION_BASED limitation:</strong> for SQL Server / Azure SQL under a
147     * case-insensitive collation, the legacy {@code IdentifierService.normalize} preserves
148     * case (the legacy code-path uses Collator-bucketed lookup for those dialects).
149     * Consequently {@code keyForMap("dbo.Orders")} and {@code keyForMap("dbo.orders")} can
150     * differ even though {@link #areEqual} considers them equal. Snapshot/index lookups
151     * that key by {@code keyForMap} on those dialects must therefore fall back to
152     * {@link #areEqual} on a key miss; this is the same contract that
153     * {@code IdentifierService.canUseCompositeKey()} exposes for the legacy code-path.
154     * P1B/P1C may add a Collator-aware key bucket; tracked as a separate task.</p>
155     */
156    public static String keyForMap(CatalogQualifiedName name) {
157        if (name == null) {
158            throw new IllegalArgumentException("CatalogIdentifierPolicy.keyForMap: name is required");
159        }
160        StringBuilder sb = new StringBuilder();
161        sb.append(name.kind().name()).append(KEY_SEPARATOR)
162          .append(name.vendor().name()).append(KEY_SEPARATOR);
163        List<String> segments = name.normalized();
164        for (int i = 0; i < segments.size(); i++) {
165            String segment = segments.get(i);
166            sb.append(segment.length()).append(':').append(segment).append(KEY_SEPARATOR);
167        }
168        return sb.toString();
169    }
170
171    /**
172     * Vendor-aware equality test for two qualified names. Per-segment comparisons use the
173     * {@link ESQLDataObjectType} that matches each segment's role (catalog/schema/table/
174     * column) so dual-policy dialects compare correctly.
175     *
176     * <p>Comparison is two-tier: first the cfg-aware {@code normalized} segments are
177     * compared directly (which handles every dialect whose {@code normalize} captures
178     * equality, including non-default {@code IdentifierConfig}s like MySQL
179     * {@code lower_case_table_names=1/2}); on a mismatch we fall through to
180     * {@link IdentifierService#areEqualStatic} so Collator-based dialects (SQL Server / Azure
181     * SQL with case-insensitive collations) still report equality correctly.</p>
182     */
183    public static boolean areEqual(CatalogQualifiedName a, CatalogQualifiedName b) {
184        if (a == null || b == null) {
185            return a == b;
186        }
187        if (a.kind() != b.kind() || a.vendor() != b.vendor() || a.size() != b.size()) {
188            return false;
189        }
190        // Tier 1: compare normalized forms (honors IdentifierConfig used at parse time).
191        if (a.normalized().equals(b.normalized())) {
192            return true;
193        }
194        // Tier 2: route through the legacy Collator-aware static path for COLLATION_BASED dialects.
195        for (int i = 0; i < a.size(); i++) {
196            ESQLDataObjectType type = kindForSegment(a.kind(), i, a.size());
197            String aSeg = a.raw().get(i);
198            String bSeg = b.raw().get(i);
199            if (!IdentifierService.areEqualStatic(a.vendor(), type, aSeg, bSeg)) {
200                return false;
201            }
202        }
203        return true;
204    }
205
206    /** Normalize a single identifier segment per {@link IdentifierConfig}'s vendor. */
207    public static String normalize(String raw, IdentifierConfig cfg) {
208        if (raw == null) return null;
209        if (cfg == null) {
210            throw new IllegalArgumentException("CatalogIdentifierPolicy.normalize: cfg is required");
211        }
212        return serviceFor(cfg, cfg.vendor()).normalize(raw, ESQLDataObjectType.dotUnknown);
213    }
214
215    /** Normalize a segment with explicit quoted-ness (advisory). */
216    public static String normalizeSegment(String raw, boolean wasQuoted, IdentifierConfig cfg) {
217        Objects.requireNonNull(cfg, "cfg");
218        // The IdentifierService.normalize(...) method inspects raw for quote characters
219        // itself, so wasQuoted is treated as advisory metadata. Retained for symmetry with
220        // the plan §9.2 API and to surface intent at call-sites.
221        return serviceFor(cfg, cfg.vendor()).normalize(raw, ESQLDataObjectType.dotUnknown);
222    }
223
224    /**
225     * Public façade over {@link #serviceFor(IdentifierConfig, EDbVendor)} for callers
226     * outside this package (validator, readers, providers) that need an
227     * {@link IdentifierService} matching a {@link IdentifierConfig}. Vendor defaults
228     * to {@code cfg.vendor()} when {@code vendor} is {@code null}.
229     */
230    public static IdentifierService identifierServiceFor(IdentifierConfig cfg, EDbVendor vendor) {
231        if (cfg == null) {
232            throw new IllegalArgumentException(
233                "CatalogIdentifierPolicy.identifierServiceFor: cfg is required");
234        }
235        return serviceFor(cfg, vendor != null ? vendor : cfg.vendor());
236    }
237
238    // ---------- internals ----------
239
240    /**
241     * Build an {@link IdentifierService} that honors the supplied {@link IdentifierConfig}.
242     * Falls back to the cached default service when the config matches vendor defaults
243     * (avoids re-allocating on every parse for the common case).
244     */
245    static IdentifierService serviceFor(IdentifierConfig cfg, EDbVendor vendorOverride) {
246        EDbVendor vendor = vendorOverride != null ? vendorOverride : cfg.vendor();
247        IdentifierProfile.VendorFlags flags = vendorFlagsFrom(cfg);
248        IdentifierProfile profile = IdentifierProfile.forVendor(vendor, flags);
249        CollatorProvider collatorProvider =
250            (vendor == EDbVendor.dbvmssql || vendor == EDbVendor.dbvazuresql)
251                ? new CollatorProvider()
252                : null;
253        return new IdentifierService(profile, collatorProvider);
254    }
255
256    static IdentifierProfile.VendorFlags vendorFlagsFrom(IdentifierConfig cfg) {
257        int mysqlLctn = cfg.mysqlLowerCaseTableNames() != null
258            ? cfg.mysqlLowerCaseTableNames() : 0;
259        String collation = cfg.mssqlCollation() != null
260            ? cfg.mssqlCollation() : "SQL_Latin1_General_CP1_CI_AS";
261        return new IdentifierProfile.VendorFlags(mysqlLctn, collation, false, false);
262    }
263
264    /**
265     * For a qualified name with overall {@code kind}, return the {@link ESQLDataObjectType}
266     * appropriate for the segment at {@code segmentIndex} within a name of {@code total}
267     * segments.
268     *
269     * <p>Layouts assumed:</p>
270     * <ul>
271     *   <li>4 segments → server.catalog.schema.object</li>
272     *   <li>3 segments → catalog.schema.object</li>
273     *   <li>2 segments → schema.object (or catalog.object on dialects without schemas)</li>
274     *   <li>1 segment  → object</li>
275     * </ul>
276     */
277    static ESQLDataObjectType kindForSegment(CatalogObjectKind kind, int segmentIndex, int total) {
278        int distanceFromEnd = total - 1 - segmentIndex;
279        if (distanceFromEnd == 0) {
280            return mapKind(kind);
281        }
282        if (distanceFromEnd == 1) {
283            // Parent of the leaf:
284            // * for COLUMN / INDEX / CONSTRAINT, the parent is a TABLE
285            // * for TABLE / VIEW / MATERIALIZED_VIEW / ROUTINE / FUNCTION / PROCEDURE /
286            //   PACKAGE / SYNONYM / SEQUENCE / TRIGGER / TYPE, the parent is a SCHEMA
287            switch (kind) {
288                case COLUMN:
289                case INDEX:
290                case CONSTRAINT:
291                    return ESQLDataObjectType.dotTable;
292                default:
293                    return ESQLDataObjectType.dotSchema;
294            }
295        }
296        if (distanceFromEnd == 2) {
297            // Grandparent:
298            // * for COLUMN, grandparent is SCHEMA
299            // * else CATALOG
300            switch (kind) {
301                case COLUMN:
302                case INDEX:
303                case CONSTRAINT:
304                    return ESQLDataObjectType.dotSchema;
305                default:
306                    return ESQLDataObjectType.dotCatalog;
307            }
308        }
309        // distanceFromEnd >= 3: server / catalog level
310        if (distanceFromEnd == 3 && (kind == CatalogObjectKind.COLUMN
311                || kind == CatalogObjectKind.INDEX || kind == CatalogObjectKind.CONSTRAINT)) {
312            return ESQLDataObjectType.dotCatalog;
313        }
314        return ESQLDataObjectType.dotUnknown;
315    }
316
317    /**
318     * Map a {@link CatalogObjectKind} to its leaf {@link ESQLDataObjectType}. Internal —
319     * use {@link #kindForSegment} when handling multi-segment names.
320     */
321    static ESQLDataObjectType mapKind(CatalogObjectKind kind) {
322        switch (kind) {
323            case CATALOG: return ESQLDataObjectType.dotCatalog;
324            case SCHEMA: return ESQLDataObjectType.dotSchema;
325            case TABLE:
326            case VIEW:
327            case MATERIALIZED_VIEW: return ESQLDataObjectType.dotTable;
328            case COLUMN: return ESQLDataObjectType.dotColumn;
329            case ROUTINE: return ESQLDataObjectType.dotRoutine;
330            case FUNCTION: return ESQLDataObjectType.dotFunction;
331            case PROCEDURE: return ESQLDataObjectType.dotProcedure;
332            case PACKAGE: return ESQLDataObjectType.dotOraclePackage;
333            case SYNONYM: return ESQLDataObjectType.dotSynonyms;
334            case TYPE: return ESQLDataObjectType.dotDataType;
335            case TRIGGER: return ESQLDataObjectType.dotTrigger;
336            case SEQUENCE:
337            case INDEX:
338            case CONSTRAINT:
339            default: return ESQLDataObjectType.dotUnknown;
340        }
341    }
342
343    /** Output of {@link #splitSegments}: parallel raw-segment list plus quoted-flag bits. */
344    static final class SegmentSplit {
345        final List<String> segments;
346        final BitSet quoted;
347
348        SegmentSplit(List<String> segments, BitSet quoted) {
349            this.segments = segments;
350            this.quoted = quoted;
351        }
352    }
353
354    /**
355     * Split a qualified name into raw segments preserving any surrounding quote characters
356     * so {@link CatalogQualifiedName} can record per-segment quoted flags faithfully.
357     *
358     * <p>BigQuery special-case: a single backtick-delimited path that contains dots
359     * (e.g., {@code `project.dataset.table`}) is split internally, mirroring the
360     * convention also used by {@code SQLUtil.parseNames}.</p>
361     */
362    static SegmentSplit splitSegments(String raw, EDbVendor vendor) {
363        // BigQuery wraps the whole path in backticks: split internally, mark unquoted.
364        if (vendor == EDbVendor.dbvbigquery
365                && raw.length() >= 2
366                && raw.charAt(0) == '`'
367                && raw.charAt(raw.length() - 1) == '`'
368                && raw.indexOf('.') > 0
369                && countUnescaped(raw, '`') == 2) {
370            String inner = raw.substring(1, raw.length() - 1);
371            return splitOnDot(inner, false);
372        }
373        List<String> out = new ArrayList<String>();
374        BitSet quotedFlags = new BitSet();
375        StringBuilder current = new StringBuilder();
376        boolean currentQuoted = false;
377        char quoteChar = 0;
378        boolean inBracket = false;
379        for (int i = 0; i < raw.length(); i++) {
380            char c = raw.charAt(i);
381            if (quoteChar != 0) {
382                current.append(c);
383                if (c == quoteChar) {
384                    if (i + 1 < raw.length() && raw.charAt(i + 1) == quoteChar) {
385                        current.append(raw.charAt(i + 1));
386                        i++;
387                    } else {
388                        quoteChar = 0;
389                    }
390                }
391            } else if (inBracket) {
392                current.append(c);
393                if (c == ']') {
394                    inBracket = false;
395                }
396            } else if (c == '"' || c == '`') {
397                current.append(c);
398                quoteChar = c;
399                currentQuoted = true;
400            } else if (c == '[' && (vendor == EDbVendor.dbvmssql || vendor == EDbVendor.dbvazuresql)) {
401                current.append(c);
402                inBracket = true;
403                currentQuoted = true;
404            } else if (c == '.') {
405                if (currentQuoted) quotedFlags.set(out.size());
406                out.add(current.toString());
407                current.setLength(0);
408                currentQuoted = false;
409            } else {
410                current.append(c);
411            }
412        }
413        if (currentQuoted) quotedFlags.set(out.size());
414        out.add(current.toString());
415        return new SegmentSplit(out, quotedFlags);
416    }
417
418    private static SegmentSplit splitOnDot(String inner, boolean markQuoted) {
419        List<String> out = new ArrayList<String>();
420        BitSet quoted = new BitSet();
421        int start = 0;
422        for (int i = 0; i < inner.length(); i++) {
423            if (inner.charAt(i) == '.') {
424                String seg = inner.substring(start, i);
425                if (markQuoted) quoted.set(out.size());
426                out.add(seg);
427                start = i + 1;
428            }
429        }
430        String last = inner.substring(start);
431        if (markQuoted) quoted.set(out.size());
432        out.add(last);
433        return new SegmentSplit(out, quoted);
434    }
435
436    private static int countUnescaped(String s, char ch) {
437        int n = 0;
438        for (int i = 0; i < s.length(); i++) {
439            if (s.charAt(i) == ch) {
440                if (i + 1 < s.length() && s.charAt(i + 1) == ch) {
441                    i++;
442                } else {
443                    n++;
444                }
445            }
446        }
447        return n;
448    }
449}