001package gudusoft.gsqlparser.catalog.runtime; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.catalog.input.model.IdentifierConfig; 005import gudusoft.gsqlparser.sqlenv.CollatorProvider; 006import gudusoft.gsqlparser.sqlenv.ESQLDataObjectType; 007import gudusoft.gsqlparser.sqlenv.IdentifierProfile; 008import gudusoft.gsqlparser.sqlenv.IdentifierService; 009 010import java.util.ArrayList; 011import java.util.BitSet; 012import java.util.List; 013import java.util.Objects; 014 015/** 016 * Thin wrapper over {@link IdentifierService} that keeps the {@code catalog/**} dependency 017 * direction one-way: the package depends on {@code IdentifierService} and 018 * {@link IdentifierConfig} but on no other {@code sqlenv} type. 019 * 020 * <p>Plan §9.2 / §9.5. Every case-folding, normalization, qualified-name comparison, and 021 * map key inside {@code catalog/**} routes through this class (or directly through 022 * {@code IdentifierService}); the forbidden-apis Maven plugin enforces this mechanically.</p> 023 * 024 * <p>{@link IdentifierConfig} fields that map to {@code IdentifierProfile.VendorFlags} 025 * (currently {@code mysqlLowerCaseTableNames} and {@code mssqlCollation}) are honored by 026 * constructing a per-call {@code IdentifierService} when they differ from defaults.</p> 027 * 028 * <p><strong>Round 1 scope cuts (tracked for P1B Round 2):</strong> 029 * <ul> 030 * <li>{@code foldUnquotedToUpper} / {@code foldUnquotedToLower} on 031 * {@link IdentifierConfig} are recorded but not yet projected onto 032 * {@code IdentifierProfile} — the per-call profile is still constructed via 033 * {@code IdentifierProfile.forVendor(vendor, flags)} which uses vendor-default fold 034 * rules. Callers who declare a non-default fold get vendor-default behavior.</li> 035 * <li>{@code tableCaseSensitive} / {@code columnCaseSensitive} (BigQuery split policy) 036 * are accepted but not honored: extending {@code IdentifierProfile} with per-object- 037 * group sensitivity flags is part of Round 2.</li> 038 * <li>{@link #areEqual} Tier-2 fallback for COLLATION_BASED dialects routes through 039 * {@link IdentifierService#areEqualStatic} (cached vendor defaults) and therefore 040 * does not honor a custom {@code mssqlCollation} or MySQL 041 * {@code lower_case_table_names=2}; Tier 1 (cfg-aware normalized comparison) 042 * handles the lctn=1 / lctn=2 case-insensitive cases since their normalize folds 043 * to a single canonical form.</li> 044 * <li>{@link #keyForMap} can produce different keys for two names that 045 * {@link #areEqual} considers equal in COLLATION_BASED dialects — same contract as 046 * {@code IdentifierService.canUseCompositeKey}. Snapshot lookups must fall back to 047 * {@code areEqual} on a key miss.</li> 048 * </ul> 049 * 050 * <p>The defaults-driven path (the primary use of this class in P1B/P1C) is fully covered 051 * by tests and is the one used by every Phase 1 reader and provider.</p> 052 */ 053public final class CatalogIdentifierPolicy { 054 055 private static final char KEY_SEPARATOR = '\u0001'; 056 057 private CatalogIdentifierPolicy() { 058 // Static utility — no instances. 059 } 060 061 /** 062 * Parse a possibly-quoted, possibly-multi-segment qualified name into a 063 * {@link CatalogQualifiedName} with both raw and normalized segments tracked. 064 * 065 * <p>Each segment is normalized using the {@link ESQLDataObjectType} appropriate for its 066 * position in the qualified name (catalog → schema → table → column), so vendor-specific 067 * dual policies like BigQuery's case-sensitive tables vs case-insensitive columns are 068 * applied per-segment rather than en bloc.</p> 069 */ 070 public static CatalogQualifiedName parse(String raw, CatalogObjectKind kind, 071 IdentifierConfig cfg, EDbVendor vendor) { 072 if (raw == null || raw.isEmpty()) { 073 throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: raw must be non-empty"); 074 } 075 if (kind == null) { 076 throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: kind is required"); 077 } 078 if (vendor == null) { 079 throw new IllegalArgumentException("CatalogIdentifierPolicy.parse: vendor is required"); 080 } 081 IdentifierService service = (cfg != null) ? serviceFor(cfg, vendor) : null; 082 SegmentSplit split = splitSegments(raw, vendor); 083 BitSet quoted = split.quoted; 084 List<String> rawSegments = split.segments; 085 List<String> normalized = new ArrayList<String>(rawSegments.size()); 086 for (int i = 0; i < rawSegments.size(); i++) { 087 String segment = rawSegments.get(i); 088 ESQLDataObjectType type = kindForSegment(kind, i, rawSegments.size()); 089 if (service != null) { 090 normalized.add(service.normalize(segment, type)); 091 } else { 092 normalized.add(IdentifierService.normalizeStatic(vendor, type, segment)); 093 } 094 } 095 return new CatalogQualifiedName(rawSegments, normalized, quoted, kind, vendor); 096 } 097 098 /** 099 * Build a {@link CatalogQualifiedName} from segments that have already been normalized 100 * by {@link IdentifierService}. Used by the lazy {@code TSQLEnv} bridge, where 101 * {@code TSQLEnv.doSearchSchemaObject} hands over already-normalized catalog/schema/ 102 * object segments and rejoining + re-parsing would mis-handle quoted-embedded-dot 103 * names like Oracle {@code "a.b"."c"}. 104 * 105 * <p>Each segment is taken verbatim — no further normalization is applied — and the 106 * resulting name carries identical raw and normalized segments. Quoted-flag bits are 107 * empty by default; callers that have lost quoting information at this layer 108 * (the legacy bridge path) accept the limitation that quoted-vs-unquoted distinction 109 * is no longer recoverable. Snapshot lookups on this name still match because the 110 * snapshot keys also derive from the same {@link IdentifierService} normalization.</p> 111 */ 112 public static CatalogQualifiedName fromAlreadyNormalizedSegments(List<String> segments, 113 CatalogObjectKind kind, 114 EDbVendor vendor) { 115 if (segments == null || segments.isEmpty()) { 116 throw new IllegalArgumentException( 117 "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: segments must be non-empty"); 118 } 119 if (kind == null) { 120 throw new IllegalArgumentException( 121 "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: kind is required"); 122 } 123 if (vendor == null) { 124 throw new IllegalArgumentException( 125 "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: vendor is required"); 126 } 127 // Defensive copy — don't trust the caller to keep their list stable. 128 List<String> normalized = new ArrayList<String>(segments); 129 for (int i = 0; i < normalized.size(); i++) { 130 String s = normalized.get(i); 131 if (s == null) { 132 throw new IllegalArgumentException( 133 "CatalogIdentifierPolicy.fromAlreadyNormalizedSegments: segment " + i + " is null"); 134 } 135 } 136 return new CatalogQualifiedName(normalized, normalized, new BitSet(), kind, vendor); 137 } 138 139 /** 140 * Map-friendly stable key for a {@link CatalogQualifiedName}. 141 * 142 * <p>Length-prefixes each normalized segment so quoted identifiers that contain dots — 143 * e.g., Oracle {@code "a.b"."c"} vs {@code "a"."b.c"} — produce distinct keys despite 144 * having the same flat join.</p> 145 * 146 * <p><strong>COLLATION_BASED limitation:</strong> for SQL Server / Azure SQL under a 147 * case-insensitive collation, the legacy {@code IdentifierService.normalize} preserves 148 * case (the legacy code-path uses Collator-bucketed lookup for those dialects). 149 * Consequently {@code keyForMap("dbo.Orders")} and {@code keyForMap("dbo.orders")} can 150 * differ even though {@link #areEqual} considers them equal. Snapshot/index lookups 151 * that key by {@code keyForMap} on those dialects must therefore fall back to 152 * {@link #areEqual} on a key miss; this is the same contract that 153 * {@code IdentifierService.canUseCompositeKey()} exposes for the legacy code-path. 154 * P1B/P1C may add a Collator-aware key bucket; tracked as a separate task.</p> 155 */ 156 public static String keyForMap(CatalogQualifiedName name) { 157 if (name == null) { 158 throw new IllegalArgumentException("CatalogIdentifierPolicy.keyForMap: name is required"); 159 } 160 StringBuilder sb = new StringBuilder(); 161 sb.append(name.kind().name()).append(KEY_SEPARATOR) 162 .append(name.vendor().name()).append(KEY_SEPARATOR); 163 List<String> segments = name.normalized(); 164 for (int i = 0; i < segments.size(); i++) { 165 String segment = segments.get(i); 166 sb.append(segment.length()).append(':').append(segment).append(KEY_SEPARATOR); 167 } 168 return sb.toString(); 169 } 170 171 /** 172 * Vendor-aware equality test for two qualified names. Per-segment comparisons use the 173 * {@link ESQLDataObjectType} that matches each segment's role (catalog/schema/table/ 174 * column) so dual-policy dialects compare correctly. 175 * 176 * <p>Comparison is two-tier: first the cfg-aware {@code normalized} segments are 177 * compared directly (which handles every dialect whose {@code normalize} captures 178 * equality, including non-default {@code IdentifierConfig}s like MySQL 179 * {@code lower_case_table_names=1/2}); on a mismatch we fall through to 180 * {@link IdentifierService#areEqualStatic} so Collator-based dialects (SQL Server / Azure 181 * SQL with case-insensitive collations) still report equality correctly.</p> 182 */ 183 public static boolean areEqual(CatalogQualifiedName a, CatalogQualifiedName b) { 184 if (a == null || b == null) { 185 return a == b; 186 } 187 if (a.kind() != b.kind() || a.vendor() != b.vendor() || a.size() != b.size()) { 188 return false; 189 } 190 // Tier 1: compare normalized forms (honors IdentifierConfig used at parse time). 191 if (a.normalized().equals(b.normalized())) { 192 return true; 193 } 194 // Tier 2: route through the legacy Collator-aware static path for COLLATION_BASED dialects. 195 for (int i = 0; i < a.size(); i++) { 196 ESQLDataObjectType type = kindForSegment(a.kind(), i, a.size()); 197 String aSeg = a.raw().get(i); 198 String bSeg = b.raw().get(i); 199 if (!IdentifierService.areEqualStatic(a.vendor(), type, aSeg, bSeg)) { 200 return false; 201 } 202 } 203 return true; 204 } 205 206 /** Normalize a single identifier segment per {@link IdentifierConfig}'s vendor. */ 207 public static String normalize(String raw, IdentifierConfig cfg) { 208 if (raw == null) return null; 209 if (cfg == null) { 210 throw new IllegalArgumentException("CatalogIdentifierPolicy.normalize: cfg is required"); 211 } 212 return serviceFor(cfg, cfg.vendor()).normalize(raw, ESQLDataObjectType.dotUnknown); 213 } 214 215 /** Normalize a segment with explicit quoted-ness (advisory). */ 216 public static String normalizeSegment(String raw, boolean wasQuoted, IdentifierConfig cfg) { 217 Objects.requireNonNull(cfg, "cfg"); 218 // The IdentifierService.normalize(...) method inspects raw for quote characters 219 // itself, so wasQuoted is treated as advisory metadata. Retained for symmetry with 220 // the plan §9.2 API and to surface intent at call-sites. 221 return serviceFor(cfg, cfg.vendor()).normalize(raw, ESQLDataObjectType.dotUnknown); 222 } 223 224 /** 225 * Public façade over {@link #serviceFor(IdentifierConfig, EDbVendor)} for callers 226 * outside this package (validator, readers, providers) that need an 227 * {@link IdentifierService} matching a {@link IdentifierConfig}. Vendor defaults 228 * to {@code cfg.vendor()} when {@code vendor} is {@code null}. 229 */ 230 public static IdentifierService identifierServiceFor(IdentifierConfig cfg, EDbVendor vendor) { 231 if (cfg == null) { 232 throw new IllegalArgumentException( 233 "CatalogIdentifierPolicy.identifierServiceFor: cfg is required"); 234 } 235 return serviceFor(cfg, vendor != null ? vendor : cfg.vendor()); 236 } 237 238 // ---------- internals ---------- 239 240 /** 241 * Build an {@link IdentifierService} that honors the supplied {@link IdentifierConfig}. 242 * Falls back to the cached default service when the config matches vendor defaults 243 * (avoids re-allocating on every parse for the common case). 244 */ 245 static IdentifierService serviceFor(IdentifierConfig cfg, EDbVendor vendorOverride) { 246 EDbVendor vendor = vendorOverride != null ? vendorOverride : cfg.vendor(); 247 IdentifierProfile.VendorFlags flags = vendorFlagsFrom(cfg); 248 IdentifierProfile profile = IdentifierProfile.forVendor(vendor, flags); 249 CollatorProvider collatorProvider = 250 (vendor == EDbVendor.dbvmssql || vendor == EDbVendor.dbvazuresql) 251 ? new CollatorProvider() 252 : null; 253 return new IdentifierService(profile, collatorProvider); 254 } 255 256 static IdentifierProfile.VendorFlags vendorFlagsFrom(IdentifierConfig cfg) { 257 int mysqlLctn = cfg.mysqlLowerCaseTableNames() != null 258 ? cfg.mysqlLowerCaseTableNames() : 0; 259 String collation = cfg.mssqlCollation() != null 260 ? cfg.mssqlCollation() : "SQL_Latin1_General_CP1_CI_AS"; 261 return new IdentifierProfile.VendorFlags(mysqlLctn, collation, false, false); 262 } 263 264 /** 265 * For a qualified name with overall {@code kind}, return the {@link ESQLDataObjectType} 266 * appropriate for the segment at {@code segmentIndex} within a name of {@code total} 267 * segments. 268 * 269 * <p>Layouts assumed:</p> 270 * <ul> 271 * <li>4 segments → server.catalog.schema.object</li> 272 * <li>3 segments → catalog.schema.object</li> 273 * <li>2 segments → schema.object (or catalog.object on dialects without schemas)</li> 274 * <li>1 segment → object</li> 275 * </ul> 276 */ 277 static ESQLDataObjectType kindForSegment(CatalogObjectKind kind, int segmentIndex, int total) { 278 int distanceFromEnd = total - 1 - segmentIndex; 279 if (distanceFromEnd == 0) { 280 return mapKind(kind); 281 } 282 if (distanceFromEnd == 1) { 283 // Parent of the leaf: 284 // * for COLUMN / INDEX / CONSTRAINT, the parent is a TABLE 285 // * for TABLE / VIEW / MATERIALIZED_VIEW / ROUTINE / FUNCTION / PROCEDURE / 286 // PACKAGE / SYNONYM / SEQUENCE / TRIGGER / TYPE, the parent is a SCHEMA 287 switch (kind) { 288 case COLUMN: 289 case INDEX: 290 case CONSTRAINT: 291 return ESQLDataObjectType.dotTable; 292 default: 293 return ESQLDataObjectType.dotSchema; 294 } 295 } 296 if (distanceFromEnd == 2) { 297 // Grandparent: 298 // * for COLUMN, grandparent is SCHEMA 299 // * else CATALOG 300 switch (kind) { 301 case COLUMN: 302 case INDEX: 303 case CONSTRAINT: 304 return ESQLDataObjectType.dotSchema; 305 default: 306 return ESQLDataObjectType.dotCatalog; 307 } 308 } 309 // distanceFromEnd >= 3: server / catalog level 310 if (distanceFromEnd == 3 && (kind == CatalogObjectKind.COLUMN 311 || kind == CatalogObjectKind.INDEX || kind == CatalogObjectKind.CONSTRAINT)) { 312 return ESQLDataObjectType.dotCatalog; 313 } 314 return ESQLDataObjectType.dotUnknown; 315 } 316 317 /** 318 * Map a {@link CatalogObjectKind} to its leaf {@link ESQLDataObjectType}. Internal — 319 * use {@link #kindForSegment} when handling multi-segment names. 320 */ 321 static ESQLDataObjectType mapKind(CatalogObjectKind kind) { 322 switch (kind) { 323 case CATALOG: return ESQLDataObjectType.dotCatalog; 324 case SCHEMA: return ESQLDataObjectType.dotSchema; 325 case TABLE: 326 case VIEW: 327 case MATERIALIZED_VIEW: return ESQLDataObjectType.dotTable; 328 case COLUMN: return ESQLDataObjectType.dotColumn; 329 case ROUTINE: return ESQLDataObjectType.dotRoutine; 330 case FUNCTION: return ESQLDataObjectType.dotFunction; 331 case PROCEDURE: return ESQLDataObjectType.dotProcedure; 332 case PACKAGE: return ESQLDataObjectType.dotOraclePackage; 333 case SYNONYM: return ESQLDataObjectType.dotSynonyms; 334 case TYPE: return ESQLDataObjectType.dotDataType; 335 case TRIGGER: return ESQLDataObjectType.dotTrigger; 336 case SEQUENCE: 337 case INDEX: 338 case CONSTRAINT: 339 default: return ESQLDataObjectType.dotUnknown; 340 } 341 } 342 343 /** Output of {@link #splitSegments}: parallel raw-segment list plus quoted-flag bits. */ 344 static final class SegmentSplit { 345 final List<String> segments; 346 final BitSet quoted; 347 348 SegmentSplit(List<String> segments, BitSet quoted) { 349 this.segments = segments; 350 this.quoted = quoted; 351 } 352 } 353 354 /** 355 * Split a qualified name into raw segments preserving any surrounding quote characters 356 * so {@link CatalogQualifiedName} can record per-segment quoted flags faithfully. 357 * 358 * <p>BigQuery special-case: a single backtick-delimited path that contains dots 359 * (e.g., {@code `project.dataset.table`}) is split internally, mirroring the 360 * convention also used by {@code SQLUtil.parseNames}.</p> 361 */ 362 static SegmentSplit splitSegments(String raw, EDbVendor vendor) { 363 // BigQuery wraps the whole path in backticks: split internally, mark unquoted. 364 if (vendor == EDbVendor.dbvbigquery 365 && raw.length() >= 2 366 && raw.charAt(0) == '`' 367 && raw.charAt(raw.length() - 1) == '`' 368 && raw.indexOf('.') > 0 369 && countUnescaped(raw, '`') == 2) { 370 String inner = raw.substring(1, raw.length() - 1); 371 return splitOnDot(inner, false); 372 } 373 List<String> out = new ArrayList<String>(); 374 BitSet quotedFlags = new BitSet(); 375 StringBuilder current = new StringBuilder(); 376 boolean currentQuoted = false; 377 char quoteChar = 0; 378 boolean inBracket = false; 379 for (int i = 0; i < raw.length(); i++) { 380 char c = raw.charAt(i); 381 if (quoteChar != 0) { 382 current.append(c); 383 if (c == quoteChar) { 384 if (i + 1 < raw.length() && raw.charAt(i + 1) == quoteChar) { 385 current.append(raw.charAt(i + 1)); 386 i++; 387 } else { 388 quoteChar = 0; 389 } 390 } 391 } else if (inBracket) { 392 current.append(c); 393 if (c == ']') { 394 inBracket = false; 395 } 396 } else if (c == '"' || c == '`') { 397 current.append(c); 398 quoteChar = c; 399 currentQuoted = true; 400 } else if (c == '[' && (vendor == EDbVendor.dbvmssql || vendor == EDbVendor.dbvazuresql)) { 401 current.append(c); 402 inBracket = true; 403 currentQuoted = true; 404 } else if (c == '.') { 405 if (currentQuoted) quotedFlags.set(out.size()); 406 out.add(current.toString()); 407 current.setLength(0); 408 currentQuoted = false; 409 } else { 410 current.append(c); 411 } 412 } 413 if (currentQuoted) quotedFlags.set(out.size()); 414 out.add(current.toString()); 415 return new SegmentSplit(out, quotedFlags); 416 } 417 418 private static SegmentSplit splitOnDot(String inner, boolean markQuoted) { 419 List<String> out = new ArrayList<String>(); 420 BitSet quoted = new BitSet(); 421 int start = 0; 422 for (int i = 0; i < inner.length(); i++) { 423 if (inner.charAt(i) == '.') { 424 String seg = inner.substring(start, i); 425 if (markQuoted) quoted.set(out.size()); 426 out.add(seg); 427 start = i + 1; 428 } 429 } 430 String last = inner.substring(start); 431 if (markQuoted) quoted.set(out.size()); 432 out.add(last); 433 return new SegmentSplit(out, quoted); 434 } 435 436 private static int countUnescaped(String s, char ch) { 437 int n = 0; 438 for (int i = 0; i < s.length(); i++) { 439 if (s.charAt(i) == ch) { 440 if (i + 1 < s.length() && s.charAt(i + 1) == ch) { 441 i++; 442 } else { 443 n++; 444 } 445 } 446 } 447 return n; 448 } 449}