001package gudusoft.gsqlparser.resolver2.namespace; 002 003import gudusoft.gsqlparser.nodes.TResultColumn; 004import gudusoft.gsqlparser.nodes.TResultColumnList; 005import gudusoft.gsqlparser.nodes.TTable; 006import gudusoft.gsqlparser.resolver2.ColumnLevel; 007import gudusoft.gsqlparser.resolver2.matcher.INameMatcher; 008import gudusoft.gsqlparser.resolver2.matcher.VendorNameMatcher; 009import gudusoft.gsqlparser.resolver2.model.ColumnSource; 010import gudusoft.gsqlparser.sqlenv.ESQLDataObjectType; 011import gudusoft.gsqlparser.stmt.TSelectSqlStatement; 012 013import java.util.*; 014 015/** 016 * Namespace representing a UNION/INTERSECT/EXCEPT query. 017 * 018 * Key characteristics: 019 * - Schema is defined by the FIRST branch (SQL standard) 020 * - Columns must be pushed down to ALL branches 021 * - hasStarColumn() returns true if ANY branch has SELECT * 022 * - addInferredColumn() propagates to ALL branches 023 * 024 * Example: 025 * FROM ( 026 * SELECT * FROM table_1 027 * UNION ALL 028 * SELECT * FROM table_2 029 * UNION ALL 030 * SELECT * FROM table_3 031 * ) Combined 032 * 033 * When outer query references "col_1", it should be pushed to ALL branches. 034 */ 035public class UnionNamespace extends AbstractNamespace { 036 037 private final TSelectSqlStatement unionQuery; 038 private final String alias; 039 040 /** All branches of the UNION (flattened) */ 041 private final List<TSelectSqlStatement> allBranches; 042 043 /** Namespace for each branch */ 044 private final List<SubqueryNamespace> branchNamespaces; 045 046 /** Inferred columns from star push-down */ 047 private Map<String, ColumnSource> inferredColumns; 048 049 /** Track inferred column names */ 050 private Set<String> inferredColumnNames; 051 052 public UnionNamespace(TSelectSqlStatement unionQuery, 053 String alias, 054 INameMatcher nameMatcher) { 055 super(unionQuery, nameMatcher); 056 this.unionQuery = unionQuery; 057 this.alias = alias; 058 059 // Flatten all UNION branches 060 this.allBranches = new ArrayList<>(); 061 flattenUnionBranches(unionQuery, allBranches); 062 063 // Create namespace for each branch 064 this.branchNamespaces = new ArrayList<>(); 065 for (int i = 0; i < allBranches.size(); i++) { 066 TSelectSqlStatement branch = allBranches.get(i); 067 SubqueryNamespace branchNs = new SubqueryNamespace(branch, "branch_" + i, nameMatcher); 068 branchNamespaces.add(branchNs); 069 } 070 } 071 072 /** 073 * Iteratively flatten UNION branches into a list. 074 * Handles nested UNION structures like: (A UNION B) UNION C 075 * Uses explicit stack to avoid StackOverflow on deeply nested chains. 076 */ 077 private void flattenUnionBranches(TSelectSqlStatement stmt, List<TSelectSqlStatement> branches) { 078 if (stmt == null) { 079 return; 080 } 081 082 Deque<TSelectSqlStatement> stack = new ArrayDeque<>(); 083 stack.push(stmt); 084 085 while (!stack.isEmpty()) { 086 TSelectSqlStatement current = stack.pop(); 087 if (current == null) { 088 continue; 089 } 090 if (current.isCombinedQuery()) { 091 // Push right first so left is processed first (LIFO) 092 stack.push(current.getRightStmt()); 093 stack.push(current.getLeftStmt()); 094 } else { 095 branches.add(current); 096 } 097 } 098 } 099 100 @Override 101 public String getDisplayName() { 102 return alias != null ? alias : "<union>"; 103 } 104 105 @Override 106 public TTable getFinalTable() { 107 // For UNION, return the first branch's final table 108 // This is used for single-table resolution 109 if (!branchNamespaces.isEmpty()) { 110 return branchNamespaces.get(0).getFinalTable(); 111 } 112 return null; 113 } 114 115 @Override 116 public List<TTable> getAllFinalTables() { 117 // Return tables from ALL branches 118 List<TTable> allTables = new ArrayList<>(); 119 for (SubqueryNamespace branchNs : branchNamespaces) { 120 branchNs.validate(); 121 List<TTable> branchTables = branchNs.getAllFinalTables(); 122 allTables.addAll(branchTables); 123 } 124 return allTables; 125 } 126 127 @Override 128 protected void doValidate() { 129 // Extract columns from first branch's SELECT list (SQL standard) 130 columnSources = new LinkedHashMap<>(); 131 132 if (allBranches.isEmpty()) { 133 return; 134 } 135 136 // Validate all branch namespaces 137 for (SubqueryNamespace branchNs : branchNamespaces) { 138 branchNs.validate(); 139 } 140 141 // Get columns from first branch (defines schema) 142 TSelectSqlStatement firstBranch = allBranches.get(0); 143 TResultColumnList selectList = firstBranch.getResultColumnList(); 144 if (selectList == null) { 145 return; 146 } 147 148 for (int i = 0; i < selectList.size(); i++) { 149 TResultColumn resultCol = selectList.getResultColumn(i); 150 String colName = getColumnName(resultCol); 151 if (colName == null) { 152 colName = "col_" + (i + 1); 153 } 154 155 // For each column position, collect tables from ALL branches. 156 // UNION/MINUS columns are matched by POSITION, not by name. 157 // For each branch, check if the result column at that position is a simple column reference. 158 // If so, include all tables from that branch as potential sources. 159 List<TTable> columnTables = new ArrayList<>(); 160 161 for (int branchIdx = 0; branchIdx < allBranches.size(); branchIdx++) { 162 TSelectSqlStatement branch = allBranches.get(branchIdx); 163 TResultColumnList branchSelectList = branch.getResultColumnList(); 164 if (branchSelectList == null || i >= branchSelectList.size()) { 165 continue; 166 } 167 168 TResultColumn branchResultCol = branchSelectList.getResultColumn(i); 169 if (branchResultCol == null || branchResultCol.getExpr() == null) { 170 continue; 171 } 172 173 // Check if this result column is a simple column reference (not NULL or expression) 174 gudusoft.gsqlparser.nodes.TExpression expr = branchResultCol.getExpr(); 175 if (expr.getExpressionType() == gudusoft.gsqlparser.EExpressionType.simple_object_name_t) { 176 gudusoft.gsqlparser.nodes.TObjectName objName = expr.getObjectOperand(); 177 if (objName != null) { 178 // Get the column name at this position in the branch 179 String branchColName = objName.getColumnNameOnly(); 180 181 // For UNION data lineage, only include tables where the column name 182 // in this branch matches the column name from the first branch. 183 // This prevents incorrect associations like CDS_APP.bankcode when 184 // CDS_APP branch actually has c_mandant at that position. 185 boolean columnNameMatches = nameMatcher.matches(branchColName, colName); 186 187 // First try to get the source table from the column reference itself 188 TTable sourceTable = objName.getSourceTable(); 189 if (sourceTable != null && !columnTables.contains(sourceTable) && columnNameMatches) { 190 // Phase 1 resolved this column - add if column name matches 191 columnTables.add(sourceTable); 192 } else if (sourceTable == null && columnNameMatches) { 193 // If sourceTable is not set (Phase 1 resolution hasn't happened), 194 // check if there's a qualified reference (e.g., t.col) 195 String tableQualifier = objName.getTableString(); 196 if (tableQualifier != null && !tableQualifier.isEmpty() && branch.tables != null) { 197 // Qualified column - try to find the table by alias or name. 198 // Cleanup follow-up to S15: route through the per-dialect 199 // {@link gudusoft.gsqlparser.resolver2.matcher.INameMatcher}, 200 // and when the matcher is a {@link VendorNameMatcher}, route 201 // through {@link ESQLDataObjectType#dotTable} explicitly. The 202 // 2-arg {@code matches(...)} default binds to {@code dotColumn} 203 // inside {@code VendorNameMatcher}, which produces wrong 204 // results for BigQuery / MySQL on table-qualifier compares 205 // (BigQuery: tables SENSITIVE / columns INSENSITIVE; MySQL: 206 // tables depend on lower_case_table_names, columns always 207 // INSENSITIVE). Same routing pattern as ListBasedScope and 208 // CTEScope below. 209 for (int ti = 0; ti < branch.tables.size(); ti++) { 210 TTable t = branch.tables.getTable(ti); 211 if (t != null) { 212 String alias = t.getAliasName(); 213 String name = t.getTableName() != null ? t.getTableName().toString() : null; 214 if ((alias != null && tableNameMatches(alias, tableQualifier)) || 215 (name != null && tableNameMatches(name, tableQualifier))) { 216 if (!columnTables.contains(t)) { 217 columnTables.add(t); 218 } 219 break; 220 } 221 } 222 } 223 } else if (branch.tables != null) { 224 // Unqualified column reference - add only the first non-subquery/join table 225 for (int ti = 0; ti < branch.tables.size(); ti++) { 226 TTable t = branch.tables.getTable(ti); 227 if (t != null && !columnTables.contains(t)) { 228 // Skip subqueries and joins - they're not final tables 229 if (t.getTableType() != gudusoft.gsqlparser.ETableSource.subquery && 230 t.getTableType() != gudusoft.gsqlparser.ETableSource.join) { 231 columnTables.add(t); 232 break; // Only add the first table for unqualified columns 233 } 234 } 235 } 236 } 237 } 238 } 239 } 240 // For non-column expressions (NULL, functions, etc.), don't add any tables 241 } 242 243 // Create column source with candidateTables from branches that have the column 244 // Always pass the list (even if empty) so getAllFinalTables() knows we explicitly 245 // determined the candidate tables rather than needing to delegate to namespace 246 ColumnSource source = new ColumnSource( 247 this, 248 colName, 249 resultCol, 250 1.0, 251 "union_column", 252 null, // overrideTable 253 columnTables // Pass empty list when no tables match 254 ); 255 256 columnSources.put(colName, source); 257 } 258 } 259 260 /** 261 * Extract column name from TResultColumn. 262 */ 263 private String getColumnName(TResultColumn resultCol) { 264 // Check for alias 265 if (resultCol.getAliasClause() != null && 266 resultCol.getAliasClause().getAliasName() != null) { 267 return resultCol.getAliasClause().getAliasName().toString(); 268 } 269 270 // Check for simple column reference 271 if (resultCol.getExpr() != null) { 272 gudusoft.gsqlparser.nodes.TExpression expr = resultCol.getExpr(); 273 if (expr.getExpressionType() == gudusoft.gsqlparser.EExpressionType.simple_object_name_t) { 274 gudusoft.gsqlparser.nodes.TObjectName objName = expr.getObjectOperand(); 275 if (objName != null) { 276 return objName.getColumnNameOnly(); 277 } 278 } 279 } 280 281 return null; 282 } 283 284 @Override 285 public ColumnLevel hasColumn(String columnName) { 286 ensureValidated(); 287 288 // Check in explicit columns from first branch (raw-keyed map; matcher-aware lookup 289 // routes through INameMatcher / IdentifierService so per-dialect rules apply). 290 if (containsColumnByMatcher(columnSources, columnName)) { 291 return ColumnLevel.EXISTS; 292 } 293 294 // Check in inferred columns. The map is raw-keyed (= ColumnSource. 295 // exposedName) — see addInferredColumn for why round-2 reverted from 296 // normalized keys. The matcher-aware helper handles per-dialect case 297 // rules including SQL Server COLLATION_BASED. 298 if (containsColumnByMatcher(inferredColumns, columnName)) { 299 return ColumnLevel.EXISTS; 300 } 301 302 // If any branch has SELECT *, unknown columns MAYBE exist 303 if (hasStarColumn()) { 304 return ColumnLevel.MAYBE; 305 } 306 307 return ColumnLevel.NOT_EXISTS; 308 } 309 310 /** 311 * Binding-diagnostic view of the set-operation output schema. 312 * 313 * <p>SQL exposes set-operation columns using the first branch's output 314 * names. Branch-local names from later operands are not visible to an 315 * outer query. When any branch contains {@code SELECT *}, the output 316 * shape is not authoritative for missing-output diagnostics.</p> 317 */ 318 public ColumnLevel hasAuthoritativeOutputColumn(String columnName) { 319 ensureValidated(); 320 321 if (columnName == null || columnName.isEmpty()) { 322 return ColumnLevel.MAYBE; 323 } 324 325 if (containsColumnByMatcher(columnSources, columnName)) { 326 return ColumnLevel.EXISTS; 327 } 328 329 if (containsColumnByMatcher(inferredColumns, columnName)) { 330 return ColumnLevel.EXISTS; 331 } 332 333 if (hasStarColumn()) { 334 return ColumnLevel.MAYBE; 335 } 336 337 if (columnSources != null && !columnSources.isEmpty()) { 338 return ColumnLevel.NOT_EXISTS; 339 } 340 341 return ColumnLevel.MAYBE; 342 } 343 344 /** 345 * Cleanup follow-up to S15: route table-qualifier compares through 346 * {@link ESQLDataObjectType#dotTable} when the matcher is vendor-aware. 347 * Without this, the 2-arg {@link INameMatcher#matches(String, String)} 348 * defaults to {@code dotColumn} semantics inside {@link VendorNameMatcher}, 349 * producing wrong results on BigQuery / MySQL. 350 */ 351 private boolean tableNameMatches(String storedName, String tableQualifier) { 352 if (nameMatcher instanceof VendorNameMatcher) { 353 return ((VendorNameMatcher) nameMatcher).matches(storedName, tableQualifier, ESQLDataObjectType.dotTable); 354 } 355 return nameMatcher.matches(storedName, tableQualifier); 356 } 357 358 @Override 359 public ColumnSource resolveColumn(String columnName) { 360 ensureValidated(); 361 362 // First check explicit columns from first branch 363 ColumnSource source = super.resolveColumn(columnName); 364 if (source != null) { 365 return source; 366 } 367 368 // Then check inferred columns. Slice S1 + codex round 2: the map is 369 // raw-keyed (= ColumnSource.exposedName), so the exact-match probe 370 // is O(1) for the common case where the same identifier is queried 371 // again. For case-only-different references, the matcher loop walks 372 // values via getExposedName() so quote state is preserved. 373 if (inferredColumns != null) { 374 ColumnSource exact = inferredColumns.get(columnName); 375 if (exact != null) { 376 return exact; 377 } 378 for (ColumnSource entry : inferredColumns.values()) { 379 String exposed = entry != null ? entry.getExposedName() : null; 380 if (exposed != null && nameMatcher.matches(exposed, columnName)) { 381 return entry; 382 } 383 } 384 } 385 386 // If has star column, auto-infer this column 387 if (hasStarColumn()) { 388 boolean added = addInferredColumn(columnName, 0.8, "auto_inferred_from_outer_reference"); 389 if (added && inferredColumns != null) { 390 ColumnSource inferredSource = inferredColumns.get(columnName); 391 if (inferredSource != null) { 392 return inferredSource; 393 } 394 } 395 } 396 397 return null; 398 } 399 400 @Override 401 public TSelectSqlStatement getSelectStatement() { 402 return unionQuery; 403 } 404 405 @Override 406 public boolean hasStarColumn() { 407 // Returns true if ANY branch has SELECT * 408 for (SubqueryNamespace branchNs : branchNamespaces) { 409 if (branchNs.hasStarColumn()) { 410 return true; 411 } 412 } 413 return false; 414 } 415 416 @Override 417 public boolean supportsDynamicInference() { 418 return hasStarColumn(); 419 } 420 421 /** 422 * Slice S4 (plan §5.5): a set-operation's output schema is defined by the 423 * first branch (SQL standard, see class javadoc). The schema is 424 * authoritative once validated AND the first branch projects named 425 * columns; if any branch contains an unresolved {@code SELECT *} we treat 426 * the output as METADATA_UNAVAILABLE because position-based column 427 * matching across branches has not yet completed. S11 will refine this 428 * once arity-mismatch handling lands. 429 */ 430 @Override 431 public MetadataState getMetadataState() { 432 ensureValidated(); 433 if (columnSources == null || columnSources.isEmpty() || hasStarColumn()) { 434 return MetadataState.METADATA_UNAVAILABLE; 435 } 436 return MetadataState.FOUND; 437 } 438 439 @Override 440 public boolean addInferredColumn(String columnName, double confidence, String evidence) { 441 if (columnName == null || columnName.isEmpty()) { 442 return false; 443 } 444 445 // Initialize maps if needed 446 if (inferredColumns == null) { 447 inferredColumns = new LinkedHashMap<>(); 448 } 449 if (inferredColumnNames == null) { 450 inferredColumnNames = new HashSet<>(); 451 } 452 453 // Slice S1: dedupe checks must respect per-vendor identifier rules so 454 // that case-only-different inputs do not leave the map with two entries 455 // (one of which is then non-deterministically picked by the matcher 456 // loops in hasColumn / resolveColumn). 457 // 458 // Codex round 2 caveat: the dedupe gate must use {@link 459 // #containsColumnByMatcher} (which walks ColumnSource.exposedName for 460 // value compares) and the storage key must be the original-cased 461 // identifier — NOT the normalized form. Two matcher-distinct 462 // identifiers can normalize to the same key (e.g. Postgres {@code 463 // "mycol"} and unquoted {@code MYCOL} both normalize to {@code 464 // mycol}), so a normalized-keyed map cannot represent both. Raw 465 // (exposedName) keys avoid this collision. 466 if (containsColumnByMatcher(columnSources, columnName)) { 467 return false; 468 } 469 if (containsColumnByMatcher(inferredColumns, columnName)) { 470 return false; 471 } 472 473 // Collect final tables from ALL branches that support dynamic inference (have SELECT *) 474 // For data lineage, we need to track that columns could come from any UNION branch 475 // The formatter will output all candidates for UNION columns when isCandidatesFromUnion is true 476 java.util.List<TTable> candidateTables = new java.util.ArrayList<>(); 477 for (SubqueryNamespace branchNs : branchNamespaces) { 478 // Only collect from branches that could have inferred columns 479 if (!branchNs.supportsDynamicInference()) { 480 continue; 481 } 482 branchNs.validate(); 483 java.util.List<TTable> branchTables = branchNs.getAllFinalTables(); 484 for (TTable table : branchTables) { 485 if (table != null && !candidateTables.contains(table)) { 486 candidateTables.add(table); 487 } 488 } 489 } 490 491 // Create inferred column source for this union namespace WITH candidate tables 492 ColumnSource source = new ColumnSource( 493 this, 494 columnName, 495 null, 496 confidence, 497 evidence, 498 null, // overrideTable is null for UNION columns 499 candidateTables.isEmpty() ? null : candidateTables 500 ); 501 502 inferredColumns.put(columnName, source); 503 inferredColumnNames.add(columnName); 504 505 if (gudusoft.gsqlparser.TBaseType.DUMP_RESOLVER_LOG_TO_CONSOLE) { 506 System.out.println("[UnionNamespace] Added '" + columnName + "' to " + alias + 507 ", propagating to " + branchNamespaces.size() + " branches"); 508 } 509 510 // CRITICAL: Propagate to branches that support dynamic inference (have SELECT *) 511 // Only propagate to branches with star columns - branches with explicit column lists 512 // either have the column explicitly or don't have it at all. 513 for (SubqueryNamespace branchNs : branchNamespaces) { 514 // Only propagate to branches that can accept inferred columns 515 if (!branchNs.supportsDynamicInference()) { 516 if (gudusoft.gsqlparser.TBaseType.DUMP_RESOLVER_LOG_TO_CONSOLE) { 517 System.out.println("[UnionNamespace] Skipping branch " + branchNs.getDisplayName() + 518 " (no star column)"); 519 } 520 continue; 521 } 522 if (gudusoft.gsqlparser.TBaseType.DUMP_RESOLVER_LOG_TO_CONSOLE) { 523 System.out.println("[UnionNamespace] Propagating '" + columnName + "' to branch " + branchNs.getDisplayName()); 524 } 525 branchNs.addInferredColumn(columnName, confidence, evidence + "_union_propagate"); 526 } 527 528 return true; 529 } 530 531 @Override 532 public Set<String> getInferredColumns() { 533 if (inferredColumnNames == null) { 534 return Collections.emptySet(); 535 } 536 return Collections.unmodifiableSet(inferredColumnNames); 537 } 538 539 /** 540 * Get all branch namespaces. 541 * Useful for external code that needs to iterate over branches. 542 */ 543 public List<SubqueryNamespace> getBranchNamespaces() { 544 return Collections.unmodifiableList(branchNamespaces); 545 } 546 547 /** 548 * Get all branch SELECT statements. 549 */ 550 public List<TSelectSqlStatement> getAllBranches() { 551 return Collections.unmodifiableList(allBranches); 552 } 553 554 /** 555 * Get the number of UNION branches. 556 */ 557 public int getBranchCount() { 558 return allBranches.size(); 559 } 560 561 @Override 562 public String toString() { 563 int totalColumns = (columnSources != null ? columnSources.size() : 0) + 564 (inferredColumns != null ? inferredColumns.size() : 0); 565 return "UnionNamespace(" + getDisplayName() + 566 ", branches=" + allBranches.size() + 567 ", columns=" + totalColumns + 568 ", inferred=" + (inferredColumns != null ? inferredColumns.size() : 0) + ")"; 569 } 570}