001package gudusoft.gsqlparser.resolver2.inference; 002 003import gudusoft.gsqlparser.nodes.TTable; 004import gudusoft.gsqlparser.resolver2.model.ColumnSource; 005 006import java.util.*; 007 008/** 009 * Engine for inferring column-to-table relationships without metadata. 010 * 011 * <p>The inference engine collects evidence from various sources in the SQL 012 * statement and uses it to infer which columns belong to which tables. 013 * This is particularly useful when: 014 * - Database metadata is not available 015 * - Dealing with SELECT * without schema information 016 * - Analyzing SQL from unknown sources 017 * 018 * <p>Inference process: 019 * 1. Collect evidence from SQL statement (WHERE, JOIN, SELECT, etc.) 020 * 2. Aggregate evidence by table and column 021 * 3. Calculate confidence scores 022 * 4. Generate inferred column sources 023 * 024 * <p>Example: 025 * <pre> 026 * SELECT * FROM employees e 027 * WHERE e.department_id = 10 028 * AND e.salary > 50000 029 * 030 * Inference: 031 * - "department_id" column exists in "employees" (confidence: 0.95) 032 * - "salary" column exists in "employees" (confidence: 0.95) 033 * </pre> 034 */ 035public class InferenceEngine { 036 037 /** Evidence collected for inference */ 038 private final List<InferenceEvidence> evidenceList = new ArrayList<>(); 039 040 /** Inferred columns by table */ 041 private final Map<String, Set<String>> inferredColumnsByTable = new HashMap<>(); 042 043 /** Evidence aggregated by table.column */ 044 private final Map<String, List<InferenceEvidence>> evidenceByColumn = new HashMap<>(); 045 046 /** 047 * Add a piece of evidence for inference. 048 * 049 * @param evidence the evidence to add 050 */ 051 public void addEvidence(InferenceEvidence evidence) { 052 if (evidence == null) { 053 return; 054 } 055 056 evidenceList.add(evidence); 057 058 // Index by table.column 059 String key = makeKey(evidence.getTableName(), evidence.getColumnName()); 060 evidenceByColumn.computeIfAbsent(key, k -> new ArrayList<>()).add(evidence); 061 062 // Track inferred columns 063 inferredColumnsByTable 064 .computeIfAbsent(evidence.getTableName(), k -> new HashSet<>()) 065 .add(evidence.getColumnName()); 066 } 067 068 /** 069 * Add multiple pieces of evidence. 070 * 071 * @param evidences the evidence to add 072 */ 073 public void addAllEvidence(Collection<InferenceEvidence> evidences) { 074 if (evidences != null) { 075 for (InferenceEvidence evidence : evidences) { 076 addEvidence(evidence); 077 } 078 } 079 } 080 081 /** 082 * Get all inferred columns for a table. 083 * 084 * @param tableName the table name 085 * @return set of inferred column names, or empty set if none 086 */ 087 public Set<String> getInferredColumns(String tableName) { 088 Set<String> columns = inferredColumnsByTable.get(tableName); 089 return columns != null ? new HashSet<>(columns) : Collections.emptySet(); 090 } 091 092 /** 093 * Get all evidence for a specific table.column. 094 * 095 * @param tableName the table name 096 * @param columnName the column name 097 * @return list of evidence, or empty list if none 098 */ 099 public List<InferenceEvidence> getEvidence(String tableName, String columnName) { 100 String key = makeKey(tableName, columnName); 101 List<InferenceEvidence> evidence = evidenceByColumn.get(key); 102 return evidence != null ? new ArrayList<>(evidence) : Collections.emptyList(); 103 } 104 105 /** 106 * Calculate the combined confidence for a table.column based on all evidence. 107 * 108 * <p>Combines multiple pieces of evidence using formula: 109 * <pre> 110 * combined = 1 - ∏(1 - conf_i) 111 * </pre> 112 * 113 * This means: 114 * - Multiple pieces of evidence increase confidence 115 * - Evidence is independent (multiplicative combination) 116 * - Result is always in [0, 1] 117 * 118 * @param tableName the table name 119 * @param columnName the column name 120 * @return combined confidence [0.0, 1.0], or 0.0 if no evidence 121 */ 122 public double calculateConfidence(String tableName, String columnName) { 123 List<InferenceEvidence> evidence = getEvidence(tableName, columnName); 124 125 if (evidence.isEmpty()) { 126 return 0.0; 127 } 128 129 // Combine confidence using complementary probability 130 double complementaryProduct = 1.0; 131 for (InferenceEvidence ev : evidence) { 132 complementaryProduct *= (1.0 - ev.getConfidence()); 133 } 134 135 return 1.0 - complementaryProduct; 136 } 137 138 /** 139 * Create an inferred ColumnSource for a table.column. 140 * 141 * @param tableName the table name 142 * @param columnName the column name 143 * @param table the TTable object (may be null if not available) 144 * @return ColumnSource with inferred confidence, or null if no evidence 145 */ 146 public ColumnSource createInferredColumnSource( 147 String tableName, 148 String columnName, 149 TTable table) { 150 151 List<InferenceEvidence> evidence = getEvidence(tableName, columnName); 152 if (evidence.isEmpty()) { 153 return null; 154 } 155 156 double confidence = calculateConfidence(tableName, columnName); 157 158 // Build evidence description 159 StringBuilder evidenceDesc = new StringBuilder(); 160 evidenceDesc.append("inferred from: "); 161 for (int i = 0; i < evidence.size(); i++) { 162 if (i > 0) evidenceDesc.append(", "); 163 evidenceDesc.append(evidence.get(i).getEvidenceType()); 164 } 165 166 // Create a namespace-less ColumnSource (we don't have actual metadata) 167 // This is a marker that indicates the column was inferred 168 return new ColumnSource( 169 null, // namespace not available for inferred columns 170 columnName, 171 evidence.get(0).getSourceNode(), // Use first evidence as source 172 confidence, 173 evidenceDesc.toString() 174 ) { 175 // Override to return the table we inferred for 176 @Override 177 public TTable getFinalTable() { 178 return table; 179 } 180 }; 181 } 182 183 /** 184 * Get all tables that have inferred columns. 185 * 186 * @return set of table names with inferred columns 187 */ 188 public Set<String> getTablesWithInferences() { 189 return new HashSet<>(inferredColumnsByTable.keySet()); 190 } 191 192 /** 193 * Get total number of pieces of evidence collected. 194 * 195 * @return evidence count 196 */ 197 public int getEvidenceCount() { 198 return evidenceList.size(); 199 } 200 201 /** 202 * Get total number of inferred columns across all tables. 203 * 204 * @return inferred column count 205 */ 206 public int getInferredColumnCount() { 207 int count = 0; 208 for (Set<String> columns : inferredColumnsByTable.values()) { 209 count += columns.size(); 210 } 211 return count; 212 } 213 214 /** 215 * Clear all evidence and inferred columns. 216 */ 217 public void clear() { 218 evidenceList.clear(); 219 inferredColumnsByTable.clear(); 220 evidenceByColumn.clear(); 221 } 222 223 /** 224 * Get statistics about the inference engine state. 225 * 226 * @return summary string 227 */ 228 public String getStatistics() { 229 return String.format( 230 "InferenceEngine[tables=%d, columns=%d, evidence=%d]", 231 inferredColumnsByTable.size(), 232 getInferredColumnCount(), 233 evidenceList.size() 234 ); 235 } 236 237 /** 238 * Make a key for indexing table.column evidence. 239 */ 240 private String makeKey(String tableName, String columnName) { 241 return tableName + "." + columnName; 242 } 243 244 @Override 245 public String toString() { 246 return getStatistics(); 247 } 248}