001package gudusoft.gsqlparser.dlineage.impl.powerquery; 002 003import gudusoft.gsqlparser.EDbVendor; 004import gudusoft.gsqlparser.TGSqlParser; 005import gudusoft.gsqlparser.nodes.TParseTreeNode; 006import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryConnectorCall; 007import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryIdentifierRef; 008import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryLetExpr; 009import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNativeQuery; 010import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNavChain; 011import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNavSegment; 012import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryStep; 013import gudusoft.gsqlparser.powerquery.ConnectorCatalog; 014import gudusoft.gsqlparser.stmt.powerquery.TPowerQueryDocumentStmt; 015 016import java.util.ArrayDeque; 017import java.util.ArrayList; 018import java.util.Deque; 019import java.util.HashSet; 020import java.util.List; 021import java.util.Set; 022 023/** 024 * Tier 1 + Tier 2 lineage analyzer for Power Query M documents. 025 * 026 * <p>Given a parsed {@link TPowerQueryDocumentStmt}, walks the step 027 * graph and produces upstream lineage references. Two patterns are 028 * handled: 029 * 030 * <ul> 031 * <li>{@code Value.NativeQuery()} steps — decoded SQL is delegated to 032 * the inner vendor's {@link TGSqlParser} instance and the resulting 033 * lineage is surfaced via 034 * {@link PowerQueryLineageResult#getNativeQueryReferences()}.</li> 035 * <li>Navigation chains {@code source{[Name,Kind]}[Data]} — segments are 036 * resolved against the connector's expected hierarchy 037 * ({@link ConnectorCatalog#expectedHierarchy(EDbVendor)}) and a 038 * synthetic {@code db.schema.table} reference is emitted.</li> 039 * </ul> 040 * 041 * <p>The analyzer is intentionally independent of the full 042 * {@code DataFlowAnalyzer} pipeline — it can be run standalone from 043 * sidecar code or from inside a higher-level lineage aggregator. 044 * 045 * <p><b>Emit policy for navigation refs:</b> every navigation-chain step 046 * produces a {@link PowerQueryLineageResult.NavigationRef}, even when the 047 * chain is only consumed as the context argument to a sibling 048 * {@code Value.NativeQuery} (in which case the true upstream tables are 049 * inside the decoded SQL, and the context-only nav refs are redundant). 050 * The analyzer stays dumb on purpose — callers that care to distinguish 051 * "real" upstreams from context-only narrowing should deduplicate using 052 * the sibling {@link PowerQueryLineageResult#getNativeQueryReferences()}. 053 */ 054public class TPowerQueryAnalyzer { 055 056 private final TPowerQueryDocumentStmt stmt; 057 private EDbVendor innerVendorOverride; 058 private EDbVendor explicitInnerVendor; 059 060 public TPowerQueryAnalyzer(TPowerQueryDocumentStmt stmt) { 061 this.stmt = stmt; 062 } 063 064 /** Fallback vendor used only when no connector call is visible in the 065 * document and no {@linkplain #withExplicitInnerVendor(EDbVendor) 066 * explicit vendor} was supplied. */ 067 public TPowerQueryAnalyzer withInnerVendorOverride(EDbVendor vendor) { 068 this.innerVendorOverride = vendor; 069 return this; 070 } 071 072 /** 073 * Force the SQL dialect used to parse inner SQL — both the decoded 074 * {@code Value.NativeQuery()} text and navigation-chain synthetic 075 * SELECTs. When set, this takes precedence over connector-based 076 * inference; when {@code null} (default), inference is used. 077 */ 078 public TPowerQueryAnalyzer withExplicitInnerVendor(EDbVendor vendor) { 079 this.explicitInnerVendor = vendor; 080 return this; 081 } 082 083 public PowerQueryLineageResult analyze() { 084 PowerQueryLineageResult out = new PowerQueryLineageResult(); 085 if (stmt == null) return out; 086 TPowerQueryLetExpr doc = stmt.getLetExpression(); 087 if (doc == null) return out; 088 089 for (TPowerQueryStep step : doc.getSteps()) { 090 TParseTreeNode expr = step.getExpression(); 091 if (expr instanceof TPowerQueryNativeQuery) { 092 handleNativeQuery(doc, step, (TPowerQueryNativeQuery) expr, out); 093 } else if (expr instanceof TPowerQueryNavChain) { 094 handleNavigationChain(doc, step, (TPowerQueryNavChain) expr, out); 095 } 096 } 097 098 // Propagate warnings emitted during parsing 099 for (String w : stmt.getWarnings()) { 100 out.addWarning(w); 101 } 102 103 return out; 104 } 105 106 // ---------- NativeQuery → inner SQL parser delegation ---------- 107 108 private void handleNativeQuery(TPowerQueryLetExpr doc, 109 TPowerQueryStep step, 110 TPowerQueryNativeQuery nq, 111 PowerQueryLineageResult out) { 112 EDbVendor vendor; 113 if (explicitInnerVendor != null) { 114 vendor = explicitInnerVendor; 115 } else { 116 vendor = nq.getInferredInnerVendor(); 117 if (vendor == null) { 118 vendor = resolveVendorForStep(doc, nq.getSourceStepName()); 119 } 120 if (vendor == null) { 121 vendor = innerVendorOverride; 122 } 123 } 124 125 PowerQueryLineageResult.NativeQueryRef ref = 126 new PowerQueryLineageResult.NativeQueryRef(); 127 ref.stepName = step.getName(); 128 ref.decodedSql = nq.getDecodedSql(); 129 ref.resolvedVendor = vendor; 130 131 if (vendor == null) { 132 out.addWarning("Could not infer vendor for NativeQuery step '" 133 + step.getName() + "'; SQL left unparsed."); 134 out.addNativeQuery(ref); 135 return; 136 } 137 138 try { 139 TGSqlParser inner = new TGSqlParser(vendor); 140 inner.sqltext = nq.getDecodedSql(); 141 int rc = inner.parse(); 142 ref.innerParseReturnCode = rc; 143 ref.innerParser = inner; 144 } catch (RuntimeException re) { 145 out.addWarning("Inner SQL parse for step '" + step.getName() 146 + "' threw: " + re.getMessage()); 147 } 148 out.addNativeQuery(ref); 149 } 150 151 // ---------- Navigation chain → synthetic table reference ---------- 152 153 private void handleNavigationChain(TPowerQueryLetExpr doc, 154 TPowerQueryStep step, 155 TPowerQueryNavChain chain, 156 PowerQueryLineageResult out) { 157 EDbVendor vendor; 158 if (explicitInnerVendor != null) { 159 vendor = explicitInnerVendor; 160 } else { 161 vendor = resolveVendorForStep(doc, chain.getSourceStepName()); 162 if (vendor == null) vendor = innerVendorOverride; 163 } 164 165 PowerQueryLineageResult.NavigationRef ref = 166 new PowerQueryLineageResult.NavigationRef(); 167 ref.stepName = step.getName(); 168 ref.resolvedVendor = vendor; 169 170 collectChainSegmentsInOrder(doc, chain, ref); 171 172 if (vendor == null) { 173 out.addWarning("Could not infer vendor for navigation step '" 174 + step.getName() + "'; table reference is raw " 175 + ref.segments.toString()); 176 } else { 177 ref.syntheticSelect = buildSyntheticSelect(vendor, ref); 178 } 179 out.addNavigation(ref); 180 } 181 182 /** 183 * Walks back through prior nav-chain steps and appends their segments 184 * to {@code ref} in connector-hierarchy order (oldest first). Without 185 * this, each nav step only knows its own one segment, so the leaf's 186 * synthetic SELECT would only reference the final name (e.g. just the 187 * view) instead of the full {@code db.schema.view} path implied by 188 * the source chain. 189 */ 190 private void collectChainSegmentsInOrder(TPowerQueryLetExpr doc, 191 TPowerQueryNavChain leaf, 192 PowerQueryLineageResult.NavigationRef ref) { 193 Deque<TPowerQueryNavChain> chainStack = new ArrayDeque<>(); 194 Set<String> guard = new HashSet<>(); 195 TPowerQueryNavChain current = leaf; 196 while (current != null) { 197 chainStack.push(current); 198 String sourceName = current.getSourceStepName(); 199 if (sourceName == null || !guard.add(sourceName) || doc == null) break; 200 TPowerQueryStep sourceStep = doc.findStep(sourceName); 201 if (sourceStep == null) break; 202 TParseTreeNode expr = sourceStep.getExpression(); 203 if (expr instanceof TPowerQueryNavChain) { 204 current = (TPowerQueryNavChain) expr; 205 } else { 206 break; 207 } 208 } 209 while (!chainStack.isEmpty()) { 210 TPowerQueryNavChain c = chainStack.pop(); 211 for (TPowerQueryNavSegment seg : c.getSegments()) { 212 if (seg == null) continue; 213 ref.addSegment(seg.getKind(), seg.getName()); 214 } 215 } 216 } 217 218 private String buildSyntheticSelect(EDbVendor vendor, 219 PowerQueryLineageResult.NavigationRef ref) { 220 List<String> hierarchy = ConnectorCatalog.expectedHierarchy(vendor); 221 if (hierarchy.isEmpty() || ref.segments.isEmpty()) return null; 222 223 List<String> parts = new ArrayList<>(); 224 for (PowerQueryLineageResult.NamedPart p : ref.segments) { 225 if (p == null || p.name == null) continue; 226 parts.add(p.name); 227 } 228 if (parts.isEmpty()) return null; 229 230 String quoted = quoteIdentifiers(vendor, parts); 231 return "SELECT * FROM " + quoted; 232 } 233 234 private String quoteIdentifiers(EDbVendor vendor, List<String> parts) { 235 switch (vendor) { 236 case dbvmssql: { 237 StringBuilder sb = new StringBuilder(); 238 for (int i = 0; i < parts.size(); i++) { 239 if (i > 0) sb.append('.'); 240 sb.append('[').append(parts.get(i)).append(']'); 241 } 242 return sb.toString(); 243 } 244 case dbvmysql: { 245 StringBuilder sb = new StringBuilder(); 246 for (int i = 0; i < parts.size(); i++) { 247 if (i > 0) sb.append('.'); 248 sb.append('`').append(parts.get(i)).append('`'); 249 } 250 return sb.toString(); 251 } 252 case dbvbigquery: { 253 StringBuilder sb = new StringBuilder(); 254 sb.append('`'); 255 for (int i = 0; i < parts.size(); i++) { 256 if (i > 0) sb.append('.'); 257 sb.append(parts.get(i)); 258 } 259 sb.append('`'); 260 return sb.toString(); 261 } 262 default: { 263 StringBuilder sb = new StringBuilder(); 264 for (int i = 0; i < parts.size(); i++) { 265 if (i > 0) sb.append('.'); 266 sb.append('"').append(parts.get(i)).append('"'); 267 } 268 return sb.toString(); 269 } 270 } 271 } 272 273 // ---------- vendor inference ---------- 274 275 private EDbVendor resolveVendorForStep(TPowerQueryLetExpr doc, String stepName) { 276 if (stepName == null || doc == null) return null; 277 String current = stepName; 278 java.util.Set<String> guard = new java.util.HashSet<>(); 279 while (current != null && guard.add(current)) { 280 TPowerQueryStep step = doc.findStep(current); 281 if (step == null) return null; 282 TParseTreeNode expr = step.getExpression(); 283 if (expr instanceof TPowerQueryConnectorCall) { 284 return ((TPowerQueryConnectorCall) expr).getResolvedVendor(); 285 } 286 if (expr instanceof TPowerQueryIdentifierRef) { 287 current = ((TPowerQueryIdentifierRef) expr).getName(); 288 continue; 289 } 290 if (expr instanceof TPowerQueryNavChain) { 291 current = ((TPowerQueryNavChain) expr).getSourceStepName(); 292 continue; 293 } 294 // Opaque expression — try connector detection on raw text fallback. 295 return null; 296 } 297 return null; 298 } 299}