001package gudusoft.gsqlparser.dlineage.impl.powerquery;
002
003import gudusoft.gsqlparser.EDbVendor;
004import gudusoft.gsqlparser.TGSqlParser;
005import gudusoft.gsqlparser.nodes.TParseTreeNode;
006import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryConnectorCall;
007import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryIdentifierRef;
008import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryLetExpr;
009import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNativeQuery;
010import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNavChain;
011import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryNavSegment;
012import gudusoft.gsqlparser.nodes.powerquery.TPowerQueryStep;
013import gudusoft.gsqlparser.powerquery.ConnectorCatalog;
014import gudusoft.gsqlparser.stmt.powerquery.TPowerQueryDocumentStmt;
015
016import java.util.ArrayDeque;
017import java.util.ArrayList;
018import java.util.Deque;
019import java.util.HashSet;
020import java.util.List;
021import java.util.Set;
022
023/**
024 * Tier 1 + Tier 2 lineage analyzer for Power Query M documents.
025 *
026 * <p>Given a parsed {@link TPowerQueryDocumentStmt}, walks the step
027 * graph and produces upstream lineage references.  Two patterns are
028 * handled:
029 *
030 * <ul>
031 *   <li>{@code Value.NativeQuery()} steps — decoded SQL is delegated to
032 *       the inner vendor's {@link TGSqlParser} instance and the resulting
033 *       lineage is surfaced via
034 *       {@link PowerQueryLineageResult#getNativeQueryReferences()}.</li>
035 *   <li>Navigation chains {@code source{[Name,Kind]}[Data]} — segments are
036 *       resolved against the connector's expected hierarchy
037 *       ({@link ConnectorCatalog#expectedHierarchy(EDbVendor)}) and a
038 *       synthetic {@code db.schema.table} reference is emitted.</li>
039 * </ul>
040 *
041 * <p>The analyzer is intentionally independent of the full
042 * {@code DataFlowAnalyzer} pipeline — it can be run standalone from
043 * sidecar code or from inside a higher-level lineage aggregator.
044 *
045 * <p><b>Emit policy for navigation refs:</b> every navigation-chain step
046 * produces a {@link PowerQueryLineageResult.NavigationRef}, even when the
047 * chain is only consumed as the context argument to a sibling
048 * {@code Value.NativeQuery} (in which case the true upstream tables are
049 * inside the decoded SQL, and the context-only nav refs are redundant).
050 * The analyzer stays dumb on purpose — callers that care to distinguish
051 * "real" upstreams from context-only narrowing should deduplicate using
052 * the sibling {@link PowerQueryLineageResult#getNativeQueryReferences()}.
053 */
054public class TPowerQueryAnalyzer {
055
056    private final TPowerQueryDocumentStmt stmt;
057    private EDbVendor innerVendorOverride;
058    private EDbVendor explicitInnerVendor;
059
060    public TPowerQueryAnalyzer(TPowerQueryDocumentStmt stmt) {
061        this.stmt = stmt;
062    }
063
064    /** Fallback vendor used only when no connector call is visible in the
065     * document and no {@linkplain #withExplicitInnerVendor(EDbVendor)
066     * explicit vendor} was supplied. */
067    public TPowerQueryAnalyzer withInnerVendorOverride(EDbVendor vendor) {
068        this.innerVendorOverride = vendor;
069        return this;
070    }
071
072    /**
073     * Force the SQL dialect used to parse inner SQL — both the decoded
074     * {@code Value.NativeQuery()} text and navigation-chain synthetic
075     * SELECTs. When set, this takes precedence over connector-based
076     * inference; when {@code null} (default), inference is used.
077     */
078    public TPowerQueryAnalyzer withExplicitInnerVendor(EDbVendor vendor) {
079        this.explicitInnerVendor = vendor;
080        return this;
081    }
082
083    public PowerQueryLineageResult analyze() {
084        PowerQueryLineageResult out = new PowerQueryLineageResult();
085        if (stmt == null) return out;
086        TPowerQueryLetExpr doc = stmt.getLetExpression();
087        if (doc == null) return out;
088
089        for (TPowerQueryStep step : doc.getSteps()) {
090            TParseTreeNode expr = step.getExpression();
091            if (expr instanceof TPowerQueryNativeQuery) {
092                handleNativeQuery(doc, step, (TPowerQueryNativeQuery) expr, out);
093            } else if (expr instanceof TPowerQueryNavChain) {
094                handleNavigationChain(doc, step, (TPowerQueryNavChain) expr, out);
095            }
096        }
097
098        // Propagate warnings emitted during parsing
099        for (String w : stmt.getWarnings()) {
100            out.addWarning(w);
101        }
102
103        return out;
104    }
105
106    // ---------- NativeQuery → inner SQL parser delegation ----------
107
108    private void handleNativeQuery(TPowerQueryLetExpr doc,
109                                   TPowerQueryStep step,
110                                   TPowerQueryNativeQuery nq,
111                                   PowerQueryLineageResult out) {
112        EDbVendor vendor;
113        if (explicitInnerVendor != null) {
114            vendor = explicitInnerVendor;
115        } else {
116            vendor = nq.getInferredInnerVendor();
117            if (vendor == null) {
118                vendor = resolveVendorForStep(doc, nq.getSourceStepName());
119            }
120            if (vendor == null) {
121                vendor = innerVendorOverride;
122            }
123        }
124
125        PowerQueryLineageResult.NativeQueryRef ref =
126                new PowerQueryLineageResult.NativeQueryRef();
127        ref.stepName = step.getName();
128        ref.decodedSql = nq.getDecodedSql();
129        ref.resolvedVendor = vendor;
130
131        if (vendor == null) {
132            out.addWarning("Could not infer vendor for NativeQuery step '"
133                    + step.getName() + "'; SQL left unparsed.");
134            out.addNativeQuery(ref);
135            return;
136        }
137
138        try {
139            TGSqlParser inner = new TGSqlParser(vendor);
140            inner.sqltext = nq.getDecodedSql();
141            int rc = inner.parse();
142            ref.innerParseReturnCode = rc;
143            ref.innerParser = inner;
144        } catch (RuntimeException re) {
145            out.addWarning("Inner SQL parse for step '" + step.getName()
146                    + "' threw: " + re.getMessage());
147        }
148        out.addNativeQuery(ref);
149    }
150
151    // ---------- Navigation chain → synthetic table reference ----------
152
153    private void handleNavigationChain(TPowerQueryLetExpr doc,
154                                       TPowerQueryStep step,
155                                       TPowerQueryNavChain chain,
156                                       PowerQueryLineageResult out) {
157        EDbVendor vendor;
158        if (explicitInnerVendor != null) {
159            vendor = explicitInnerVendor;
160        } else {
161            vendor = resolveVendorForStep(doc, chain.getSourceStepName());
162            if (vendor == null) vendor = innerVendorOverride;
163        }
164
165        PowerQueryLineageResult.NavigationRef ref =
166                new PowerQueryLineageResult.NavigationRef();
167        ref.stepName = step.getName();
168        ref.resolvedVendor = vendor;
169
170        collectChainSegmentsInOrder(doc, chain, ref);
171
172        if (vendor == null) {
173            out.addWarning("Could not infer vendor for navigation step '"
174                    + step.getName() + "'; table reference is raw "
175                    + ref.segments.toString());
176        } else {
177            ref.syntheticSelect = buildSyntheticSelect(vendor, ref);
178        }
179        out.addNavigation(ref);
180    }
181
182    /**
183     * Walks back through prior nav-chain steps and appends their segments
184     * to {@code ref} in connector-hierarchy order (oldest first). Without
185     * this, each nav step only knows its own one segment, so the leaf's
186     * synthetic SELECT would only reference the final name (e.g. just the
187     * view) instead of the full {@code db.schema.view} path implied by
188     * the source chain.
189     */
190    private void collectChainSegmentsInOrder(TPowerQueryLetExpr doc,
191                                             TPowerQueryNavChain leaf,
192                                             PowerQueryLineageResult.NavigationRef ref) {
193        Deque<TPowerQueryNavChain> chainStack = new ArrayDeque<>();
194        Set<String> guard = new HashSet<>();
195        TPowerQueryNavChain current = leaf;
196        while (current != null) {
197            chainStack.push(current);
198            String sourceName = current.getSourceStepName();
199            if (sourceName == null || !guard.add(sourceName) || doc == null) break;
200            TPowerQueryStep sourceStep = doc.findStep(sourceName);
201            if (sourceStep == null) break;
202            TParseTreeNode expr = sourceStep.getExpression();
203            if (expr instanceof TPowerQueryNavChain) {
204                current = (TPowerQueryNavChain) expr;
205            } else {
206                break;
207            }
208        }
209        while (!chainStack.isEmpty()) {
210            TPowerQueryNavChain c = chainStack.pop();
211            for (TPowerQueryNavSegment seg : c.getSegments()) {
212                if (seg == null) continue;
213                ref.addSegment(seg.getKind(), seg.getName());
214            }
215        }
216    }
217
218    private String buildSyntheticSelect(EDbVendor vendor,
219                                        PowerQueryLineageResult.NavigationRef ref) {
220        List<String> hierarchy = ConnectorCatalog.expectedHierarchy(vendor);
221        if (hierarchy.isEmpty() || ref.segments.isEmpty()) return null;
222
223        List<String> parts = new ArrayList<>();
224        for (PowerQueryLineageResult.NamedPart p : ref.segments) {
225            if (p == null || p.name == null) continue;
226            parts.add(p.name);
227        }
228        if (parts.isEmpty()) return null;
229
230        String quoted = quoteIdentifiers(vendor, parts);
231        return "SELECT * FROM " + quoted;
232    }
233
234    private String quoteIdentifiers(EDbVendor vendor, List<String> parts) {
235        switch (vendor) {
236            case dbvmssql: {
237                StringBuilder sb = new StringBuilder();
238                for (int i = 0; i < parts.size(); i++) {
239                    if (i > 0) sb.append('.');
240                    sb.append('[').append(parts.get(i)).append(']');
241                }
242                return sb.toString();
243            }
244            case dbvmysql: {
245                StringBuilder sb = new StringBuilder();
246                for (int i = 0; i < parts.size(); i++) {
247                    if (i > 0) sb.append('.');
248                    sb.append('`').append(parts.get(i)).append('`');
249                }
250                return sb.toString();
251            }
252            case dbvbigquery: {
253                StringBuilder sb = new StringBuilder();
254                sb.append('`');
255                for (int i = 0; i < parts.size(); i++) {
256                    if (i > 0) sb.append('.');
257                    sb.append(parts.get(i));
258                }
259                sb.append('`');
260                return sb.toString();
261            }
262            default: {
263                StringBuilder sb = new StringBuilder();
264                for (int i = 0; i < parts.size(); i++) {
265                    if (i > 0) sb.append('.');
266                    sb.append('"').append(parts.get(i)).append('"');
267                }
268                return sb.toString();
269            }
270        }
271    }
272
273    // ---------- vendor inference ----------
274
275    private EDbVendor resolveVendorForStep(TPowerQueryLetExpr doc, String stepName) {
276        if (stepName == null || doc == null) return null;
277        String current = stepName;
278        java.util.Set<String> guard = new java.util.HashSet<>();
279        while (current != null && guard.add(current)) {
280            TPowerQueryStep step = doc.findStep(current);
281            if (step == null) return null;
282            TParseTreeNode expr = step.getExpression();
283            if (expr instanceof TPowerQueryConnectorCall) {
284                return ((TPowerQueryConnectorCall) expr).getResolvedVendor();
285            }
286            if (expr instanceof TPowerQueryIdentifierRef) {
287                current = ((TPowerQueryIdentifierRef) expr).getName();
288                continue;
289            }
290            if (expr instanceof TPowerQueryNavChain) {
291                current = ((TPowerQueryNavChain) expr).getSourceStepName();
292                continue;
293            }
294            // Opaque expression — try connector detection on raw text fallback.
295            return null;
296        }
297        return null;
298    }
299}