Source code

001package gudusoft.gsqlparser.pp2.token;
002
003import gudusoft.gsqlparser.ETokenType;
004import gudusoft.gsqlparser.TSourceToken;
005import gudusoft.gsqlparser.TSourceTokenList;
006import gudusoft.gsqlparser.pp2.FormatDiagnostic;
007
008import java.util.ArrayList;
009import java.util.Collections;
010import java.util.List;
011
012/**
013 * Byte-authority for a pp2 format call. Every character of the input is
014 * owned by exactly one {@link SourceSpan}.
015 *
016 * <p>This is the Codex Round-3 insight that makes "no byte ever lost" a
017 * concrete guarantee. {@link Pp2TokenStreamBuilder} (S7) is allowed to
018 * normalize whitespace and CRLF because the ledger preserves the raw
019 * bytes; the assembler (S15) can restore them when emitting output.
020 *
021 * <h2>Coverage invariant</h2>
022 *
023 * <p>For every ledger {@code l} produced by {@link #build(String, TSourceTokenList)}:
024 * <pre>
025 *   l.getSpans().get(0).getStartOffset() == 0
026 *   l.getSpans().get(N-1).getEndOffset() == input.length()
027 *   span[i].getEndOffset() == span[i+1].getStartOffset()  // no gaps, no overlaps
028 *   Σ span.getText() == original input                    // bytes recovered
029 * </pre>
030 *
031 * <h2>RAW_FALLBACK spans</h2>
032 *
033 * <p>Any range the tokenizer did not cover — usually a vendor-parser
034 * tokenization gap, occasionally an unclosed string the lexer truncated —
035 * is recorded as a {@link SourceSpan.Kind#RAW_FALLBACK} span with the
036 * verbatim source text. The engine emits a {@code FormatDiagnostic} for
037 * each such span so the result advertises the recovery. Legal SQL on the
038 * five major vendors produces zero {@code RAW_FALLBACK} spans (verified
039 * by {@code SourceSpanLedgerTest}).
040 *
041 * <p>Plan reference: §7.3/S8, §7.4/S8, §10.3 (the construction sketch).
042 */
043public final class SourceSpanLedger {
044
045    private final String source;
046    private final List<SourceSpan> spans;
047    private final int rawFallbackCount;
048    private final List<FormatDiagnostic> diagnostics;
049
050    private SourceSpanLedger(String source, List<SourceSpan> spans,
051                             int rawFallbackCount,
052                             List<FormatDiagnostic> diagnostics) {
053        this.source = source;
054        this.spans = spans;
055        this.rawFallbackCount = rawFallbackCount;
056        this.diagnostics = diagnostics;
057    }
058
059    /**
060     * Build a ledger from the original SQL string and the parser's token
061     * list. Tokens are visited in {@code offset} order. Malformed tokenizer
062     * output — negative offsets, tokens past the input boundary, or
063     * overlapping/out-of-order spans (observed with CRLF line endings after an
064     * unbalanced construct on Windows / git autocrlf) — is <b>tolerated</b>,
065     * not fatal: out-of-range tokens are skipped or clamped, overlaps are
066     * clamped to the running cursor, and a {@link FormatDiagnostic} records
067     * each recovery. This keeps the fault-tolerant contract (no throw; every
068     * input byte still owned by exactly one span via RAW_FALLBACK gap filling).
069     *
070     * @throws NullPointerException if {@code source} or {@code tokens} is null
071     */
072    public static SourceSpanLedger build(String source, TSourceTokenList tokens) {
073        if (source == null) throw new NullPointerException("source");
074        if (tokens == null) throw new NullPointerException("tokens");
075
076        // Diagnostics are collected across both passes (the bounds pass below
077        // and the coverage walk) so every tolerated malformation is observable.
078        List<FormatDiagnostic> diags = new ArrayList<FormatDiagnostic>();
079
080        // Collect (offset, length, kind, text) tuples sorted by offset so
081        // the coverage walk is deterministic even if the lexer produced
082        // tokens out of source order (it shouldn't, but).
083        //
084        // A fault-tolerant formatter must never crash on malformed tokenizer
085        // output. The GSP lexer can emit tokens with offsets/lengths that
086        // violate the TSourceTokenList contract on adverse input — notably
087        // CRLF line endings (Windows / git autocrlf) after an unbalanced or
088        // truncated construct, which produce out-of-order / overlapping spans.
089        // Rather than throw (which would abort the whole format via Pp2Engine's
090        // top-level catch and surface as FAILED), we tolerate these: skip a
091        // token whose offset is negative or past the input, and clamp one that
092        // runs past the input boundary, recording a diagnostic each time. Any
093        // bytes a skipped token would have covered are picked up by the
094        // RAW_FALLBACK gap/tail filling below, so no input byte is lost. All
095        // bounds arithmetic uses long to avoid integer overflow on a
096        // pathologically large offset.
097        List<int[]> idx = new ArrayList<int[]>(tokens.size());
098        for (int i = 0; i < tokens.size(); i++) {
099            TSourceToken t = tokens.get(i);
100            if (t == null) {
101                throw new NullPointerException("tokens[" + i + "]");
102            }
103            String text = t.toString();
104            if (text == null || text.isEmpty()) {
105                // Empty tokens (Removed sentinels, etc.) contribute no bytes;
106                // skip rather than attempt to anchor them.
107                continue;
108            }
109            long startL = t.offset;
110            long endL = startL + text.length();
111            if (startL < 0 || startL >= source.length()) {
112                diags.add(new FormatDiagnostic(
113                    FormatDiagnostic.Severity.WARNING, 0, source.length(),
114                    "token at index " + i + " has out-of-range offset "
115                        + startL + " (input length " + source.length()
116                        + ") — skipped, bytes recovered as RAW_FALLBACK"));
117                continue;
118            }
119            int start = (int) startL;
120            int end;
121            if (endL > source.length()) {
122                end = source.length();
123                diags.add(new FormatDiagnostic(
124                    FormatDiagnostic.Severity.WARNING, start, source.length(),
125                    "token at index " + i + " (offset " + start + ", length "
126                        + text.length() + ") extends past input length "
127                        + source.length() + " — clamped"));
128            } else {
129                end = (int) endL;
130            }
131            idx.add(new int[] {start, end, i});
132        }
133        // Sort by start offset (stable, ties broken by tokenIndex).
134        idx.sort((a, b) -> {
135            if (a[0] != b[0]) return Integer.compare(a[0], b[0]);
136            return Integer.compare(a[2], b[2]);
137        });
138
139        List<SourceSpan> spans = new ArrayList<SourceSpan>(idx.size() * 2 + 2);
140        int cursor = 0;
141        int rawCount = 0;
142        for (int[] entry : idx) {
143            int start = entry[0];
144            int end = entry[1];
145            TSourceToken t = tokens.get(entry[2]);
146
147            if (start < cursor) {
148                // Fully contained in an already-emitted span is the GSP
149                // lexer "${name}" quirk — see TokenCoverage Javadoc.
150                if (TokenCoverage.isFullyShadowed(t, cursor)) {
151                    continue;
152                }
153                // Adds no new bytes (ends at or before the cursor): skip; the
154                // bytes are already owned by an earlier span.
155                if (end <= cursor) {
156                    continue;
157                }
158                // Partial overlap (start < cursor < end). On adverse input the
159                // GSP lexer can emit overlapping spans (e.g. CRLF after an
160                // unbalanced construct). A fault-tolerant formatter must not
161                // throw: clamp this token's start to the cursor so the coverage
162                // invariant (no overlaps) holds, emit only its non-overlapping
163                // tail [cursor, end], and record a diagnostic. The overlapping
164                // prefix [start, cursor] is already covered by the previous
165                // span, so reconstruction (Σ span.text == source) still holds.
166                diags.add(new FormatDiagnostic(
167                    FormatDiagnostic.Severity.WARNING,
168                    start, end,
169                    "overlapping token at index " + entry[2] + " (offset "
170                        + start + ", end " + end + ") clamped to [" + cursor
171                        + ", " + end + "] — tokenizer span overlap tolerated"));
172                SourceSpan.Kind clampedKind = classifyToken(t.tokentype);
173                spans.add(new SourceSpan(cursor, end, clampedKind,
174                    source.substring(cursor, end)));
175                cursor = end;
176                continue;
177            }
178            if (start > cursor) {
179                // Tokenizer gap: cover with a RAW_FALLBACK span and emit a
180                // diagnostic so the result advertises the recovery.
181                spans.add(new SourceSpan(cursor, start,
182                    SourceSpan.Kind.RAW_FALLBACK,
183                    source.substring(cursor, start)));
184                diags.add(new FormatDiagnostic(
185                    FormatDiagnostic.Severity.WARNING,
186                    cursor, start,
187                    "tokenizer gap recovered as RAW_FALLBACK ("
188                        + (start - cursor) + " chars)"));
189                rawCount++;
190            }
191            SourceSpan.Kind kind = classifyToken(t.tokentype);
192            // Normally a token's text equals its source bytes, but if its end
193            // was clamped (out-of-range, above) the token text is longer than
194            // its span; use the source slice so reconstruction (Σ span.text ==
195            // source) holds exactly.
196            String spanText = t.toString();
197            if (spanText.length() != end - start) {
198                spanText = source.substring(start, end);
199            }
200            spans.add(new SourceSpan(start, end, kind, spanText));
201            cursor = end;
202        }
203        if (cursor < source.length()) {
204            // Trailing un-tokenized bytes: trailing whitespace usually shows
205            // up as ttwhitespace tokens, but if anything was missed we
206            // record it as RAW_FALLBACK rather than truncate.
207            String tail = source.substring(cursor);
208            // Classify trailing whitespace correctly so it doesn't trip the
209            // "zero RAW_FALLBACK on legal SQL" smoke. A trailing newline
210            // outside any tokenized span is TRIVIA, not RAW_FALLBACK.
211            SourceSpan.Kind kind = isPureWhitespace(tail)
212                ? SourceSpan.Kind.TRIVIA
213                : SourceSpan.Kind.RAW_FALLBACK;
214            spans.add(new SourceSpan(cursor, source.length(), kind, tail));
215            if (kind == SourceSpan.Kind.RAW_FALLBACK) {
216                diags.add(new FormatDiagnostic(
217                    FormatDiagnostic.Severity.WARNING,
218                    cursor, source.length(),
219                    "trailing tokenizer gap recovered as RAW_FALLBACK ("
220                        + (source.length() - cursor) + " chars)"));
221                rawCount++;
222            }
223        }
224
225        return new SourceSpanLedger(source,
226            Collections.unmodifiableList(spans),
227            rawCount,
228            Collections.unmodifiableList(diags));
229    }
230
231    /**
232     * Classify a token type into a {@link SourceSpan.Kind}. Whitespace is
233     * {@link SourceSpan.Kind#TRIVIA}; comments, string literals, and
234     * quoted identifiers are {@link SourceSpan.Kind#PROTECTED}; everything
235     * else is {@link SourceSpan.Kind#TOKEN}. Slice S9 extends this with
236     * finer-grained protected-zone detection (hints, NO_FORMAT blocks).
237     */
238    public static SourceSpan.Kind classifyToken(ETokenType type) {
239        if (type == null) return SourceSpan.Kind.TOKEN;
240        switch (type) {
241            case ttwhitespace:
242            case ttreturn:
243                return SourceSpan.Kind.TRIVIA;
244            case ttsimplecomment:
245            case ttbracketedcomment:
246            case ttCPPComment:
247            case ttsqstring:
248            case ttdqstring:
249            case ttdbstring:
250            case ttbrstring:
251                return SourceSpan.Kind.PROTECTED;
252            default:
253                return SourceSpan.Kind.TOKEN;
254        }
255    }
256
257    private static boolean isPureWhitespace(String s) {
258        for (int i = 0; i < s.length(); i++) {
259            char c = s.charAt(i);
260            if (c != ' ' && c != '\t' && c != '\n' && c != '\r'
261                && c != '\f' && c != 0x0B) {
262                return false;
263            }
264        }
265        return true;
266    }
267
268    /** The original input the ledger was built from. */
269    public String getSource() {
270        return source;
271    }
272
273    /** Unmodifiable, byte-order-stable list of every span. */
274    public List<SourceSpan> getSpans() {
275        return spans;
276    }
277
278    /** Number of {@link SourceSpan.Kind#RAW_FALLBACK} spans in the ledger. */
279    public int getRawFallbackCount() {
280        return rawFallbackCount;
281    }
282
283    /**
284     * Diagnostics emitted while building the ledger. Currently this list
285     * has one {@link FormatDiagnostic.Severity#WARNING} entry per
286     * {@link SourceSpan.Kind#RAW_FALLBACK} span; the engine appends these
287     * to the per-call {@code Pp2FormatResult.diagnostics} list (S16).
288     * Returns an unmodifiable view.
289     */
290    public List<FormatDiagnostic> getDiagnostics() {
291        return diagnostics;
292    }
293
294    /** Convenience: rebuild the source by concatenating every span's text. */
295    public String reconstruct() {
296        StringBuilder sb = new StringBuilder(source.length());
297        for (SourceSpan s : spans) sb.append(s.getText());
298        return sb.toString();
299    }
300
301    @Override
302    public String toString() {
303        return "SourceSpanLedger{spans=" + spans.size()
304            + " rawFallback=" + rawFallbackCount
305            + " sourceLen=" + source.length() + "}";
306    }
307}