001package gudusoft.gsqlparser.pp2.token; 002 003import gudusoft.gsqlparser.ETokenType; 004import gudusoft.gsqlparser.TSourceToken; 005import gudusoft.gsqlparser.TSourceTokenList; 006import gudusoft.gsqlparser.pp2.FormatDiagnostic; 007 008import java.util.ArrayList; 009import java.util.Collections; 010import java.util.List; 011 012/** 013 * Byte-authority for a pp2 format call. Every character of the input is 014 * owned by exactly one {@link SourceSpan}. 015 * 016 * <p>This is the Codex Round-3 insight that makes "no byte ever lost" a 017 * concrete guarantee. {@link Pp2TokenStreamBuilder} (S7) is allowed to 018 * normalize whitespace and CRLF because the ledger preserves the raw 019 * bytes; the assembler (S15) can restore them when emitting output. 020 * 021 * <h2>Coverage invariant</h2> 022 * 023 * <p>For every ledger {@code l} produced by {@link #build(String, TSourceTokenList)}: 024 * <pre> 025 * l.getSpans().get(0).getStartOffset() == 0 026 * l.getSpans().get(N-1).getEndOffset() == input.length() 027 * span[i].getEndOffset() == span[i+1].getStartOffset() // no gaps, no overlaps 028 * Σ span.getText() == original input // bytes recovered 029 * </pre> 030 * 031 * <h2>RAW_FALLBACK spans</h2> 032 * 033 * <p>Any range the tokenizer did not cover — usually a vendor-parser 034 * tokenization gap, occasionally an unclosed string the lexer truncated — 035 * is recorded as a {@link SourceSpan.Kind#RAW_FALLBACK} span with the 036 * verbatim source text. The engine emits a {@code FormatDiagnostic} for 037 * each such span so the result advertises the recovery. Legal SQL on the 038 * five major vendors produces zero {@code RAW_FALLBACK} spans (verified 039 * by {@code SourceSpanLedgerTest}). 040 * 041 * <p>Plan reference: §7.3/S8, §7.4/S8, §10.3 (the construction sketch). 042 */ 043public final class SourceSpanLedger { 044 045 private final String source; 046 private final List<SourceSpan> spans; 047 private final int rawFallbackCount; 048 private final List<FormatDiagnostic> diagnostics; 049 050 private SourceSpanLedger(String source, List<SourceSpan> spans, 051 int rawFallbackCount, 052 List<FormatDiagnostic> diagnostics) { 053 this.source = source; 054 this.spans = spans; 055 this.rawFallbackCount = rawFallbackCount; 056 this.diagnostics = diagnostics; 057 } 058 059 /** 060 * Build a ledger from the original SQL string and the parser's token 061 * list. Tokens are visited in {@code offset} order. Malformed tokenizer 062 * output — negative offsets, tokens past the input boundary, or 063 * overlapping/out-of-order spans (observed with CRLF line endings after an 064 * unbalanced construct on Windows / git autocrlf) — is <b>tolerated</b>, 065 * not fatal: out-of-range tokens are skipped or clamped, overlaps are 066 * clamped to the running cursor, and a {@link FormatDiagnostic} records 067 * each recovery. This keeps the fault-tolerant contract (no throw; every 068 * input byte still owned by exactly one span via RAW_FALLBACK gap filling). 069 * 070 * @throws NullPointerException if {@code source} or {@code tokens} is null 071 */ 072 public static SourceSpanLedger build(String source, TSourceTokenList tokens) { 073 if (source == null) throw new NullPointerException("source"); 074 if (tokens == null) throw new NullPointerException("tokens"); 075 076 // Diagnostics are collected across both passes (the bounds pass below 077 // and the coverage walk) so every tolerated malformation is observable. 078 List<FormatDiagnostic> diags = new ArrayList<FormatDiagnostic>(); 079 080 // Collect (offset, length, kind, text) tuples sorted by offset so 081 // the coverage walk is deterministic even if the lexer produced 082 // tokens out of source order (it shouldn't, but). 083 // 084 // A fault-tolerant formatter must never crash on malformed tokenizer 085 // output. The GSP lexer can emit tokens with offsets/lengths that 086 // violate the TSourceTokenList contract on adverse input — notably 087 // CRLF line endings (Windows / git autocrlf) after an unbalanced or 088 // truncated construct, which produce out-of-order / overlapping spans. 089 // Rather than throw (which would abort the whole format via Pp2Engine's 090 // top-level catch and surface as FAILED), we tolerate these: skip a 091 // token whose offset is negative or past the input, and clamp one that 092 // runs past the input boundary, recording a diagnostic each time. Any 093 // bytes a skipped token would have covered are picked up by the 094 // RAW_FALLBACK gap/tail filling below, so no input byte is lost. All 095 // bounds arithmetic uses long to avoid integer overflow on a 096 // pathologically large offset. 097 List<int[]> idx = new ArrayList<int[]>(tokens.size()); 098 for (int i = 0; i < tokens.size(); i++) { 099 TSourceToken t = tokens.get(i); 100 if (t == null) { 101 throw new NullPointerException("tokens[" + i + "]"); 102 } 103 String text = t.toString(); 104 if (text == null || text.isEmpty()) { 105 // Empty tokens (Removed sentinels, etc.) contribute no bytes; 106 // skip rather than attempt to anchor them. 107 continue; 108 } 109 long startL = t.offset; 110 long endL = startL + text.length(); 111 if (startL < 0 || startL >= source.length()) { 112 diags.add(new FormatDiagnostic( 113 FormatDiagnostic.Severity.WARNING, 0, source.length(), 114 "token at index " + i + " has out-of-range offset " 115 + startL + " (input length " + source.length() 116 + ") — skipped, bytes recovered as RAW_FALLBACK")); 117 continue; 118 } 119 int start = (int) startL; 120 int end; 121 if (endL > source.length()) { 122 end = source.length(); 123 diags.add(new FormatDiagnostic( 124 FormatDiagnostic.Severity.WARNING, start, source.length(), 125 "token at index " + i + " (offset " + start + ", length " 126 + text.length() + ") extends past input length " 127 + source.length() + " — clamped")); 128 } else { 129 end = (int) endL; 130 } 131 idx.add(new int[] {start, end, i}); 132 } 133 // Sort by start offset (stable, ties broken by tokenIndex). 134 idx.sort((a, b) -> { 135 if (a[0] != b[0]) return Integer.compare(a[0], b[0]); 136 return Integer.compare(a[2], b[2]); 137 }); 138 139 List<SourceSpan> spans = new ArrayList<SourceSpan>(idx.size() * 2 + 2); 140 int cursor = 0; 141 int rawCount = 0; 142 for (int[] entry : idx) { 143 int start = entry[0]; 144 int end = entry[1]; 145 TSourceToken t = tokens.get(entry[2]); 146 147 if (start < cursor) { 148 // Fully contained in an already-emitted span is the GSP 149 // lexer "${name}" quirk — see TokenCoverage Javadoc. 150 if (TokenCoverage.isFullyShadowed(t, cursor)) { 151 continue; 152 } 153 // Adds no new bytes (ends at or before the cursor): skip; the 154 // bytes are already owned by an earlier span. 155 if (end <= cursor) { 156 continue; 157 } 158 // Partial overlap (start < cursor < end). On adverse input the 159 // GSP lexer can emit overlapping spans (e.g. CRLF after an 160 // unbalanced construct). A fault-tolerant formatter must not 161 // throw: clamp this token's start to the cursor so the coverage 162 // invariant (no overlaps) holds, emit only its non-overlapping 163 // tail [cursor, end], and record a diagnostic. The overlapping 164 // prefix [start, cursor] is already covered by the previous 165 // span, so reconstruction (Σ span.text == source) still holds. 166 diags.add(new FormatDiagnostic( 167 FormatDiagnostic.Severity.WARNING, 168 start, end, 169 "overlapping token at index " + entry[2] + " (offset " 170 + start + ", end " + end + ") clamped to [" + cursor 171 + ", " + end + "] — tokenizer span overlap tolerated")); 172 SourceSpan.Kind clampedKind = classifyToken(t.tokentype); 173 spans.add(new SourceSpan(cursor, end, clampedKind, 174 source.substring(cursor, end))); 175 cursor = end; 176 continue; 177 } 178 if (start > cursor) { 179 // Tokenizer gap: cover with a RAW_FALLBACK span and emit a 180 // diagnostic so the result advertises the recovery. 181 spans.add(new SourceSpan(cursor, start, 182 SourceSpan.Kind.RAW_FALLBACK, 183 source.substring(cursor, start))); 184 diags.add(new FormatDiagnostic( 185 FormatDiagnostic.Severity.WARNING, 186 cursor, start, 187 "tokenizer gap recovered as RAW_FALLBACK (" 188 + (start - cursor) + " chars)")); 189 rawCount++; 190 } 191 SourceSpan.Kind kind = classifyToken(t.tokentype); 192 // Normally a token's text equals its source bytes, but if its end 193 // was clamped (out-of-range, above) the token text is longer than 194 // its span; use the source slice so reconstruction (Σ span.text == 195 // source) holds exactly. 196 String spanText = t.toString(); 197 if (spanText.length() != end - start) { 198 spanText = source.substring(start, end); 199 } 200 spans.add(new SourceSpan(start, end, kind, spanText)); 201 cursor = end; 202 } 203 if (cursor < source.length()) { 204 // Trailing un-tokenized bytes: trailing whitespace usually shows 205 // up as ttwhitespace tokens, but if anything was missed we 206 // record it as RAW_FALLBACK rather than truncate. 207 String tail = source.substring(cursor); 208 // Classify trailing whitespace correctly so it doesn't trip the 209 // "zero RAW_FALLBACK on legal SQL" smoke. A trailing newline 210 // outside any tokenized span is TRIVIA, not RAW_FALLBACK. 211 SourceSpan.Kind kind = isPureWhitespace(tail) 212 ? SourceSpan.Kind.TRIVIA 213 : SourceSpan.Kind.RAW_FALLBACK; 214 spans.add(new SourceSpan(cursor, source.length(), kind, tail)); 215 if (kind == SourceSpan.Kind.RAW_FALLBACK) { 216 diags.add(new FormatDiagnostic( 217 FormatDiagnostic.Severity.WARNING, 218 cursor, source.length(), 219 "trailing tokenizer gap recovered as RAW_FALLBACK (" 220 + (source.length() - cursor) + " chars)")); 221 rawCount++; 222 } 223 } 224 225 return new SourceSpanLedger(source, 226 Collections.unmodifiableList(spans), 227 rawCount, 228 Collections.unmodifiableList(diags)); 229 } 230 231 /** 232 * Classify a token type into a {@link SourceSpan.Kind}. Whitespace is 233 * {@link SourceSpan.Kind#TRIVIA}; comments, string literals, and 234 * quoted identifiers are {@link SourceSpan.Kind#PROTECTED}; everything 235 * else is {@link SourceSpan.Kind#TOKEN}. Slice S9 extends this with 236 * finer-grained protected-zone detection (hints, NO_FORMAT blocks). 237 */ 238 public static SourceSpan.Kind classifyToken(ETokenType type) { 239 if (type == null) return SourceSpan.Kind.TOKEN; 240 switch (type) { 241 case ttwhitespace: 242 case ttreturn: 243 return SourceSpan.Kind.TRIVIA; 244 case ttsimplecomment: 245 case ttbracketedcomment: 246 case ttCPPComment: 247 case ttsqstring: 248 case ttdqstring: 249 case ttdbstring: 250 case ttbrstring: 251 return SourceSpan.Kind.PROTECTED; 252 default: 253 return SourceSpan.Kind.TOKEN; 254 } 255 } 256 257 private static boolean isPureWhitespace(String s) { 258 for (int i = 0; i < s.length(); i++) { 259 char c = s.charAt(i); 260 if (c != ' ' && c != '\t' && c != '\n' && c != '\r' 261 && c != '\f' && c != 0x0B) { 262 return false; 263 } 264 } 265 return true; 266 } 267 268 /** The original input the ledger was built from. */ 269 public String getSource() { 270 return source; 271 } 272 273 /** Unmodifiable, byte-order-stable list of every span. */ 274 public List<SourceSpan> getSpans() { 275 return spans; 276 } 277 278 /** Number of {@link SourceSpan.Kind#RAW_FALLBACK} spans in the ledger. */ 279 public int getRawFallbackCount() { 280 return rawFallbackCount; 281 } 282 283 /** 284 * Diagnostics emitted while building the ledger. Currently this list 285 * has one {@link FormatDiagnostic.Severity#WARNING} entry per 286 * {@link SourceSpan.Kind#RAW_FALLBACK} span; the engine appends these 287 * to the per-call {@code Pp2FormatResult.diagnostics} list (S16). 288 * Returns an unmodifiable view. 289 */ 290 public List<FormatDiagnostic> getDiagnostics() { 291 return diagnostics; 292 } 293 294 /** Convenience: rebuild the source by concatenating every span's text. */ 295 public String reconstruct() { 296 StringBuilder sb = new StringBuilder(source.length()); 297 for (SourceSpan s : spans) sb.append(s.getText()); 298 return sb.toString(); 299 } 300 301 @Override 302 public String toString() { 303 return "SourceSpanLedger{spans=" + spans.size() 304 + " rawFallback=" + rawFallbackCount 305 + " sourceLen=" + source.length() + "}"; 306 } 307}