001package gudusoft.gsqlparser.pp2.token; 002 003import gudusoft.gsqlparser.ETokenType; 004import gudusoft.gsqlparser.TSourceToken; 005import gudusoft.gsqlparser.TSourceTokenList; 006 007import java.util.ArrayList; 008import java.util.List; 009 010/** 011 * Lossless adapter from {@link TSourceTokenList} to {@link Pp2TokenStream}. 012 * 013 * <p>Implements the Delphi {@code initTokenArray} semantics from 014 * {@code gsp_vcl/pp/sqlion.pas}: walk the token list, fold every 015 * {@code ttwhitespace} / {@code ttreturn} token into the 016 * {@code precedingBlanks} and {@code precedingLinebreaks} counts of the 017 * next non-whitespace token. Comments 018 * ({@code ttsimplecomment}, {@code ttbracketedcomment}) are <b>not</b> 019 * folded — they are first-class tokens in pp2 so downstream stages can 020 * preserve, reanchor, or reflow them per {@code CommentPolicy}. 021 * 022 * <h2>Logical-line semantics</h2> 023 * 024 * <p>{@code precedingLinebreaks} is a count of <i>logical</i> new lines, 025 * not raw characters. {@code "\r\n"} counts as <b>one</b> linebreak (not 026 * two), as does a lone {@code "\n"} or a lone {@code "\r"}. Downstream 027 * layout rules in S25/S28 treat each logical linebreak as a new visual 028 * line; counting CRLF as two would surface as a spurious blank line on 029 * Windows-encoded scripts. 030 * 031 * <p>Byte-exact recovery of mixed line endings is not the spine's job — 032 * the {@code SourceSpanLedger} (S8) records every byte of the original 033 * input, including the precise CR/LF/CRLF sequence, so the region 034 * assembler (S15) can restore them when emitting output. 035 * 036 * <h2>Reconstruction property (CRLF-normalized)</h2> 037 * 038 * <p>After normalizing {@code "\r\n"} to {@code "\n"} in the original 039 * input, summing across the produced stream: 040 * <pre> 041 * Σ ( precedingLinebreaks + precedingBlanks + token.text.length() ) 042 * + trailingBlanks + trailingLinebreaks 043 * == normalized input length 044 * </pre> 045 * The trailing trivia is returned alongside the stream so the source-span 046 * ledger (S8) and the region assembler (S15) can place it back into the 047 * output. 048 * 049 * <h2>Counting rules</h2> 050 * 051 * <ul> 052 * <li>{@code "\r\n"} consumed together → 1 linebreak.</li> 053 * <li>Lone {@code "\n"} → 1 linebreak.</li> 054 * <li>Lone {@code "\r"} → 1 linebreak.</li> 055 * <li>Any other character (space, tab, form-feed, vertical tab, NBSP) 056 * → 1 blank. {@code countWhitespace()} only sees lexer-classified 057 * whitespace tokens, so non-whitespace UTF-8 characters never reach 058 * it.</li> 059 * </ul> 060 */ 061public final class Pp2TokenStreamBuilder { 062 063 /** 064 * Result of a build: the stream of solid + comment tokens, plus any 065 * trailing trivia characters that appeared after the last such token. 066 */ 067 public static final class BuildResult { 068 private final Pp2TokenStream stream; 069 private final int trailingBlanks; 070 private final int trailingLinebreaks; 071 private final int totalSourceLength; 072 073 BuildResult(Pp2TokenStream stream, int trailingBlanks, 074 int trailingLinebreaks, int totalSourceLength) { 075 this.stream = stream; 076 this.trailingBlanks = trailingBlanks; 077 this.trailingLinebreaks = trailingLinebreaks; 078 this.totalSourceLength = totalSourceLength; 079 } 080 081 public Pp2TokenStream getStream() { return stream; } 082 public int getTrailingBlanks() { return trailingBlanks; } 083 public int getTrailingLinebreaks() { return trailingLinebreaks; } 084 085 /** 086 * Total character count of the source token text the builder saw 087 * (sum of every {@code TSourceToken.toString().length()} including 088 * whitespace tokens). Reported for diagnostics; the reconstruction 089 * property compares the per-token sum against the 090 * <i>CRLF-normalized</i> input length, which differs from this raw 091 * total whenever the input contains {@code "\r\n"}. 092 */ 093 public int getTotalSourceLength() { return totalSourceLength; } 094 } 095 096 /** 097 * Build a stream from the given source token list. 098 * 099 * @throws NullPointerException if {@code source} is null or contains a 100 * null element 101 */ 102 public BuildResult build(TSourceTokenList source) { 103 if (source == null) throw new NullPointerException("source"); 104 List<Pp2Token> tokens = new ArrayList<Pp2Token>(source.size()); 105 106 int pendingBlanks = 0; 107 int pendingLinebreaks = 0; 108 int totalSourceLength = 0; 109 // GSP lexer quirk (probed for "${name}"): the lexer sometimes emits 110 // an outer token covering a region AND phantom inner tokens for 111 // the same bytes. Track the last emitted token's source-range end 112 // so the shared TokenCoverage helper can skip phantom inner tokens. 113 long lastEmittedEnd = -1L; 114 115 for (int i = 0; i < source.size(); i++) { 116 TSourceToken t = source.get(i); 117 if (t == null) { 118 throw new NullPointerException("source[" + i + "] — null entry in TSourceTokenList"); 119 } 120 String text = t.toString(); 121 int len = text == null ? 0 : text.length(); 122 123 // Skip tokens FULLY shadowed by an earlier emitted token. 124 // Partial overlap (start shadowed, end extending past) is 125 // never silently dropped here — SourceSpanLedger (S8) is the 126 // authority that detects partial overlap loudly. 127 if (TokenCoverage.isFullyShadowed(t, lastEmittedEnd)) { 128 continue; 129 } 130 totalSourceLength += len; 131 132 if (isFoldable(t.tokentype)) { 133 // Accumulate into pending counters; do not emit a Pp2Token. 134 int[] counts = countWhitespace(text); 135 pendingBlanks += counts[0]; 136 pendingLinebreaks += counts[1]; 137 } else { 138 // Solid or comment token — emit, attaching pending trivia. 139 tokens.add(new Pp2Token(t, pendingBlanks, pendingLinebreaks, null)); 140 pendingBlanks = 0; 141 pendingLinebreaks = 0; 142 lastEmittedEnd = TokenCoverage.endOffset(t); 143 } 144 } 145 146 Pp2TokenStream stream = Pp2TokenStream.ofTokens(tokens); 147 return new BuildResult(stream, pendingBlanks, pendingLinebreaks, 148 totalSourceLength); 149 } 150 151 /** 152 * Is this token type rolled into preceding-whitespace counters rather 153 * than emitted as its own {@code Pp2Token}? Only true whitespace 154 * tokens — comments are first-class in pp2. 155 */ 156 public static boolean isFoldable(ETokenType type) { 157 return type == ETokenType.ttwhitespace || type == ETokenType.ttreturn; 158 } 159 160 /** 161 * Classify each character in a whitespace-token's text as blank vs 162 * linebreak. {@code '\n'} and {@code '\r'} are linebreaks; everything 163 * else (space, tab, form-feed, vertical tab, NBSP, ...) is a blank. 164 * 165 * @return {@code int[2]} with {@code [0]=blanks, [1]=linebreaks} 166 */ 167 public static int[] countWhitespace(String text) { 168 if (text == null || text.isEmpty()) return new int[] {0, 0}; 169 int blanks = 0; 170 int breaks = 0; 171 int i = 0; 172 int n = text.length(); 173 while (i < n) { 174 char c = text.charAt(i); 175 if (c == '\r') { 176 breaks++; 177 // Consume "\r\n" as a single logical linebreak. 178 if (i + 1 < n && text.charAt(i + 1) == '\n') { 179 i += 2; 180 } else { 181 i++; 182 } 183 } else if (c == '\n') { 184 breaks++; 185 i++; 186 } else { 187 blanks++; 188 i++; 189 } 190 } 191 return new int[] {blanks, breaks}; 192 } 193 194}