Source code

001package gudusoft.gsqlparser.pp2.token;
002
003import gudusoft.gsqlparser.ETokenType;
004import gudusoft.gsqlparser.TSourceToken;
005import gudusoft.gsqlparser.TSourceTokenList;
006
007import java.util.ArrayList;
008import java.util.List;
009
010/**
011 * Lossless adapter from {@link TSourceTokenList} to {@link Pp2TokenStream}.
012 *
013 * <p>Implements the Delphi {@code initTokenArray} semantics from
014 * {@code gsp_vcl/pp/sqlion.pas}: walk the token list, fold every
015 * {@code ttwhitespace} / {@code ttreturn} token into the
016 * {@code precedingBlanks} and {@code precedingLinebreaks} counts of the
017 * next non-whitespace token. Comments
018 * ({@code ttsimplecomment}, {@code ttbracketedcomment}) are <b>not</b>
019 * folded — they are first-class tokens in pp2 so downstream stages can
020 * preserve, reanchor, or reflow them per {@code CommentPolicy}.
021 *
022 * <h2>Logical-line semantics</h2>
023 *
024 * <p>{@code precedingLinebreaks} is a count of <i>logical</i> new lines,
025 * not raw characters. {@code "\r\n"} counts as <b>one</b> linebreak (not
026 * two), as does a lone {@code "\n"} or a lone {@code "\r"}. Downstream
027 * layout rules in S25/S28 treat each logical linebreak as a new visual
028 * line; counting CRLF as two would surface as a spurious blank line on
029 * Windows-encoded scripts.
030 *
031 * <p>Byte-exact recovery of mixed line endings is not the spine's job —
032 * the {@code SourceSpanLedger} (S8) records every byte of the original
033 * input, including the precise CR/LF/CRLF sequence, so the region
034 * assembler (S15) can restore them when emitting output.
035 *
036 * <h2>Reconstruction property (CRLF-normalized)</h2>
037 *
038 * <p>After normalizing {@code "\r\n"} to {@code "\n"} in the original
039 * input, summing across the produced stream:
040 * <pre>
041 *   Σ ( precedingLinebreaks + precedingBlanks + token.text.length() )
042 *     + trailingBlanks + trailingLinebreaks
043 *   == normalized input length
044 * </pre>
045 * The trailing trivia is returned alongside the stream so the source-span
046 * ledger (S8) and the region assembler (S15) can place it back into the
047 * output.
048 *
049 * <h2>Counting rules</h2>
050 *
051 * <ul>
052 *   <li>{@code "\r\n"} consumed together → 1 linebreak.</li>
053 *   <li>Lone {@code "\n"} → 1 linebreak.</li>
054 *   <li>Lone {@code "\r"} → 1 linebreak.</li>
055 *   <li>Any other character (space, tab, form-feed, vertical tab, NBSP)
056 *       → 1 blank. {@code countWhitespace()} only sees lexer-classified
057 *       whitespace tokens, so non-whitespace UTF-8 characters never reach
058 *       it.</li>
059 * </ul>
060 */
061public final class Pp2TokenStreamBuilder {
062
063    /**
064     * Result of a build: the stream of solid + comment tokens, plus any
065     * trailing trivia characters that appeared after the last such token.
066     */
067    public static final class BuildResult {
068        private final Pp2TokenStream stream;
069        private final int trailingBlanks;
070        private final int trailingLinebreaks;
071        private final int totalSourceLength;
072
073        BuildResult(Pp2TokenStream stream, int trailingBlanks,
074                    int trailingLinebreaks, int totalSourceLength) {
075            this.stream = stream;
076            this.trailingBlanks = trailingBlanks;
077            this.trailingLinebreaks = trailingLinebreaks;
078            this.totalSourceLength = totalSourceLength;
079        }
080
081        public Pp2TokenStream getStream() { return stream; }
082        public int getTrailingBlanks() { return trailingBlanks; }
083        public int getTrailingLinebreaks() { return trailingLinebreaks; }
084
085        /**
086         * Total character count of the source token text the builder saw
087         * (sum of every {@code TSourceToken.toString().length()} including
088         * whitespace tokens). Reported for diagnostics; the reconstruction
089         * property compares the per-token sum against the
090         * <i>CRLF-normalized</i> input length, which differs from this raw
091         * total whenever the input contains {@code "\r\n"}.
092         */
093        public int getTotalSourceLength() { return totalSourceLength; }
094    }
095
096    /**
097     * Build a stream from the given source token list.
098     *
099     * @throws NullPointerException if {@code source} is null or contains a
100     *     null element
101     */
102    public BuildResult build(TSourceTokenList source) {
103        if (source == null) throw new NullPointerException("source");
104        List<Pp2Token> tokens = new ArrayList<Pp2Token>(source.size());
105
106        int pendingBlanks = 0;
107        int pendingLinebreaks = 0;
108        int totalSourceLength = 0;
109        // GSP lexer quirk (probed for "${name}"): the lexer sometimes emits
110        // an outer token covering a region AND phantom inner tokens for
111        // the same bytes. Track the last emitted token's source-range end
112        // so the shared TokenCoverage helper can skip phantom inner tokens.
113        long lastEmittedEnd = -1L;
114
115        for (int i = 0; i < source.size(); i++) {
116            TSourceToken t = source.get(i);
117            if (t == null) {
118                throw new NullPointerException("source[" + i + "] — null entry in TSourceTokenList");
119            }
120            String text = t.toString();
121            int len = text == null ? 0 : text.length();
122
123            // Skip tokens FULLY shadowed by an earlier emitted token.
124            // Partial overlap (start shadowed, end extending past) is
125            // never silently dropped here — SourceSpanLedger (S8) is the
126            // authority that detects partial overlap loudly.
127            if (TokenCoverage.isFullyShadowed(t, lastEmittedEnd)) {
128                continue;
129            }
130            totalSourceLength += len;
131
132            if (isFoldable(t.tokentype)) {
133                // Accumulate into pending counters; do not emit a Pp2Token.
134                int[] counts = countWhitespace(text);
135                pendingBlanks += counts[0];
136                pendingLinebreaks += counts[1];
137            } else {
138                // Solid or comment token — emit, attaching pending trivia.
139                tokens.add(new Pp2Token(t, pendingBlanks, pendingLinebreaks, null));
140                pendingBlanks = 0;
141                pendingLinebreaks = 0;
142                lastEmittedEnd = TokenCoverage.endOffset(t);
143            }
144        }
145
146        Pp2TokenStream stream = Pp2TokenStream.ofTokens(tokens);
147        return new BuildResult(stream, pendingBlanks, pendingLinebreaks,
148            totalSourceLength);
149    }
150
151    /**
152     * Is this token type rolled into preceding-whitespace counters rather
153     * than emitted as its own {@code Pp2Token}? Only true whitespace
154     * tokens — comments are first-class in pp2.
155     */
156    public static boolean isFoldable(ETokenType type) {
157        return type == ETokenType.ttwhitespace || type == ETokenType.ttreturn;
158    }
159
160    /**
161     * Classify each character in a whitespace-token's text as blank vs
162     * linebreak. {@code '\n'} and {@code '\r'} are linebreaks; everything
163     * else (space, tab, form-feed, vertical tab, NBSP, ...) is a blank.
164     *
165     * @return {@code int[2]} with {@code [0]=blanks, [1]=linebreaks}
166     */
167    public static int[] countWhitespace(String text) {
168        if (text == null || text.isEmpty()) return new int[] {0, 0};
169        int blanks = 0;
170        int breaks = 0;
171        int i = 0;
172        int n = text.length();
173        while (i < n) {
174            char c = text.charAt(i);
175            if (c == '\r') {
176                breaks++;
177                // Consume "\r\n" as a single logical linebreak.
178                if (i + 1 < n && text.charAt(i + 1) == '\n') {
179                    i += 2;
180                } else {
181                    i++;
182                }
183            } else if (c == '\n') {
184                breaks++;
185                i++;
186            } else {
187                blanks++;
188                i++;
189            }
190        }
191        return new int[] {blanks, breaks};
192    }
193
194}