001package gudusoft.gsqlparser.parser.powerquery; 002 003/** 004 * Decodes Power Query M-language string escape sequences. 005 * 006 * <p>M uses a mix of embedded-quote doubling and {@code #(...)} character 007 * escapes. The standard forms we must support: 008 * 009 * <pre> 010 * "" → literal " 011 * #(lf) → \n 012 * #(cr) → \r 013 * #(cr,lf) → \r\n 014 * #(tab) → \t 015 * #(XXXX) → Unicode codepoint (4 hex digits) 016 * #(XXXXXXXX) → Unicode codepoint (8 hex digits) 017 * </pre> 018 * 019 * <p>Any unrecognised {@code #(…)} sequence is preserved verbatim in the 020 * output — the plan's "never silent wrong lineage" rule applies: we'd 021 * rather leave a suspicious literal alone than silently produce a wrong 022 * SQL string. Callers can detect such cases via {@link Result#hasWarnings()}. 023 */ 024public final class PowerQueryEscapeDecoder { 025 026 private PowerQueryEscapeDecoder() {} 027 028 public static final class Result { 029 private final String decoded; 030 private final boolean hasWarnings; 031 private final String warning; 032 033 Result(String decoded, String warning) { 034 this.decoded = decoded; 035 this.hasWarnings = warning != null; 036 this.warning = warning; 037 } 038 039 public String getDecoded() { return decoded; } 040 public boolean hasWarnings() { return hasWarnings; } 041 public String getWarning() { return warning; } 042 } 043 044 /** 045 * Decode the raw text <em>inside</em> the surrounding double quotes 046 * of an M string literal. Do not pass the surrounding quotes. 047 */ 048 public static Result decode(String raw) { 049 if (raw == null) return new Result(null, null); 050 StringBuilder out = new StringBuilder(raw.length()); 051 StringBuilder firstWarning = null; 052 053 int i = 0; 054 while (i < raw.length()) { 055 char c = raw.charAt(i); 056 057 if (c == '"' && i + 1 < raw.length() && raw.charAt(i + 1) == '"') { 058 out.append('"'); 059 i += 2; 060 continue; 061 } 062 063 if (c == '#' && i + 1 < raw.length() && raw.charAt(i + 1) == '(') { 064 int close = raw.indexOf(')', i + 2); 065 if (close < 0) { 066 out.append(c); 067 i++; 068 continue; 069 } 070 String body = raw.substring(i + 2, close).trim(); 071 String expanded = expandEscapeBody(body); 072 if (expanded != null) { 073 out.append(expanded); 074 } else { 075 out.append(raw, i, close + 1); 076 if (firstWarning == null) { 077 firstWarning = new StringBuilder(); 078 firstWarning.append("Unrecognised M escape #(").append(body).append(")"); 079 } 080 } 081 i = close + 1; 082 continue; 083 } 084 085 out.append(c); 086 i++; 087 } 088 return new Result(out.toString(), 089 firstWarning == null ? null : firstWarning.toString()); 090 } 091 092 private static String expandEscapeBody(String body) { 093 if (body == null || body.isEmpty()) return null; 094 String lower = body.toLowerCase(java.util.Locale.ROOT); 095 096 StringBuilder sb = new StringBuilder(); 097 for (String part : splitTopLevelCommas(lower)) { 098 String trimmed = part.trim(); 099 switch (trimmed) { 100 case "lf": sb.append('\n'); break; 101 case "cr": sb.append('\r'); break; 102 case "tab": sb.append('\t'); break; 103 default: 104 if (looksLikeHex(trimmed)) { 105 try { 106 int cp = Integer.parseInt(trimmed, 16); 107 sb.appendCodePoint(cp); 108 } catch (NumberFormatException nfe) { 109 return null; 110 } 111 } else { 112 return null; 113 } 114 } 115 } 116 return sb.toString(); 117 } 118 119 private static boolean looksLikeHex(String s) { 120 if (s.isEmpty() || s.length() > 8) return false; 121 for (int i = 0; i < s.length(); i++) { 122 char c = s.charAt(i); 123 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'))) return false; 124 } 125 return true; 126 } 127 128 private static java.util.List<String> splitTopLevelCommas(String s) { 129 java.util.List<String> out = new java.util.ArrayList<>(); 130 int start = 0; 131 for (int i = 0; i < s.length(); i++) { 132 if (s.charAt(i) == ',') { 133 out.add(s.substring(start, i)); 134 start = i + 1; 135 } 136 } 137 out.add(s.substring(start)); 138 return out; 139 } 140}