@@ -2,6 +2,7 @@ package parser_test
22
33import (
44 "context"
5+ "encoding/hex"
56 "encoding/json"
67 "flag"
78 "os"
@@ -14,6 +15,29 @@ import (
1415 "github.com/sqlc-dev/doubleclick/parser"
1516)
1617
18+ // decodeHexEscapes decodes \xNN escape sequences in a string to raw bytes
19+ // This allows comparing strings with hex escapes to decoded strings
20+ func decodeHexEscapes (s string ) string {
21+ hexEscapeRegex := regexp .MustCompile (`(\\x[0-9A-Fa-f]{2})+` )
22+ return hexEscapeRegex .ReplaceAllStringFunc (s , func (match string ) string {
23+ // Decode all consecutive hex escapes together
24+ var result []byte
25+ for i := 0 ; i < len (match ); i += 4 {
26+ // Each \xNN is 4 characters
27+ if i + 4 > len (match ) {
28+ break
29+ }
30+ hexStr := match [i + 2 : i + 4 ] // Skip \x prefix
31+ b , err := hex .DecodeString (hexStr )
32+ if err != nil || len (b ) != 1 {
33+ return match // Return original on error
34+ }
35+ result = append (result , b [0 ])
36+ }
37+ return string (result )
38+ })
39+ }
40+
1741// whitespaceRegex matches sequences of whitespace characters
1842var whitespaceRegex = regexp .MustCompile (`\s+` )
1943
@@ -33,6 +57,105 @@ var numericUnderscoreRegex = regexp.MustCompile(`(\d)_(\d)`)
3357// backtickIdentRegex normalizes backtick identifiers to unquoted
3458var backtickIdentRegex = regexp .MustCompile ("`([^`]+)`" )
3559
60+ // normalizeEscapesInStrings normalizes escape sequences within string literals:
61+ // - \' -> '' (backslash-escaped quote to SQL-standard)
62+ // - \\ -> \ (double backslash to single backslash)
63+ // This allows comparing strings with different escape styles.
64+ func normalizeEscapesInStrings (s string ) string {
65+ var result strings.Builder
66+ result .Grow (len (s ))
67+ i := 0
68+ for i < len (s ) {
69+ ch := s [i ]
70+ if ch == '\'' {
71+ // Start of a single-quoted string
72+ result .WriteByte (ch )
73+ i ++
74+ for i < len (s ) {
75+ ch = s [i ]
76+ if ch == '\\' && i + 1 < len (s ) && s [i + 1 ] == '\'' {
77+ // Backslash-escaped quote -> convert to SQL-standard ''
78+ result .WriteString ("''" )
79+ i += 2
80+ } else if ch == '\\' && i + 1 < len (s ) && s [i + 1 ] == '\\' {
81+ // Escaped backslash \\ -> single backslash \
82+ result .WriteByte ('\\' )
83+ i += 2
84+ } else if ch == '\'' {
85+ // Either end of string or escaped quote
86+ result .WriteByte (ch )
87+ i ++
88+ if i < len (s ) && s [i ] == '\'' {
89+ // Escaped quote ''
90+ result .WriteByte (s [i ])
91+ i ++
92+ } else {
93+ // End of string
94+ break
95+ }
96+ } else {
97+ result .WriteByte (ch )
98+ i ++
99+ }
100+ }
101+ } else {
102+ result .WriteByte (ch )
103+ i ++
104+ }
105+ }
106+ return result .String ()
107+ }
108+
109+ // normalizeCommasOutsideStrings removes spaces after commas that are outside of string literals
110+ func normalizeCommasOutsideStrings (s string ) string {
111+ var result strings.Builder
112+ result .Grow (len (s ))
113+ inString := false
114+ stringChar := byte (0 )
115+ i := 0
116+ for i < len (s ) {
117+ ch := s [i ]
118+ if ! inString {
119+ if ch == '\'' || ch == '"' {
120+ inString = true
121+ stringChar = ch
122+ result .WriteByte (ch )
123+ i ++
124+ } else if ch == ',' && i + 1 < len (s ) && s [i + 1 ] == ' ' {
125+ // Skip space after comma outside of strings
126+ result .WriteByte (ch )
127+ i += 2
128+ } else {
129+ result .WriteByte (ch )
130+ i ++
131+ }
132+ } else {
133+ // Inside string
134+ if ch == stringChar {
135+ // Check for escaped quote ('' or "")
136+ if i + 1 < len (s ) && s [i + 1 ] == stringChar {
137+ result .WriteByte (ch )
138+ result .WriteByte (s [i + 1 ])
139+ i += 2
140+ } else {
141+ inString = false
142+ result .WriteByte (ch )
143+ i ++
144+ }
145+ } else if ch == '\\' && i + 1 < len (s ) {
146+ // Escaped character - keep both
147+ result .WriteByte (ch )
148+ result .WriteByte (s [i + 1 ])
149+ i += 2
150+ } else {
151+ result .WriteByte (ch )
152+ i ++
153+ }
154+ }
155+ }
156+ return result .String ()
157+ }
158+
36159// normalizeForFormat normalizes SQL for format comparison by collapsing
37160// whitespace, normalizing spaces around operators, and stripping trailing
38161// semicolons. This allows comparing formatted output regardless of whitespace
@@ -41,12 +164,32 @@ func normalizeForFormat(s string) string {
41164 normalized := normalizeWhitespace (s )
42165 // Normalize spaces around operators (remove spaces)
43166 normalized = operatorSpaceRegex .ReplaceAllString (normalized , "$1" )
167+ // Normalize commas: remove spaces after commas outside of strings
168+ normalized = normalizeCommasOutsideStrings (normalized )
169+ // Normalize backslash-escaped quotes to SQL-standard (\' -> '')
170+ normalized = normalizeEscapesInStrings (normalized )
44171 // Remove underscores from numeric literals (100_000 -> 100000)
45172 for numericUnderscoreRegex .MatchString (normalized ) {
46173 normalized = numericUnderscoreRegex .ReplaceAllString (normalized , "$1$2" )
47174 }
48175 // Normalize backtick identifiers to unquoted
49176 normalized = backtickIdentRegex .ReplaceAllString (normalized , "$1" )
177+ // Normalize double-quoted identifiers to unquoted (but not in strings)
178+ // This handles "identifier" -> identifier (e.g., 2 "union" -> 2 union)
179+ normalized = regexp .MustCompile (`(\s)"([^"]+)"` ).ReplaceAllString (normalized , "$1$2" )
180+ // Normalize AS keyword case: as -> AS
181+ normalized = regexp .MustCompile (`\bas\b` ).ReplaceAllString (normalized , "AS" )
182+ // Remove leading zeros from integer literals (077 -> 77)
183+ normalized = regexp .MustCompile (`\b0+(\d+)\b` ).ReplaceAllString (normalized , "$1" )
184+ // Normalize heredocs ($$...$$ -> '...')
185+ normalized = regexp .MustCompile (`\$\$([^$]*)\$\$` ).ReplaceAllString (normalized , "'$1'" )
186+ // Normalize empty tuple () to tuple()
187+ normalized = regexp .MustCompile (`\(\)` ).ReplaceAllString (normalized , "tuple()" )
188+ // Normalize hex string literals x'...' to just '...' (decoded form)
189+ // The formatter outputs the decoded string, so we need to normalize for comparison
190+ normalized = regexp .MustCompile (`[xX]'([^']*)'` ).ReplaceAllString (normalized , "'$1'" )
191+ // Decode hex escape sequences (\xNN -> actual character)
192+ normalized = decodeHexEscapes (normalized )
50193 // Normalize "INNER JOIN" to "JOIN" (they're equivalent) - case insensitive
51194 normalized = regexp .MustCompile (`(?i)\bINNER\s+JOIN\b` ).ReplaceAllString (normalized , "JOIN" )
52195 // Normalize "LEFT OUTER JOIN" to "LEFT JOIN"
@@ -57,8 +200,6 @@ func normalizeForFormat(s string) string {
57200 normalized = regexp .MustCompile (`\bASC\b` ).ReplaceAllString (normalized , "" )
58201 // Normalize "OFFSET n ROWS" to "OFFSET n"
59202 normalized = regexp .MustCompile (`\bOFFSET\s+(\S+)\s+ROWS?\b` ).ReplaceAllString (normalized , "OFFSET $1" )
60- // Normalize escaped backslashes in strings (\\x -> \x)
61- normalized = strings .ReplaceAll (normalized , `\\` , `\` )
62203 // Normalize CROSS JOIN to comma
63204 normalized = strings .ReplaceAll (normalized , "CROSS JOIN" , "," )
64205 // Normalize ENGINE = X to ENGINE X (and engine X to ENGINE X)
@@ -67,6 +208,9 @@ func normalizeForFormat(s string) string {
67208 normalized = regexp .MustCompile (`(?i)\bINSERT\s+INTO\s+TABLE\b` ).ReplaceAllString (normalized , "INSERT INTO" )
68209 // Normalize UNION DISTINCT to UNION (DISTINCT is default)
69210 normalized = regexp .MustCompile (`(?i)\bUNION\s+DISTINCT\b` ).ReplaceAllString (normalized , "UNION" )
211+ // Normalize REGEXP operator to match() function (they're equivalent)
212+ // 'x' REGEXP 'y' -> match('x','y')
213+ normalized = regexp .MustCompile (`('[^']*')\s+REGEXP\s+('[^']*')` ).ReplaceAllString (normalized , "match($1,$2)" )
70214 // Normalize PARTITION BY () to PARTITION BY (for empty ORDER BY)
71215 normalized = regexp .MustCompile (`\bORDER BY \(\)\b` ).ReplaceAllString (normalized , "ORDER BY tuple()" )
72216 // Normalize INSERT INTO table (cols) to have no space before ( (or consistent spacing)
@@ -76,16 +220,18 @@ func normalizeForFormat(s string) string {
76220 normalized = regexp .MustCompile (`(?i)\bWITH\s+TIES\b` ).ReplaceAllString (normalized , "TIES" )
77221 // Normalize parentheses around simple column references in WHERE: (database=...) to database=...
78222 normalized = regexp .MustCompile (`\((\w+)=` ).ReplaceAllString (normalized , "$1=" )
79- // Normalize parentheses around lambda bodies: (x -> (expr)) to (x -> expr)
80- normalized = regexp .MustCompile (`->\s*\(` ).ReplaceAllString (normalized , "-> " )
81- // Now we need to remove extra closing parens, but this is tricky
82- // Let's try a simpler approach: remove redundant parens around IS NULL, IS NOT NULL
83- normalized = regexp .MustCompile (`\((\w+\s+IS\s+NOT\s+NULL)\)` ).ReplaceAllString (normalized , "$1" )
84- normalized = regexp .MustCompile (`\((\w+\s+IS\s+NULL)\)` ).ReplaceAllString (normalized , "$1" )
223+ // Normalize parentheses around single values after operators like NOT
224+ normalized = regexp .MustCompile (`\bNOT\s*\((\d+)\)` ).ReplaceAllString (normalized , "NOT $1" )
225+ normalized = regexp .MustCompile (`\bnot\s*\((\d+)\)` ).ReplaceAllString (normalized , "not $1" )
226+ // Normalize parentheses around IS NULL and IS NOT NULL expressions
227+ // This handles both standalone (x IS NULL) and inside lambdas x -> (x IS NULL)
228+ normalized = regexp .MustCompile (`\((\w+)\s+IS\s+NOT\s+NULL\)` ).ReplaceAllString (normalized , "$1 IS NOT NULL" )
229+ normalized = regexp .MustCompile (`\((\w+)\s+IS\s+NULL\)` ).ReplaceAllString (normalized , "$1 IS NULL" )
85230 // Re-normalize whitespace after replacements
86231 normalized = normalizeWhitespace (normalized )
87- // Strip trailing semicolon if present
88- return strings .TrimSuffix (normalized , ";" )
232+ // Strip trailing semicolon and any spaces before it
233+ normalized = strings .TrimSuffix (strings .TrimSpace (normalized ), ";" )
234+ return strings .TrimSpace (normalized )
89235}
90236
91237// stripComments removes SQL comments from a query string.
0 commit comments