Skip to content

Commit 5219cf5

Browse files
committed
Move SQL normalization functions to internal/normalize package
Extract normalization code from parser_test.go into a dedicated package: - DecodeHexEscapes: decode \xNN sequences - Whitespace: collapse whitespace - EscapesInStrings: normalize \' to '' and \\ to \ - CommasOutsideStrings: normalize comma spacing - ForFormat: comprehensive SQL normalization for format comparison - StripComments: remove SQL comments Pre-compile regexes for better performance.
1 parent 64df371 commit 5219cf5

File tree

2 files changed

+317
-291
lines changed

2 files changed

+317
-291
lines changed

internal/normalize/normalize.go

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
// Package normalize provides SQL normalization functions for comparing
2+
// semantically equivalent SQL statements that may differ syntactically.
3+
package normalize
4+
5+
import (
6+
"encoding/hex"
7+
"regexp"
8+
"strings"
9+
)
10+
11+
// Pre-compiled regexes for performance
12+
var (
13+
whitespaceRegex = regexp.MustCompile(`\s+`)
14+
operatorSpaceRegex = regexp.MustCompile(`\s*([=<>!]+|::|->|\|\||&&)\s*`)
15+
numericUnderscoreRegex = regexp.MustCompile(`(\d)_(\d)`)
16+
backtickIdentRegex = regexp.MustCompile("`([^`]+)`")
17+
hexEscapeRegex = regexp.MustCompile(`(\\x[0-9A-Fa-f]{2})+`)
18+
doubleQuotedIdentRegex = regexp.MustCompile(`(\s)"([^"]+)"`)
19+
asKeywordRegex = regexp.MustCompile(`\bas\b`)
20+
leadingZerosRegex = regexp.MustCompile(`\b0+(\d+)\b`)
21+
heredocRegex = regexp.MustCompile(`\$\$([^$]*)\$\$`)
22+
emptyTupleRegex = regexp.MustCompile(`\(\)`)
23+
hexStringRegex = regexp.MustCompile(`[xX]'([^']*)'`)
24+
innerJoinRegex = regexp.MustCompile(`(?i)\bINNER\s+JOIN\b`)
25+
leftOuterJoinRegex = regexp.MustCompile(`(?i)\bLEFT\s+OUTER\s+JOIN\b`)
26+
rightOuterJoinRegex = regexp.MustCompile(`(?i)\bRIGHT\s+OUTER\s+JOIN\b`)
27+
ascRegex = regexp.MustCompile(`\bASC\b`)
28+
offsetRowsRegex = regexp.MustCompile(`\bOFFSET\s+(\S+)\s+ROWS?\b`)
29+
engineEqualsRegex = regexp.MustCompile(`(?i)\bENGINE\s*=\s*`)
30+
insertIntoTableRegex = regexp.MustCompile(`(?i)\bINSERT\s+INTO\s+TABLE\b`)
31+
unionDistinctRegex = regexp.MustCompile(`(?i)\bUNION\s+DISTINCT\b`)
32+
regexpOperatorRegex = regexp.MustCompile(`('[^']*')\s+REGEXP\s+('[^']*')`)
33+
orderByEmptyRegex = regexp.MustCompile(`\bORDER BY \(\)\b`)
34+
spaceBeforeParenRegex = regexp.MustCompile(`(\w+)\s+\((\w)`)
35+
withTiesRegex = regexp.MustCompile(`(?i)\bWITH\s+TIES\b`)
36+
parenColumnEqualsRegex = regexp.MustCompile(`\((\w+)=`)
37+
notParenDigitRegex = regexp.MustCompile(`\bNOT\s*\((\d+)\)`)
38+
notLowerParenRegex = regexp.MustCompile(`\bnot\s*\((\d+)\)`)
39+
isNotNullParenRegex = regexp.MustCompile(`\((\w+)\s+IS\s+NOT\s+NULL\)`)
40+
isNullParenRegex = regexp.MustCompile(`\((\w+)\s+IS\s+NULL\)`)
41+
)
42+
43+
// DecodeHexEscapes decodes \xNN escape sequences in a string to raw bytes.
44+
// This allows comparing strings with hex escapes to decoded strings.
45+
func DecodeHexEscapes(s string) string {
46+
return hexEscapeRegex.ReplaceAllStringFunc(s, func(match string) string {
47+
// Decode all consecutive hex escapes together
48+
var result []byte
49+
for i := 0; i < len(match); i += 4 {
50+
// Each \xNN is 4 characters
51+
if i+4 > len(match) {
52+
break
53+
}
54+
hexStr := match[i+2 : i+4] // Skip \x prefix
55+
b, err := hex.DecodeString(hexStr)
56+
if err != nil || len(b) != 1 {
57+
return match // Return original on error
58+
}
59+
result = append(result, b[0])
60+
}
61+
return string(result)
62+
})
63+
}
64+
65+
// Whitespace collapses all whitespace sequences to a single space
66+
// and trims leading/trailing whitespace.
67+
func Whitespace(s string) string {
68+
return strings.TrimSpace(whitespaceRegex.ReplaceAllString(s, " "))
69+
}
70+
71+
// EscapesInStrings normalizes escape sequences within string literals:
72+
// - \' -> '' (backslash-escaped quote to SQL-standard)
73+
// - \\ -> \ (double backslash to single backslash)
74+
//
75+
// This allows comparing strings with different escape styles.
76+
func EscapesInStrings(s string) string {
77+
var result strings.Builder
78+
result.Grow(len(s))
79+
i := 0
80+
for i < len(s) {
81+
ch := s[i]
82+
if ch == '\'' {
83+
// Start of a single-quoted string
84+
result.WriteByte(ch)
85+
i++
86+
for i < len(s) {
87+
ch = s[i]
88+
if ch == '\\' && i+1 < len(s) && s[i+1] == '\'' {
89+
// Backslash-escaped quote -> convert to SQL-standard ''
90+
result.WriteString("''")
91+
i += 2
92+
} else if ch == '\\' && i+1 < len(s) && s[i+1] == '\\' {
93+
// Escaped backslash \\ -> single backslash \
94+
result.WriteByte('\\')
95+
i += 2
96+
} else if ch == '\'' {
97+
// Either end of string or escaped quote
98+
result.WriteByte(ch)
99+
i++
100+
if i < len(s) && s[i] == '\'' {
101+
// Escaped quote ''
102+
result.WriteByte(s[i])
103+
i++
104+
} else {
105+
// End of string
106+
break
107+
}
108+
} else {
109+
result.WriteByte(ch)
110+
i++
111+
}
112+
}
113+
} else {
114+
result.WriteByte(ch)
115+
i++
116+
}
117+
}
118+
return result.String()
119+
}
120+
121+
// CommasOutsideStrings removes spaces after commas that are outside of string literals.
122+
func CommasOutsideStrings(s string) string {
123+
var result strings.Builder
124+
result.Grow(len(s))
125+
inString := false
126+
stringChar := byte(0)
127+
i := 0
128+
for i < len(s) {
129+
ch := s[i]
130+
if !inString {
131+
if ch == '\'' || ch == '"' {
132+
inString = true
133+
stringChar = ch
134+
result.WriteByte(ch)
135+
i++
136+
} else if ch == ',' && i+1 < len(s) && s[i+1] == ' ' {
137+
// Skip space after comma outside of strings
138+
result.WriteByte(ch)
139+
i += 2
140+
} else {
141+
result.WriteByte(ch)
142+
i++
143+
}
144+
} else {
145+
// Inside string
146+
if ch == stringChar {
147+
// Check for escaped quote ('' or "")
148+
if i+1 < len(s) && s[i+1] == stringChar {
149+
result.WriteByte(ch)
150+
result.WriteByte(s[i+1])
151+
i += 2
152+
} else {
153+
inString = false
154+
result.WriteByte(ch)
155+
i++
156+
}
157+
} else if ch == '\\' && i+1 < len(s) {
158+
// Escaped character - keep both
159+
result.WriteByte(ch)
160+
result.WriteByte(s[i+1])
161+
i += 2
162+
} else {
163+
result.WriteByte(ch)
164+
i++
165+
}
166+
}
167+
}
168+
return result.String()
169+
}
170+
171+
// ForFormat normalizes SQL for format comparison by applying various
172+
// normalizations that make semantically equivalent SQL statements match.
173+
// This includes whitespace normalization, operator spacing, escape sequences,
174+
// and various SQL syntax equivalences.
175+
func ForFormat(s string) string {
176+
normalized := Whitespace(s)
177+
// Normalize spaces around operators (remove spaces)
178+
normalized = operatorSpaceRegex.ReplaceAllString(normalized, "$1")
179+
// Normalize commas: remove spaces after commas outside of strings
180+
normalized = CommasOutsideStrings(normalized)
181+
// Normalize backslash-escaped quotes to SQL-standard (\' -> '')
182+
normalized = EscapesInStrings(normalized)
183+
// Remove underscores from numeric literals (100_000 -> 100000)
184+
for numericUnderscoreRegex.MatchString(normalized) {
185+
normalized = numericUnderscoreRegex.ReplaceAllString(normalized, "$1$2")
186+
}
187+
// Normalize backtick identifiers to unquoted
188+
normalized = backtickIdentRegex.ReplaceAllString(normalized, "$1")
189+
// Normalize double-quoted identifiers to unquoted (but not in strings)
190+
// This handles "identifier" -> identifier (e.g., 2 "union" -> 2 union)
191+
normalized = doubleQuotedIdentRegex.ReplaceAllString(normalized, "$1$2")
192+
// Normalize AS keyword case: as -> AS
193+
normalized = asKeywordRegex.ReplaceAllString(normalized, "AS")
194+
// Remove leading zeros from integer literals (077 -> 77)
195+
normalized = leadingZerosRegex.ReplaceAllString(normalized, "$1")
196+
// Normalize heredocs ($$...$$ -> '...')
197+
normalized = heredocRegex.ReplaceAllString(normalized, "'$1'")
198+
// Normalize empty tuple () to tuple()
199+
normalized = emptyTupleRegex.ReplaceAllString(normalized, "tuple()")
200+
// Normalize hex string literals x'...' to just '...' (decoded form)
201+
// The formatter outputs the decoded string, so we need to normalize for comparison
202+
normalized = hexStringRegex.ReplaceAllString(normalized, "'$1'")
203+
// Decode hex escape sequences (\xNN -> actual character)
204+
normalized = DecodeHexEscapes(normalized)
205+
// Normalize "INNER JOIN" to "JOIN" (they're equivalent) - case insensitive
206+
normalized = innerJoinRegex.ReplaceAllString(normalized, "JOIN")
207+
// Normalize "LEFT OUTER JOIN" to "LEFT JOIN"
208+
normalized = leftOuterJoinRegex.ReplaceAllString(normalized, "LEFT JOIN")
209+
// Normalize "RIGHT OUTER JOIN" to "RIGHT JOIN"
210+
normalized = rightOuterJoinRegex.ReplaceAllString(normalized, "RIGHT JOIN")
211+
// Normalize "ORDER BY x ASC" to "ORDER BY x" (ASC is default)
212+
normalized = ascRegex.ReplaceAllString(normalized, "")
213+
// Normalize "OFFSET n ROWS" to "OFFSET n"
214+
normalized = offsetRowsRegex.ReplaceAllString(normalized, "OFFSET $1")
215+
// Normalize CROSS JOIN to comma
216+
normalized = strings.ReplaceAll(normalized, "CROSS JOIN", ",")
217+
// Normalize ENGINE = X to ENGINE X (and engine X to ENGINE X)
218+
normalized = engineEqualsRegex.ReplaceAllString(normalized, "ENGINE ")
219+
// Normalize INSERT INTO TABLE to INSERT INTO
220+
normalized = insertIntoTableRegex.ReplaceAllString(normalized, "INSERT INTO")
221+
// Normalize UNION DISTINCT to UNION (DISTINCT is default)
222+
normalized = unionDistinctRegex.ReplaceAllString(normalized, "UNION")
223+
// Normalize REGEXP operator to match() function (they're equivalent)
224+
// 'x' REGEXP 'y' -> match('x','y')
225+
normalized = regexpOperatorRegex.ReplaceAllString(normalized, "match($1,$2)")
226+
// Normalize ORDER BY () to ORDER BY tuple()
227+
normalized = orderByEmptyRegex.ReplaceAllString(normalized, "ORDER BY tuple()")
228+
// Normalize INSERT INTO table (cols) to have no space before ( (or consistent spacing)
229+
// This matches "tablename (" and removes the space: "tablename("
230+
normalized = spaceBeforeParenRegex.ReplaceAllString(normalized, "$1($2")
231+
// Normalize WITH TIES to TIES (for LIMIT)
232+
normalized = withTiesRegex.ReplaceAllString(normalized, "TIES")
233+
// Normalize parentheses around simple column references in WHERE: (database=...) to database=...
234+
normalized = parenColumnEqualsRegex.ReplaceAllString(normalized, "$1=")
235+
// Normalize parentheses around single values after operators like NOT
236+
normalized = notParenDigitRegex.ReplaceAllString(normalized, "NOT $1")
237+
normalized = notLowerParenRegex.ReplaceAllString(normalized, "not $1")
238+
// Normalize parentheses around IS NULL and IS NOT NULL expressions
239+
// This handles both standalone (x IS NULL) and inside lambdas x -> (x IS NULL)
240+
normalized = isNotNullParenRegex.ReplaceAllString(normalized, "$1 IS NOT NULL")
241+
normalized = isNullParenRegex.ReplaceAllString(normalized, "$1 IS NULL")
242+
// Re-normalize whitespace after replacements
243+
normalized = Whitespace(normalized)
244+
// Strip trailing semicolon and any spaces before it
245+
normalized = strings.TrimSuffix(strings.TrimSpace(normalized), ";")
246+
return strings.TrimSpace(normalized)
247+
}
248+
249+
// StripComments removes SQL comments from a query string.
250+
// It handles:
251+
// - Line comments: -- to end of line
252+
// - Block comments: /* ... */ with nesting support
253+
func StripComments(s string) string {
254+
var result strings.Builder
255+
result.Grow(len(s))
256+
257+
i := 0
258+
for i < len(s) {
259+
// Check for line comment: --
260+
if i+1 < len(s) && s[i] == '-' && s[i+1] == '-' {
261+
// Skip until end of line
262+
for i < len(s) && s[i] != '\n' {
263+
i++
264+
}
265+
continue
266+
}
267+
268+
// Check for block comment: /* ... */
269+
if i+1 < len(s) && s[i] == '/' && s[i+1] == '*' {
270+
depth := 1
271+
i += 2
272+
for i < len(s) && depth > 0 {
273+
if i+1 < len(s) && s[i] == '/' && s[i+1] == '*' {
274+
depth++
275+
i += 2
276+
} else if i+1 < len(s) && s[i] == '*' && s[i+1] == '/' {
277+
depth--
278+
i += 2
279+
} else {
280+
i++
281+
}
282+
}
283+
continue
284+
}
285+
286+
// Check for string literal - don't strip comments inside strings
287+
if s[i] == '\'' {
288+
result.WriteByte(s[i])
289+
i++
290+
for i < len(s) {
291+
if s[i] == '\'' {
292+
result.WriteByte(s[i])
293+
i++
294+
// Check for escaped quote ''
295+
if i < len(s) && s[i] == '\'' {
296+
result.WriteByte(s[i])
297+
i++
298+
continue
299+
}
300+
break
301+
}
302+
result.WriteByte(s[i])
303+
i++
304+
}
305+
continue
306+
}
307+
308+
result.WriteByte(s[i])
309+
i++
310+
}
311+
312+
return result.String()
313+
}

0 commit comments

Comments
 (0)