|
| 1 | +// Package normalize provides SQL normalization functions for comparing |
| 2 | +// semantically equivalent SQL statements that may differ syntactically. |
| 3 | +package normalize |
| 4 | + |
| 5 | +import ( |
| 6 | + "encoding/hex" |
| 7 | + "regexp" |
| 8 | + "strings" |
| 9 | +) |
| 10 | + |
| 11 | +// Pre-compiled regexes for performance |
| 12 | +var ( |
| 13 | + whitespaceRegex = regexp.MustCompile(`\s+`) |
| 14 | + operatorSpaceRegex = regexp.MustCompile(`\s*([=<>!]+|::|->|\|\||&&)\s*`) |
| 15 | + numericUnderscoreRegex = regexp.MustCompile(`(\d)_(\d)`) |
| 16 | + backtickIdentRegex = regexp.MustCompile("`([^`]+)`") |
| 17 | + hexEscapeRegex = regexp.MustCompile(`(\\x[0-9A-Fa-f]{2})+`) |
| 18 | + doubleQuotedIdentRegex = regexp.MustCompile(`(\s)"([^"]+)"`) |
| 19 | + asKeywordRegex = regexp.MustCompile(`\bas\b`) |
| 20 | + leadingZerosRegex = regexp.MustCompile(`\b0+(\d+)\b`) |
| 21 | + heredocRegex = regexp.MustCompile(`\$\$([^$]*)\$\$`) |
| 22 | + emptyTupleRegex = regexp.MustCompile(`\(\)`) |
| 23 | + hexStringRegex = regexp.MustCompile(`[xX]'([^']*)'`) |
| 24 | + innerJoinRegex = regexp.MustCompile(`(?i)\bINNER\s+JOIN\b`) |
| 25 | + leftOuterJoinRegex = regexp.MustCompile(`(?i)\bLEFT\s+OUTER\s+JOIN\b`) |
| 26 | + rightOuterJoinRegex = regexp.MustCompile(`(?i)\bRIGHT\s+OUTER\s+JOIN\b`) |
| 27 | + ascRegex = regexp.MustCompile(`\bASC\b`) |
| 28 | + offsetRowsRegex = regexp.MustCompile(`\bOFFSET\s+(\S+)\s+ROWS?\b`) |
| 29 | + engineEqualsRegex = regexp.MustCompile(`(?i)\bENGINE\s*=\s*`) |
| 30 | + insertIntoTableRegex = regexp.MustCompile(`(?i)\bINSERT\s+INTO\s+TABLE\b`) |
| 31 | + unionDistinctRegex = regexp.MustCompile(`(?i)\bUNION\s+DISTINCT\b`) |
| 32 | + regexpOperatorRegex = regexp.MustCompile(`('[^']*')\s+REGEXP\s+('[^']*')`) |
| 33 | + orderByEmptyRegex = regexp.MustCompile(`\bORDER BY \(\)\b`) |
| 34 | + spaceBeforeParenRegex = regexp.MustCompile(`(\w+)\s+\((\w)`) |
| 35 | + withTiesRegex = regexp.MustCompile(`(?i)\bWITH\s+TIES\b`) |
| 36 | + parenColumnEqualsRegex = regexp.MustCompile(`\((\w+)=`) |
| 37 | + notParenDigitRegex = regexp.MustCompile(`\bNOT\s*\((\d+)\)`) |
| 38 | + notLowerParenRegex = regexp.MustCompile(`\bnot\s*\((\d+)\)`) |
| 39 | + isNotNullParenRegex = regexp.MustCompile(`\((\w+)\s+IS\s+NOT\s+NULL\)`) |
| 40 | + isNullParenRegex = regexp.MustCompile(`\((\w+)\s+IS\s+NULL\)`) |
| 41 | +) |
| 42 | + |
| 43 | +// DecodeHexEscapes decodes \xNN escape sequences in a string to raw bytes. |
| 44 | +// This allows comparing strings with hex escapes to decoded strings. |
| 45 | +func DecodeHexEscapes(s string) string { |
| 46 | + return hexEscapeRegex.ReplaceAllStringFunc(s, func(match string) string { |
| 47 | + // Decode all consecutive hex escapes together |
| 48 | + var result []byte |
| 49 | + for i := 0; i < len(match); i += 4 { |
| 50 | + // Each \xNN is 4 characters |
| 51 | + if i+4 > len(match) { |
| 52 | + break |
| 53 | + } |
| 54 | + hexStr := match[i+2 : i+4] // Skip \x prefix |
| 55 | + b, err := hex.DecodeString(hexStr) |
| 56 | + if err != nil || len(b) != 1 { |
| 57 | + return match // Return original on error |
| 58 | + } |
| 59 | + result = append(result, b[0]) |
| 60 | + } |
| 61 | + return string(result) |
| 62 | + }) |
| 63 | +} |
| 64 | + |
| 65 | +// Whitespace collapses all whitespace sequences to a single space |
| 66 | +// and trims leading/trailing whitespace. |
| 67 | +func Whitespace(s string) string { |
| 68 | + return strings.TrimSpace(whitespaceRegex.ReplaceAllString(s, " ")) |
| 69 | +} |
| 70 | + |
| 71 | +// EscapesInStrings normalizes escape sequences within string literals: |
| 72 | +// - \' -> '' (backslash-escaped quote to SQL-standard) |
| 73 | +// - \\ -> \ (double backslash to single backslash) |
| 74 | +// |
| 75 | +// This allows comparing strings with different escape styles. |
| 76 | +func EscapesInStrings(s string) string { |
| 77 | + var result strings.Builder |
| 78 | + result.Grow(len(s)) |
| 79 | + i := 0 |
| 80 | + for i < len(s) { |
| 81 | + ch := s[i] |
| 82 | + if ch == '\'' { |
| 83 | + // Start of a single-quoted string |
| 84 | + result.WriteByte(ch) |
| 85 | + i++ |
| 86 | + for i < len(s) { |
| 87 | + ch = s[i] |
| 88 | + if ch == '\\' && i+1 < len(s) && s[i+1] == '\'' { |
| 89 | + // Backslash-escaped quote -> convert to SQL-standard '' |
| 90 | + result.WriteString("''") |
| 91 | + i += 2 |
| 92 | + } else if ch == '\\' && i+1 < len(s) && s[i+1] == '\\' { |
| 93 | + // Escaped backslash \\ -> single backslash \ |
| 94 | + result.WriteByte('\\') |
| 95 | + i += 2 |
| 96 | + } else if ch == '\'' { |
| 97 | + // Either end of string or escaped quote |
| 98 | + result.WriteByte(ch) |
| 99 | + i++ |
| 100 | + if i < len(s) && s[i] == '\'' { |
| 101 | + // Escaped quote '' |
| 102 | + result.WriteByte(s[i]) |
| 103 | + i++ |
| 104 | + } else { |
| 105 | + // End of string |
| 106 | + break |
| 107 | + } |
| 108 | + } else { |
| 109 | + result.WriteByte(ch) |
| 110 | + i++ |
| 111 | + } |
| 112 | + } |
| 113 | + } else { |
| 114 | + result.WriteByte(ch) |
| 115 | + i++ |
| 116 | + } |
| 117 | + } |
| 118 | + return result.String() |
| 119 | +} |
| 120 | + |
| 121 | +// CommasOutsideStrings removes spaces after commas that are outside of string literals. |
| 122 | +func CommasOutsideStrings(s string) string { |
| 123 | + var result strings.Builder |
| 124 | + result.Grow(len(s)) |
| 125 | + inString := false |
| 126 | + stringChar := byte(0) |
| 127 | + i := 0 |
| 128 | + for i < len(s) { |
| 129 | + ch := s[i] |
| 130 | + if !inString { |
| 131 | + if ch == '\'' || ch == '"' { |
| 132 | + inString = true |
| 133 | + stringChar = ch |
| 134 | + result.WriteByte(ch) |
| 135 | + i++ |
| 136 | + } else if ch == ',' && i+1 < len(s) && s[i+1] == ' ' { |
| 137 | + // Skip space after comma outside of strings |
| 138 | + result.WriteByte(ch) |
| 139 | + i += 2 |
| 140 | + } else { |
| 141 | + result.WriteByte(ch) |
| 142 | + i++ |
| 143 | + } |
| 144 | + } else { |
| 145 | + // Inside string |
| 146 | + if ch == stringChar { |
| 147 | + // Check for escaped quote ('' or "") |
| 148 | + if i+1 < len(s) && s[i+1] == stringChar { |
| 149 | + result.WriteByte(ch) |
| 150 | + result.WriteByte(s[i+1]) |
| 151 | + i += 2 |
| 152 | + } else { |
| 153 | + inString = false |
| 154 | + result.WriteByte(ch) |
| 155 | + i++ |
| 156 | + } |
| 157 | + } else if ch == '\\' && i+1 < len(s) { |
| 158 | + // Escaped character - keep both |
| 159 | + result.WriteByte(ch) |
| 160 | + result.WriteByte(s[i+1]) |
| 161 | + i += 2 |
| 162 | + } else { |
| 163 | + result.WriteByte(ch) |
| 164 | + i++ |
| 165 | + } |
| 166 | + } |
| 167 | + } |
| 168 | + return result.String() |
| 169 | +} |
| 170 | + |
| 171 | +// ForFormat normalizes SQL for format comparison by applying various |
| 172 | +// normalizations that make semantically equivalent SQL statements match. |
| 173 | +// This includes whitespace normalization, operator spacing, escape sequences, |
| 174 | +// and various SQL syntax equivalences. |
| 175 | +func ForFormat(s string) string { |
| 176 | + normalized := Whitespace(s) |
| 177 | + // Normalize spaces around operators (remove spaces) |
| 178 | + normalized = operatorSpaceRegex.ReplaceAllString(normalized, "$1") |
| 179 | + // Normalize commas: remove spaces after commas outside of strings |
| 180 | + normalized = CommasOutsideStrings(normalized) |
| 181 | + // Normalize backslash-escaped quotes to SQL-standard (\' -> '') |
| 182 | + normalized = EscapesInStrings(normalized) |
| 183 | + // Remove underscores from numeric literals (100_000 -> 100000) |
| 184 | + for numericUnderscoreRegex.MatchString(normalized) { |
| 185 | + normalized = numericUnderscoreRegex.ReplaceAllString(normalized, "$1$2") |
| 186 | + } |
| 187 | + // Normalize backtick identifiers to unquoted |
| 188 | + normalized = backtickIdentRegex.ReplaceAllString(normalized, "$1") |
| 189 | + // Normalize double-quoted identifiers to unquoted (but not in strings) |
| 190 | + // This handles "identifier" -> identifier (e.g., 2 "union" -> 2 union) |
| 191 | + normalized = doubleQuotedIdentRegex.ReplaceAllString(normalized, "$1$2") |
| 192 | + // Normalize AS keyword case: as -> AS |
| 193 | + normalized = asKeywordRegex.ReplaceAllString(normalized, "AS") |
| 194 | + // Remove leading zeros from integer literals (077 -> 77) |
| 195 | + normalized = leadingZerosRegex.ReplaceAllString(normalized, "$1") |
| 196 | + // Normalize heredocs ($$...$$ -> '...') |
| 197 | + normalized = heredocRegex.ReplaceAllString(normalized, "'$1'") |
| 198 | + // Normalize empty tuple () to tuple() |
| 199 | + normalized = emptyTupleRegex.ReplaceAllString(normalized, "tuple()") |
| 200 | + // Normalize hex string literals x'...' to just '...' (decoded form) |
| 201 | + // The formatter outputs the decoded string, so we need to normalize for comparison |
| 202 | + normalized = hexStringRegex.ReplaceAllString(normalized, "'$1'") |
| 203 | + // Decode hex escape sequences (\xNN -> actual character) |
| 204 | + normalized = DecodeHexEscapes(normalized) |
| 205 | + // Normalize "INNER JOIN" to "JOIN" (they're equivalent) - case insensitive |
| 206 | + normalized = innerJoinRegex.ReplaceAllString(normalized, "JOIN") |
| 207 | + // Normalize "LEFT OUTER JOIN" to "LEFT JOIN" |
| 208 | + normalized = leftOuterJoinRegex.ReplaceAllString(normalized, "LEFT JOIN") |
| 209 | + // Normalize "RIGHT OUTER JOIN" to "RIGHT JOIN" |
| 210 | + normalized = rightOuterJoinRegex.ReplaceAllString(normalized, "RIGHT JOIN") |
| 211 | + // Normalize "ORDER BY x ASC" to "ORDER BY x" (ASC is default) |
| 212 | + normalized = ascRegex.ReplaceAllString(normalized, "") |
| 213 | + // Normalize "OFFSET n ROWS" to "OFFSET n" |
| 214 | + normalized = offsetRowsRegex.ReplaceAllString(normalized, "OFFSET $1") |
| 215 | + // Normalize CROSS JOIN to comma |
| 216 | + normalized = strings.ReplaceAll(normalized, "CROSS JOIN", ",") |
| 217 | + // Normalize ENGINE = X to ENGINE X (and engine X to ENGINE X) |
| 218 | + normalized = engineEqualsRegex.ReplaceAllString(normalized, "ENGINE ") |
| 219 | + // Normalize INSERT INTO TABLE to INSERT INTO |
| 220 | + normalized = insertIntoTableRegex.ReplaceAllString(normalized, "INSERT INTO") |
| 221 | + // Normalize UNION DISTINCT to UNION (DISTINCT is default) |
| 222 | + normalized = unionDistinctRegex.ReplaceAllString(normalized, "UNION") |
| 223 | + // Normalize REGEXP operator to match() function (they're equivalent) |
| 224 | + // 'x' REGEXP 'y' -> match('x','y') |
| 225 | + normalized = regexpOperatorRegex.ReplaceAllString(normalized, "match($1,$2)") |
| 226 | + // Normalize ORDER BY () to ORDER BY tuple() |
| 227 | + normalized = orderByEmptyRegex.ReplaceAllString(normalized, "ORDER BY tuple()") |
| 228 | + // Normalize INSERT INTO table (cols) to have no space before ( (or consistent spacing) |
| 229 | + // This matches "tablename (" and removes the space: "tablename(" |
| 230 | + normalized = spaceBeforeParenRegex.ReplaceAllString(normalized, "$1($2") |
| 231 | + // Normalize WITH TIES to TIES (for LIMIT) |
| 232 | + normalized = withTiesRegex.ReplaceAllString(normalized, "TIES") |
| 233 | + // Normalize parentheses around simple column references in WHERE: (database=...) to database=... |
| 234 | + normalized = parenColumnEqualsRegex.ReplaceAllString(normalized, "$1=") |
| 235 | + // Normalize parentheses around single values after operators like NOT |
| 236 | + normalized = notParenDigitRegex.ReplaceAllString(normalized, "NOT $1") |
| 237 | + normalized = notLowerParenRegex.ReplaceAllString(normalized, "not $1") |
| 238 | + // Normalize parentheses around IS NULL and IS NOT NULL expressions |
| 239 | + // This handles both standalone (x IS NULL) and inside lambdas x -> (x IS NULL) |
| 240 | + normalized = isNotNullParenRegex.ReplaceAllString(normalized, "$1 IS NOT NULL") |
| 241 | + normalized = isNullParenRegex.ReplaceAllString(normalized, "$1 IS NULL") |
| 242 | + // Re-normalize whitespace after replacements |
| 243 | + normalized = Whitespace(normalized) |
| 244 | + // Strip trailing semicolon and any spaces before it |
| 245 | + normalized = strings.TrimSuffix(strings.TrimSpace(normalized), ";") |
| 246 | + return strings.TrimSpace(normalized) |
| 247 | +} |
| 248 | + |
| 249 | +// StripComments removes SQL comments from a query string. |
| 250 | +// It handles: |
| 251 | +// - Line comments: -- to end of line |
| 252 | +// - Block comments: /* ... */ with nesting support |
| 253 | +func StripComments(s string) string { |
| 254 | + var result strings.Builder |
| 255 | + result.Grow(len(s)) |
| 256 | + |
| 257 | + i := 0 |
| 258 | + for i < len(s) { |
| 259 | + // Check for line comment: -- |
| 260 | + if i+1 < len(s) && s[i] == '-' && s[i+1] == '-' { |
| 261 | + // Skip until end of line |
| 262 | + for i < len(s) && s[i] != '\n' { |
| 263 | + i++ |
| 264 | + } |
| 265 | + continue |
| 266 | + } |
| 267 | + |
| 268 | + // Check for block comment: /* ... */ |
| 269 | + if i+1 < len(s) && s[i] == '/' && s[i+1] == '*' { |
| 270 | + depth := 1 |
| 271 | + i += 2 |
| 272 | + for i < len(s) && depth > 0 { |
| 273 | + if i+1 < len(s) && s[i] == '/' && s[i+1] == '*' { |
| 274 | + depth++ |
| 275 | + i += 2 |
| 276 | + } else if i+1 < len(s) && s[i] == '*' && s[i+1] == '/' { |
| 277 | + depth-- |
| 278 | + i += 2 |
| 279 | + } else { |
| 280 | + i++ |
| 281 | + } |
| 282 | + } |
| 283 | + continue |
| 284 | + } |
| 285 | + |
| 286 | + // Check for string literal - don't strip comments inside strings |
| 287 | + if s[i] == '\'' { |
| 288 | + result.WriteByte(s[i]) |
| 289 | + i++ |
| 290 | + for i < len(s) { |
| 291 | + if s[i] == '\'' { |
| 292 | + result.WriteByte(s[i]) |
| 293 | + i++ |
| 294 | + // Check for escaped quote '' |
| 295 | + if i < len(s) && s[i] == '\'' { |
| 296 | + result.WriteByte(s[i]) |
| 297 | + i++ |
| 298 | + continue |
| 299 | + } |
| 300 | + break |
| 301 | + } |
| 302 | + result.WriteByte(s[i]) |
| 303 | + i++ |
| 304 | + } |
| 305 | + continue |
| 306 | + } |
| 307 | + |
| 308 | + result.WriteByte(s[i]) |
| 309 | + i++ |
| 310 | + } |
| 311 | + |
| 312 | + return result.String() |
| 313 | +} |
0 commit comments