Skip to content

Commit 7110956

Browse files
ajitpratap0Ajit Pratap Singhclaude
authored
feat(fingerprint): add SQL query fingerprinting and normalization (#444) (#463)
- New pkg/fingerprint/ package with Normalize() and Fingerprint() APIs - Normalize replaces all literal values (strings, numbers, booleans, NULLs) with '?' via an AST visitor, returning re-formatted SQL - Fingerprint returns a stable SHA-256 hex digest of the normalized form - Both exported at gosqlx package level as convenience functions - 19 tests covering: literal replacement, placeholder preservation, IN lists, determinism, invalid SQL error, 64-char hash format, race safety - No race conditions (verified with -race -count=3) Co-authored-by: Ajit Pratap Singh <ajitpratapsingh@Ajits-Mac-mini-2655.local> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent a721a16 commit 7110956

File tree

4 files changed

+439
-0
lines changed

4 files changed

+439
-0
lines changed

pkg/fingerprint/fingerprint.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// Copyright 2026 GoSQLX Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package fingerprint provides SQL query normalization and fingerprinting.
16+
//
17+
// Normalize replaces all literal values (strings, numbers, booleans, NULLs)
18+
// with "?" placeholders and returns the re-formatted SQL. Two queries that are
19+
// structurally identical but differ only in literal values will produce the
20+
// same normalized output.
21+
//
22+
// Fingerprint returns the SHA-256 hex digest of the normalized form, providing
23+
// a stable 64-character key for query deduplication, caching, and slow-query
24+
// grouping.
25+
//
26+
// Existing parameter placeholders ($1, ?, :name) are always preserved
27+
// unchanged.
28+
//
29+
// Example:
30+
//
31+
// n, err := fingerprint.Normalize("SELECT * FROM users WHERE id = 42")
32+
// // n == "SELECT * FROM users WHERE id = ?"
33+
//
34+
// fp, err := fingerprint.Fingerprint("SELECT * FROM users WHERE id = 42")
35+
// // fp == "a3f1..." (64-char SHA-256 hex)
36+
// fp2, _ := fingerprint.Fingerprint("SELECT * FROM users WHERE id = 999")
37+
// // fp == fp2 (same structure, different literal)
38+
package fingerprint
39+
40+
import (
41+
"crypto/sha256"
42+
"fmt"
43+
"strings"
44+
45+
"github.com/ajitpratap0/GoSQLX/pkg/formatter"
46+
"github.com/ajitpratap0/GoSQLX/pkg/sql/ast"
47+
"github.com/ajitpratap0/GoSQLX/pkg/sql/parser"
48+
"github.com/ajitpratap0/GoSQLX/pkg/sql/tokenizer"
49+
)
50+
51+
// literalNormalizer is an AST visitor that replaces all non-placeholder literal
52+
// values with "?". It mutates the AST in-place; callers must not reuse the AST
53+
// after normalization.
54+
type literalNormalizer struct{}
55+
56+
// Visit implements ast.Visitor. It replaces literal values with "?" by mutating
57+
// each LiteralValue node encountered. Placeholder nodes ($1, ?, :name) are
58+
// left untouched.
59+
func (n *literalNormalizer) Visit(node ast.Node) (ast.Visitor, error) {
60+
if node == nil {
61+
return nil, nil
62+
}
63+
if lit, ok := node.(*ast.LiteralValue); ok {
64+
// Skip existing parameter placeholders — they must be preserved.
65+
if strings.EqualFold(lit.Type, "placeholder") {
66+
return n, nil
67+
}
68+
// Replace the literal value with a bare "?" marker. Setting Type to ""
69+
// causes LiteralValue.SQL() to fall through to the default case which
70+
// returns fmt.Sprintf("%v", l.Value) == "?".
71+
lit.Value = "?"
72+
lit.Type = ""
73+
}
74+
return n, nil
75+
}
76+
77+
// Normalize parses the SQL, replaces all literal values (strings, numbers,
78+
// booleans, NULLs) with "?" placeholders, and returns the re-formatted SQL.
79+
//
80+
// Two queries that are structurally identical but use different literal values
81+
// (e.g., WHERE id = 1 vs WHERE id = 42) will produce the same normalized output.
82+
// Existing parameter placeholders ($1, ?, :name) are preserved unchanged.
83+
//
84+
// Returns an error if the SQL cannot be parsed.
85+
//
86+
// Example:
87+
//
88+
// n, err := fingerprint.Normalize("SELECT * FROM users WHERE id = 42 AND name = 'alice'")
89+
// // n == "SELECT * FROM users WHERE id = ? AND name = ?"
90+
func Normalize(sql string) (string, error) {
91+
tkz := tokenizer.GetTokenizer()
92+
defer tokenizer.PutTokenizer(tkz)
93+
94+
tokens, err := tkz.Tokenize([]byte(sql))
95+
if err != nil {
96+
return "", fmt.Errorf("fingerprint: tokenization failed: %w", err)
97+
}
98+
99+
p := parser.GetParser()
100+
defer parser.PutParser(p)
101+
102+
astObj, err := p.ParseFromModelTokens(tokens)
103+
if err != nil {
104+
return "", fmt.Errorf("fingerprint: parsing failed: %w", err)
105+
}
106+
defer ast.ReleaseAST(astObj)
107+
108+
// Walk the AST and replace all non-placeholder literals with "?".
109+
v := &literalNormalizer{}
110+
for _, stmt := range astObj.Statements {
111+
if err := ast.Walk(v, stmt); err != nil {
112+
return "", fmt.Errorf("fingerprint: AST walk failed: %w", err)
113+
}
114+
}
115+
116+
// Format the mutated AST back to SQL using compact (single-line) style.
117+
opts := ast.CompactStyle()
118+
var parts []string
119+
for _, stmt := range astObj.Statements {
120+
parts = append(parts, formatter.FormatStatement(stmt, opts))
121+
}
122+
123+
return strings.Join(parts, "; "), nil
124+
}
125+
126+
// Fingerprint parses the SQL, normalizes all literals to "?", and returns the
127+
// SHA-256 hex digest of the normalized form. Two structurally identical queries
128+
// with different literal values will produce the same fingerprint.
129+
//
130+
// The fingerprint is stable across GoSQLX versions as long as the formatter
131+
// output for a given AST structure does not change.
132+
//
133+
// Returns a 64-character lowercase hex string, or an error if SQL is invalid.
134+
//
135+
// Example:
136+
//
137+
// fp, err := fingerprint.Fingerprint("SELECT * FROM users WHERE id = 42")
138+
// fp2, _ := fingerprint.Fingerprint("SELECT * FROM users WHERE id = 999")
139+
// // fp == fp2 (same structure, different literal)
140+
func Fingerprint(sql string) (string, error) {
141+
normalized, err := Normalize(sql)
142+
if err != nil {
143+
return "", err
144+
}
145+
h := sha256.Sum256([]byte(normalized))
146+
return fmt.Sprintf("%x", h), nil
147+
}
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
// Copyright 2026 GoSQLX Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package fingerprint_test
16+
17+
import (
18+
"strings"
19+
"testing"
20+
21+
"github.com/ajitpratap0/GoSQLX/pkg/fingerprint"
22+
)
23+
24+
func TestNormalize_ReplacesStringLiterals(t *testing.T) {
25+
sql := "SELECT * FROM users WHERE name = 'alice'"
26+
got, err := fingerprint.Normalize(sql)
27+
if err != nil {
28+
t.Fatalf("Normalize() error: %v", err)
29+
}
30+
if strings.Contains(got, "'alice'") {
31+
t.Errorf("Normalize() did not replace string literal; got: %s", got)
32+
}
33+
if !strings.Contains(got, "?") {
34+
t.Errorf("Normalize() missing ? placeholder; got: %s", got)
35+
}
36+
}
37+
38+
func TestNormalize_ReplacesNumericLiterals(t *testing.T) {
39+
sql := "SELECT * FROM orders WHERE amount > 100"
40+
got, err := fingerprint.Normalize(sql)
41+
if err != nil {
42+
t.Fatalf("Normalize() error: %v", err)
43+
}
44+
if strings.Contains(got, "100") {
45+
t.Errorf("Normalize() did not replace numeric literals; got: %s", got)
46+
}
47+
if !strings.Contains(got, "?") {
48+
t.Errorf("Normalize() missing ? placeholder; got: %s", got)
49+
}
50+
}
51+
52+
func TestNormalize_IdenticalQueries_SameResult(t *testing.T) {
53+
q1 := "SELECT * FROM users WHERE id = 1"
54+
q2 := "SELECT * FROM users WHERE id = 999"
55+
n1, err := fingerprint.Normalize(q1)
56+
if err != nil {
57+
t.Fatalf("Normalize(q1) error: %v", err)
58+
}
59+
n2, err := fingerprint.Normalize(q2)
60+
if err != nil {
61+
t.Fatalf("Normalize(q2) error: %v", err)
62+
}
63+
if n1 != n2 {
64+
t.Errorf("structurally identical queries should normalize to same string:\n q1 → %s\n q2 → %s", n1, n2)
65+
}
66+
}
67+
68+
func TestNormalize_PreservesParameterPlaceholders(t *testing.T) {
69+
sql := "SELECT * FROM users WHERE id = $1"
70+
got, err := fingerprint.Normalize(sql)
71+
if err != nil {
72+
t.Fatalf("Normalize() error: %v", err)
73+
}
74+
if !strings.Contains(got, "$1") {
75+
t.Errorf("Normalize() must preserve existing placeholders; got: %s", got)
76+
}
77+
}
78+
79+
func TestNormalize_InListLiterals(t *testing.T) {
80+
sql := "SELECT * FROM users WHERE id IN (1, 2, 3)"
81+
got, err := fingerprint.Normalize(sql)
82+
if err != nil {
83+
t.Fatalf("Normalize() error: %v", err)
84+
}
85+
// After normalization, the numeric literals should be replaced with ?
86+
if strings.Contains(got, " 1,") || strings.Contains(got, ", 1,") {
87+
t.Errorf("Normalize() did not replace IN list literals; got: %s", got)
88+
}
89+
if !strings.Contains(got, "?") {
90+
t.Errorf("Normalize() missing ? placeholder; got: %s", got)
91+
}
92+
}
93+
94+
func TestFingerprint_SameStructure_SameHash(t *testing.T) {
95+
q1 := "SELECT * FROM users WHERE id = 1"
96+
q2 := "SELECT * FROM users WHERE id = 42"
97+
fp1, err := fingerprint.Fingerprint(q1)
98+
if err != nil {
99+
t.Fatalf("Fingerprint(q1) error: %v", err)
100+
}
101+
fp2, err := fingerprint.Fingerprint(q2)
102+
if err != nil {
103+
t.Fatalf("Fingerprint(q2) error: %v", err)
104+
}
105+
if fp1 != fp2 {
106+
t.Errorf("same structure different literals must yield same fingerprint:\n fp1=%s\n fp2=%s", fp1, fp2)
107+
}
108+
}
109+
110+
func TestFingerprint_DifferentStructure_DifferentHash(t *testing.T) {
111+
q1 := "SELECT id FROM users WHERE status = 1"
112+
q2 := "SELECT name FROM users WHERE status = 1"
113+
fp1, _ := fingerprint.Fingerprint(q1)
114+
fp2, _ := fingerprint.Fingerprint(q2)
115+
if fp1 == fp2 {
116+
t.Errorf("different query structures must yield different fingerprints")
117+
}
118+
}
119+
120+
func TestFingerprint_IsHex64Chars(t *testing.T) {
121+
sql := "SELECT 1"
122+
fp, err := fingerprint.Fingerprint(sql)
123+
if err != nil {
124+
t.Fatalf("Fingerprint() error: %v", err)
125+
}
126+
if len(fp) != 64 {
127+
t.Errorf("SHA-256 hex fingerprint should be 64 chars, got %d: %s", len(fp), fp)
128+
}
129+
}
130+
131+
func TestNormalize_InvalidSQL_ReturnsError(t *testing.T) {
132+
_, err := fingerprint.Normalize("SELECT FROM WHERE")
133+
if err == nil {
134+
t.Error("Normalize() should return error for invalid SQL")
135+
}
136+
}
137+
138+
func TestFingerprint_Deterministic(t *testing.T) {
139+
sql := "SELECT u.id, u.name FROM users u WHERE u.active = true"
140+
fp1, err := fingerprint.Fingerprint(sql)
141+
if err != nil {
142+
t.Fatalf("Fingerprint() error: %v", err)
143+
}
144+
fp2, err := fingerprint.Fingerprint(sql)
145+
if err != nil {
146+
t.Fatalf("Fingerprint() error: %v", err)
147+
}
148+
if fp1 != fp2 {
149+
t.Error("Fingerprint() must be deterministic for the same input")
150+
}
151+
}
152+
153+
func TestNormalize_ReplacesBooleanLiterals(t *testing.T) {
154+
sql := "SELECT * FROM users WHERE active = true"
155+
got, err := fingerprint.Normalize(sql)
156+
if err != nil {
157+
t.Fatalf("Normalize() error: %v", err)
158+
}
159+
// Boolean literals should be replaced with ?
160+
if strings.Contains(got, "true") || strings.Contains(got, "TRUE") {
161+
t.Errorf("Normalize() did not replace boolean literal; got: %s", got)
162+
}
163+
if !strings.Contains(got, "?") {
164+
t.Errorf("Normalize() missing ? placeholder; got: %s", got)
165+
}
166+
}
167+
168+
func TestNormalize_ReplacesNullLiterals(t *testing.T) {
169+
sql := "SELECT * FROM users WHERE deleted_at IS NOT NULL"
170+
got, err := fingerprint.Normalize(sql)
171+
if err != nil {
172+
t.Fatalf("Normalize() error: %v", err)
173+
}
174+
// IS NOT NULL should still be preserved as-is — NULL here is a keyword, not a literal
175+
// This test verifies the query parses correctly
176+
if got == "" {
177+
t.Errorf("Normalize() returned empty string for valid SQL")
178+
}
179+
}
180+
181+
func TestNormalize_StringLiteralSameAsNumericNormalized(t *testing.T) {
182+
q1 := "SELECT * FROM t WHERE x = 'hello'"
183+
q2 := "SELECT * FROM t WHERE x = 'world'"
184+
n1, err := fingerprint.Normalize(q1)
185+
if err != nil {
186+
t.Fatalf("Normalize(q1) error: %v", err)
187+
}
188+
n2, err := fingerprint.Normalize(q2)
189+
if err != nil {
190+
t.Fatalf("Normalize(q2) error: %v", err)
191+
}
192+
if n1 != n2 {
193+
t.Errorf("same-structure string literal queries should normalize identically:\n n1=%s\n n2=%s", n1, n2)
194+
}
195+
}
196+
197+
func TestFingerprint_StringVsNumericLiteralDifferentStructure(t *testing.T) {
198+
// Even though both use ?, they have different column names => different structure
199+
q1 := "SELECT id FROM users WHERE id = 1"
200+
q2 := "SELECT name FROM users WHERE id = 1"
201+
fp1, _ := fingerprint.Fingerprint(q1)
202+
fp2, _ := fingerprint.Fingerprint(q2)
203+
if fp1 == fp2 {
204+
t.Errorf("queries with different selected columns must have different fingerprints")
205+
}
206+
}

pkg/gosqlx/gosqlx.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"strings"
2121
"time"
2222

23+
"github.com/ajitpratap0/GoSQLX/pkg/fingerprint"
2324
"github.com/ajitpratap0/GoSQLX/pkg/formatter"
2425
"github.com/ajitpratap0/GoSQLX/pkg/sql/ast"
2526
"github.com/ajitpratap0/GoSQLX/pkg/sql/keywords"
@@ -627,3 +628,32 @@ func ParseWithRecovery(sql string) ([]ast.Statement, []error) {
627628
func ParseWithDialect(sql string, dialect keywords.SQLDialect) (*ast.AST, error) {
628629
return parser.ParseWithDialect(sql, dialect)
629630
}
631+
632+
// Normalize parses sql, replaces all literal values (strings, numbers, booleans,
633+
// NULLs) with "?" placeholders, and returns the re-formatted SQL.
634+
//
635+
// Two queries that differ only in literal values (e.g., WHERE id = 1 vs WHERE id = 42)
636+
// produce identical output. Existing parameter placeholders ($1, ?, :name) are preserved.
637+
//
638+
// Returns an error if the SQL cannot be parsed.
639+
//
640+
// Example:
641+
//
642+
// norm, err := gosqlx.Normalize("SELECT * FROM users WHERE id = 42")
643+
// // norm == "SELECT * FROM users WHERE id = ?"
644+
func Normalize(sql string) (string, error) {
645+
return fingerprint.Normalize(sql)
646+
}
647+
648+
// Fingerprint returns a stable 64-character SHA-256 hex digest for the given SQL.
649+
// Structurally identical queries with different literal values produce the same fingerprint,
650+
// making this suitable for query deduplication, caching, and slow-query grouping.
651+
//
652+
// Example:
653+
//
654+
// fp1, _ := gosqlx.Fingerprint("SELECT * FROM users WHERE id = 1")
655+
// fp2, _ := gosqlx.Fingerprint("SELECT * FROM users WHERE id = 999")
656+
// // fp1 == fp2
657+
func Fingerprint(sql string) (string, error) {
658+
return fingerprint.Fingerprint(sql)
659+
}

0 commit comments

Comments
 (0)