Skip to content

Commit 4020efc

Browse files
suryaiyer95claude
andcommitted
feat: enhance data-diff tool — per-table WHERE clauses, improved output formatting
- Add `source_where_clause` and `target_where_clause` params to bridge protocol - Update `run_data_diff` to pass per-table WHERE to reladiff engine - Enhance tool output formatting with column-level match rates and sample mismatches - Expand system prompt with progressive validation guidance Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ce11eea commit 4020efc

5 files changed

Lines changed: 126 additions & 9 deletions

File tree

packages/altimate-engine/src/altimate_engine/server.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,10 @@ def dispatch(request: JsonRpcRequest) -> JsonRpcResponse:
965965
extra_columns=params.get("extra_columns"),
966966
algorithm=params.get("algorithm", "auto"),
967967
where_clause=params.get("where_clause"),
968+
source_where_clause=params.get("source_where_clause"),
969+
target_where_clause=params.get("target_where_clause"),
970+
numeric_tolerance=params.get("numeric_tolerance"),
971+
timestamp_tolerance_ms=params.get("timestamp_tolerance_ms"),
968972
source_database=params.get("source_database"),
969973
source_schema=params.get("source_schema"),
970974
target_database=params.get("target_database"),

packages/altimate-engine/src/altimate_engine/sql/data_diff.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ def run_data_diff(
7575
extra_columns: list[str] | None = None,
7676
algorithm: str = "auto",
7777
where_clause: str | None = None,
78+
source_where_clause: str | None = None,
79+
target_where_clause: str | None = None,
80+
numeric_tolerance: float | None = None,
81+
timestamp_tolerance_ms: int | None = None,
7882
source_database: str | None = None,
7983
source_schema: str | None = None,
8084
target_database: str | None = None,
@@ -109,21 +113,31 @@ def run_data_diff(
109113
if target_schema:
110114
table2["schema"] = target_schema
111115

116+
config: dict[str, Any] = {
117+
"algorithm": algorithm,
118+
"key_columns": key_columns,
119+
"extra_columns": extra_columns or [],
120+
}
121+
122+
if where_clause:
123+
config["where_clause"] = where_clause
124+
if source_where_clause:
125+
config["where_clause_table1"] = source_where_clause
126+
if target_where_clause:
127+
config["where_clause_table2"] = target_where_clause
128+
if numeric_tolerance is not None:
129+
config["numeric_tolerance"] = numeric_tolerance
130+
if timestamp_tolerance_ms is not None:
131+
config["timestamp_tolerance_ms"] = timestamp_tolerance_ms
132+
112133
spec = {
113134
"table1": table1,
114135
"table2": table2,
115136
"dialect1": dialect1,
116137
"dialect2": dialect2,
117-
"config": {
118-
"algorithm": algorithm,
119-
"key_columns": key_columns,
120-
"extra_columns": extra_columns or [],
121-
},
138+
"config": config,
122139
}
123140

124-
if where_clause:
125-
spec["config"]["where_clause"] = where_clause
126-
127141
logger.info("Starting reladiff session: %s", json.dumps(spec, indent=2))
128142

129143
# Create session and run the state machine loop

packages/opencode/src/altimate/bridge/protocol.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,10 @@ export interface DataDiffRunParams {
886886
extra_columns?: string[]
887887
algorithm?: "auto" | "hashdiff" | "joindiff" | "profile" | "recon" | "cascade"
888888
where_clause?: string
889+
source_where_clause?: string
890+
target_where_clause?: string
891+
numeric_tolerance?: number
892+
timestamp_tolerance_ms?: number
889893
source_database?: string
890894
source_schema?: string
891895
target_database?: string

packages/opencode/src/altimate/prompts/data-diff.txt

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ Your purpose is to compare data between two tables (same database or different w
1919
- `extra_columns`: (optional) additional columns to compare beyond keys
2020
- `algorithm`: (optional) "auto" (default), "hashdiff", "joindiff", "profile", "recon", or "cascade"
2121
- `where_clause`: (optional) WHERE filter applied to both tables
22+
- `source_where_clause`: (optional) WHERE filter applied only to the source table
23+
- `target_where_clause`: (optional) WHERE filter applied only to the target table
24+
- `numeric_tolerance`: (optional) absolute tolerance for numeric comparisons (e.g., 0.01)
25+
- `timestamp_tolerance_ms`: (optional) tolerance for timestamp comparisons in milliseconds
2226
- `source_database`, `source_schema`, `target_database`, `target_schema`: (optional) fully qualify tables
2327

2428
### Algorithm selection
@@ -46,7 +50,44 @@ data_diff(
4650
)
4751
```
4852

49-
The tool returns a structured report with row counts, diff statistics, and any mismatched rows.
53+
The tool returns a structured report with:
54+
- Row counts and diff statistics
55+
- Per-column match rates (e.g., "amount: 95.2% match")
56+
- Categorized mismatches (null in source, null in target, value differs)
57+
- Sample mismatching rows with key values and differing values
58+
59+
### Filtering subsets of data
60+
61+
Use per-table WHERE clauses to validate specific date ranges or partitions:
62+
63+
```
64+
data_diff(
65+
source_table: "orders",
66+
target_table: "orders_migrated",
67+
source_warehouse: "old_db",
68+
target_warehouse: "new_db",
69+
key_columns: ["order_id"],
70+
extra_columns: ["amount", "status"],
71+
source_where_clause: "created_at >= '2024-01-01'",
72+
target_where_clause: "created_at >= '2024-01-01' AND region = 'US'"
73+
)
74+
```
75+
76+
### Tolerance-based comparison
77+
78+
For migrations where small numeric/timestamp differences are acceptable:
79+
80+
```
81+
data_diff(
82+
source_table: "metrics",
83+
target_table: "metrics_v2",
84+
source_warehouse: "snowflake_wh",
85+
key_columns: ["metric_id"],
86+
extra_columns: ["value", "recorded_at"],
87+
numeric_tolerance: 0.001,
88+
timestamp_tolerance_ms: 1000
89+
)
90+
```
5091

5192
## FALLBACK: Manual SQL Validation
5293

packages/opencode/src/altimate/tools/data-diff-run.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,23 @@ export const DataDiffRunTool = Tool.define("data_diff", {
2929
"profile=column statistics only. cascade=count→profile→content.",
3030
),
3131
where_clause: z.string().optional().describe("Optional WHERE filter applied to both tables"),
32+
source_where_clause: z
33+
.string()
34+
.optional()
35+
.describe("WHERE filter applied only to the source table (e.g., date range filter)"),
36+
target_where_clause: z
37+
.string()
38+
.optional()
39+
.describe("WHERE filter applied only to the target table"),
40+
numeric_tolerance: z
41+
.number()
42+
.optional()
43+
.describe("Absolute tolerance for numeric comparisons (e.g., 0.01). Values within this threshold are treated as equal."),
44+
timestamp_tolerance_ms: z
45+
.number()
46+
.int()
47+
.optional()
48+
.describe("Tolerance for timestamp comparisons in milliseconds (e.g., 1000 for 1 second)"),
3249
source_database: z.string().optional().describe("Source database/catalog name"),
3350
source_schema: z.string().optional().describe("Source schema name"),
3451
target_database: z.string().optional().describe("Target database/catalog name"),
@@ -45,6 +62,10 @@ export const DataDiffRunTool = Tool.define("data_diff", {
4562
extra_columns: args.extra_columns,
4663
algorithm: args.algorithm,
4764
where_clause: args.where_clause,
65+
source_where_clause: args.source_where_clause,
66+
target_where_clause: args.target_where_clause,
67+
numeric_tolerance: args.numeric_tolerance,
68+
timestamp_tolerance_ms: args.timestamp_tolerance_ms,
4869
source_database: args.source_database,
4970
source_schema: args.source_schema,
5071
target_database: args.target_database,
@@ -103,6 +124,39 @@ function formatOutcome(outcome: Record<string, unknown>, args: Record<string, un
103124
lines.push(`Exclusive to table2: ${stats.exclusive_table2 ?? 0}`)
104125
lines.push(`Updated: ${stats.updated ?? 0}`)
105126
lines.push(`Diff %: ${((stats.diff_percent as number) * 100).toFixed(2)}%`)
127+
128+
// Per-column match rates
129+
const matchRates = (stats.column_match_rates ?? []) as Record<string, unknown>[]
130+
if (matchRates.length > 0) {
131+
lines.push("")
132+
lines.push("Column Match Rates:")
133+
for (const col of matchRates) {
134+
const pct = (col.match_percent as number).toFixed(1)
135+
lines.push(` ${col.column}: ${pct}% (${col.matched}/${col.total})`)
136+
}
137+
}
138+
139+
// Mismatch samples
140+
const samples = (stats.mismatch_samples ?? []) as Record<string, unknown>[]
141+
if (samples.length > 0) {
142+
lines.push("")
143+
lines.push("Sample Mismatches:")
144+
for (const s of samples) {
145+
const key = (s.key_values as string[] | undefined)?.join(", ") ?? "?"
146+
const cat = s.category as string
147+
if (cat === "exclusive_table1") {
148+
lines.push(` [${key}] only in source`)
149+
} else if (cat === "exclusive_table2") {
150+
lines.push(` [${key}] only in target`)
151+
} else if (cat === "null_in_source") {
152+
lines.push(` [${key}] NULL in source, "${s.value_table2}" in target`)
153+
} else if (cat === "null_in_target") {
154+
lines.push(` [${key}] "${s.value_table1}" in source, NULL in target`)
155+
} else {
156+
lines.push(` [${key}] "${s.value_table1}" vs "${s.value_table2}"`)
157+
}
158+
}
159+
}
106160
} else {
107161
lines.push(`Unchanged: ${stats.unchanged ?? stats.rows_table1}`)
108162
}

0 commit comments

Comments
 (0)