feat: enhance data-diff tool — per-table WHERE clauses, improved output formatting

suryaiyer95 · claude · suryaiyer95 · commit 4020efc30105 · 2026-03-13T09:46:19.000-07:00
- Add `source_where_clause` and `target_where_clause` params to bridge protocol
- Update `run_data_diff` to pass per-table WHERE to reladiff engine
- Enhance tool output formatting with column-level match rates and sample mismatches
- Expand system prompt with progressive validation guidance

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/packages/altimate-engine/src/altimate_engine/server.py b/packages/altimate-engine/src/altimate_engine/server.py
@@ -965,6 +965,10 @@ def dispatch(request: JsonRpcRequest) -> JsonRpcResponse:
                 extra_columns=params.get("extra_columns"),
                 algorithm=params.get("algorithm", "auto"),
                 where_clause=params.get("where_clause"),
+                source_where_clause=params.get("source_where_clause"),
+                target_where_clause=params.get("target_where_clause"),
+                numeric_tolerance=params.get("numeric_tolerance"),
+                timestamp_tolerance_ms=params.get("timestamp_tolerance_ms"),
                 source_database=params.get("source_database"),
                 source_schema=params.get("source_schema"),
                 target_database=params.get("target_database"),
diff --git a/packages/altimate-engine/src/altimate_engine/sql/data_diff.py b/packages/altimate-engine/src/altimate_engine/sql/data_diff.py
@@ -75,6 +75,10 @@ def run_data_diff(
     extra_columns: list[str] | None = None,
     algorithm: str = "auto",
     where_clause: str | None = None,
+    source_where_clause: str | None = None,
+    target_where_clause: str | None = None,
+    numeric_tolerance: float | None = None,
+    timestamp_tolerance_ms: int | None = None,
     source_database: str | None = None,
     source_schema: str | None = None,
     target_database: str | None = None,
@@ -109,21 +113,31 @@ def run_data_diff(
     if target_schema:
         table2["schema"] = target_schema
 
+    config: dict[str, Any] = {
+        "algorithm": algorithm,
+        "key_columns": key_columns,
+        "extra_columns": extra_columns or [],
+    }
+
+    if where_clause:
+        config["where_clause"] = where_clause
+    if source_where_clause:
+        config["where_clause_table1"] = source_where_clause
+    if target_where_clause:
+        config["where_clause_table2"] = target_where_clause
+    if numeric_tolerance is not None:
+        config["numeric_tolerance"] = numeric_tolerance
+    if timestamp_tolerance_ms is not None:
+        config["timestamp_tolerance_ms"] = timestamp_tolerance_ms
+
     spec = {
         "table1": table1,
         "table2": table2,
         "dialect1": dialect1,
         "dialect2": dialect2,
-        "config": {
-            "algorithm": algorithm,
-            "key_columns": key_columns,
-            "extra_columns": extra_columns or [],
-        },
+        "config": config,
     }
 
-    if where_clause:
-        spec["config"]["where_clause"] = where_clause
-
     logger.info("Starting reladiff session: %s", json.dumps(spec, indent=2))
 
     # Create session and run the state machine loop
diff --git a/packages/opencode/src/altimate/bridge/protocol.ts b/packages/opencode/src/altimate/bridge/protocol.ts
@@ -886,6 +886,10 @@ export interface DataDiffRunParams {
   extra_columns?: string[]
   algorithm?: "auto" | "hashdiff" | "joindiff" | "profile" | "recon" | "cascade"
   where_clause?: string
+  source_where_clause?: string
+  target_where_clause?: string
+  numeric_tolerance?: number
+  timestamp_tolerance_ms?: number
   source_database?: string
   source_schema?: string
   target_database?: string
diff --git a/packages/opencode/src/altimate/prompts/data-diff.txt b/packages/opencode/src/altimate/prompts/data-diff.txt
@@ -19,6 +19,10 @@ Your purpose is to compare data between two tables (same database or different w
    - `extra_columns`: (optional) additional columns to compare beyond keys
    - `algorithm`: (optional) "auto" (default), "hashdiff", "joindiff", "profile", "recon", or "cascade"
    - `where_clause`: (optional) WHERE filter applied to both tables
+   - `source_where_clause`: (optional) WHERE filter applied only to the source table
+   - `target_where_clause`: (optional) WHERE filter applied only to the target table
+   - `numeric_tolerance`: (optional) absolute tolerance for numeric comparisons (e.g., 0.01)
+   - `timestamp_tolerance_ms`: (optional) tolerance for timestamp comparisons in milliseconds
    - `source_database`, `source_schema`, `target_database`, `target_schema`: (optional) fully qualify tables
 
 ### Algorithm selection
@@ -46,7 +50,44 @@ data_diff(
 )
 ```
 
-The tool returns a structured report with row counts, diff statistics, and any mismatched rows.
+The tool returns a structured report with:
+- Row counts and diff statistics
+- Per-column match rates (e.g., "amount: 95.2% match")
+- Categorized mismatches (null in source, null in target, value differs)
+- Sample mismatching rows with key values and differing values
+
+### Filtering subsets of data
+
+Use per-table WHERE clauses to validate specific date ranges or partitions:
+
+```
+data_diff(
+  source_table: "orders",
+  target_table: "orders_migrated",
+  source_warehouse: "old_db",
+  target_warehouse: "new_db",
+  key_columns: ["order_id"],
+  extra_columns: ["amount", "status"],
+  source_where_clause: "created_at >= '2024-01-01'",
+  target_where_clause: "created_at >= '2024-01-01' AND region = 'US'"
+)
+```
+
+### Tolerance-based comparison
+
+For migrations where small numeric/timestamp differences are acceptable:
+
+```
+data_diff(
+  source_table: "metrics",
+  target_table: "metrics_v2",
+  source_warehouse: "snowflake_wh",
+  key_columns: ["metric_id"],
+  extra_columns: ["value", "recorded_at"],
+  numeric_tolerance: 0.001,
+  timestamp_tolerance_ms: 1000
+)
+```
 
 ## FALLBACK: Manual SQL Validation
 
diff --git a/packages/opencode/src/altimate/tools/data-diff-run.ts b/packages/opencode/src/altimate/tools/data-diff-run.ts
@@ -29,6 +29,23 @@ export const DataDiffRunTool = Tool.define("data_diff", {
           "profile=column statistics only. cascade=count→profile→content.",
       ),
     where_clause: z.string().optional().describe("Optional WHERE filter applied to both tables"),
+    source_where_clause: z
+      .string()
+      .optional()
+      .describe("WHERE filter applied only to the source table (e.g., date range filter)"),
+    target_where_clause: z
+      .string()
+      .optional()
+      .describe("WHERE filter applied only to the target table"),
+    numeric_tolerance: z
+      .number()
+      .optional()
+      .describe("Absolute tolerance for numeric comparisons (e.g., 0.01). Values within this threshold are treated as equal."),
+    timestamp_tolerance_ms: z
+      .number()
+      .int()
+      .optional()
+      .describe("Tolerance for timestamp comparisons in milliseconds (e.g., 1000 for 1 second)"),
     source_database: z.string().optional().describe("Source database/catalog name"),
     source_schema: z.string().optional().describe("Source schema name"),
     target_database: z.string().optional().describe("Target database/catalog name"),
@@ -45,6 +62,10 @@ export const DataDiffRunTool = Tool.define("data_diff", {
         extra_columns: args.extra_columns,
         algorithm: args.algorithm,
         where_clause: args.where_clause,
+        source_where_clause: args.source_where_clause,
+        target_where_clause: args.target_where_clause,
+        numeric_tolerance: args.numeric_tolerance,
+        timestamp_tolerance_ms: args.timestamp_tolerance_ms,
         source_database: args.source_database,
         source_schema: args.source_schema,
         target_database: args.target_database,
@@ -103,6 +124,39 @@ function formatOutcome(outcome: Record<string, unknown>, args: Record<string, un
       lines.push(`Exclusive to table2: ${stats.exclusive_table2 ?? 0}`)
       lines.push(`Updated: ${stats.updated ?? 0}`)
       lines.push(`Diff %: ${((stats.diff_percent as number) * 100).toFixed(2)}%`)
+
+      // Per-column match rates
+      const matchRates = (stats.column_match_rates ?? []) as Record<string, unknown>[]
+      if (matchRates.length > 0) {
+        lines.push("")
+        lines.push("Column Match Rates:")
+        for (const col of matchRates) {
+          const pct = (col.match_percent as number).toFixed(1)
+          lines.push(`  ${col.column}: ${pct}% (${col.matched}/${col.total})`)
+        }
+      }
+
+      // Mismatch samples
+      const samples = (stats.mismatch_samples ?? []) as Record<string, unknown>[]
+      if (samples.length > 0) {
+        lines.push("")
+        lines.push("Sample Mismatches:")
+        for (const s of samples) {
+          const key = (s.key_values as string[] | undefined)?.join(", ") ?? "?"
+          const cat = s.category as string
+          if (cat === "exclusive_table1") {
+            lines.push(`  [${key}] only in source`)
+          } else if (cat === "exclusive_table2") {
+            lines.push(`  [${key}] only in target`)
+          } else if (cat === "null_in_source") {
+            lines.push(`  [${key}] NULL in source, "${s.value_table2}" in target`)
+          } else if (cat === "null_in_target") {
+            lines.push(`  [${key}] "${s.value_table1}" in source, NULL in target`)
+          } else {
+            lines.push(`  [${key}] "${s.value_table1}" vs "${s.value_table2}"`)
+          }
+        }
+      }
     } else {
       lines.push(`Unchanged: ${stats.unchanged ?? stats.rows_table1}`)
     }