feat: add partition support to data_diff

suryaiyer95 · suryaiyer95 · commit fcfd122ee03b · 2026-03-30T19:00:31.000-07:00
Split large tables by a date or numeric column before diffing.
Each partition is diffed independently then results are aggregated.

New params:
- partition_column: column to split on (date or numeric)
- partition_granularity: day | week | month | year (for dates)
- partition_bucket_size: bucket width for numeric columns

New output field:
- partition_results: per-partition breakdown (identical / differ / error)

Dialect-aware SQL: Postgres, Snowflake, BigQuery, ClickHouse, MySQL.

Skill updated with partition guidance and examples.
diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
@@ -44,6 +44,9 @@ description: Validate that two tables or query results are identical — or diag
 - `extra_columns` — columns to compare beyond keys (omit = compare all)
 - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
 - `where_clause` — filter applied to both tables
+- `partition_column` — split the table by this column and diff each group independently (recommended for large tables)
+- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`)
+- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K)
 
 > **CRITICAL — Algorithm choice:**
 > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
@@ -117,8 +120,31 @@ SELECT COUNT(*) FROM orders
 
 Use this to choose the algorithm:
 - **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine
-- **1M–100M rows**: `hashdiff` or `cascade`
-- **> 100M rows**: `hashdiff` with a `where_clause` date filter to validate a recent window first
+- **1M–100M rows**: `hashdiff` with `partition_column` for faster, more precise results
+- **> 100M rows**: `hashdiff` + `partition_column` — required; bisection alone may miss rows at this scale
+
+**When to use `partition_column`:**
+- Table has a natural time or key column (e.g. `created_at`, `order_id`, `event_date`)
+- Table has > 500K rows and bisection is slow or returning incomplete results
+- You need per-partition visibility (which month/range has the problem)
+
+```
+// Date column — partition by month
+data_diff(source="lineitem", target="lineitem",
+  key_columns=["l_orderkey", "l_linenumber"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="l_shipdate", partition_granularity="month",
+  algorithm="hashdiff")
+
+// Numeric column — partition by key ranges of 100K
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderkey", partition_bucket_size=100000,
+  algorithm="hashdiff")
+```
+
+Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.
 
 ### Step 4: Profile first for unknown tables
 
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -7,7 +7,7 @@
  * This file is the bridge between that engine and altimate-code's drivers.
  */
 
-import type { DataDiffParams, DataDiffResult } from "../types"
+import type { DataDiffParams, DataDiffResult, PartitionDiffResult } from "../types"
 import * as Registry from "./registry"
 
 // ---------------------------------------------------------------------------
@@ -119,7 +119,238 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
 
 const MAX_STEPS = 200
 
+// ---------------------------------------------------------------------------
+// Partition support
+// ---------------------------------------------------------------------------
+
+/**
+ * Build a DATE_TRUNC expression appropriate for the warehouse dialect.
+ */
+function dateTruncExpr(granularity: string, column: string, dialect: string): string {
+  const g = granularity.toLowerCase()
+  switch (dialect) {
+    case "bigquery":
+      return `DATE_TRUNC(${column}, ${g.toUpperCase()})`
+    case "clickhouse":
+      return `toStartOf${g.charAt(0).toUpperCase() + g.slice(1)}(${column})`
+    case "mysql":
+    case "mariadb": {
+      const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01"
+      return `DATE_FORMAT(${column}, '${fmt}')`
+    }
+    default:
+      // Postgres, Snowflake, Redshift, DuckDB, etc.
+      return `DATE_TRUNC('${g}', ${column})`
+  }
+}
+
+/**
+ * Build SQL to discover distinct partition values from the source table.
+ */
+function buildPartitionDiscoverySQL(
+  table: string,
+  partitionColumn: string,
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+  dialect: string,
+  whereClause?: string,
+): string {
+  const isNumeric = bucketSize != null
+
+  let expr: string
+  if (isNumeric) {
+    expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
+  } else {
+    expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+  }
+
+  const where = whereClause ? `WHERE ${whereClause}` : ""
+  return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
+}
+
+/**
+ * Build a WHERE clause that scopes to a single partition.
+ */
+function buildPartitionWhereClause(
+  partitionColumn: string,
+  partitionValue: string,
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+  dialect: string,
+): string {
+  if (bucketSize != null) {
+    const lo = Number(partitionValue)
+    const hi = lo + bucketSize
+    return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
+  }
+
+  const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+
+  // Cast the literal appropriately per dialect
+  switch (dialect) {
+    case "bigquery":
+      return `${expr} = '${partitionValue}'`
+    case "clickhouse":
+      return `${expr} = toDate('${partitionValue}')`
+    case "mysql":
+    case "mariadb":
+      return `${expr} = '${partitionValue}'`
+    default:
+      return `${expr} = '${partitionValue}'`
+  }
+}
+
+/**
+ * Extract DiffStats from a successful outcome (if present).
+ */
+function extractStats(outcome: unknown): {
+  rows_source: number
+  rows_target: number
+  differences: number
+  status: "identical" | "differ"
+} {
+  const o = outcome as any
+  if (!o) return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" }
+
+  if (o.Match) {
+    return {
+      rows_source: o.Match.row_count ?? 0,
+      rows_target: o.Match.row_count ?? 0,
+      differences: 0,
+      status: "identical",
+    }
+  }
+
+  if (o.Diff) {
+    const d = o.Diff
+    return {
+      rows_source: d.total_source_rows ?? 0,
+      rows_target: d.total_target_rows ?? 0,
+      differences: (d.rows_only_in_source ?? 0) + (d.rows_only_in_target ?? 0) + (d.rows_updated ?? 0),
+      status: "differ",
+    }
+  }
+
+  return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" }
+}
+
+/**
+ * Merge two Diff outcomes into one aggregated Diff outcome.
+ */
+function mergeOutcomes(accumulated: unknown, next: unknown): unknown {
+  const a = accumulated as any
+  const n = next as any
+
+  const aD = a?.Diff ?? (a?.Match ? { total_source_rows: a.Match.row_count, total_target_rows: a.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: a.Match.row_count, sample_diffs: [] } : null)
+  const nD = n?.Diff ?? (n?.Match ? { total_source_rows: n.Match.row_count, total_target_rows: n.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: n.Match.row_count, sample_diffs: [] } : null)
+
+  if (!aD && !nD) return { Match: { row_count: 0 } }
+  if (!aD) return next
+  if (!nD) return accumulated
+
+  const merged = {
+    total_source_rows: (aD.total_source_rows ?? 0) + (nD.total_source_rows ?? 0),
+    total_target_rows: (aD.total_target_rows ?? 0) + (nD.total_target_rows ?? 0),
+    rows_only_in_source: (aD.rows_only_in_source ?? 0) + (nD.rows_only_in_source ?? 0),
+    rows_only_in_target: (aD.rows_only_in_target ?? 0) + (nD.rows_only_in_target ?? 0),
+    rows_updated: (aD.rows_updated ?? 0) + (nD.rows_updated ?? 0),
+    rows_identical: (aD.rows_identical ?? 0) + (nD.rows_identical ?? 0),
+    sample_diffs: [...(aD.sample_diffs ?? []), ...(nD.sample_diffs ?? [])].slice(0, 20),
+  }
+
+  const totalDiff = merged.rows_only_in_source + merged.rows_only_in_target + merged.rows_updated
+  if (totalDiff === 0) {
+    return { Match: { row_count: merged.total_source_rows, algorithm: "partitioned" } }
+  }
+  return { Diff: merged }
+}
+
+/**
+ * Run a partitioned diff: discover partition values, diff each partition independently,
+ * then aggregate results.
+ */
+async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  const resolveDialect = (warehouse: string | undefined): string => {
+    if (warehouse) {
+      const cfg = Registry.getConfig(warehouse)
+      return cfg?.type ?? "generic"
+    }
+    const warehouses = Registry.list().warehouses
+    return warehouses[0]?.type ?? "generic"
+  }
+
+  const sourceDialect = resolveDialect(params.source_warehouse)
+  const { table1Name } = resolveTableSources(params.source, params.target)
+
+  // Discover partition values from source
+  const discoverySql = buildPartitionDiscoverySQL(
+    table1Name,
+    params.partition_column!,
+    params.partition_granularity,
+    params.partition_bucket_size,
+    sourceDialect,
+    params.where_clause,
+  )
+
+  let partitionValues: string[]
+  try {
+    const rows = await executeQuery(discoverySql, params.source_warehouse)
+    partitionValues = rows.map((r) => String(r[0] ?? "")).filter(Boolean)
+  } catch (e) {
+    return { success: false, error: `Partition discovery failed: ${e}`, steps: 0 }
+  }
+
+  if (partitionValues.length === 0) {
+    return { success: true, steps: 1, outcome: { Match: { row_count: 0, algorithm: "partitioned" } }, partition_results: [] }
+  }
+
+  // Diff each partition
+  const partitionResults: PartitionDiffResult[] = []
+  let aggregatedOutcome: unknown = null
+  let totalSteps = 1
+
+  for (const pVal of partitionValues) {
+    const partWhere = buildPartitionWhereClause(
+      params.partition_column!,
+      pVal,
+      params.partition_granularity,
+      params.partition_bucket_size,
+      sourceDialect,
+    )
+    const fullWhere = params.where_clause ? `(${params.where_clause}) AND (${partWhere})` : partWhere
+
+    const result = await runDataDiff({
+      ...params,
+      where_clause: fullWhere,
+      partition_column: undefined, // prevent recursion
+    })
+
+    totalSteps += result.steps
+
+    if (!result.success) {
+      partitionResults.push({ partition: pVal, rows_source: 0, rows_target: 0, differences: 0, status: "error", error: result.error })
+      continue
+    }
+
+    const stats = extractStats(result.outcome)
+    partitionResults.push({ partition: pVal, ...stats })
+    aggregatedOutcome = aggregatedOutcome == null ? result.outcome : mergeOutcomes(aggregatedOutcome, result.outcome)
+  }
+
+  return {
+    success: true,
+    steps: totalSteps,
+    outcome: aggregatedOutcome ?? { Match: { row_count: 0, algorithm: "partitioned" } },
+    partition_results: partitionResults,
+  }
+}
+
 export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  // Dispatch to partitioned diff if partition_column is set
+  if (params.partition_column) {
+    return runPartitionedDiff(params)
+  }
+
   // Dynamically import NAPI module (not available in test environments without the binary)
   let DataParitySession: new (specJson: string) => {
     start(): string
diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts
@@ -985,13 +985,48 @@ export interface DataDiffParams {
   numeric_tolerance?: number
   /** Timestamp tolerance in milliseconds */
   timestamp_tolerance_ms?: number
+  /**
+   * Column to partition on before diffing. The table is split into groups by
+   * this column and each group is diffed independently. Results are aggregated.
+   * Use for large tables where bisection alone is too slow or imprecise.
+   *
+   * Examples: "l_shipdate" (date column), "l_orderkey" (numeric column)
+   */
+  partition_column?: string
+  /**
+   * Granularity for date partition columns: "day" | "week" | "month" | "year".
+   * For numeric columns, ignored — use partition_bucket_size instead.
+   * Defaults to "month".
+   */
+  partition_granularity?: "day" | "week" | "month" | "year"
+  /**
+   * For numeric partition columns: size of each bucket.
+   * E.g. 100000 splits l_orderkey into [0, 100000), [100000, 200000), …
+   */
+  partition_bucket_size?: number
+}
+
+export interface PartitionDiffResult {
+  /** The partition value (date string or numeric bucket start) */
+  partition: string
+  /** Source row count in this partition */
+  rows_source: number
+  /** Target row count in this partition */
+  rows_target: number
+  /** Total differences found (exclusive + updated) */
+  differences: number
+  /** "identical" | "differ" | "error" */
+  status: "identical" | "differ" | "error"
+  error?: string
 }
 
 export interface DataDiffResult {
   success: boolean
   steps: number
   outcome?: unknown
   error?: string
+  /** Per-partition breakdown when partition_column is used */
+  partition_results?: PartitionDiffResult[]
 }
 
 // --- Method registry ---
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts