fix: auto-discover extra_columns and exclude audit/timestamp columns from data diff

aidtya · suryaiyer95 · commit 9ed95bbf24b1 · 2026-03-30T19:00:31.000-07:00
The Rust engine only compares columns explicitly listed in extra_columns.
When omitted, it was silently reporting all key-matched rows as 'identical'
even when non-key values differed — a false positive bug.

Changes:
- Auto-discover columns from information_schema when extra_columns is omitted
  and source is a plain table name (not a SQL query)
- Exclude audit/timestamp columns (updated_at, created_at, inserted_at,
  modified_at, _fivetran_*, _airbyte_*, publisher_last_updated_*, etc.)
  from comparison by default since they typically differ due to ETL timing
- Report excluded columns in tool output so users know what was skipped
- Fix misleading tool description that said 'Omit to compare all columns'
- Update SKILL.md with critical guidance on extra_columns behavior
diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
@@ -256,6 +256,18 @@ Output includes aggregate diff + per-partition breakdown showing which group has
 
 ---
 
+## CRITICAL: `extra_columns` Behavior
+
+The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives.
+
+**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded.
+
+**SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs.
+
+**When to override auto-exclusion:** If the user specifically wants to compare audit columns (e.g., verifying that `created_at` was preserved during migration), pass those columns explicitly in `extra_columns`.
+
+---
+
 ## Common Mistakes
 
 **Writing manual diff SQL instead of calling data_diff**
@@ -272,3 +284,6 @@ Output includes aggregate diff + per-partition breakdown showing which group has
 
 **Running full diff on a billion-row table without asking**
 → Always ask the user before expensive operations. Offer filtering and partition options.
+
+**Omitting extra_columns when source is a SQL query**
+→ Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -113,6 +113,138 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
   )
 }
 
+// ---------------------------------------------------------------------------
+// Column auto-discovery and audit column exclusion
+// ---------------------------------------------------------------------------
+
+/**
+ * Patterns that match audit/timestamp columns which should be excluded from
+ * value comparison by default. These columns typically differ between source
+ * and target due to ETL timing, sync metadata, or pipeline bookkeeping —
+ * not because of actual data discrepancies.
+ */
+const AUDIT_COLUMN_PATTERNS = [
+  // Exact common names
+  /^(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)_(at|on|date|time|timestamp|ts|dt|epoch)$/i,
+  // Suffix patterns: *_at, *_on with temporal prefix
+  /_(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)$/i,
+  // ETL metadata columns
+  /^(etl|elt|dbt|pipeline|batch|sync|publish|ingest)_(created|updated|modified|loaded|run|timestamp|ts|time|at|epoch)/i,
+  /^(_sdc_|_airbyte_|_fivetran_|_stitch_|__hevo_)/i,
+  // Generic timestamp metadata
+  /^(last_updated|last_modified|date_updated|date_modified|date_created|row_updated|row_created)$/i,
+  /^(publisher_last_updated|publisher_updated)/i,
+  // Epoch variants
+  /(updated|modified|created|inserted|published|loaded|synced)_epoch/i,
+  /epoch_ms$/i,
+]
+
+/**
+ * Check whether a column name matches known audit/timestamp patterns.
+ */
+function isAuditColumn(columnName: string): boolean {
+  return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName))
+}
+
+/**
+ * Build a query to discover column names for a table, appropriate for the dialect.
+ */
+function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
+  // Parse schema.table or db.schema.table
+  const parts = tableName.split(".")
+  let schemaFilter = ""
+  let tableFilter = ""
+
+  if (parts.length === 3) {
+    schemaFilter = `table_schema = '${parts[1]}'`
+    tableFilter = `table_name = '${parts[2]}'`
+  } else if (parts.length === 2) {
+    schemaFilter = `table_schema = '${parts[0]}'`
+    tableFilter = `table_name = '${parts[1]}'`
+  } else {
+    tableFilter = `table_name = '${parts[0]}'`
+  }
+
+  switch (dialect) {
+    case "clickhouse":
+      return `DESCRIBE TABLE ${tableName}`
+    case "snowflake":
+      return `SHOW COLUMNS IN TABLE ${tableName}`
+    default: {
+      // Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema
+      const conditions = [tableFilter]
+      if (schemaFilter) conditions.push(schemaFilter)
+      return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
+    }
+  }
+}
+
+/**
+ * Parse column names from the discovery query result, handling dialect differences.
+ */
+function parseColumnNames(rows: (string | null)[][], dialect: string): string[] {
+  switch (dialect) {
+    case "clickhouse":
+      // DESCRIBE returns: name, type, default_type, default_expression, ...
+      return rows.map((r) => r[0] ?? "").filter(Boolean)
+    case "snowflake":
+      // SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ...
+      // column_name is at index 2
+      return rows.map((r) => r[2] ?? "").filter(Boolean)
+    default:
+      // information_schema returns: column_name
+      return rows.map((r) => r[0] ?? "").filter(Boolean)
+  }
+}
+
+/**
+ * Auto-discover non-key, non-audit columns for a table.
+ *
+ * When the caller omits `extra_columns`, we query the source table's schema to
+ * find all columns, then exclude:
+ *   1. Key columns (already used for matching)
+ *   2. Audit/timestamp columns (updated_at, created_at, etc.) that typically
+ *      differ between source and target due to ETL timing
+ *
+ * Returns the list of columns to compare, or undefined if discovery fails
+ * (in which case the engine falls back to key-only comparison).
+ */
+async function discoverExtraColumns(
+  tableName: string,
+  keyColumns: string[],
+  dialect: string,
+  warehouseName: string | undefined,
+): Promise<{ columns: string[]; excludedAudit: string[] } | undefined> {
+  // Only works for plain table names, not SQL queries
+  if (SQL_KEYWORDS.test(tableName)) return undefined
+
+  try {
+    const sql = buildColumnDiscoverySQL(tableName, dialect)
+    const rows = await executeQuery(sql, warehouseName)
+    const allColumns = parseColumnNames(rows, dialect)
+
+    if (allColumns.length === 0) return undefined
+
+    const keySet = new Set(keyColumns.map((k) => k.toLowerCase()))
+    const extraColumns: string[] = []
+    const excludedAudit: string[] = []
+
+    for (const col of allColumns) {
+      if (keySet.has(col.toLowerCase())) continue
+      if (isAuditColumn(col)) {
+        excludedAudit.push(col)
+      } else {
+        extraColumns.push(col)
+      }
+    }
+
+    return { columns: extraColumns, excludedAudit }
+  } catch {
+    // Schema discovery failed — fall back to engine default (key-only)
+    return undefined
+  }
+}
+
 // ---------------------------------------------------------------------------
 // Main orchestrator
 // ---------------------------------------------------------------------------
@@ -426,6 +558,26 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
   const dialect1 = resolveDialect(params.source_warehouse)
   const dialect2 = resolveDialect(params.target_warehouse ?? params.source_warehouse)
 
+  // Auto-discover extra_columns when not explicitly provided.
+  // The Rust engine only compares columns listed in extra_columns — if the list is
+  // empty, it compares key existence only and reports all matched rows as "identical"
+  // even when non-key values differ. This auto-discovery prevents that silent bug.
+  let extraColumns = params.extra_columns
+  let excludedAuditColumns: string[] = []
+
+  if (!extraColumns || extraColumns.length === 0) {
+    const discovered = await discoverExtraColumns(
+      params.source,
+      params.key_columns,
+      dialect1,
+      params.source_warehouse,
+    )
+    if (discovered) {
+      extraColumns = discovered.columns
+      excludedAuditColumns = discovered.excludedAudit
+    }
+  }
+
   // Build session spec
   const spec = {
     table1: table1Ref,
@@ -435,7 +587,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
     config: {
       algorithm: params.algorithm ?? "auto",
       key_columns: params.key_columns,
-      extra_columns: params.extra_columns ?? [],
+      extra_columns: extraColumns ?? [],
       ...(params.where_clause ? { where_clause: params.where_clause } : {}),
       ...(params.numeric_tolerance != null ? { numeric_tolerance: params.numeric_tolerance } : {}),
       ...(params.timestamp_tolerance_ms != null
@@ -477,6 +629,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
         success: true,
         steps: stepCount,
         outcome: action.outcome,
+        ...(excludedAuditColumns.length > 0 ? { excluded_audit_columns: excludedAuditColumns } : {}),
       }
     }
 
diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts
@@ -1027,6 +1027,8 @@ export interface DataDiffResult {
   error?: string
   /** Per-partition breakdown when partition_column is used */
   partition_results?: PartitionDiffResult[]
+  /** Columns auto-excluded from comparison (audit/timestamp columns like updated_at, created_at) */
+  excluded_audit_columns?: string[]
 }
 
 // --- Method registry ---
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -35,7 +35,13 @@ export const DataDiffTool = Tool.define("data_diff", {
     extra_columns: z
       .array(z.string())
       .optional()
-      .describe("Additional columns to compare beyond the key columns. Omit to compare all columns"),
+      .describe(
+        "Columns to compare beyond the key columns. " +
+        "IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " +
+        "(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " +
+        "If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " +
+        "Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison."
+      ),
     algorithm: z
       .enum(["auto", "joindiff", "hashdiff", "profile", "cascade"])
       .optional()
@@ -111,6 +117,12 @@ export const DataDiffTool = Tool.define("data_diff", {
         output += formatPartitionResults(result.partition_results, args.partition_column!)
       }
 
+      // Report auto-excluded audit columns so the LLM and user know what was skipped
+      const excluded = (result as any).excluded_audit_columns as string[] | undefined
+      if (excluded && excluded.length > 0) {
+        output += `\n\n  Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}`
+      }
+
       return {
         title: `Data diff: ${summarize(outcome)}`,
         metadata: { success: true, steps: result.steps },

Original file line number	Diff line number	Diff line change
`@@ -1027,6 +1027,8 @@ export interface DataDiffResult {`
`1027`	`1027`	`error?: string`
`1028`	`1028`	`/** Per-partition breakdown when partition_column is used */`
`1029`	`1029`	`partition_results?: PartitionDiffResult[]`
	`1030`	`+ /** Columns auto-excluded from comparison (audit/timestamp columns like updated_at, created_at) */`
	`1031`	`+ excluded_audit_columns?: string[]`
`1030`	`1032`	`}`
`1031`	`1033`
`1032`	`1034`	`// --- Method registry ---`