Skip to content

Commit 9ed95bb

Browse files
aidtyasuryaiyer95
authored andcommitted
fix: auto-discover extra_columns and exclude audit/timestamp columns from data diff
The Rust engine only compares columns explicitly listed in extra_columns. When omitted, it was silently reporting all key-matched rows as 'identical' even when non-key values differed — a false positive bug. Changes: - Auto-discover columns from information_schema when extra_columns is omitted and source is a plain table name (not a SQL query) - Exclude audit/timestamp columns (updated_at, created_at, inserted_at, modified_at, _fivetran_*, _airbyte_*, publisher_last_updated_*, etc.) from comparison by default since they typically differ due to ETL timing - Report excluded columns in tool output so users know what was skipped - Fix misleading tool description that said 'Omit to compare all columns' - Update SKILL.md with critical guidance on extra_columns behavior
1 parent 49686ef commit 9ed95bb

File tree

4 files changed

+184
-2
lines changed

4 files changed

+184
-2
lines changed

.opencode/skills/data-parity/SKILL.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,18 @@ Output includes aggregate diff + per-partition breakdown showing which group has
256256

257257
---
258258

259+
## CRITICAL: `extra_columns` Behavior
260+
261+
The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives.
262+
263+
**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded.
264+
265+
**SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs.
266+
267+
**When to override auto-exclusion:** If the user specifically wants to compare audit columns (e.g., verifying that `created_at` was preserved during migration), pass those columns explicitly in `extra_columns`.
268+
269+
---
270+
259271
## Common Mistakes
260272

261273
**Writing manual diff SQL instead of calling data_diff**
@@ -272,3 +284,6 @@ Output includes aggregate diff + per-partition breakdown showing which group has
272284

273285
**Running full diff on a billion-row table without asking**
274286
→ Always ask the user before expensive operations. Offer filtering and partition options.
287+
288+
**Omitting extra_columns when source is a SQL query**
289+
→ Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly.

packages/opencode/src/altimate/native/connections/data-diff.ts

Lines changed: 154 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,138 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
113113
)
114114
}
115115

116+
// ---------------------------------------------------------------------------
117+
// Column auto-discovery and audit column exclusion
118+
// ---------------------------------------------------------------------------
119+
120+
/**
121+
* Patterns that match audit/timestamp columns which should be excluded from
122+
* value comparison by default. These columns typically differ between source
123+
* and target due to ETL timing, sync metadata, or pipeline bookkeeping —
124+
* not because of actual data discrepancies.
125+
*/
126+
const AUDIT_COLUMN_PATTERNS = [
127+
// Exact common names
128+
/^(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)_(at|on|date|time|timestamp|ts|dt|epoch)$/i,
129+
// Suffix patterns: *_at, *_on with temporal prefix
130+
/_(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)$/i,
131+
// ETL metadata columns
132+
/^(etl|elt|dbt|pipeline|batch|sync|publish|ingest)_(created|updated|modified|loaded|run|timestamp|ts|time|at|epoch)/i,
133+
/^(_sdc_|_airbyte_|_fivetran_|_stitch_|__hevo_)/i,
134+
// Generic timestamp metadata
135+
/^(last_updated|last_modified|date_updated|date_modified|date_created|row_updated|row_created)$/i,
136+
/^(publisher_last_updated|publisher_updated)/i,
137+
// Epoch variants
138+
/(updated|modified|created|inserted|published|loaded|synced)_epoch/i,
139+
/epoch_ms$/i,
140+
]
141+
142+
/**
143+
* Check whether a column name matches known audit/timestamp patterns.
144+
*/
145+
function isAuditColumn(columnName: string): boolean {
146+
return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName))
147+
}
148+
149+
/**
150+
* Build a query to discover column names for a table, appropriate for the dialect.
151+
*/
152+
function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
153+
// Parse schema.table or db.schema.table
154+
const parts = tableName.split(".")
155+
let schemaFilter = ""
156+
let tableFilter = ""
157+
158+
if (parts.length === 3) {
159+
schemaFilter = `table_schema = '${parts[1]}'`
160+
tableFilter = `table_name = '${parts[2]}'`
161+
} else if (parts.length === 2) {
162+
schemaFilter = `table_schema = '${parts[0]}'`
163+
tableFilter = `table_name = '${parts[1]}'`
164+
} else {
165+
tableFilter = `table_name = '${parts[0]}'`
166+
}
167+
168+
switch (dialect) {
169+
case "clickhouse":
170+
return `DESCRIBE TABLE ${tableName}`
171+
case "snowflake":
172+
return `SHOW COLUMNS IN TABLE ${tableName}`
173+
default: {
174+
// Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema
175+
const conditions = [tableFilter]
176+
if (schemaFilter) conditions.push(schemaFilter)
177+
return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
178+
}
179+
}
180+
}
181+
182+
/**
183+
* Parse column names from the discovery query result, handling dialect differences.
184+
*/
185+
function parseColumnNames(rows: (string | null)[][], dialect: string): string[] {
186+
switch (dialect) {
187+
case "clickhouse":
188+
// DESCRIBE returns: name, type, default_type, default_expression, ...
189+
return rows.map((r) => r[0] ?? "").filter(Boolean)
190+
case "snowflake":
191+
// SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ...
192+
// column_name is at index 2
193+
return rows.map((r) => r[2] ?? "").filter(Boolean)
194+
default:
195+
// information_schema returns: column_name
196+
return rows.map((r) => r[0] ?? "").filter(Boolean)
197+
}
198+
}
199+
200+
/**
201+
* Auto-discover non-key, non-audit columns for a table.
202+
*
203+
* When the caller omits `extra_columns`, we query the source table's schema to
204+
* find all columns, then exclude:
205+
* 1. Key columns (already used for matching)
206+
* 2. Audit/timestamp columns (updated_at, created_at, etc.) that typically
207+
* differ between source and target due to ETL timing
208+
*
209+
* Returns the list of columns to compare, or undefined if discovery fails
210+
* (in which case the engine falls back to key-only comparison).
211+
*/
212+
async function discoverExtraColumns(
213+
tableName: string,
214+
keyColumns: string[],
215+
dialect: string,
216+
warehouseName: string | undefined,
217+
): Promise<{ columns: string[]; excludedAudit: string[] } | undefined> {
218+
// Only works for plain table names, not SQL queries
219+
if (SQL_KEYWORDS.test(tableName)) return undefined
220+
221+
try {
222+
const sql = buildColumnDiscoverySQL(tableName, dialect)
223+
const rows = await executeQuery(sql, warehouseName)
224+
const allColumns = parseColumnNames(rows, dialect)
225+
226+
if (allColumns.length === 0) return undefined
227+
228+
const keySet = new Set(keyColumns.map((k) => k.toLowerCase()))
229+
const extraColumns: string[] = []
230+
const excludedAudit: string[] = []
231+
232+
for (const col of allColumns) {
233+
if (keySet.has(col.toLowerCase())) continue
234+
if (isAuditColumn(col)) {
235+
excludedAudit.push(col)
236+
} else {
237+
extraColumns.push(col)
238+
}
239+
}
240+
241+
return { columns: extraColumns, excludedAudit }
242+
} catch {
243+
// Schema discovery failed — fall back to engine default (key-only)
244+
return undefined
245+
}
246+
}
247+
116248
// ---------------------------------------------------------------------------
117249
// Main orchestrator
118250
// ---------------------------------------------------------------------------
@@ -426,6 +558,26 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
426558
const dialect1 = resolveDialect(params.source_warehouse)
427559
const dialect2 = resolveDialect(params.target_warehouse ?? params.source_warehouse)
428560

561+
// Auto-discover extra_columns when not explicitly provided.
562+
// The Rust engine only compares columns listed in extra_columns — if the list is
563+
// empty, it compares key existence only and reports all matched rows as "identical"
564+
// even when non-key values differ. This auto-discovery prevents that silent bug.
565+
let extraColumns = params.extra_columns
566+
let excludedAuditColumns: string[] = []
567+
568+
if (!extraColumns || extraColumns.length === 0) {
569+
const discovered = await discoverExtraColumns(
570+
params.source,
571+
params.key_columns,
572+
dialect1,
573+
params.source_warehouse,
574+
)
575+
if (discovered) {
576+
extraColumns = discovered.columns
577+
excludedAuditColumns = discovered.excludedAudit
578+
}
579+
}
580+
429581
// Build session spec
430582
const spec = {
431583
table1: table1Ref,
@@ -435,7 +587,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
435587
config: {
436588
algorithm: params.algorithm ?? "auto",
437589
key_columns: params.key_columns,
438-
extra_columns: params.extra_columns ?? [],
590+
extra_columns: extraColumns ?? [],
439591
...(params.where_clause ? { where_clause: params.where_clause } : {}),
440592
...(params.numeric_tolerance != null ? { numeric_tolerance: params.numeric_tolerance } : {}),
441593
...(params.timestamp_tolerance_ms != null
@@ -477,6 +629,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
477629
success: true,
478630
steps: stepCount,
479631
outcome: action.outcome,
632+
...(excludedAuditColumns.length > 0 ? { excluded_audit_columns: excludedAuditColumns } : {}),
480633
}
481634
}
482635

packages/opencode/src/altimate/native/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,8 @@ export interface DataDiffResult {
10271027
error?: string
10281028
/** Per-partition breakdown when partition_column is used */
10291029
partition_results?: PartitionDiffResult[]
1030+
/** Columns auto-excluded from comparison (audit/timestamp columns like updated_at, created_at) */
1031+
excluded_audit_columns?: string[]
10301032
}
10311033

10321034
// --- Method registry ---

packages/opencode/src/altimate/tools/data-diff.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,13 @@ export const DataDiffTool = Tool.define("data_diff", {
3535
extra_columns: z
3636
.array(z.string())
3737
.optional()
38-
.describe("Additional columns to compare beyond the key columns. Omit to compare all columns"),
38+
.describe(
39+
"Columns to compare beyond the key columns. " +
40+
"IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " +
41+
"(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " +
42+
"If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " +
43+
"Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison."
44+
),
3945
algorithm: z
4046
.enum(["auto", "joindiff", "hashdiff", "profile", "cascade"])
4147
.optional()
@@ -111,6 +117,12 @@ export const DataDiffTool = Tool.define("data_diff", {
111117
output += formatPartitionResults(result.partition_results, args.partition_column!)
112118
}
113119

120+
// Report auto-excluded audit columns so the LLM and user know what was skipped
121+
const excluded = (result as any).excluded_audit_columns as string[] | undefined
122+
if (excluded && excluded.length > 0) {
123+
output += `\n\n Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}`
124+
}
125+
114126
return {
115127
title: `Data diff: ${summarize(outcome)}`,
116128
metadata: { success: true, steps: result.steps },

0 commit comments

Comments
 (0)