Skip to content

Commit fb9cb19

Browse files
committed
feat: add categorical partition mode (string, enum, boolean)
When partition_column is set without partition_granularity or partition_bucket_size, groups by raw DISTINCT values. Works for any non-date, non-numeric column: status, region, country, etc. WHERE clause uses equality: col = 'value' with proper escaping.
1 parent fcfd122 commit fb9cb19

File tree

3 files changed

+53
-14
lines changed

3 files changed

+53
-14
lines changed

.opencode/skills/data-parity/SKILL.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ description: Validate that two tables or query results are identical — or diag
4444
- `extra_columns` — columns to compare beyond keys (omit = compare all)
4545
- `algorithm``auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
4646
- `where_clause` — filter applied to both tables
47-
- `partition_column` — split the table by this column and diff each group independently (recommended for large tables)
48-
- `partition_granularity``day` | `week` | `month` | `year` for date columns (default: `month`)
49-
- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K)
47+
- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes:
48+
- **Date column**: set `partition_granularity` → groups by truncated date periods
49+
- **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges
50+
- **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`)
51+
- `partition_granularity``day` | `week` | `month` | `year` — only for date columns
52+
- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`)
5053

5154
> **CRITICAL — Algorithm choice:**
5255
> - If `source_warehouse``target_warehouse`**always use `hashdiff`** (or `auto`).
@@ -142,6 +145,13 @@ data_diff(source="orders", target="orders",
142145
source_warehouse="pg_source", target_warehouse="pg_target",
143146
partition_column="o_orderkey", partition_bucket_size=100000,
144147
algorithm="hashdiff")
148+
149+
// Categorical column — partition by distinct status values ('O', 'F', 'P')
150+
data_diff(source="orders", target="orders",
151+
key_columns=["o_orderkey"],
152+
source_warehouse="pg_source", target_warehouse="pg_target",
153+
partition_column="o_orderstatus", // no granularity or bucket_size needed
154+
algorithm="hashdiff")
145155
```
146156

147157
Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.

packages/opencode/src/altimate/native/connections/data-diff.ts

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,21 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
144144
}
145145
}
146146

147+
/**
148+
* Determine the partition mode based on which params are provided.
149+
* - "date" → partition_granularity is set (or column looks like a date)
150+
* - "numeric" → partition_bucket_size is set
151+
* - "categorical" → neither — use DISTINCT values directly (string, enum, boolean)
152+
*/
153+
function partitionMode(
154+
granularity: string | undefined,
155+
bucketSize: number | undefined,
156+
): "date" | "numeric" | "categorical" {
157+
if (bucketSize != null) return "numeric"
158+
if (granularity != null) return "date"
159+
return "categorical"
160+
}
161+
147162
/**
148163
* Build SQL to discover distinct partition values from the source table.
149164
*/
@@ -155,16 +170,19 @@ function buildPartitionDiscoverySQL(
155170
dialect: string,
156171
whereClause?: string,
157172
): string {
158-
const isNumeric = bucketSize != null
173+
const where = whereClause ? `WHERE ${whereClause}` : ""
174+
const mode = partitionMode(granularity, bucketSize)
159175

160176
let expr: string
161-
if (isNumeric) {
177+
if (mode === "numeric") {
162178
expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
179+
} else if (mode === "date") {
180+
expr = dateTruncExpr(granularity!, partitionColumn, dialect)
163181
} else {
164-
expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
182+
// categorical — raw distinct values, no transformation
183+
expr = partitionColumn
165184
}
166185

167-
const where = whereClause ? `WHERE ${whereClause}` : ""
168186
return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
169187
}
170188

@@ -178,13 +196,22 @@ function buildPartitionWhereClause(
178196
bucketSize: number | undefined,
179197
dialect: string,
180198
): string {
181-
if (bucketSize != null) {
199+
const mode = partitionMode(granularity, bucketSize)
200+
201+
if (mode === "numeric") {
182202
const lo = Number(partitionValue)
183-
const hi = lo + bucketSize
203+
const hi = lo + bucketSize!
184204
return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
185205
}
186206

187-
const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
207+
if (mode === "categorical") {
208+
// Quote the value — works for strings, enums, booleans
209+
const escaped = partitionValue.replace(/'/g, "''")
210+
return `${partitionColumn} = '${escaped}'`
211+
}
212+
213+
// date mode
214+
const expr = dateTruncExpr(granularity!, partitionColumn, dialect)
188215

189216
// Cast the literal appropriately per dialect
190217
switch (dialect) {

packages/opencode/src/altimate/tools/data-diff.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,20 @@ export const DataDiffTool = Tool.define("data_diff", {
5555
.optional()
5656
.describe(
5757
"Column to partition on before diffing. Splits the table into groups and diffs each independently. " +
58-
"Use for large tables to get faster, more precise results. " +
59-
"Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " +
58+
"Three modes depending on which other params you set:\n" +
59+
" • Date column → set partition_granularity (day/week/month/year). E.g. partition_column='l_shipdate', partition_granularity='month'\n" +
60+
" • Numeric column → set partition_bucket_size. E.g. partition_column='l_orderkey', partition_bucket_size=100000\n" +
61+
" • Categorical → set neither. Works for string/enum/boolean columns like 'status', 'region', 'country'. Groups by distinct values.\n" +
6062
"Results are aggregated with a per-partition breakdown showing which groups have differences.",
6163
),
6264
partition_granularity: z
6365
.enum(["day", "week", "month", "year"])
6466
.optional()
65-
.describe("Granularity for date partition columns. Defaults to 'month'."),
67+
.describe("For date partition columns: truncation granularity. Omit for numeric or categorical columns."),
6668
partition_bucket_size: z
6769
.number()
6870
.optional()
69-
.describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."),
71+
.describe("For numeric partition columns: size of each bucket. E.g. 100000 splits l_orderkey into ranges of 100K. Omit for date or categorical columns."),
7072
}),
7173
async execute(args, ctx) {
7274
// Require read permission — data diff executes SELECT queries

0 commit comments

Comments
 (0)