feat: add categorical partition mode (string, enum, boolean)

suryaiyer95 · suryaiyer95 · commit fb9cb190b81d · 2026-03-30T19:00:31.000-07:00
When partition_column is set without partition_granularity or
partition_bucket_size, groups by raw DISTINCT values. Works for
any non-date, non-numeric column: status, region, country, etc.

WHERE clause uses equality: col = 'value' with proper escaping.
diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
@@ -44,9 +44,12 @@ description: Validate that two tables or query results are identical — or diag
 - `extra_columns` — columns to compare beyond keys (omit = compare all)
 - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
 - `where_clause` — filter applied to both tables
-- `partition_column` — split the table by this column and diff each group independently (recommended for large tables)
-- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`)
-- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K)
+- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes:
+  - **Date column**: set `partition_granularity` → groups by truncated date periods
+  - **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges
+  - **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`)
+- `partition_granularity` — `day` | `week` | `month` | `year` — only for date columns
+- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`)
 
 > **CRITICAL — Algorithm choice:**
 > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
@@ -142,6 +145,13 @@ data_diff(source="orders", target="orders",
   source_warehouse="pg_source", target_warehouse="pg_target",
   partition_column="o_orderkey", partition_bucket_size=100000,
   algorithm="hashdiff")
+
+// Categorical column — partition by distinct status values ('O', 'F', 'P')
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderstatus",   // no granularity or bucket_size needed
+  algorithm="hashdiff")
 ```
 
 Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -144,6 +144,21 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
   }
 }
 
+/**
+ * Determine the partition mode based on which params are provided.
+ * - "date"        → partition_granularity is set (or column looks like a date)
+ * - "numeric"     → partition_bucket_size is set
+ * - "categorical" → neither — use DISTINCT values directly (string, enum, boolean)
+ */
+function partitionMode(
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+): "date" | "numeric" | "categorical" {
+  if (bucketSize != null) return "numeric"
+  if (granularity != null) return "date"
+  return "categorical"
+}
+
 /**
  * Build SQL to discover distinct partition values from the source table.
  */
@@ -155,16 +170,19 @@ function buildPartitionDiscoverySQL(
   dialect: string,
   whereClause?: string,
 ): string {
-  const isNumeric = bucketSize != null
+  const where = whereClause ? `WHERE ${whereClause}` : ""
+  const mode = partitionMode(granularity, bucketSize)
 
   let expr: string
-  if (isNumeric) {
+  if (mode === "numeric") {
     expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
+  } else if (mode === "date") {
+    expr = dateTruncExpr(granularity!, partitionColumn, dialect)
   } else {
-    expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+    // categorical — raw distinct values, no transformation
+    expr = partitionColumn
   }
 
-  const where = whereClause ? `WHERE ${whereClause}` : ""
   return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
 }
 
@@ -178,13 +196,22 @@ function buildPartitionWhereClause(
   bucketSize: number | undefined,
   dialect: string,
 ): string {
-  if (bucketSize != null) {
+  const mode = partitionMode(granularity, bucketSize)
+
+  if (mode === "numeric") {
     const lo = Number(partitionValue)
-    const hi = lo + bucketSize
+    const hi = lo + bucketSize!
     return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
   }
 
-  const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+  if (mode === "categorical") {
+    // Quote the value — works for strings, enums, booleans
+    const escaped = partitionValue.replace(/'/g, "''")
+    return `${partitionColumn} = '${escaped}'`
+  }
+
+  // date mode
+  const expr = dateTruncExpr(granularity!, partitionColumn, dialect)
 
   // Cast the literal appropriately per dialect
   switch (dialect) {
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -55,18 +55,20 @@ export const DataDiffTool = Tool.define("data_diff", {
       .optional()
       .describe(
         "Column to partition on before diffing. Splits the table into groups and diffs each independently. " +
-        "Use for large tables to get faster, more precise results. " +
-        "Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " +
+        "Three modes depending on which other params you set:\n" +
+        "  • Date column   → set partition_granularity (day/week/month/year). E.g. partition_column='l_shipdate', partition_granularity='month'\n" +
+        "  • Numeric column → set partition_bucket_size. E.g. partition_column='l_orderkey', partition_bucket_size=100000\n" +
+        "  • Categorical   → set neither. Works for string/enum/boolean columns like 'status', 'region', 'country'. Groups by distinct values.\n" +
         "Results are aggregated with a per-partition breakdown showing which groups have differences.",
       ),
     partition_granularity: z
       .enum(["day", "week", "month", "year"])
       .optional()
-      .describe("Granularity for date partition columns. Defaults to 'month'."),
+      .describe("For date partition columns: truncation granularity. Omit for numeric or categorical columns."),
     partition_bucket_size: z
       .number()
       .optional()
-      .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."),
+      .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits l_orderkey into ranges of 100K. Omit for date or categorical columns."),
   }),
   async execute(args, ctx) {
     // Require read permission — data diff executes SELECT queries