Merge branch 'main' into feat/databricks-provider

anandgupta42 · web-flow · commit 822a92c47ea3 · 2026-04-04T19:51:38.000-07:00
diff --git a/packages/opencode/src/altimate/telemetry/index.ts b/packages/opencode/src/altimate/telemetry/index.ts
@@ -637,6 +637,28 @@ export namespace Telemetry {
         total_cost: number
       }
   // altimate_change end
+    // altimate_change start — pre-execution SQL validation telemetry
+    | {
+        type: "sql_pre_validation"
+        timestamp: number
+        session_id: string
+        /** skipped = no cache or stale, passed = valid SQL, blocked = invalid SQL caught, error = validation itself failed */
+        outcome: "skipped" | "passed" | "blocked" | "error"
+        /** why: no_cache, stale_cache, empty_cache, valid, non_structural, structural_error, dispatcher_failed, validation_exception */
+        reason: string
+        /** warehouse driver type (postgres, snowflake, bigquery, ...) — enables per-warehouse catch-rate analysis */
+        warehouse_type: string
+        /** read / write / unknown — enables per-query-type analysis */
+        query_type: string
+        /** SHA-256 prefix of masked SQL — join key to sql_execute_failure events for same query */
+        masked_sql_hash: string
+        schema_columns: number
+        /** true when schema scan hit the column-scan cap — flags samples biased by large-warehouse truncation */
+        schema_truncated: boolean
+        duration_ms: number
+        error_message?: string
+      }
+    // altimate_change end
 
   /** SHA256 hash a masked error message for anonymous grouping. */
   export function hashError(maskedMessage: string): string {
diff --git a/packages/opencode/src/altimate/tools/sql-execute.ts b/packages/opencode/src/altimate/tools/sql-execute.ts
@@ -9,6 +9,10 @@ import { Telemetry } from "../telemetry"
 // altimate_change start — progressive disclosure suggestions
 import { PostConnectSuggestions } from "./post-connect-suggestions"
 // altimate_change end
+// altimate_change start — pre-execution SQL validation via cached schema
+import { getCache } from "../native/schema/cache"
+import * as Registry from "../native/connections/registry"
+// altimate_change end
 
 export const SqlExecuteTool = Tool.define("sql_execute", {
   description: "Execute SQL against a connected data warehouse. Returns results as a formatted table.",
@@ -34,6 +38,14 @@ export const SqlExecuteTool = Tool.define("sql_execute", {
     }
     // altimate_change end
 
+    // altimate_change start — shadow-mode pre-execution SQL validation
+    // Runs validation against cached schema and emits sql_pre_validation telemetry,
+    // but does NOT block execution. Used to measure catch rate before deciding
+    // whether to enable blocking in a future release. Fire-and-forget so it
+    // doesn't add latency to the sql_execute hot path.
+    preValidateSql(args.query, args.warehouse, queryType).catch(() => {})
+    // altimate_change end
+
     try {
       const result = await Dispatcher.call("sql.execute", {
         sql: args.query,
@@ -91,6 +103,184 @@ export const SqlExecuteTool = Tool.define("sql_execute", {
   },
 })
 
+// altimate_change start — pre-execution SQL validation via cached schema
+const CACHE_TTL_MS = 24 * 60 * 60 * 1000 // 24 hours
+// High ceiling so large warehouses aren't arbitrarily truncated; we emit
+// schema_truncated in telemetry when the cap is reached so the shadow sample
+// can be interpreted correctly.
+const COLUMN_SCAN_LIMIT = 500_000
+
+interface PreValidationResult {
+  blocked: boolean
+  error?: string
+}
+
+async function preValidateSql(sql: string, warehouse: string | undefined, queryType: string): Promise<PreValidationResult> {
+  const startTime = Date.now()
+  // Yield the event loop before heavy synchronous SQLite work so concurrent
+  // tasks aren't blocked. Bun's sqlite API is sync and listColumns can touch
+  // hundreds of thousands of rows for large warehouses.
+  await new Promise<void>((resolve) => setImmediate(resolve))
+
+  // Precompute correlation fields used in every telemetry event this function emits.
+  const maskedSqlHash = Telemetry.hashError(Telemetry.maskString(sql))
+
+  try {
+    // Resolve the warehouse the same way sql.execute's fallback path does:
+    // when caller omits `warehouse`, sql.execute uses Registry.list()[0].
+    // Matching that here keeps the shadow validation aligned with actual
+    // execution (dbt-routed queries are a known gap — they short-circuit
+    // before this fallback, so validation may use a different warehouse
+    // than the one dbt selects).
+    const registered = Registry.list().warehouses
+    let warehouseName = warehouse
+    if (!warehouseName) {
+      warehouseName = registered[0]?.name
+    }
+    const warehouseInfo = registered.find((w) => w.name === warehouseName)
+    const warehouseType = warehouseInfo?.type ?? "unknown"
+
+    const ctx: TrackCtx = {
+      warehouse_type: warehouseType,
+      query_type: queryType,
+      masked_sql_hash: maskedSqlHash,
+    }
+
+    if (!warehouseName) {
+      trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime, false, ctx)
+      return { blocked: false }
+    }
+
+    const cache = await getCache()
+    const status = cache.cacheStatus()
+
+    const warehouseStatus = status.warehouses.find((w) => w.name === warehouseName)
+    if (!warehouseStatus?.last_indexed) {
+      trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime, false, ctx)
+      return { blocked: false }
+    }
+
+    // Check cache freshness
+    const cacheAge = Date.now() - new Date(warehouseStatus.last_indexed).getTime()
+    if (cacheAge > CACHE_TTL_MS) {
+      trackPreValidation("skipped", "stale_cache", 0, Date.now() - startTime, false, ctx)
+      return { blocked: false }
+    }
+
+    // Build schema context from cached columns
+    const columns = cache.listColumns(warehouseName, COLUMN_SCAN_LIMIT)
+    const schemaTruncated = columns.length >= COLUMN_SCAN_LIMIT
+    if (columns.length === 0) {
+      trackPreValidation("skipped", "empty_cache", 0, Date.now() - startTime, false, ctx)
+      return { blocked: false }
+    }
+
+    // Build schema context keyed by fully-qualified name (database.schema.table)
+    // so multi-database warehouses don't collide on schema+table alone.
+    // Dedupe columns per table to defend against residual collisions.
+    const schemaContext: Record<string, { name: string; type: string; nullable: boolean }[]> = {}
+    const seenColumns: Record<string, Set<string>> = {}
+    for (const col of columns) {
+      const tableName = [col.database, col.schema_name, col.table].filter(Boolean).join(".")
+      if (!tableName) continue
+      if (!schemaContext[tableName]) {
+        schemaContext[tableName] = []
+        seenColumns[tableName] = new Set()
+      }
+      if (seenColumns[tableName].has(col.name)) continue
+      seenColumns[tableName].add(col.name)
+      schemaContext[tableName].push({
+        name: col.name,
+        type: col.data_type || "VARCHAR",
+        nullable: col.nullable,
+      })
+    }
+
+    // Validate SQL against cached schema
+    const validationResult = await Dispatcher.call("altimate_core.validate", {
+      sql,
+      schema_path: "",
+      schema_context: schemaContext,
+    })
+
+    // If the dispatcher itself failed, don't treat missing data as "valid".
+    if (!validationResult.success) {
+      const errMsg = typeof validationResult.error === "string" ? validationResult.error : undefined
+      trackPreValidation("error", "dispatcher_failed", 0, Date.now() - startTime, false, ctx, errMsg)
+      return { blocked: false }
+    }
+
+    const data = (validationResult.data ?? {}) as Record<string, any>
+    const errors = Array.isArray(data.errors) ? data.errors : []
+    const isValid = data.valid !== false && errors.length === 0
+
+    if (isValid) {
+      trackPreValidation("passed", "valid", columns.length, Date.now() - startTime, schemaTruncated, ctx)
+      return { blocked: false }
+    }
+
+    // Only block on high-confidence structural errors
+    const structuralErrors = errors.filter((e: any) => {
+      const msg = (e.message ?? "").toLowerCase()
+      return /\b(column|table|view|relation|identifier|not found|does not exist)\b/.test(msg)
+    })
+
+    if (structuralErrors.length === 0) {
+      // Non-structural errors (ambiguous cases) — let them through
+      trackPreValidation("passed", "non_structural", columns.length, Date.now() - startTime, schemaTruncated, ctx)
+      return { blocked: false }
+    }
+
+    const errorMsgs = structuralErrors.map((e: any) => e.message).join("\n")
+    trackPreValidation("blocked", "structural_error", columns.length, Date.now() - startTime, schemaTruncated, ctx, errorMsgs)
+    // Shadow mode: caller discards the result. When blocking is enabled in the
+    // future, build errorOutput here with the structural errors and
+    // schemaContext keys for user-facing guidance.
+    return { blocked: false }
+  } catch {
+    // Validation failure should never block execution
+    const ctx: TrackCtx = { warehouse_type: "unknown", query_type: queryType, masked_sql_hash: maskedSqlHash }
+    trackPreValidation("error", "validation_exception", 0, Date.now() - startTime, false, ctx)
+    return { blocked: false }
+  }
+}
+
+interface TrackCtx {
+  warehouse_type: string
+  query_type: string
+  masked_sql_hash: string
+}
+
+function trackPreValidation(
+  outcome: "skipped" | "passed" | "blocked" | "error",
+  reason: string,
+  schema_columns: number,
+  duration_ms: number,
+  schema_truncated: boolean,
+  ctx: TrackCtx,
+  error_message?: string,
+) {
+  // Mask schema identifiers (table / column names, paths, user IDs) from the
+  // validator error BEFORE it leaves the process — these are PII-adjacent and
+  // must not land in App Insights as raw strings.
+  const masked = error_message ? Telemetry.maskString(error_message).slice(0, 500) : undefined
+  Telemetry.track({
+    type: "sql_pre_validation",
+    timestamp: Date.now(),
+    session_id: Telemetry.getContext().sessionId,
+    outcome,
+    reason,
+    warehouse_type: ctx.warehouse_type,
+    query_type: ctx.query_type,
+    masked_sql_hash: ctx.masked_sql_hash,
+    schema_columns,
+    schema_truncated,
+    duration_ms,
+    ...(masked && { error_message: masked }),
+  })
+}
+// altimate_change end
+
 function formatResult(result: SqlExecuteResult): string {
   if (result.row_count === 0) return "(0 rows)"
 
diff --git a/packages/opencode/test/altimate/connections.test.ts b/packages/opencode/test/altimate/connections.test.ts
@@ -51,6 +51,36 @@ describe("ConnectionRegistry", () => {
     )
   })
 
+  test("cassandra gives helpful hint instead of generic unsupported error", async () => {
+    Registry.setConfigs({
+      mydb: { type: "cassandra", host: "localhost" },
+    })
+    await expect(Registry.get("mydb")).rejects.toThrow("not yet supported")
+    await expect(Registry.get("mydb")).rejects.toThrow("cqlsh")
+  })
+
+  test("cockroachdb suggests using postgres type", async () => {
+    Registry.setConfigs({
+      mydb: { type: "cockroachdb", host: "localhost" },
+    })
+    await expect(Registry.get("mydb")).rejects.toThrow("postgres")
+  })
+
+  test("timescaledb suggests using postgres type", async () => {
+    Registry.setConfigs({
+      mydb: { type: "timescaledb", host: "localhost" },
+    })
+    await expect(Registry.get("mydb")).rejects.toThrow("postgres")
+  })
+
+  test("truly unknown type gives generic unsupported error with supported list", async () => {
+    Registry.setConfigs({
+      mydb: { type: "neo4j", host: "localhost" },
+    })
+    await expect(Registry.get("mydb")).rejects.toThrow("Unsupported database type")
+    await expect(Registry.get("mydb")).rejects.toThrow("Supported:")
+  })
+
   test("getConfig returns config for known connection", () => {
     Registry.setConfigs({
       mydb: { type: "postgres", host: "localhost" },
@@ -608,6 +638,44 @@ trino_project:
       fs.rmSync(tmpDir, { recursive: true })
     }
   })
+
+  test("clickhouse adapter maps correctly from dbt profiles", async () => {
+    const fs = await import("fs")
+    const os = await import("os")
+    const path = await import("path")
+
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "dbt-test-"))
+    const profilesPath = path.join(tmpDir, "profiles.yml")
+
+    fs.writeFileSync(
+      profilesPath,
+      `
+ch_project:
+  outputs:
+    dev:
+      type: clickhouse
+      host: clickhouse.example.com
+      port: 8443
+      user: default
+      password: secret
+      database: analytics
+      schema: default
+`,
+    )
+
+    try {
+      const connections = await parseDbtProfiles(profilesPath)
+      expect(connections).toHaveLength(1)
+      expect(connections[0].type).toBe("clickhouse")
+      expect(connections[0].config.type).toBe("clickhouse")
+      expect(connections[0].config.host).toBe("clickhouse.example.com")
+      expect(connections[0].config.port).toBe(8443)
+      expect(connections[0].config.user).toBe("default")
+      expect(connections[0].config.database).toBe("analytics")
+    } finally {
+      fs.rmSync(tmpDir, { recursive: true })
+    }
+  })
 })
 
 // ---------------------------------------------------------------------------
diff --git a/packages/opencode/test/altimate/schema-finops-dbt.test.ts b/packages/opencode/test/altimate/schema-finops-dbt.test.ts
@@ -158,6 +158,53 @@ describe("FinOps: SQL template generation", () => {
       const built = HistoryTemplates.buildHistoryQuery("databricks", 7, 50)
       expect(built?.sql).toContain("system.query.history")
     })
+
+    test("builds ClickHouse history SQL with clamped integer days and limit", () => {
+      const built = HistoryTemplates.buildHistoryQuery("clickhouse", 7, 100)
+      expect(built).not.toBeNull()
+      expect(built?.sql).toContain("system.query_log")
+      expect(built?.sql).toContain("QueryFinish")
+      // Days and limit should be integer-substituted, not bind params
+      expect(built?.binds).toEqual([])
+      // Verify the clamped values are in the SQL
+      expect(built?.sql).toContain("today() - 7")
+      expect(built?.sql).toContain("LIMIT 100")
+    })
+
+    test("ClickHouse buildHistoryQuery clamps extreme days and limit values", () => {
+      // Days clamped to [1, 365]
+      const extremeDays = HistoryTemplates.buildHistoryQuery("clickhouse", 9999, 50)
+      expect(extremeDays?.sql).toContain("today() - 365")
+
+      const zeroDays = HistoryTemplates.buildHistoryQuery("clickhouse", 0, 50)
+      // Math.floor(0) || 30 = 30 (0 is falsy), then Math.max(1, Math.min(30, 365)) = 30
+      expect(zeroDays?.sql).toContain("today() - 30")
+
+      // Limit clamped to [1, 10000]
+      const extremeLimit = HistoryTemplates.buildHistoryQuery("clickhouse", 7, 999999)
+      expect(extremeLimit?.sql).toContain("LIMIT 10000")
+
+      const zeroLimit = HistoryTemplates.buildHistoryQuery("clickhouse", 7, 0)
+      // Math.floor(0) || 100 = 100 (0 is falsy), then Math.max(1, Math.min(100, 10000)) = 100
+      expect(zeroLimit?.sql).toContain("LIMIT 100")
+    })
+
+    test("ClickHouse buildHistoryQuery handles NaN and float inputs safely", () => {
+      // NaN days defaults to 30 via || 30 fallback
+      const nanDays = HistoryTemplates.buildHistoryQuery("clickhouse", NaN, 50)
+      expect(nanDays?.sql).toContain("today() - 30")
+      expect(nanDays?.sql).not.toContain("NaN")
+
+      // NaN limit defaults to 100 via || 100 fallback
+      const nanLimit = HistoryTemplates.buildHistoryQuery("clickhouse", 7, NaN)
+      expect(nanLimit?.sql).toContain("LIMIT 100")
+      expect(nanLimit?.sql).not.toContain("NaN")
+
+      // Float values should be floored
+      const floatInputs = HistoryTemplates.buildHistoryQuery("clickhouse", 7.9, 50.5)
+      expect(floatInputs?.sql).toContain("today() - 7")
+      expect(floatInputs?.sql).toContain("LIMIT 50")
+    })
   })
 
   describe("warehouse-advisor", () => {
diff --git a/packages/opencode/test/telemetry/telemetry.test.ts b/packages/opencode/test/telemetry/telemetry.test.ts
@@ -245,11 +245,12 @@ const ALL_EVENT_TYPES: Telemetry.Event["type"][] = [
   "sql_execute_failure",
   "feature_suggestion",
   "core_failure",
+  "sql_pre_validation",
 ]
 
 describe("telemetry.event-types", () => {
   test("all event types are valid", () => {
-    expect(ALL_EVENT_TYPES.length).toBe(42)
+    expect(ALL_EVENT_TYPES.length).toBe(43)
   })
 })