Skip to content

Commit 5cc579d

Browse files
committed
fix: address cubic review feedback on SQL pre-validation
- P1: mask validator error via `Telemetry.maskString()` before emitting `sql_pre_validation` telemetry. Raw schema identifiers (table/column names, paths) no longer leak to App Insights. - P2: resolve fallback warehouse via `Registry.list().warehouses[0]` (same path `sql.execute` uses) instead of the cache's first warehouse. Keeps shadow validation aligned with actual execution. - P2: raise column-scan cap from 10k to 500k and add `schema_truncated` boolean to the event. Avoids false structural errors on large warehouses and lets analysis flag biased samples.
1 parent d355511 commit 5cc579d

2 files changed

Lines changed: 38 additions & 15 deletions

File tree

packages/opencode/src/altimate/telemetry/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,8 @@ export namespace Telemetry {
647647
/** why: no_cache, stale_cache, empty_cache, valid, non_structural, structural_error, validation_exception */
648648
reason: string
649649
schema_columns: number
650+
/** true when schema scan hit the column-scan cap — flags samples biased by large-warehouse truncation */
651+
schema_truncated: boolean
650652
duration_ms: number
651653
error_message?: string
652654
}

packages/opencode/src/altimate/tools/sql-execute.ts

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import { PostConnectSuggestions } from "./post-connect-suggestions"
1111
// altimate_change end
1212
// altimate_change start — pre-execution SQL validation via cached schema
1313
import { getCache } from "../native/schema/cache"
14+
import * as Registry from "../native/connections/registry"
1415
// altimate_change end
1516

1617
export const SqlExecuteTool = Tool.define("sql_execute", {
@@ -104,6 +105,10 @@ export const SqlExecuteTool = Tool.define("sql_execute", {
104105

105106
// altimate_change start — pre-execution SQL validation via cached schema
106107
const CACHE_TTL_MS = 24 * 60 * 60 * 1000 // 24 hours
108+
// High ceiling so large warehouses aren't arbitrarily truncated; we emit
109+
// schema_truncated in telemetry when the cap is reached so the shadow sample
110+
// can be interpreted correctly.
111+
const COLUMN_SCAN_LIMIT = 500_000
107112

108113
interface PreValidationResult {
109114
blocked: boolean
@@ -113,33 +118,43 @@ interface PreValidationResult {
113118
async function preValidateSql(sql: string, warehouse?: string): Promise<PreValidationResult> {
114119
const startTime = Date.now()
115120
try {
116-
const cache = await getCache()
117-
const status = cache.cacheStatus()
118-
119-
// Find the target warehouse in cache
120-
const warehouseName = warehouse || status.warehouses[0]?.name
121+
// Resolve the warehouse the same way sql.execute's fallback path does:
122+
// when caller omits `warehouse`, sql.execute uses Registry.list()[0].
123+
// Matching that here keeps the shadow validation aligned with actual
124+
// execution (dbt-routed queries are a known gap — they short-circuit
125+
// before this fallback, so validation may use a different warehouse
126+
// than the one dbt selects).
127+
let warehouseName = warehouse
128+
if (!warehouseName) {
129+
const registered = Registry.list().warehouses
130+
warehouseName = registered[0]?.name
131+
}
121132
if (!warehouseName) {
122-
trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime)
133+
trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime, false)
123134
return { blocked: false }
124135
}
125136

137+
const cache = await getCache()
138+
const status = cache.cacheStatus()
139+
126140
const warehouseStatus = status.warehouses.find((w) => w.name === warehouseName)
127141
if (!warehouseStatus?.last_indexed) {
128-
trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime)
142+
trackPreValidation("skipped", "no_cache", 0, Date.now() - startTime, false)
129143
return { blocked: false }
130144
}
131145

132146
// Check cache freshness
133147
const cacheAge = Date.now() - new Date(warehouseStatus.last_indexed).getTime()
134148
if (cacheAge > CACHE_TTL_MS) {
135-
trackPreValidation("skipped", "stale_cache", 0, Date.now() - startTime)
149+
trackPreValidation("skipped", "stale_cache", 0, Date.now() - startTime, false)
136150
return { blocked: false }
137151
}
138152

139153
// Build schema context from cached columns
140-
const columns = cache.listColumns(warehouseName, 10_000)
154+
const columns = cache.listColumns(warehouseName, COLUMN_SCAN_LIMIT)
155+
const schemaTruncated = columns.length >= COLUMN_SCAN_LIMIT
141156
if (columns.length === 0) {
142-
trackPreValidation("skipped", "empty_cache", 0, Date.now() - startTime)
157+
trackPreValidation("skipped", "empty_cache", 0, Date.now() - startTime, false)
143158
return { blocked: false }
144159
}
145160

@@ -168,7 +183,7 @@ async function preValidateSql(sql: string, warehouse?: string): Promise<PreValid
168183
const isValid = data.valid !== false && errors.length === 0
169184

170185
if (isValid) {
171-
trackPreValidation("passed", "valid", columns.length, Date.now() - startTime)
186+
trackPreValidation("passed", "valid", columns.length, Date.now() - startTime, schemaTruncated)
172187
return { blocked: false }
173188
}
174189

@@ -180,7 +195,7 @@ async function preValidateSql(sql: string, warehouse?: string): Promise<PreValid
180195

181196
if (structuralErrors.length === 0) {
182197
// Non-structural errors (ambiguous cases) — let them through
183-
trackPreValidation("passed", "non_structural", columns.length, Date.now() - startTime)
198+
trackPreValidation("passed", "non_structural", columns.length, Date.now() - startTime, schemaTruncated)
184199
return { blocked: false }
185200
}
186201

@@ -202,11 +217,11 @@ async function preValidateSql(sql: string, warehouse?: string): Promise<PreValid
202217
`Fix the query and retry. If the schema cache is outdated, run schema_index to refresh it.`,
203218
].join("\n")
204219

205-
trackPreValidation("blocked", "structural_error", columns.length, Date.now() - startTime, errorMsgs)
220+
trackPreValidation("blocked", "structural_error", columns.length, Date.now() - startTime, schemaTruncated, errorMsgs)
206221
return { blocked: true, error: errorOutput }
207222
} catch {
208223
// Validation failure should never block execution
209-
trackPreValidation("error", "validation_exception", 0, Date.now() - startTime)
224+
trackPreValidation("error", "validation_exception", 0, Date.now() - startTime, false)
210225
return { blocked: false }
211226
}
212227
}
@@ -216,17 +231,23 @@ function trackPreValidation(
216231
reason: string,
217232
schema_columns: number,
218233
duration_ms: number,
234+
schema_truncated: boolean,
219235
error_message?: string,
220236
) {
237+
// Mask schema identifiers (table / column names, paths, user IDs) from the
238+
// validator error BEFORE it leaves the process — these are PII-adjacent and
239+
// must not land in App Insights as raw strings.
240+
const masked = error_message ? Telemetry.maskString(error_message).slice(0, 500) : undefined
221241
Telemetry.track({
222242
type: "sql_pre_validation",
223243
timestamp: Date.now(),
224244
session_id: Telemetry.getContext().sessionId,
225245
outcome,
226246
reason,
227247
schema_columns,
248+
schema_truncated,
228249
duration_ms,
229-
...(error_message && { error_message: error_message.slice(0, 500) }),
250+
...(masked && { error_message: masked }),
230251
})
231252
}
232253
// altimate_change end

0 commit comments

Comments
 (0)