@@ -9,6 +9,10 @@ import { Telemetry } from "../telemetry"
99// altimate_change start — progressive disclosure suggestions
1010import { PostConnectSuggestions } from "./post-connect-suggestions"
1111// altimate_change end
12+ // altimate_change start — pre-execution SQL validation via cached schema
13+ import { getCache } from "../native/schema/cache"
14+ import * as Registry from "../native/connections/registry"
15+ // altimate_change end
1216
1317export const SqlExecuteTool = Tool . define ( "sql_execute" , {
1418 description : "Execute SQL against a connected data warehouse. Returns results as a formatted table." ,
@@ -34,6 +38,14 @@ export const SqlExecuteTool = Tool.define("sql_execute", {
3438 }
3539 // altimate_change end
3640
41+ // altimate_change start — shadow-mode pre-execution SQL validation
42+ // Runs validation against cached schema and emits sql_pre_validation telemetry,
43+ // but does NOT block execution. Used to measure catch rate before deciding
44+ // whether to enable blocking in a future release. Fire-and-forget so it
45+ // doesn't add latency to the sql_execute hot path.
46+ preValidateSql ( args . query , args . warehouse , queryType ) . catch ( ( ) => { } )
47+ // altimate_change end
48+
3749 try {
3850 const result = await Dispatcher . call ( "sql.execute" , {
3951 sql : args . query ,
@@ -91,6 +103,184 @@ export const SqlExecuteTool = Tool.define("sql_execute", {
91103 } ,
92104} )
93105
106+ // altimate_change start — pre-execution SQL validation via cached schema
107+ const CACHE_TTL_MS = 24 * 60 * 60 * 1000 // 24 hours
108+ // High ceiling so large warehouses aren't arbitrarily truncated; we emit
109+ // schema_truncated in telemetry when the cap is reached so the shadow sample
110+ // can be interpreted correctly.
111+ const COLUMN_SCAN_LIMIT = 500_000
112+
113+ interface PreValidationResult {
114+ blocked : boolean
115+ error ?: string
116+ }
117+
118+ async function preValidateSql ( sql : string , warehouse : string | undefined , queryType : string ) : Promise < PreValidationResult > {
119+ const startTime = Date . now ( )
120+ // Yield the event loop before heavy synchronous SQLite work so concurrent
121+ // tasks aren't blocked. Bun's sqlite API is sync and listColumns can touch
122+ // hundreds of thousands of rows for large warehouses.
123+ await new Promise < void > ( ( resolve ) => setImmediate ( resolve ) )
124+
125+ // Precompute correlation fields used in every telemetry event this function emits.
126+ const maskedSqlHash = Telemetry . hashError ( Telemetry . maskString ( sql ) )
127+
128+ try {
129+ // Resolve the warehouse the same way sql.execute's fallback path does:
130+ // when caller omits `warehouse`, sql.execute uses Registry.list()[0].
131+ // Matching that here keeps the shadow validation aligned with actual
132+ // execution (dbt-routed queries are a known gap — they short-circuit
133+ // before this fallback, so validation may use a different warehouse
134+ // than the one dbt selects).
135+ const registered = Registry . list ( ) . warehouses
136+ let warehouseName = warehouse
137+ if ( ! warehouseName ) {
138+ warehouseName = registered [ 0 ] ?. name
139+ }
140+ const warehouseInfo = registered . find ( ( w ) => w . name === warehouseName )
141+ const warehouseType = warehouseInfo ?. type ?? "unknown"
142+
143+ const ctx : TrackCtx = {
144+ warehouse_type : warehouseType ,
145+ query_type : queryType ,
146+ masked_sql_hash : maskedSqlHash ,
147+ }
148+
149+ if ( ! warehouseName ) {
150+ trackPreValidation ( "skipped" , "no_cache" , 0 , Date . now ( ) - startTime , false , ctx )
151+ return { blocked : false }
152+ }
153+
154+ const cache = await getCache ( )
155+ const status = cache . cacheStatus ( )
156+
157+ const warehouseStatus = status . warehouses . find ( ( w ) => w . name === warehouseName )
158+ if ( ! warehouseStatus ?. last_indexed ) {
159+ trackPreValidation ( "skipped" , "no_cache" , 0 , Date . now ( ) - startTime , false , ctx )
160+ return { blocked : false }
161+ }
162+
163+ // Check cache freshness
164+ const cacheAge = Date . now ( ) - new Date ( warehouseStatus . last_indexed ) . getTime ( )
165+ if ( cacheAge > CACHE_TTL_MS ) {
166+ trackPreValidation ( "skipped" , "stale_cache" , 0 , Date . now ( ) - startTime , false , ctx )
167+ return { blocked : false }
168+ }
169+
170+ // Build schema context from cached columns
171+ const columns = cache . listColumns ( warehouseName , COLUMN_SCAN_LIMIT )
172+ const schemaTruncated = columns . length >= COLUMN_SCAN_LIMIT
173+ if ( columns . length === 0 ) {
174+ trackPreValidation ( "skipped" , "empty_cache" , 0 , Date . now ( ) - startTime , false , ctx )
175+ return { blocked : false }
176+ }
177+
178+ // Build schema context keyed by fully-qualified name (database.schema.table)
179+ // so multi-database warehouses don't collide on schema+table alone.
180+ // Dedupe columns per table to defend against residual collisions.
181+ const schemaContext : Record < string , { name : string ; type : string ; nullable : boolean } [ ] > = { }
182+ const seenColumns : Record < string , Set < string > > = { }
183+ for ( const col of columns ) {
184+ const tableName = [ col . database , col . schema_name , col . table ] . filter ( Boolean ) . join ( "." )
185+ if ( ! tableName ) continue
186+ if ( ! schemaContext [ tableName ] ) {
187+ schemaContext [ tableName ] = [ ]
188+ seenColumns [ tableName ] = new Set ( )
189+ }
190+ if ( seenColumns [ tableName ] . has ( col . name ) ) continue
191+ seenColumns [ tableName ] . add ( col . name )
192+ schemaContext [ tableName ] . push ( {
193+ name : col . name ,
194+ type : col . data_type || "VARCHAR" ,
195+ nullable : col . nullable ,
196+ } )
197+ }
198+
199+ // Validate SQL against cached schema
200+ const validationResult = await Dispatcher . call ( "altimate_core.validate" , {
201+ sql,
202+ schema_path : "" ,
203+ schema_context : schemaContext ,
204+ } )
205+
206+ // If the dispatcher itself failed, don't treat missing data as "valid".
207+ if ( ! validationResult . success ) {
208+ const errMsg = typeof validationResult . error === "string" ? validationResult . error : undefined
209+ trackPreValidation ( "error" , "dispatcher_failed" , 0 , Date . now ( ) - startTime , false , ctx , errMsg )
210+ return { blocked : false }
211+ }
212+
213+ const data = ( validationResult . data ?? { } ) as Record < string , any >
214+ const errors = Array . isArray ( data . errors ) ? data . errors : [ ]
215+ const isValid = data . valid !== false && errors . length === 0
216+
217+ if ( isValid ) {
218+ trackPreValidation ( "passed" , "valid" , columns . length , Date . now ( ) - startTime , schemaTruncated , ctx )
219+ return { blocked : false }
220+ }
221+
222+ // Only block on high-confidence structural errors
223+ const structuralErrors = errors . filter ( ( e : any ) => {
224+ const msg = ( e . message ?? "" ) . toLowerCase ( )
225+ return / \b ( c o l u m n | t a b l e | v i e w | r e l a t i o n | i d e n t i f i e r | n o t f o u n d | d o e s n o t e x i s t ) \b / . test ( msg )
226+ } )
227+
228+ if ( structuralErrors . length === 0 ) {
229+ // Non-structural errors (ambiguous cases) — let them through
230+ trackPreValidation ( "passed" , "non_structural" , columns . length , Date . now ( ) - startTime , schemaTruncated , ctx )
231+ return { blocked : false }
232+ }
233+
234+ const errorMsgs = structuralErrors . map ( ( e : any ) => e . message ) . join ( "\n" )
235+ trackPreValidation ( "blocked" , "structural_error" , columns . length , Date . now ( ) - startTime , schemaTruncated , ctx , errorMsgs )
236+ // Shadow mode: caller discards the result. When blocking is enabled in the
237+ // future, build errorOutput here with the structural errors and
238+ // schemaContext keys for user-facing guidance.
239+ return { blocked : false }
240+ } catch {
241+ // Validation failure should never block execution
242+ const ctx : TrackCtx = { warehouse_type : "unknown" , query_type : queryType , masked_sql_hash : maskedSqlHash }
243+ trackPreValidation ( "error" , "validation_exception" , 0 , Date . now ( ) - startTime , false , ctx )
244+ return { blocked : false }
245+ }
246+ }
247+
248+ interface TrackCtx {
249+ warehouse_type : string
250+ query_type : string
251+ masked_sql_hash : string
252+ }
253+
254+ function trackPreValidation (
255+ outcome : "skipped" | "passed" | "blocked" | "error" ,
256+ reason : string ,
257+ schema_columns : number ,
258+ duration_ms : number ,
259+ schema_truncated : boolean ,
260+ ctx : TrackCtx ,
261+ error_message ?: string ,
262+ ) {
263+ // Mask schema identifiers (table / column names, paths, user IDs) from the
264+ // validator error BEFORE it leaves the process — these are PII-adjacent and
265+ // must not land in App Insights as raw strings.
266+ const masked = error_message ? Telemetry . maskString ( error_message ) . slice ( 0 , 500 ) : undefined
267+ Telemetry . track ( {
268+ type : "sql_pre_validation" ,
269+ timestamp : Date . now ( ) ,
270+ session_id : Telemetry . getContext ( ) . sessionId ,
271+ outcome,
272+ reason,
273+ warehouse_type : ctx . warehouse_type ,
274+ query_type : ctx . query_type ,
275+ masked_sql_hash : ctx . masked_sql_hash ,
276+ schema_columns,
277+ schema_truncated,
278+ duration_ms,
279+ ...( masked && { error_message : masked } ) ,
280+ } )
281+ }
282+ // altimate_change end
283+
94284function formatResult ( result : SqlExecuteResult ) : string {
95285 if ( result . row_count === 0 ) return "(0 rows)"
96286
0 commit comments