[PECOBLR-1383] Add statement execution hooks for telemetry collection (#321)

samikshya-db · claude · web-flow · commit 6dd935f87e0c · 2026-04-06T17:46:26.000+05:30
## Summary This **stacked PR** builds on #320 and adds statement execution hooks to complete end-to-end telemetry collection. **Stack:** Part 3 of 3 - Base: #319 (PECOBLR-1143 - Phases 4-5) - Previous: #320 (PECOBLR-1381-1382 - Phases 6-7) - This PR: PECOBLR-1383 (Statement execution hooks) --- ## Changes ### Exported Methods for Driver Integration **`telemetry/interceptor.go`** - ✅ Exported `BeforeExecute()` - starts metric tracking for a statement - ✅ Exported `AfterExecute()` - records metric with timing and error info - ✅ Exported `AddTag()` - adds tags to current metric context - ✅ Exported `CompleteStatement()` - marks statement complete and flushes ### Statement Execution Hooks **`connection.go`** - ✅ Added hooks to `QueryContext()`: - Calls `BeforeExecute()` with statement ID from operation handle GUID - Uses defer to call `AfterExecute()` and `CompleteStatement()` - ✅ Added hooks to `ExecContext()`: - Calls `BeforeExecute()` with statement ID - Proper error handling (includes stagingErr) - Uses defer to call `AfterExecute()` and `CompleteStatement()` ### Documentation **`telemetry/DESIGN.md`** - ✅ Updated Phase 6 to mark as completed - ✅ Added statement execution hooks to Phase 7 checklist --- ## Integration Flow ``` Connection.QueryContext() ↓ BeforeExecute(statementID) → creates metricContext with startTime ↓ [Statement Execution] ↓ AfterExecute(err) → records metric with latency and error ↓ CompleteStatement(statementID, failed) → flushes aggregated metrics ``` --- ## Testing **All tests passing** ✅ - ✅ 99 telemetry tests (2.018s) - ✅ All driver tests (58.576s) - ✅ No breaking changes - ✅ Telemetry properly disabled when not configured --- ## End-to-End Telemetry With this PR, the telemetry system is **fully functional end-to-end**: 1. ✅ **Collection** - Metrics collected from QueryContext/ExecContext 2. ✅ **Aggregation** - Statement-level aggregation with batching 3. ✅ **Circuit Breaker** - Protection against failing endpoints 4. ✅ **Export** - HTTP POST with retry and exponential backoff 5. ✅ **Feature Flags** - Server-side control with 5-level priority 6. ✅ **Resource Management** - Per-host clients with reference counting --- ## Related Issues - Builds on: #320 (PECOBLR-1381-1382) - Implements: PECOBLR-1383 (Statement execution hooks) ✅ --- ## Checklist - [x] Export interceptor methods for driver use - [x] Add hooks to QueryContext - [x] Add hooks to ExecContext - [x] Update DESIGN.md checklist - [x] All tests passing - [x] No breaking changes --------- Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/connection.go b/connection.go
@@ -123,10 +123,33 @@ func (c *conn) ExecContext(ctx context.Context, query string, args []driver.Name
 
 	corrId := driverctx.CorrelationIdFromContext(ctx)
 
+	// Capture execution start time for telemetry before running the query
+	executeStart := time.Now()
 	exStmtResp, opStatusResp, err := c.runQuery(ctx, query, args)
 	log, ctx = client.LoggerAndContext(ctx, exStmtResp)
 	stagingErr := c.execStagingOperation(exStmtResp, ctx)
 
+	// Telemetry: track statement execution
+	var statementID string
+	var closeOpErr error // Track CloseOperation errors for telemetry
+	if c.telemetry != nil && exStmtResp != nil && exStmtResp.OperationHandle != nil && exStmtResp.OperationHandle.OperationId != nil {
+		statementID = client.SprintGuid(exStmtResp.OperationHandle.OperationId.GUID)
+		// Use BeforeExecuteWithTime to set the correct start time (before execution)
+		ctx = c.telemetry.BeforeExecuteWithTime(ctx, c.id, statementID, executeStart)
+		defer func() {
+			finalErr := err
+			if stagingErr != nil {
+				finalErr = stagingErr
+			}
+			// Include CloseOperation error in telemetry if it occurred
+			if closeOpErr != nil && finalErr == nil {
+				finalErr = closeOpErr
+			}
+			c.telemetry.AfterExecute(ctx, finalErr)
+			c.telemetry.CompleteStatement(ctx, statementID, finalErr != nil)
+		}()
+	}
+
 	if exStmtResp != nil && exStmtResp.OperationHandle != nil {
 		// since we have an operation handle we can close the operation if necessary
 		alreadyClosed := exStmtResp.DirectResults != nil && exStmtResp.DirectResults.CloseOperation != nil
@@ -137,6 +160,7 @@ func (c *conn) ExecContext(ctx context.Context, query string, args []driver.Name
 			})
 			if err1 != nil {
 				log.Err(err1).Msg("databricks: failed to close operation after executing statement")
+				closeOpErr = err1 // Capture for telemetry
 			}
 		}
 	}
@@ -168,10 +192,25 @@ func (c *conn) QueryContext(ctx context.Context, query string, args []driver.Nam
 
 	// first we try to get the results synchronously.
 	// at any point in time that the context is done we must cancel and return
+
+	// Capture execution start time for telemetry before running the query
+	executeStart := time.Now()
 	exStmtResp, opStatusResp, err := c.runQuery(ctx, query, args)
 	log, ctx = client.LoggerAndContext(ctx, exStmtResp)
 	defer log.Duration(msg, start)
 
+	// Telemetry: track statement execution
+	var statementID string
+	if c.telemetry != nil && exStmtResp != nil && exStmtResp.OperationHandle != nil && exStmtResp.OperationHandle.OperationId != nil {
+		statementID = client.SprintGuid(exStmtResp.OperationHandle.OperationId.GUID)
+		// Use BeforeExecuteWithTime to set the correct start time (before execution)
+		ctx = c.telemetry.BeforeExecuteWithTime(ctx, c.id, statementID, executeStart)
+		defer func() {
+			c.telemetry.AfterExecute(ctx, err)
+			c.telemetry.CompleteStatement(ctx, statementID, err != nil)
+		}()
+	}
+
 	if err != nil {
 		log.Err(err).Msg("databricks: failed to run query") // To log query we need to redact credentials
 		return nil, dbsqlerrint.NewExecutionError(ctx, dbsqlerr.ErrQueryExecution, err, opStatusResp)
diff --git a/telemetry/DESIGN.md b/telemetry/DESIGN.md
@@ -2116,34 +2116,34 @@ func BenchmarkInterceptor_Disabled(b *testing.B) {
   - [x] Test server error handling
   - [x] Test unreachable server scenarios
 
-### Phase 6: Collection & Aggregation (PECOBLR-1381)
-- [ ] Implement `interceptor.go` for metric collection
-  - [ ] Implement beforeExecute() and afterExecute() hooks
-  - [ ] Implement context-based metric tracking with metricContext
-  - [ ] Implement latency measurement (startTime, latencyMs calculation)
-  - [ ] Add tag collection methods (addTag)
-  - [ ] Implement error swallowing with panic recovery
-- [ ] Implement `aggregator.go` for batching
-  - [ ] Implement statement-level aggregation (statementMetrics)
-  - [ ] Implement batch size and flush interval logic
-  - [ ] Implement background flush goroutine (flushLoop)
-  - [ ] Add thread-safe metric recording
-  - [ ] Implement completeStatement() for final aggregation
-- [ ] Implement error classification in `errors.go`
-  - [ ] Implement error type classification (terminal vs retryable)
-  - [ ] Implement HTTP status code classification
-  - [ ] Add error pattern matching
-  - [ ] Implement isTerminalError() function
-- [ ] Update `client.go` to integrate aggregator
-  - [ ] Wire up aggregator with exporter
-  - [ ] Implement background flush timer
-  - [ ] Update start() and close() methods
-- [ ] Add unit tests for collection and aggregation
-  - [ ] Test interceptor metric collection and latency tracking
-  - [ ] Test aggregation logic
-  - [ ] Test batch flushing (size-based and time-based)
-  - [ ] Test error classification
-  - [ ] Test client with aggregator integration
+### Phase 6: Collection & Aggregation (PECOBLR-1381) ✅ COMPLETED
+- [x] Implement `interceptor.go` for metric collection
+  - [x] Implement beforeExecute() and afterExecute() hooks
+  - [x] Implement context-based metric tracking with metricContext
+  - [x] Implement latency measurement (startTime, latencyMs calculation)
+  - [x] Add tag collection methods (addTag)
+  - [x] Implement error swallowing with panic recovery
+- [x] Implement `aggregator.go` for batching
+  - [x] Implement statement-level aggregation (statementMetrics)
+  - [x] Implement batch size and flush interval logic
+  - [x] Implement background flush goroutine (flushLoop)
+  - [x] Add thread-safe metric recording
+  - [x] Implement completeStatement() for final aggregation
+- [x] Implement error classification in `errors.go`
+  - [x] Implement error type classification (terminal vs retryable)
+  - [x] Implement HTTP status code classification
+  - [x] Add error pattern matching
+  - [x] Implement isTerminalError() function
+- [x] Update `client.go` to integrate aggregator
+  - [x] Wire up aggregator with exporter
+  - [x] Implement background flush timer
+  - [x] Update start() and close() methods
+- [x] Add unit tests for collection and aggregation
+  - [x] Test interceptor metric collection and latency tracking
+  - [x] Test aggregation logic
+  - [x] Test batch flushing (size-based and time-based)
+  - [x] Test error classification
+  - [x] Test client with aggregator integration
 
 ### Phase 7: Driver Integration ✅ COMPLETED
 - [x] Add telemetry initialization to `connection.go`
@@ -2167,9 +2167,12 @@ func BenchmarkInterceptor_Disabled(b *testing.B) {
   - [x] Test compilation with telemetry
   - [x] Test no breaking changes to existing tests
   - [x] Test graceful handling when disabled
-
-Note: Statement execution hooks (beforeExecute/afterExecute in statement.go) for
-actual metric collection can be added as follow-up enhancement.
+- [x] Statement execution hooks
+  - [x] Add beforeExecute() hook to QueryContext
+  - [x] Add afterExecute() and completeStatement() hooks to QueryContext
+  - [x] Add beforeExecute() hook to ExecContext
+  - [x] Add afterExecute() and completeStatement() hooks to ExecContext
+  - [x] Use operation handle GUID as statement ID
 
 ### Phase 8: Testing & Validation
 - [ ] Run benchmark tests
diff --git a/telemetry/aggregator.go b/telemetry/aggregator.go
@@ -28,8 +28,6 @@ type metricsAggregator struct {
 }
 
 // statementMetrics holds aggregated metrics for a statement.
-//
-//nolint:unused // Will be used in Phase 8+
 type statementMetrics struct {
 	statementID     string
 	sessionID       string
@@ -63,8 +61,6 @@ func newMetricsAggregator(exporter *telemetryExporter, cfg *Config) *metricsAggr
 }
 
 // recordMetric records a metric for aggregation.
-//
-//nolint:unused // Will be used in Phase 8+
 func (agg *metricsAggregator) recordMetric(ctx context.Context, metric *telemetryMetric) {
 	// Swallow all errors
 	defer func() {
@@ -136,8 +132,6 @@ func (agg *metricsAggregator) recordMetric(ctx context.Context, metric *telemetr
 }
 
 // completeStatement marks a statement as complete and emits aggregated metric.
-//
-//nolint:unused // Will be used in Phase 8+
 func (agg *metricsAggregator) completeStatement(ctx context.Context, statementID string, failed bool) {
 	defer func() {
 		if r := recover(); r != nil {
@@ -248,13 +242,10 @@ func (agg *metricsAggregator) close(ctx context.Context) error {
 }
 
 // simpleError is a simple error implementation for testing.
-//
-//nolint:unused // Will be used in Phase 8+
 type simpleError struct {
 	msg string
 }
 
-//nolint:unused // Will be used in Phase 8+
 func (e *simpleError) Error() string {
 	return e.msg
 }
diff --git a/telemetry/errors.go b/telemetry/errors.go
@@ -8,8 +8,7 @@ import (
 // isTerminalError returns true if error is terminal (non-retryable).
 // Terminal errors indicate user errors or permanent failures that won't
 // be resolved by retrying the operation.
-//
-//nolint:unused // Will be used in Phase 8+
+
 func isTerminalError(err error) bool {
 	if err == nil {
 		return false
@@ -45,8 +44,7 @@ func isTerminalError(err error) bool {
 
 // classifyError classifies an error for telemetry purposes.
 // Returns a string representation of the error type.
-//
-//nolint:unused // Will be used in Phase 8+
+
 func classifyError(err error) string {
 	if err == nil {
 		return ""
@@ -89,14 +87,12 @@ func isRetryableError(err error) bool {
 }
 
 // httpError represents an HTTP error with status code.
-//
-//nolint:unused // Will be used in Phase 8+
+
 type httpError struct {
 	statusCode int
 	message    string
 }
 
-//nolint:unused // Will be used in Phase 8+
 func (e *httpError) Error() string {
 	return e.message
 }
@@ -112,16 +108,14 @@ func newHTTPError(statusCode int, message string) error {
 }
 
 // isTerminalHTTPStatus returns true for non-retryable HTTP status codes.
-//
-//nolint:unused // Will be used in Phase 8+
+
 func isTerminalHTTPStatus(status int) bool {
 	// 4xx errors (except 429) are terminal
 	return status >= 400 && status < 500 && status != 429
 }
 
 // extractHTTPError extracts HTTP error information if available.
-//
-//nolint:unused // Will be used in Phase 8+
+
 func extractHTTPError(err error) (*httpError, bool) {
 	var httpErr *httpError
 	if errors.As(err, &httpErr) {
diff --git a/telemetry/interceptor.go b/telemetry/interceptor.go
@@ -15,18 +15,15 @@ type Interceptor struct {
 }
 
 // metricContext holds metric collection state in context.
-//
-//nolint:unused // Will be used in Phase 8+
 type metricContext struct {
+	sessionID   string
 	statementID string
 	startTime   time.Time
 	tags        map[string]interface{}
 }
 
-//nolint:unused // Will be used in Phase 8+
 type contextKey int
 
-//nolint:unused // Will be used in Phase 8+
 const metricContextKey contextKey = 0
 
 // newInterceptor creates a new telemetry interceptor.
@@ -38,32 +35,28 @@ func newInterceptor(aggregator *metricsAggregator, enabled bool) *Interceptor {
 }
 
 // withMetricContext adds metric context to the context.
-//
-//nolint:unused // Will be used in Phase 8+
 func withMetricContext(ctx context.Context, mc *metricContext) context.Context {
 	return context.WithValue(ctx, metricContextKey, mc)
 }
 
 // getMetricContext retrieves metric context from the context.
-//
-//nolint:unused // Will be used in Phase 8+
 func getMetricContext(ctx context.Context) *metricContext {
 	if mc, ok := ctx.Value(metricContextKey).(*metricContext); ok {
 		return mc
 	}
 	return nil
 }
 
-// beforeExecute is called before statement execution.
+// BeforeExecute is called before statement execution.
 // Returns a new context with metric tracking attached.
-//
-//nolint:unused // Will be used in Phase 8+
-func (i *Interceptor) beforeExecute(ctx context.Context, statementID string) context.Context {
+// Exported for use by the driver package.
+func (i *Interceptor) BeforeExecute(ctx context.Context, sessionID string, statementID string) context.Context {
 	if !i.enabled {
 		return ctx
 	}
 
 	mc := &metricContext{
+		sessionID:   sessionID,
 		statementID: statementID,
 		startTime:   time.Now(),
 		tags:        make(map[string]interface{}),
@@ -72,11 +65,28 @@ func (i *Interceptor) beforeExecute(ctx context.Context, statementID string) con
 	return withMetricContext(ctx, mc)
 }
 
-// afterExecute is called after statement execution.
+// BeforeExecuteWithTime is called before statement execution with a custom start time.
+// This is useful when the statement ID is not known until after execution starts.
+// Exported for use by the driver package.
+func (i *Interceptor) BeforeExecuteWithTime(ctx context.Context, sessionID string, statementID string, startTime time.Time) context.Context {
+	if !i.enabled {
+		return ctx
+	}
+
+	mc := &metricContext{
+		sessionID:   sessionID,
+		statementID: statementID,
+		startTime:   startTime,
+		tags:        make(map[string]interface{}),
+	}
+
+	return withMetricContext(ctx, mc)
+}
+
+// AfterExecute is called after statement execution.
 // Records the metric with timing and error information.
-//
-//nolint:unused // Will be used in Phase 8+
-func (i *Interceptor) afterExecute(ctx context.Context, err error) {
+// Exported for use by the driver package.
+func (i *Interceptor) AfterExecute(ctx context.Context, err error) {
 	if !i.enabled {
 		return
 	}
@@ -96,6 +106,7 @@ func (i *Interceptor) afterExecute(ctx context.Context, err error) {
 	metric := &telemetryMetric{
 		metricType:  "statement",
 		timestamp:   mc.startTime,
+		sessionID:   mc.sessionID,
 		statementID: mc.statementID,
 		latencyMs:   time.Since(mc.startTime).Milliseconds(),
 		tags:        mc.tags,
@@ -109,10 +120,9 @@ func (i *Interceptor) afterExecute(ctx context.Context, err error) {
 	i.aggregator.recordMetric(ctx, metric)
 }
 
-// addTag adds a tag to the current metric context.
-//
-//nolint:unused // Will be used in Phase 8+
-func (i *Interceptor) addTag(ctx context.Context, key string, value interface{}) {
+// AddTag adds a tag to the current metric context.
+// Exported for use by the driver package.
+func (i *Interceptor) AddTag(ctx context.Context, key string, value interface{}) {
 	if !i.enabled {
 		return
 	}
@@ -146,10 +156,9 @@ func (i *Interceptor) recordConnection(ctx context.Context, tags map[string]inte
 	i.aggregator.recordMetric(ctx, metric)
 }
 
-// completeStatement marks a statement as complete and flushes aggregated metrics.
-//
-//nolint:unused // Will be used in Phase 8+
-func (i *Interceptor) completeStatement(ctx context.Context, statementID string, failed bool) {
+// CompleteStatement marks a statement as complete and flushes aggregated metrics.
+// Exported for use by the driver package.
+func (i *Interceptor) CompleteStatement(ctx context.Context, statementID string, failed bool) {
 	if !i.enabled {
 		return
 	}

Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,6 @@ type metricsAggregator struct {`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`// statementMetrics holds aggregated metrics for a statement.`
`31`		`-//`
`32`		`-//nolint:unused // Will be used in Phase 8+`
`33`	`31`	`type statementMetrics struct {`
`34`	`32`	`statementID string`
`35`	`33`	`sessionID string`
`@@ -63,8 +61,6 @@ func newMetricsAggregator(exporter telemetryExporter, cfg Config) *metricsAggr`
`63`	`61`	`}`
`64`	`62`
`65`	`63`	`// recordMetric records a metric for aggregation.`
`66`		`-//`
`67`		`-//nolint:unused // Will be used in Phase 8+`
`68`	`64`	`func (agg metricsAggregator) recordMetric(ctx context.Context, metric telemetryMetric) {`
`69`	`65`	`// Swallow all errors`
`70`	`66`	`defer func() {`
`@@ -136,8 +132,6 @@ func (agg metricsAggregator) recordMetric(ctx context.Context, metric telemetr`
`136`	`132`	`}`
`137`	`133`
`138`	`134`	`// completeStatement marks a statement as complete and emits aggregated metric.`
`139`		`-//`
`140`		`-//nolint:unused // Will be used in Phase 8+`
`141`	`135`	`func (agg *metricsAggregator) completeStatement(ctx context.Context, statementID string, failed bool) {`
`142`	`136`	`defer func() {`
`143`	`137`	`if r := recover(); r != nil {`
`@@ -248,13 +242,10 @@ func (agg *metricsAggregator) close(ctx context.Context) error {`
`248`	`242`	`}`
`249`	`243`
`250`	`244`	`// simpleError is a simple error implementation for testing.`
`251`		`-//`
`252`		`-//nolint:unused // Will be used in Phase 8+`
`253`	`245`	`type simpleError struct {`
`254`	`246`	`msg string`
`255`	`247`	`}`
`256`	`248`
`257`		`-//nolint:unused // Will be used in Phase 8+`
`258`	`249`	`func (e *simpleError) Error() string {`
`259`	`250`	`return e.msg`
`260`	`251`	`}`