Skip to content

Commit 22a88f9

Browse files
colexecerror: allow-list vecindex packages for panic catching
Previously, panics originating from the vecindex packages were not in the colexecerror allow-list, causing them to crash the server instead of being returned as SQL errors. For example, a dimension mismatch query against a vector index would bring down the node. Catching these panics is safe for two reasons. The vecindex search path holds no in-memory locks (it relies on KV-layer locking via vecstore), so a panic mid-operation cannot tear shared state. And Manager.Get — the one in-memory cache on the path — now cleans up after a panic during cache-miss construction: waiters are woken with an error and the failed entry is removed so the next call retries. Caught panics are returned as assertion errors with sentry reports, preserving bug visibility without crashing the server. Resolves: #146694 Epic: none Release note: None Co-Authored-By: roachdev-claude <roachdev-claude-bot@cockroachlabs.com>
1 parent fc8cddc commit 22a88f9

10 files changed

Lines changed: 263 additions & 15 deletions

File tree

pkg/server/server_sql.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,11 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) {
808808
rangeStatsFetcher := rangestats.NewFetcher(cfg.db)
809809

810810
vecIndexManager := vecindex.NewManager(ctx, cfg.stopper, &cfg.Settings.SV, codec, cfg.internalDB)
811+
812+
if vecIndexKnobs, _ := cfg.TestingKnobs.VecIndexTestingKnobs.(*vecindex.VecIndexTestingKnobs); vecIndexKnobs != nil {
813+
vecIndexManager.SetTestingKnobs(vecIndexKnobs)
814+
}
815+
811816
cfg.registry.AddMetricStruct(vecIndexManager.Metrics())
812817

813818
// Set up the DistSQL server.

pkg/sql/colexecerror/error.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,13 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) {
170170
// Multiple actual packages can have the same prefix as a single constant string
171171
// defined below, but all of such packages are allowed to be caught from.
172172
const (
173-
colPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/col"
174-
encodingPackagePrefix = "github.com/cockroachdb/cockroach/pkg/util/encoding"
175-
execinfraPackagePrefix = "github.com/cockroachdb/cockroach/pkg/sql/execinfra"
176-
sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col"
177-
sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row"
178-
sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem"
173+
colPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/col"
174+
encodingPackagePrefix = "github.com/cockroachdb/cockroach/pkg/util/encoding"
175+
execinfraPackagePrefix = "github.com/cockroachdb/cockroach/pkg/sql/execinfra"
176+
sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col"
177+
sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row"
178+
sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem"
179+
sqlVecindexPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/vecindex"
179180
// When running BenchmarkCatchVectorizedRuntimeError under bazel, the
180181
// repository prefix is missing.
181182
testSqlColPackagesPrefix = "pkg/sql/col"
@@ -217,6 +218,7 @@ func shouldCatchPanic(panicEmittedFrom string) bool {
217218
strings.HasPrefix(panicEmittedFrom, sqlColPackagesPrefix) ||
218219
strings.HasPrefix(panicEmittedFrom, sqlRowPackagesPrefix) ||
219220
strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) ||
221+
strings.HasPrefix(panicEmittedFrom, sqlVecindexPackagesPrefix) ||
220222
strings.HasPrefix(panicEmittedFrom, testSqlColPackagesPrefix)
221223
}
222224

pkg/sql/rowexec/vector_search.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ func newVectorSearchProcessor(
7171
rerankMultiplier := int(flowCtx.EvalCtx.SessionData().VectorSearchRerankMultiplier)
7272
v.searcher.Init(flowCtx.EvalCtx,
7373
idx, flowCtx.Txn, &spec.GetFullVectorsFetchSpec, searchBeamSize, maxResults, rerankMultiplier)
74+
if mgr, ok := flowCtx.Cfg.VecIndexManager.(*vecindex.Manager); ok {
75+
v.searcher.SetTestingKnobs(mgr.TestingKnobs())
76+
}
7477
colTypes := make([]*types.T, len(v.fetchSpec.FetchedColumns))
7578
for i, col := range v.fetchSpec.FetchedColumns {
7679
colTypes[i] = col.Type
@@ -276,6 +279,9 @@ func newVectorMutationSearchProcessor(
276279
return nil, err
277280
}
278281
v.searcher.Init(flowCtx.EvalCtx, idx, flowCtx.Txn, &spec.GetFullVectorsFetchSpec)
282+
if mgr, ok := flowCtx.Cfg.VecIndexManager.(*vecindex.Manager); ok {
283+
v.searcher.SetTestingKnobs(mgr.TestingKnobs())
284+
}
279285

280286
// Pass through the input columns, and add the partition column and optional
281287
// quantized vector column.

pkg/sql/vecindex/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ go_test(
6969
"//pkg/sql/catalog/descs",
7070
"//pkg/sql/catalog/desctestutils",
7171
"//pkg/sql/catalog/tabledesc",
72+
"//pkg/sql/colexecerror",
7273
"//pkg/sql/rowenc",
7374
"//pkg/sql/sem/catid",
7475
"//pkg/sql/sem/eval",
@@ -93,6 +94,7 @@ go_test(
9394
"//pkg/util/randutil",
9495
"//pkg/util/vector",
9596
"@com_github_cockroachdb_datadriven//:datadriven",
97+
"@com_github_cockroachdb_errors//:errors",
9698
"@com_github_stretchr_testify//require",
9799
],
98100
)

pkg/sql/vecindex/manager.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ func (m *Manager) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
8383
m.testingKnobs = knobs
8484
}
8585

86+
// TestingKnobs returns the testing knobs configured on the manager, or nil if
87+
// none have been set. Exposed so that executor processors can propagate the
88+
// knobs to per-flow helpers such as Searcher.
89+
func (m *Manager) TestingKnobs() *VecIndexTestingKnobs {
90+
return m.testingKnobs
91+
}
92+
8693
// Metrics returns a metric.Struct which holds metrics for all vector indexes
8794
// maintained by the manager.
8895
func (m *Manager) Metrics() metric.Struct {
@@ -121,6 +128,39 @@ func (m *Manager) Get(
121128
e = &indexEntry{mustWait: true, waitCond: sync.Cond{L: &m.mu}}
122129
m.mu.indexes[idxKey] = e
123130

131+
// If the index construction below panics, mustWait will still be true
132+
// because the normal completion path (which sets mustWait=false and calls
133+
// Broadcast) is skipped. Wrap the panic cause into e.err so parked waiters
134+
// get a meaningful error, drop the cache entry so subsequent callers retry
135+
// instead of hanging, then rethrow so colexecerror catches the original
136+
// panic for the calling goroutine as usual.
137+
defer func() {
138+
if !e.mustWait {
139+
return
140+
}
141+
142+
e.mustWait = false
143+
144+
r := recover()
145+
switch cause := r.(type) {
146+
case error:
147+
e.err = errors.NewAssertionErrorWithWrappedErrf(
148+
cause, "vector index construction panicked")
149+
case nil:
150+
e.err = errors.AssertionFailedf("vector index construction did not complete")
151+
default:
152+
e.err = errors.AssertionFailedf(
153+
"vector index construction panicked: %v", cause)
154+
}
155+
156+
e.waitCond.Broadcast()
157+
delete(m.mu.indexes, idxKey)
158+
159+
if r != nil {
160+
panic(r)
161+
}
162+
}()
163+
124164
idx, err := func() (*cspann.Index, error) {
125165
// Unlock while we build the index structure so that concurrent requests can be
126166
// serviced. We've already set mustWait to true, so other requests will wait
@@ -167,7 +207,7 @@ func (m *Manager) Get(
167207

168208
if err != nil {
169209
// Don't keep the index entry around, so that we retry the query.
170-
m.mu.indexes[idxKey] = nil
210+
delete(m.mu.indexes, idxKey)
171211
}
172212
return idx, err
173213
}

pkg/sql/vecindex/manager_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package vecindex_test
88
import (
99
"context"
1010
"strconv"
11+
"strings"
1112
"sync"
1213
"testing"
1314
"time"
@@ -30,6 +31,7 @@ import (
3031
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
3132
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3233
"github.com/cockroachdb/cockroach/pkg/util/log"
34+
"github.com/cockroachdb/errors"
3335
"github.com/stretchr/testify/require"
3436
)
3537

@@ -240,4 +242,82 @@ func TestVectorManager(t *testing.T) {
240242
}
241243
vectorMgr.SetTestingKnobs(nil)
242244
})
245+
246+
t.Run("test panic during pull recovers cleanly", func(t *testing.T) {
247+
// Block the panicking goroutine until the other 10 callers are parked on
248+
// e.waitCond, so the panic propagates exactly when 10 waiters depend on
249+
// the cleanup defer in Manager.Get.
250+
pullDelayer := sync.WaitGroup{}
251+
pullDelayer.Add(10)
252+
testingKnobs := vecindex.VecIndexTestingKnobs{
253+
DuringVecIndexPull: func() {
254+
pullDelayer.Wait()
255+
panic(errors.AssertionFailedf("injected vecindex panic"))
256+
},
257+
BeforeVecIndexWait: func() {
258+
pullDelayer.Done()
259+
},
260+
}
261+
vectorMgr.SetTestingKnobs(&testingKnobs)
262+
263+
// Table 150 is created in setup but not pulled by any earlier subtest,
264+
// so DuringVecIndexPull will fire (cached entries skip the inner
265+
// closure).
266+
const tableID catid.DescID = 150
267+
268+
// Fire 11 concurrent Get calls. The first to acquire m.mu installs the
269+
// indexEntry and runs the inner closure, which panics. The other 10
270+
// observe e.mustWait=true and block on e.waitCond. The cleanup defer
271+
// in Manager.Get must unblock all 10 and delete the cache entry so a
272+
// subsequent Get retries instead of hanging.
273+
results := make([]error, 11)
274+
wg := sync.WaitGroup{}
275+
for i := range 11 {
276+
wg.Add(1)
277+
go func(idx int) {
278+
defer wg.Done()
279+
defer func() {
280+
if r := recover(); r != nil {
281+
if err, ok := r.(error); ok {
282+
results[idx] = err
283+
} else {
284+
results[idx] = errors.Newf("non-error panic: %v", r)
285+
}
286+
}
287+
}()
288+
_, err := vectorMgr.Get(ctx, tableID, 2)
289+
if err != nil {
290+
results[idx] = err
291+
}
292+
}(i)
293+
}
294+
wg.Wait()
295+
296+
// The panicker propagates the raw injected panic; the 10 waiters get the
297+
// wrapped sentinel, which contains the original cause as well. Check the
298+
// sentinel substring first so a wrapped error isn't miscounted as the
299+
// raw panic.
300+
var injectedCount, wrappedCount int
301+
for _, e := range results {
302+
require.Error(t, e)
303+
require.Contains(t, e.Error(), "injected vecindex panic",
304+
"all errors should carry the original panic cause")
305+
if strings.Contains(e.Error(), "vector index construction panicked") {
306+
wrappedCount++
307+
} else {
308+
injectedCount++
309+
}
310+
}
311+
require.Equal(t, 1, injectedCount,
312+
"exactly one goroutine should propagate the original panic unwrapped")
313+
require.Equal(t, 10, wrappedCount,
314+
"waiters should receive the wrapped sentinel error")
315+
316+
// After cleanup, the cache entry must be gone so a second Get retries.
317+
// Clear the panicking knob first so the retry can succeed.
318+
vectorMgr.SetTestingKnobs(nil)
319+
idx, err := vectorMgr.Get(ctx, tableID, 2)
320+
require.NoError(t, err)
321+
require.NotNil(t, idx)
322+
})
243323
}

pkg/sql/vecindex/mutation_searcher.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type MutationSearcher struct {
3232
partitionKey tree.Datum
3333
encoded tree.Datum
3434
evalCtx *eval.Context
35+
testingKnobs *VecIndexTestingKnobs
3536
}
3637

3738
// Init wraps the given KV transaction in a C-SPANN transaction and initializes
@@ -61,6 +62,14 @@ func (s *MutationSearcher) KVStats() vecstore.KVStats {
6162
return s.txn.KVStats()
6263
}
6364

65+
// SetTestingKnobs configures testing knobs on the mutation searcher. Invoked
66+
// by executor processors after Init to propagate knobs from the Manager. The
67+
// pointer is nil in production, and a nil pointer is a no-op for subsequent
68+
// knob lookups.
69+
func (s *MutationSearcher) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
70+
s.testingKnobs = knobs
71+
}
72+
6473
// SearchForInsert triggers a search for the partition in which to insert the
6574
// input vector. The partition's key is returned by PartitionKey() and the
6675
// input vector's quantized and encoded form is returned by EncodedVector().
@@ -70,6 +79,10 @@ func (s *MutationSearcher) KVStats() vecstore.KVStats {
7079
func (s *MutationSearcher) SearchForInsert(
7180
ctx context.Context, prefix roachpb.Key, vec vector.T,
7281
) error {
82+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringMutationSearch != nil {
83+
s.testingKnobs.PanicDuringMutationSearch()
84+
}
85+
7386
res, err := s.idx.SearchForInsert(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec)
7487
if err != nil {
7588
return err
@@ -101,6 +114,10 @@ func (s *MutationSearcher) SearchForInsert(
101114
func (s *MutationSearcher) SearchForDelete(
102115
ctx context.Context, prefix roachpb.Key, vec vector.T, key cspann.KeyBytes,
103116
) error {
117+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringMutationSearch != nil {
118+
s.testingKnobs.PanicDuringMutationSearch()
119+
}
120+
104121
res, err := s.idx.SearchForDelete(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec, key)
105122
if err != nil {
106123
return err

pkg/sql/vecindex/searcher.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@ import (
2424
// NOTE: Searcher is intended to be embedded within an execution engine
2525
// processor object.
2626
type Searcher struct {
27-
idx *cspann.Index
28-
txn vecstore.Txn
29-
idxCtx cspann.Context
30-
options cspann.SearchOptions
31-
searchSet cspann.SearchSet
32-
results cspann.SearchResults
33-
resultIdx int
34-
evalCtx *eval.Context
27+
idx *cspann.Index
28+
txn vecstore.Txn
29+
idxCtx cspann.Context
30+
options cspann.SearchOptions
31+
searchSet cspann.SearchSet
32+
results cspann.SearchResults
33+
resultIdx int
34+
evalCtx *eval.Context
35+
testingKnobs *VecIndexTestingKnobs
3536
}
3637

3738
// Init wraps the given KV transaction in a C-SPANN transaction and initializes
@@ -65,13 +66,25 @@ func (s *Searcher) Init(
6566
cspann.IncreaseRerankResults(baseBeamSize, maxResults, rerankMultiplier)
6667
}
6768

69+
// SetTestingKnobs configures testing knobs on the searcher. Invoked by
70+
// executor processors after Init to propagate knobs from the Manager. The
71+
// pointer is nil in production, and a nil pointer is a no-op for subsequent
72+
// knob lookups.
73+
func (s *Searcher) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
74+
s.testingKnobs = knobs
75+
}
76+
6877
// Search triggers a search over the index for the given vector, within the
6978
// scope of the given prefix. "maxResults" specifies the maximum number of
7079
// results that will be returned.
7180
//
7281
// NOTE: The caller is assumed to own the memory for all parameters and can
7382
// reuse the memory after the call returns.
7483
func (s *Searcher) Search(ctx context.Context, prefix roachpb.Key, vec vector.T) error {
84+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringSearch != nil {
85+
s.testingKnobs.PanicDuringSearch()
86+
}
87+
7588
err := s.idx.Search(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec, &s.searchSet, s.options)
7689
if err != nil {
7790
return err

0 commit comments

Comments
 (0)