Skip to content

Commit 952fb25

Browse files
colexecerror: allow-list vecindex packages for panic catching
Previously, panics originating from the vecindex packages were not in the colexecerror allow-list, causing them to crash the server instead of being returned as SQL errors. For example, a dimension mismatch query against a vector index would bring down the node. The vecindex search path does not manipulate shared state or hold in-memory locks in production (it uses KV-layer locking via vecstore), so it is safe to catch these panics. Caught panics are returned as assertion errors with sentry reports, preserving bug visibility without crashing the server. Resolves: #146694 Epic: none Release note: None Co-Authored-By: roachdev-claude <roachdev-claude-bot@cockroachlabs.com>
1 parent 0c5b33a commit 952fb25

10 files changed

Lines changed: 250 additions & 15 deletions

File tree

pkg/server/server_sql.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,11 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) {
808808
rangeStatsFetcher := rangestats.NewFetcher(cfg.db)
809809

810810
vecIndexManager := vecindex.NewManager(ctx, cfg.stopper, &cfg.Settings.SV, codec, cfg.internalDB)
811+
812+
if knobs := cfg.TestingKnobs.VecIndexTestingKnobs; knobs != nil {
813+
vecIndexManager.SetTestingKnobs(knobs.(*vecindex.VecIndexTestingKnobs))
814+
}
815+
811816
cfg.registry.AddMetricStruct(vecIndexManager.Metrics())
812817

813818
// Set up the DistSQL server.

pkg/sql/colexecerror/error.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,13 @@ func CatchVectorizedRuntimeError(operation func()) (retErr error) {
170170
// Multiple actual packages can have the same prefix as a single constant string
171171
// defined below, but all of such packages are allowed to be caught from.
172172
const (
173-
colPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/col"
174-
encodingPackagePrefix = "github.com/cockroachdb/cockroach/pkg/util/encoding"
175-
execinfraPackagePrefix = "github.com/cockroachdb/cockroach/pkg/sql/execinfra"
176-
sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col"
177-
sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row"
178-
sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem"
173+
colPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/col"
174+
encodingPackagePrefix = "github.com/cockroachdb/cockroach/pkg/util/encoding"
175+
execinfraPackagePrefix = "github.com/cockroachdb/cockroach/pkg/sql/execinfra"
176+
sqlColPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/col"
177+
sqlRowPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/row"
178+
sqlSemPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/sem"
179+
sqlVecindexPackagesPrefix = "github.com/cockroachdb/cockroach/pkg/sql/vecindex"
179180
// When running BenchmarkCatchVectorizedRuntimeError under bazel, the
180181
// repository prefix is missing.
181182
testSqlColPackagesPrefix = "pkg/sql/col"
@@ -217,6 +218,7 @@ func shouldCatchPanic(panicEmittedFrom string) bool {
217218
strings.HasPrefix(panicEmittedFrom, sqlColPackagesPrefix) ||
218219
strings.HasPrefix(panicEmittedFrom, sqlRowPackagesPrefix) ||
219220
strings.HasPrefix(panicEmittedFrom, sqlSemPackagesPrefix) ||
221+
strings.HasPrefix(panicEmittedFrom, sqlVecindexPackagesPrefix) ||
220222
strings.HasPrefix(panicEmittedFrom, testSqlColPackagesPrefix)
221223
}
222224

pkg/sql/rowexec/vector_search.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ func newVectorSearchProcessor(
7171
rerankMultiplier := int(flowCtx.EvalCtx.SessionData().VectorSearchRerankMultiplier)
7272
v.searcher.Init(flowCtx.EvalCtx,
7373
idx, flowCtx.Txn, &spec.GetFullVectorsFetchSpec, searchBeamSize, maxResults, rerankMultiplier)
74+
v.searcher.SetTestingKnobs(flowCtx.Cfg.VecIndexManager.(*vecindex.Manager).TestingKnobs())
7475
colTypes := make([]*types.T, len(v.fetchSpec.FetchedColumns))
7576
for i, col := range v.fetchSpec.FetchedColumns {
7677
colTypes[i] = col.Type
@@ -276,6 +277,7 @@ func newVectorMutationSearchProcessor(
276277
return nil, err
277278
}
278279
v.searcher.Init(flowCtx.EvalCtx, idx, flowCtx.Txn, &spec.GetFullVectorsFetchSpec)
280+
v.searcher.SetTestingKnobs(flowCtx.Cfg.VecIndexManager.(*vecindex.Manager).TestingKnobs())
279281

280282
// Pass through the input columns, and add the partition column and optional
281283
// quantized vector column.

pkg/sql/vecindex/BUILD.bazel

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ go_test(
6969
"//pkg/sql/catalog/descs",
7070
"//pkg/sql/catalog/desctestutils",
7171
"//pkg/sql/catalog/tabledesc",
72+
"//pkg/sql/colexecerror",
73+
"//pkg/sql/pgwire/pgcode",
74+
"//pkg/sql/pgwire/pgerror",
7275
"//pkg/sql/rowenc",
7376
"//pkg/sql/sem/catid",
7477
"//pkg/sql/sem/eval",
@@ -93,6 +96,7 @@ go_test(
9396
"//pkg/util/randutil",
9497
"//pkg/util/vector",
9598
"@com_github_cockroachdb_datadriven//:datadriven",
99+
"@com_github_cockroachdb_errors//:errors",
96100
"@com_github_stretchr_testify//require",
97101
],
98102
)

pkg/sql/vecindex/manager.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ func (m *Manager) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
8383
m.testingKnobs = knobs
8484
}
8585

86+
// TestingKnobs returns the testing knobs configured on the manager, or nil if
87+
// none have been set. Exposed so that executor processors can propagate the
88+
// knobs to per-flow helpers such as Searcher.
89+
func (m *Manager) TestingKnobs() *VecIndexTestingKnobs {
90+
return m.testingKnobs
91+
}
92+
8693
// Metrics returns a metric.Struct which holds metrics for all vector indexes
8794
// maintained by the manager.
8895
func (m *Manager) Metrics() metric.Struct {
@@ -121,6 +128,19 @@ func (m *Manager) Get(
121128
e = &indexEntry{mustWait: true, waitCond: sync.Cond{L: &m.mu}}
122129
m.mu.indexes[idxKey] = e
123130

131+
// If the index construction below panics, mustWait will still be true
132+
// because the normal completion path (which sets mustWait=false and calls
133+
// Broadcast) is skipped. Clean up the cache entry so that waiters are
134+
// unblocked and subsequent callers retry instead of hanging.
135+
defer func() {
136+
if e.mustWait {
137+
e.mustWait = false
138+
e.err = errors.AssertionFailedf("vector index construction panicked")
139+
e.waitCond.Broadcast()
140+
delete(m.mu.indexes, idxKey)
141+
}
142+
}()
143+
124144
idx, err := func() (*cspann.Index, error) {
125145
// Unlock while we build the index structure so that concurrent requests can be
126146
// serviced. We've already set mustWait to true, so other requests will wait
@@ -167,7 +187,7 @@ func (m *Manager) Get(
167187

168188
if err != nil {
169189
// Don't keep the index entry around, so that we retry the query.
170-
m.mu.indexes[idxKey] = nil
190+
delete(m.mu.indexes, idxKey)
171191
}
172192
return idx, err
173193
}

pkg/sql/vecindex/manager_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package vecindex_test
88
import (
99
"context"
1010
"strconv"
11+
"strings"
1112
"sync"
1213
"testing"
1314
"time"
@@ -30,6 +31,7 @@ import (
3031
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
3132
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
3233
"github.com/cockroachdb/cockroach/pkg/util/log"
34+
"github.com/cockroachdb/errors"
3335
"github.com/stretchr/testify/require"
3436
)
3537

@@ -240,4 +242,77 @@ func TestVectorManager(t *testing.T) {
240242
}
241243
vectorMgr.SetTestingKnobs(nil)
242244
})
245+
246+
t.Run("test panic during pull recovers cleanly", func(t *testing.T) {
247+
// Block the panicking goroutine until the other 10 callers are parked on
248+
// e.waitCond, so the panic propagates exactly when 10 waiters depend on
249+
// the cleanup defer in Manager.Get.
250+
pullDelayer := sync.WaitGroup{}
251+
pullDelayer.Add(10)
252+
testingKnobs := vecindex.VecIndexTestingKnobs{
253+
DuringVecIndexPull: func() {
254+
pullDelayer.Wait()
255+
panic(errors.AssertionFailedf("injected vecindex panic"))
256+
},
257+
BeforeVecIndexWait: func() {
258+
pullDelayer.Done()
259+
},
260+
}
261+
vectorMgr.SetTestingKnobs(&testingKnobs)
262+
263+
// Table 150 is created in setup but not pulled by any earlier subtest,
264+
// so DuringVecIndexPull will fire (cached entries skip the inner
265+
// closure).
266+
const tableID catid.DescID = 150
267+
268+
// Fire 11 concurrent Get calls. The first to acquire m.mu installs the
269+
// indexEntry and runs the inner closure, which panics. The other 10
270+
// observe e.mustWait=true and block on e.waitCond. The cleanup defer
271+
// in Manager.Get must unblock all 10 and delete the cache entry so a
272+
// subsequent Get retries instead of hanging.
273+
results := make([]error, 11)
274+
wg := sync.WaitGroup{}
275+
for i := range 11 {
276+
wg.Add(1)
277+
go func(idx int) {
278+
defer wg.Done()
279+
defer func() {
280+
if r := recover(); r != nil {
281+
if err, ok := r.(error); ok {
282+
results[idx] = err
283+
} else {
284+
results[idx] = errors.Newf("non-error panic: %v", r)
285+
}
286+
}
287+
}()
288+
_, err := vectorMgr.Get(ctx, tableID, 2)
289+
if err != nil {
290+
results[idx] = err
291+
}
292+
}(i)
293+
}
294+
wg.Wait()
295+
296+
var injectedCount, sentinelCount int
297+
for _, e := range results {
298+
require.Error(t, e)
299+
switch {
300+
case strings.Contains(e.Error(), "injected vecindex panic"):
301+
injectedCount++
302+
case strings.Contains(e.Error(), "vector index construction panicked"):
303+
sentinelCount++
304+
}
305+
}
306+
require.Equal(t, 1, injectedCount,
307+
"exactly one goroutine should propagate the original panic")
308+
require.Equal(t, 10, sentinelCount,
309+
"waiters should receive the post-cleanup sentinel error")
310+
311+
// After cleanup, the cache entry must be gone so a second Get retries.
312+
// Clear the panicking knob first so the retry can succeed.
313+
vectorMgr.SetTestingKnobs(nil)
314+
idx, err := vectorMgr.Get(ctx, tableID, 2)
315+
require.NoError(t, err)
316+
require.NotNil(t, idx)
317+
})
243318
}

pkg/sql/vecindex/mutation_searcher.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type MutationSearcher struct {
3232
partitionKey tree.Datum
3333
encoded tree.Datum
3434
evalCtx *eval.Context
35+
testingKnobs *VecIndexTestingKnobs
3536
}
3637

3738
// Init wraps the given KV transaction in a C-SPANN transaction and initializes
@@ -61,6 +62,13 @@ func (s *MutationSearcher) KVStats() vecstore.KVStats {
6162
return s.txn.KVStats()
6263
}
6364

65+
// SetTestingKnobs configures testing knobs on the mutation searcher. A nil
66+
// knobs pointer is a no-op. Called by executor processors after Init to
67+
// propagate knobs from the Manager; production code paths do not invoke it.
68+
func (s *MutationSearcher) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
69+
s.testingKnobs = knobs
70+
}
71+
6472
// SearchForInsert triggers a search for the partition in which to insert the
6573
// input vector. The partition's key is returned by PartitionKey() and the
6674
// input vector's quantized and encoded form is returned by EncodedVector().
@@ -70,6 +78,10 @@ func (s *MutationSearcher) KVStats() vecstore.KVStats {
7078
func (s *MutationSearcher) SearchForInsert(
7179
ctx context.Context, prefix roachpb.Key, vec vector.T,
7280
) error {
81+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringMutationSearch != nil {
82+
s.testingKnobs.PanicDuringMutationSearch()
83+
}
84+
7385
res, err := s.idx.SearchForInsert(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec)
7486
if err != nil {
7587
return err
@@ -101,6 +113,10 @@ func (s *MutationSearcher) SearchForInsert(
101113
func (s *MutationSearcher) SearchForDelete(
102114
ctx context.Context, prefix roachpb.Key, vec vector.T, key cspann.KeyBytes,
103115
) error {
116+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringMutationSearch != nil {
117+
s.testingKnobs.PanicDuringMutationSearch()
118+
}
119+
104120
res, err := s.idx.SearchForDelete(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec, key)
105121
if err != nil {
106122
return err

pkg/sql/vecindex/searcher.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@ import (
2424
// NOTE: Searcher is intended to be embedded within an execution engine
2525
// processor object.
2626
type Searcher struct {
27-
idx *cspann.Index
28-
txn vecstore.Txn
29-
idxCtx cspann.Context
30-
options cspann.SearchOptions
31-
searchSet cspann.SearchSet
32-
results cspann.SearchResults
33-
resultIdx int
34-
evalCtx *eval.Context
27+
idx *cspann.Index
28+
txn vecstore.Txn
29+
idxCtx cspann.Context
30+
options cspann.SearchOptions
31+
searchSet cspann.SearchSet
32+
results cspann.SearchResults
33+
resultIdx int
34+
evalCtx *eval.Context
35+
testingKnobs *VecIndexTestingKnobs
3536
}
3637

3738
// Init wraps the given KV transaction in a C-SPANN transaction and initializes
@@ -65,13 +66,24 @@ func (s *Searcher) Init(
6566
cspann.IncreaseRerankResults(baseBeamSize, maxResults, rerankMultiplier)
6667
}
6768

69+
// SetTestingKnobs configures testing knobs on the searcher. A nil knobs
70+
// pointer is a no-op. Called by executor processors after Init to propagate
71+
// knobs from the Manager; production code paths do not invoke it.
72+
func (s *Searcher) SetTestingKnobs(knobs *VecIndexTestingKnobs) {
73+
s.testingKnobs = knobs
74+
}
75+
6876
// Search triggers a search over the index for the given vector, within the
6977
// scope of the given prefix. "maxResults" specifies the maximum number of
7078
// results that will be returned.
7179
//
7280
// NOTE: The caller is assumed to own the memory for all parameters and can
7381
// reuse the memory after the call returns.
7482
func (s *Searcher) Search(ctx context.Context, prefix roachpb.Key, vec vector.T) error {
83+
if s.testingKnobs != nil && s.testingKnobs.PanicDuringSearch != nil {
84+
s.testingKnobs.PanicDuringSearch()
85+
}
86+
7587
err := s.idx.Search(ctx, &s.idxCtx, cspann.TreeKey(prefix), vec, &s.searchSet, s.options)
7688
if err != nil {
7789
return err

0 commit comments

Comments
 (0)