Skip to content

Commit 11377f8

Browse files
release-26.2: mmaprototype: fix nil-pointer panic when StorePool sees stores MMA does not (#170939)
release-26.2: mmaprototype: fix nil-pointer panic when StorePool sees stores MMA does not
2 parents ac72c86 + 808a893 commit 11377f8

11 files changed

Lines changed: 165 additions & 15 deletions

File tree

pkg/kv/kvserver/allocator/allocatorimpl/allocator.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1914,7 +1914,7 @@ func (a Allocator) RebalanceTarget(
19141914
// continues to correspond to the original candidate set and source store,
19151915
// even as candidates are removed.
19161916
for {
1917-
target, existingCandidate, bestIdx = bestRebalanceTarget(a.randGen, results, a.as)
1917+
target, existingCandidate, bestIdx = bestRebalanceTarget(ctx, a.randGen, results, a.as)
19181918
if target == nil {
19191919
return zero, zero, "", false
19201920
}
@@ -2555,7 +2555,7 @@ func (a *Allocator) TransferLeaseTarget(
25552555
for _, s := range sl.Stores {
25562556
targetStores = append(targetStores, s.StoreID)
25572557
}
2558-
handle := a.as.BuildMMARebalanceAdvisor(source.StoreID, targetStores)
2558+
handle := a.as.BuildMMARebalanceAdvisor(ctx, source.StoreID, targetStores)
25592559
var bestOption roachpb.ReplicaDescriptor
25602560
candidates := make([]roachpb.ReplicaDescriptor, 0, len(validTargets))
25612561
bestOptionLeaseCount := int32(math.MaxInt32)

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1952,7 +1952,10 @@ func rankedCandidateListForRebalancing(
19521952
// Contract: responsible for making sure that the returned bestIdx has the
19531953
// corresponding MMA advisor in advisors.
19541954
func bestRebalanceTarget(
1955-
randGen allocatorRand, options []rebalanceOptions, as *mmaintegration.AllocatorSync,
1955+
ctx context.Context,
1956+
randGen allocatorRand,
1957+
options []rebalanceOptions,
1958+
as *mmaintegration.AllocatorSync,
19561959
) (target, existingCandidate *candidate, bestIdx int) {
19571960
bestIdx = -1
19581961
var bestTarget *candidate
@@ -1982,7 +1985,7 @@ func bestRebalanceTarget(
19821985
for _, cand := range options[bestIdx].candidates {
19831986
stores = append(stores, cand.store.StoreID)
19841987
}
1985-
options[bestIdx].advisor = as.BuildMMARebalanceAdvisor(options[bestIdx].existing.store.StoreID, stores)
1988+
options[bestIdx].advisor = as.BuildMMARebalanceAdvisor(ctx, options[bestIdx].existing.store.StoreID, stores)
19861989
}
19871990

19881991
// Copy the selected target out of the candidates slice before modifying

pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ func TestBestRebalanceTarget(t *testing.T) {
448448
var i int
449449
for {
450450
i++
451-
target, existing, _ := bestRebalanceTarget(allocRand, candidates, a.as)
451+
target, existing, _ := bestRebalanceTarget(ctx, allocRand, candidates, a.as)
452452
if len(expectedTargets) == 0 {
453453
if target == nil {
454454
break

pkg/kv/kvserver/allocator/mmaprototype/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ go_test(
6161
"logging_test.go",
6262
"memo_helper_test.go",
6363
"mma_metrics_test.go",
64+
"rebalance_advisor_test.go",
6465
],
6566
data = glob(["testdata/**"]),
6667
embed = [":mmaprototype"],

pkg/kv/kvserver/allocator/mmaprototype/allocator.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,9 @@ type Allocator interface {
193193
//
194194
// TODO(sumeer): merge the above comment with the comment in the
195195
// implementation.
196-
BuildMMARebalanceAdvisor(existing roachpb.StoreID, cands []roachpb.StoreID) *MMARebalanceAdvisor
196+
BuildMMARebalanceAdvisor(
197+
ctx context.Context, existing roachpb.StoreID, cands []roachpb.StoreID,
198+
) *MMARebalanceAdvisor
197199

198200
// IsInConflictWithMMA is called by the allocator sync to determine if the
199201
// given candidate is in conflict with the existing store.

pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2409,6 +2409,22 @@ func (cs *clusterState) getStoreReportedLoad(storeID roachpb.StoreID) (roachpb.N
24092409
return 0, nil
24102410
}
24112411

2412+
// hasStore reports whether storeID is known to mma's clusterState.
2413+
// Used by BuildMMARebalanceAdvisor to filter candidate slices that come
2414+
// from the legacy allocator's StorePool view, which can include stores
2415+
// that gossip has not yet announced to MMA.
2416+
func (cs *clusterState) hasStore(storeID roachpb.StoreID) bool {
2417+
_, ok := cs.stores[storeID]
2418+
return ok
2419+
}
2420+
2421+
// notHasStore is the negation of hasStore, exposed so callers can pass
2422+
// it as a method value to slices.IndexFunc / slices.DeleteFunc without
2423+
// an inline closure.
2424+
func (cs *clusterState) notHasStore(storeID roachpb.StoreID) bool {
2425+
return !cs.hasStore(storeID)
2426+
}
2427+
24122428
func (cs *clusterState) getNodeReportedLoad(nodeID roachpb.NodeID) *NodeLoad {
24132429
if nodeState, ok := cs.nodes[nodeID]; ok {
24142430
return &nodeState.NodeLoad

pkg/kv/kvserver/allocator/mmaprototype/load.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,15 @@ func (mm *meansMemo) getStoreLoadSummary(
484484
// stores may contain duplicate storeIDs, in which case computeMeansForStoreSet
485485
// should deduplicate processing of the stores. stores should be immutable.
486486
//
487+
// Precondition: every storeID in stores must be known to loadProvider
488+
// (loadProvider.getStoreReportedLoad must return a non-nil *storeLoad).
489+
// Callers receiving storeIDs from outside MMA (notably the legacy
490+
// allocator's StorePool view, which can run ahead of MMA's gossip
491+
// updates) must filter unknown stores out first. BuildMMARebalanceAdvisor
492+
// is the canonical filtering point; clusterState.hasStore is the helper
493+
// to use. A violation is asserted in test builds and logged-and-skipped
494+
// in production rather than nil-dereferenced (see #170703).
495+
//
487496
// If stores is empty, computeMeansForStoreSet returns the zero meansLoad
488497
// and ok=false. Callers must check ok before consuming the returned means;
489498
// the zero value would otherwise misclassify stores as overloaded due to
@@ -510,6 +519,17 @@ func computeMeansForStoreSet(
510519
if _, ok := scratchStores[storeID]; ok {
511520
continue
512521
}
522+
if sload == nil {
523+
// Precondition violation: a caller passed an unknown storeID. In
524+
// test builds this panics, surfacing the bug; in production it
525+
// logs and skips, avoiding the nil-pointer crash from #170703.
526+
// context.Background is used because plumbing ctx into this
527+
// internal helper would touch every memo path; the assertion is a
528+
// last-resort safety net and is not expected to fire.
529+
assertTruef(context.Background(), false,
530+
"computeMeansForStoreSet: storeID %d not known to loadProvider", storeID)
531+
continue
532+
}
513533
n++
514534
scratchStores[storeID] = struct{}{}
515535
for j := range sload.reportedLoad {
@@ -530,6 +550,11 @@ func computeMeansForStoreSet(
530550
scratchNodes[nodeID] = loadProvider.getNodeReportedLoad(nodeID)
531551
}
532552
}
553+
if n == 0 {
554+
// Every storeID hit the nil-sload guard above. Bail out before the
555+
// divisions below would panic with division by zero.
556+
return meansLoad{}, false
557+
}
533558
for i := range means.storeLoad.load {
534559
if means.storeLoad.capacity[i] != UnknownCapacity {
535560
// NB: capacity can be 0 for CPURate when nodeCPURateUsage >>

pkg/kv/kvserver/allocator/mmaprototype/rebalance_advisor.go

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package mmaprototype
77

88
import (
99
"context"
10+
"slices"
1011

1112
"github.com/cockroachdb/cockroach/pkg/roachpb"
1213
"github.com/cockroachdb/cockroach/pkg/util/log"
@@ -71,29 +72,59 @@ func NoopMMARebalanceAdvisor() *MMARebalanceAdvisor {
7172
// duplicate storeIDs. It is up to computeMeansForStoreSet to handle
7273
// de-duplication of storeIDs from the cands list.
7374
//
75+
// Callers may pass storeIDs (in `existing` or `cands`) that MMA's
76+
// clusterState has not yet seen. This happens during startup, when the
77+
// legacy allocator's StorePool reflects gossiped store descriptors that
78+
// MMA has not yet been notified of via SetStore. Such stores are filtered
79+
// out of `cands` before computing means; if `existing` itself is unknown,
80+
// a NoopMMARebalanceAdvisor is returned because MMA has no load history
81+
// against which to judge candidates. The same asymmetry is acknowledged
82+
// in updateStoreStatuses, which logs and skips unknown stores rather
83+
// than panicking. See #170703.
84+
//
7485
// The returned advisor should be passed to IsInConflictWithMMA as a helper to
7586
// determine if a candidate is vetoed by the multi-metric allocator due to
7687
// running counter to its goals.
7788
func (a *allocatorState) BuildMMARebalanceAdvisor(
78-
existing roachpb.StoreID, cands []roachpb.StoreID,
89+
ctx context.Context, existing roachpb.StoreID, cands []roachpb.StoreID,
7990
) *MMARebalanceAdvisor {
8091
// a.cs is mutated by gossip-driven callbacks (e.g. ProcessStoreLoadMsg) and
8192
// must only be accessed under a.mu. The other public methods on
8293
// allocatorState follow this discipline.
8394
a.mu.Lock()
8495
defer a.mu.Unlock()
96+
if !a.cs.hasStore(existing) {
97+
// MMA has no load history for the source store, so it cannot judge
98+
// whether any candidate is more overloaded than existing. Fall back to
99+
// a no-op advisor rather than risk a misclassification (or, before the
100+
// hasStore filter on cands below was added, a nil-pointer panic).
101+
log.KvDistribution.VEventf(ctx, 2,
102+
"mma skipping advisor: existing store s%d not yet known to mma", existing)
103+
return NoopMMARebalanceAdvisor()
104+
}
105+
// Drop any cand the integration layer learned about (via gossip / StorePool)
106+
// before MMA did. computeMeansForStoreSet would otherwise nil-deref on the
107+
// missing storeLoad. In the steady state every cand is known, so the
108+
// IndexFunc walk completes without allocating; only when an unknown cand
109+
// is present do we copy and compact.
110+
if slices.IndexFunc(cands, a.cs.notHasStore) != -1 {
111+
cp := make([]roachpb.StoreID, len(cands))
112+
copy(cp, cands)
113+
cands = slices.DeleteFunc(cp, a.cs.notHasStore)
114+
}
85115
// TODO(wenyihu6): for simplicity, we create a new scratchNodes every call.
86116
// We should reuse the scratchNodes instead.
87117
scratchNodes := map[roachpb.NodeID]*NodeLoad{}
88118
scratchStores := map[roachpb.StoreID]struct{}{}
89119
cands = append(cands, existing)
90120
means, ok := computeMeansForStoreSet(a.cs, cands, scratchNodes, scratchStores)
91121
if !ok {
92-
// Unreachable: cands always contains at least `existing`. Assert in
93-
// test builds; in production, fall back to a no-op advisor rather than
94-
// return a zero-valued means that would misclassify stores. Gating on
95-
// !ok avoids variadic arg boxing on the success path.
96-
assertTruef(context.Background(), false, "computeMeansForStoreSet returned !ok for non-empty cands=%v", cands)
122+
// Unreachable: cands always contains at least `existing`, which we
123+
// just verified is known to MMA. Assert in test builds; in production,
124+
// fall back to a no-op advisor rather than return a zero-valued means
125+
// that would misclassify stores. Gating on !ok avoids variadic arg
126+
// boxing on the success path.
127+
assertTruef(ctx, false, "computeMeansForStoreSet returned !ok for non-empty cands=%v", cands)
97128
return NoopMMARebalanceAdvisor()
98129
}
99130
return &MMARebalanceAdvisor{
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright 2026 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package mmaprototype
7+
8+
import (
9+
"context"
10+
"math/rand"
11+
"testing"
12+
13+
"github.com/cockroachdb/cockroach/pkg/roachpb"
14+
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
15+
"github.com/cockroachdb/cockroach/pkg/util/log"
16+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
17+
"github.com/stretchr/testify/require"
18+
)
19+
20+
// TestBuildMMARebalanceAdvisorUnknownStores covers the crash described in
21+
// #170703: before the fix in this commit, BuildMMARebalanceAdvisor would
22+
// nil-pointer-deref if any storeID it was given (either `existing` or
23+
// anything in `cands`) was not yet known to MMA's clusterState. This
24+
// happens during startup, when the caller (the legacy allocator) builds
25+
// candidate lists from StorePool's gossip-driven view before MMA has been
26+
// notified of the corresponding stores via SetStore.
27+
//
28+
// After the fix, unknown cands are silently dropped, and an unknown
29+
// existing falls back to a no-op advisor (which always reports no
30+
// conflict).
31+
func TestBuildMMARebalanceAdvisorUnknownStores(t *testing.T) {
32+
defer leaktest.AfterTest(t)()
33+
defer log.Scope(t).Close(t)
34+
35+
ctx := context.Background()
36+
37+
const (
38+
knownStore roachpb.StoreID = 1
39+
unknownStore roachpb.StoreID = 99
40+
)
41+
42+
makeAllocator := func() *allocatorState {
43+
a := NewAllocatorState(timeutil.DefaultTimeSource{}, rand.New(rand.NewSource(0)))
44+
a.SetStore(StoreAttributesAndLocality{
45+
StoreID: knownStore,
46+
NodeID: roachpb.NodeID(knownStore),
47+
})
48+
return a
49+
}
50+
51+
t.Run("unknown cand", func(t *testing.T) {
52+
a := makeAllocator()
53+
// Unknown cand is silently dropped; the advisor is built over just
54+
// the known existing store.
55+
advisor := a.BuildMMARebalanceAdvisor(ctx, knownStore, []roachpb.StoreID{unknownStore})
56+
require.NotNil(t, advisor)
57+
require.False(t, advisor.disabled)
58+
require.Equal(t, knownStore, advisor.existingStoreID)
59+
})
60+
61+
t.Run("unknown existing", func(t *testing.T) {
62+
a := makeAllocator()
63+
// MMA cannot judge candidates against a source it does not know;
64+
// the no-op advisor short-circuits IsInConflictWithMMA to false.
65+
advisor := a.BuildMMARebalanceAdvisor(ctx, unknownStore, nil)
66+
require.NotNil(t, advisor)
67+
require.True(t, advisor.disabled)
68+
require.False(t, a.IsInConflictWithMMA(ctx, knownStore, advisor, false /* cpuOnly */))
69+
})
70+
}

pkg/kv/kvserver/mmaintegration/allocator_sync.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,9 @@ type mmaState interface {
5858
// MMARebalanceAdvisor for the given existing store and candidates. The
5959
// advisor should be later passed to IsInConflictWithMMA to determine if a
6060
// given candidate is in conflict with the existing store.
61-
BuildMMARebalanceAdvisor(existing roachpb.StoreID, cands []roachpb.StoreID) *mmaprototype.MMARebalanceAdvisor
61+
BuildMMARebalanceAdvisor(
62+
ctx context.Context, existing roachpb.StoreID, cands []roachpb.StoreID,
63+
) *mmaprototype.MMARebalanceAdvisor
6264
// IsInConflictWithMMA is called by the allocator sync to determine if the
6365
// given candidate is in conflict with the existing store.
6466
IsInConflictWithMMA(ctx context.Context, cand roachpb.StoreID, advisor *mmaprototype.MMARebalanceAdvisor, cpuOnly bool) bool

0 commit comments

Comments
 (0)