Skip to content

Commit 0be9fc7

Browse files
committed
test(kv): address PR #745 round-1 review
Round-1 review on commit ad924ad: - gemini medium: assert minimum elapsed time, not just upper bound. Without a lower bound a future regression returning DeadlineExceeded before actually waiting would silently pass. Added a lower-bound check with 200ms slack so a slow CI scheduler that trips ctx.Done() a hair before the wall clock catches up does not flake. - claude bot (style): trim incident narrative from the verifyLeaderTimeout doc comment. Specific IPs, goroutine counts, and timestamps belong in the commit message and PR body, not the source — they go stale and add no design value at the call site. Kept the rationale (why 5s, the O(N) pendingReads feedback loop, match leaderForwardTimeout) and added a "see PR #745" reference for readers who want the full incident detail. - claude bot: switch the regression test from stdlib `stderrors "errors"` to `github.com/cockroachdb/errors` for consistency with the project convention (CLAUDE.md mandates cockroachdb/errors at boundaries; the test is a boundary). Also drops the import alias which made gci complain about ordering. - claude bot: add `if testing.Short() { t.Skip(...) }` guard so `go test -short ./kv/...` does not pay the verifyLeaderTimeout (5s) wait. Default `make test` still runs it. - reviewdog/golangci (gci): import order fixed by reordering third-party (cockroachdb) above local (bootjp) per the project's default gci sections (standard, default, prefix(...)). No behavior change. Test still passes: go test -race -count=1 -run TestVerifyLeaderEngine ./kv -- 6.06s go test -race -count=1 -short ./kv -- 5.55s (test skipped under -short) No semantic change to the production fix; this commit is test + comment polish only, so no caller audit needed.
1 parent ad924ad commit 0be9fc7

2 files changed

Lines changed: 39 additions & 26 deletions

File tree

kv/raft_engine.go

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,20 @@ import (
1010
)
1111

1212
// verifyLeaderTimeout caps how long the no-context verifyLeaderEngine path
13-
// is willing to wait for a ReadIndex round-trip.
14-
//
15-
// A previous version called engine.VerifyLeader with context.Background(),
16-
// so callers without an upstream deadline (LeaderProxy.Commit/Abort,
13+
// is willing to wait for a ReadIndex round-trip. Without this bound,
14+
// callers that hold context.Background() (LeaderProxy.Commit/Abort,
1715
// Coordinate.VerifyLeader, ShardedCoordinator.VerifyLeader[ForKey], and
1816
// the S3/SQS/admin /healthz/leader handlers) blocked indefinitely whenever
19-
// ReadIndex completion stalled — a single transient stall accumulated
20-
// callers permanently.
21-
//
22-
// Production hit this on 2026-05-08: a follower (192.168.0.214) lost its
23-
// network route mid-flight and the leader's ReadIndex completion stalled
24-
// intermittently. verifyLeaderEngine callers piled up at ~9/sec without
25-
// bound; after ~37 minutes the leader was holding 20,560 goroutines
26-
// (20,478 in submitRead select, oldest 39 minutes), CPU pinned at 1870%
27-
// (the Engine.run Ready loop walks pendingReads O(N) per tick, so the
28-
// queue feeds back on itself), and host MemAvailable trended toward 0
29-
// until OOM. The same pattern repeated on each new leader after failover.
17+
// ReadIndex completion stalled, and a single transient stall accumulated
18+
// callers permanently — Engine.run's Ready loop walks pendingReads O(N)
19+
// per tick, so the queue feeds back on itself once it grows.
3020
//
3121
// 5s matches leaderForwardTimeout: a verify that takes longer than a
32-
// single forward RPC is, by definition, useless as a freshness check,
33-
// and the proxy's verify-then-forward path stays within its 5s retry
34-
// budget.
22+
// single forward RPC is useless as a freshness check, and the proxy's
23+
// verify-then-forward path stays within its 5s retry budget.
24+
//
25+
// See PR #745 / incident 2026-05-08 for the goroutine-pile production
26+
// failure this prevents.
3527
const verifyLeaderTimeout = 5 * time.Second
3628

3729
func engineForGroup(g *ShardGroup) raftengine.Engine {

kv/raft_engine_test.go

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ package kv
22

33
import (
44
"context"
5-
stderrors "errors"
65
"testing"
76
"time"
87

8+
"github.com/cockroachdb/errors"
9+
910
"github.com/bootjp/elastickv/internal/raftengine"
1011
)
1112

@@ -16,8 +17,8 @@ import (
1617
// callers under test.
1718
type blockingLeaderView struct{}
1819

19-
func (blockingLeaderView) State() raftengine.State { return raftengine.StateLeader }
20-
func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} }
20+
func (blockingLeaderView) State() raftengine.State { return raftengine.StateLeader }
21+
func (blockingLeaderView) Leader() raftengine.LeaderInfo { return raftengine.LeaderInfo{ID: "self"} }
2122
func (blockingLeaderView) VerifyLeader(ctx context.Context) error {
2223
<-ctx.Done()
2324
return ctx.Err()
@@ -30,10 +31,17 @@ func (blockingLeaderView) LinearizableRead(ctx context.Context) (uint64, error)
3031
// TestVerifyLeaderEngine_BoundsBlockingReadIndex pins the regression: if a
3132
// stalled ReadIndex used to return only when the underlying ctx fired, but
3233
// callers passed context.Background(), the goroutine pinned forever. After
33-
// 2026-05-08-style stalls in production this must complete within roughly
34+
// the 2026-05-08 incident this must complete within roughly
3435
// verifyLeaderTimeout, surfacing context.DeadlineExceeded.
36+
//
37+
// Skipped under -short because the whole point is to wait for the deadline
38+
// to fire; the no-skip path adds verifyLeaderTimeout (5s) to every default
39+
// `make test` run.
3540
func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
3641
t.Parallel()
42+
if testing.Short() {
43+
t.Skip("skipping: blocks for verifyLeaderTimeout (5s)")
44+
}
3745

3846
start := time.Now()
3947
err := verifyLeaderEngine(blockingLeaderView{})
@@ -42,13 +50,26 @@ func TestVerifyLeaderEngine_BoundsBlockingReadIndex(t *testing.T) {
4250
if err == nil {
4351
t.Fatalf("verifyLeaderEngine(blocking) returned nil; expected DeadlineExceeded")
4452
}
45-
if !stderrors.Is(err, context.DeadlineExceeded) {
53+
if !errors.Is(err, context.DeadlineExceeded) {
4654
t.Fatalf("verifyLeaderEngine(blocking) err = %v; want DeadlineExceeded", err)
4755
}
48-
// Allow generous slack so a slow CI host does not flake; the point is
49-
// not to assert a tight bound but to prove the call returns at all.
56+
// Lower bound: confirm the engine actually held the call until the
57+
// deadline fired. Without this, a future regression that returned
58+
// DeadlineExceeded immediately (e.g. a misplaced ctx check before
59+
// the engine call) would silently pass. Pulled in from gemini's
60+
// PR #745 round-1 review — the upper bound alone proved bounded
61+
// completion but not "the timeout actually fired."
62+
//
63+
// Tolerate a 200ms early-return slack so a slow CI scheduler that
64+
// trips ctx.Done() a hair before the wall clock catches up does
65+
// not flake.
66+
const slack = 200 * time.Millisecond
67+
if elapsed+slack < verifyLeaderTimeout {
68+
t.Fatalf("verifyLeaderEngine(blocking) returned too early after %s; want >= %s (-%s slack)", elapsed, verifyLeaderTimeout, slack)
69+
}
70+
// Upper bound: prove the call returned at all. Generous so a slow
71+
// CI host does not flake.
5072
if elapsed > 2*verifyLeaderTimeout {
5173
t.Fatalf("verifyLeaderEngine(blocking) returned after %s; want <= 2x verifyLeaderTimeout (%s)", elapsed, verifyLeaderTimeout)
5274
}
5375
}
54-

0 commit comments

Comments
 (0)