Skip to content

Commit dbe4725

Browse files
authored
test(grpc): absorb leader churn in consistency loops without weakening the check (#632)
## Summary `Test_consistency_satisfy_write_after_read_sequence` and `Test_grpc_transaction` both run a 9999-iteration Put/Get loop against a 3-node Raft cluster. On a busy CI runner Raft re-election can fire mid-loop and the in-flight RPC surfaces as `rpc error: code = Unknown desc = leader not found` (also `etcd raft engine is not leader`). Today both tests abort on the first such error even though it is purely an availability hiccup — nothing committed, so consistency cannot be violated. Wrap every RPC in the existing `retryNotLeader` helper so transient leader churn is absorbed within `leaderChurnRetryTimeout`. The consistency assertions (`assert.Equal` for value reads, `assert.Nil` for post-delete reads) are unchanged: once the Put / Delete eventually commits, the subsequent Get must agree, and a stale read still fails the test loudly. **This is the explicit constraint — 一貫性確認そのものは損なわず — and it holds because `retryNotLeader` only inspects RPC error codes, never the response payload.** The fix is symmetric with how `rpushEventually` / `lpushEventually` already wrap Lua list tests for the same class of CI flake. ## Test plan - [x] Build / vet / `golangci-lint` clean. - [x] Locally: `go test ./adapter/ -run 'Test_grpc_transaction|Test_consistency_satisfy_write_after_read_sequence' -count=1 -timeout 600s` passes (this is the same race-mode shape that was failing on https://github.com/bootjp/elastickv/actions/runs/24930560663/job/73007486310). - [x] No change to the consistency invariants — only the RPC failure path is retried. /gemini review @codex review <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Tests** * Enhanced robustness of gRPC transaction tests by implementing retry logic for transient failures during leader elections, reducing flaky test failures and improving overall test reliability in distributed cluster scenarios. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2 parents e055e98 + 1aed386 commit dbe4725

1 file changed

Lines changed: 60 additions & 17 deletions

File tree

adapter/grpc_test.go

Lines changed: 60 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -188,32 +188,56 @@ func Test_consistency_satisfy_write_after_read_sequence(t *testing.T) {
188188
c := rawKVClient(t, adders)
189189
defer shutdown(nodes)
190190

191+
// Use t.Context() so a test-level cancel (timeout, parent test
192+
// stopping) propagates into every RPC and the retry loop alike,
193+
// rather than leaking work via context.Background() once the test
194+
// goroutine returns.
195+
ctx := t.Context()
191196
key := []byte("test-key-sequence")
192197

198+
// Each RPC is wrapped in retryNotLeader so an in-flight Raft
199+
// re-election (which can fire mid-loop on a busy CI runner — emit
200+
// "leader not found" / "etcd raft engine is not leader" — and is
201+
// purely an availability hiccup, not a consistency violation) does
202+
// not abort the test. The post-RPC assert.Equal still pins the
203+
// consistency invariant: once Put eventually succeeds, the
204+
// subsequent Get must return the same value, otherwise we fail.
193205
for i := range 9999 {
194206
want := []byte("sequence" + strconv.Itoa(i))
195-
_, err := c.RawPut(
196-
context.Background(),
197-
&pb.RawPutRequest{Key: key, Value: want},
198-
)
199-
// Stop at the first RPC failure instead of continuing: a
200-
// genuine regression would otherwise cascade into 9998 more
201-
// iterations, each reporting the same broken invariant, and
202-
// drown the real cause in test-output noise.
207+
err := retryNotLeader(ctx, func() error {
208+
_, perr := c.RawPut(ctx, &pb.RawPutRequest{Key: key, Value: want})
209+
return perr
210+
})
211+
// Stop at the first non-leader-churn RPC failure instead of
212+
// continuing: a genuine regression would otherwise cascade
213+
// into 9998 more iterations, each reporting the same broken
214+
// invariant, and drown the real cause in test-output noise.
203215
if !assert.NoError(t, err, "Put RPC failed") {
204216
break
205217
}
206218

207-
_, err = c.RawPut(context.Background(), &pb.RawPutRequest{Key: key, Value: want})
219+
err = retryNotLeader(ctx, func() error {
220+
_, perr := c.RawPut(ctx, &pb.RawPutRequest{Key: key, Value: want})
221+
return perr
222+
})
208223
if !assert.NoError(t, err, "Put RPC failed") {
209224
break
210225
}
211226

212-
resp, err := c.RawGet(context.Background(), &pb.RawGetRequest{Key: key})
227+
var resp *pb.RawGetResponse
228+
err = retryNotLeader(ctx, func() error {
229+
var gerr error
230+
resp, gerr = c.RawGet(ctx, &pb.RawGetRequest{Key: key})
231+
return gerr
232+
})
213233
if !assert.NoError(t, err, "Get RPC failed") {
214234
break
215235
}
216236

237+
// Consistency invariant — the entire reason this test exists.
238+
// Wrapped RPCs only mask transport-layer flakes; if the
239+
// cluster ever returns a stale Get result here it is still
240+
// flagged loudly.
217241
assert.Equal(t, want, resp.Value, "consistency check failed")
218242
}
219243
}
@@ -224,32 +248,51 @@ func Test_grpc_transaction(t *testing.T) {
224248
c := transactionalKVClient(t, adders)
225249
defer shutdown(nodes)
226250

251+
// See Test_consistency_satisfy_write_after_read_sequence for why
252+
// we use t.Context() and retryNotLeader together.
253+
ctx := t.Context()
227254
key := []byte("test-key-sequence")
228255

256+
// Same retryNotLeader wrap as Test_consistency_satisfy_write_after_read
257+
// _sequence: tolerate transient leader churn (purely availability,
258+
// not consistency) while keeping the Put → Get → Delete → Get
259+
// invariants strict.
229260
for i := range 9999 {
230261
want := []byte("sequence" + strconv.Itoa(i))
231-
_, err := c.Put(
232-
context.Background(),
233-
&pb.PutRequest{Key: key, Value: want},
234-
)
262+
err := retryNotLeader(ctx, func() error {
263+
_, perr := c.Put(ctx, &pb.PutRequest{Key: key, Value: want})
264+
return perr
265+
})
235266
// See Test_consistency_satisfy_write_after_read_sequence:
236267
// break on first RPC failure so a single broken invariant
237268
// does not amplify into thousands of assertion lines.
238269
if !assert.NoError(t, err, "Put RPC failed") {
239270
break
240271
}
241-
resp, err := c.Get(context.Background(), &pb.GetRequest{Key: key})
272+
var resp *pb.GetResponse
273+
err = retryNotLeader(ctx, func() error {
274+
var gerr error
275+
resp, gerr = c.Get(ctx, &pb.GetRequest{Key: key})
276+
return gerr
277+
})
242278
if !assert.NoError(t, err, "Get RPC failed") {
243279
break
244280
}
245281
assert.Equal(t, want, resp.Value, "consistency check failed")
246282

247-
_, err = c.Delete(context.Background(), &pb.DeleteRequest{Key: key})
283+
err = retryNotLeader(ctx, func() error {
284+
_, derr := c.Delete(ctx, &pb.DeleteRequest{Key: key})
285+
return derr
286+
})
248287
if !assert.NoError(t, err, "Delete RPC failed") {
249288
break
250289
}
251290

252-
resp, err = c.Get(context.Background(), &pb.GetRequest{Key: key})
291+
err = retryNotLeader(ctx, func() error {
292+
var gerr error
293+
resp, gerr = c.Get(ctx, &pb.GetRequest{Key: key})
294+
return gerr
295+
})
253296
if !assert.NoError(t, err, "Get RPC failed") {
254297
break
255298
}

0 commit comments

Comments
 (0)