From e2a5a896ab4a845b778d8ed142c7bf702487d54c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 2 Jun 2026 00:40:59 +0900 Subject: [PATCH 1/2] fix(redis): finish option-2 dedup HLC-4 / ctx-parent parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Picks up the two follow-up items noted in claude[bot]'s PR #887 verdict but deferred from that PR: (a) prepareDispatch() takes its bounded ctx from t.server. handlerContext() — the server-lifetime ctx, NOT the caller's dispatchCtx. The reuseCtx fix on PR #887 round 1 corrected runTransactionWithDedup to derive from dispatchCtx; this commit applies the symmetric fix to prepareDispatch so a disconnected client / retryRedisWrite timeout interrupts the prepare+dispatch promptly. Nil-guard falls back to handlerContext for test fixtures that construct a txnContext without setting ctx. (b) dispatchListPushReuse and the listPushCoreWithDedup first- attempt site both shipped before the HLC-4 physical-ceiling migration and were missed in that pass. dispatchExecReuse already uses NextFenced (PR #887 round 1) and so do all first-attempt commit_ts allocations through prepareDispatch and the post-raise SetQueueAttributes etc. These two list-push dedup sites were the last persistence-grade Clock().Next() callers in adapter/redis.go — verified by grep '\.Clock()\.Next\b' adapter/redis.go now returning only the NextFenced variants. Without HLC-4 parity here, a stale-leader window could let the list-push dedup path mint a commit_ts that collides with the successor's freshly-fenced range — defeating the same anomaly class option-2 was introduced to prevent. Caller audit per /loop semantic-change rule ============================================ - prepareDispatch: sole caller is commit() (in this file) and firstExecAttempt (in this file). Both honor defer prepared.cancel() and check len(prepared.elems) == 0; ctx- parent change is additive — same defer-cancel discipline, same error mapping. No external callers. - dispatchListPushReuse: sole caller is listPushCoreWithDedup (in this file). NextFenced error wired through the existing (newLen, drop, err) tuple position; the new error is non- retryable so retryRedisWrite surfaces it directly to the caller — same shape as PR #887 round-1's dispatchExecReuse NextFenced wiring. - listPushCoreWithDedup first-attempt site: surfaces NextFenced error through the existing retryRedisWrite closure return, matching the pattern in prepareDispatch. Validation ========== go test ./adapter/ -run 'ListPushDedup|ExecDedup|StandaloneSetDedup' -race -count=2 -> ok 1.2s gofmt + go vet + golangci-lint run -> 0 issues This PR closes the explicitly-noted follow-up from PR #887's "Note on pre-existing Clock().Next() callers" + the prepareDispatch ctx-parent deviation the round-1 review marked for symmetric treatment. No new functionality. --- adapter/redis.go | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/adapter/redis.go b/adapter/redis.go index 170f0fa7..fa481e4f 100644 --- a/adapter/redis.go +++ b/adapter/redis.go @@ -2778,12 +2778,21 @@ func (t *txnContext) prepareDispatch() (preparedTxnDispatch, error) { // non-string keys get a !redis|ttl| element written in the same transaction. ttlElems := t.buildTTLElems() - // Derive a single redisDispatchTimeout-bounded context covering both the - // stream-deletion scans (paginated ScanAt/ExistsAt over StreamEntryScanPrefix) - // and the final Dispatch. Without this bound, buildStreamDeletionElems would - // run on the server-lifetime handlerContext, leaving its scans uncancellable - // from the request side on a slow disk or hot-key pathological commit. - ctx, cancel := context.WithTimeout(t.server.handlerContext(), redisDispatchTimeout) + // Derive a single redisDispatchTimeout-bounded context covering both + // the stream-deletion scans (paginated ScanAt/ExistsAt over + // StreamEntryScanPrefix) and the final Dispatch. The parent is the + // txnContext's own ctx (the caller's dispatchCtx), not the server- + // lifetime handlerContext, so an outer cancellation (client + // disconnect, retryRedisWrite timeout) interrupts the prepare+dispatch + // promptly instead of waiting the full redisDispatchTimeout — same + // rationale as runTransactionWithDedup's reuseCtx fix on PR #887. The + // nil-guard falls back to handlerContext for callers that construct a + // txnContext without setting ctx (test fixtures). + parentCtx := t.ctx + if parentCtx == nil { + parentCtx = t.server.handlerContext() + } + ctx, cancel := context.WithTimeout(parentCtx, redisDispatchTimeout) streamElems, err := t.buildStreamDeletionElems(ctx) if err != nil { @@ -3607,7 +3616,16 @@ type reusableListPush struct { // fresh meta. Extracted from listPushCoreWithDedup to keep that closure // under the cyclop / gocognit / nestif limits. func (r *RedisServer) dispatchListPushReuse(ctx context.Context, key []byte, pending *reusableListPush) (newLen int64, drop bool, err error) { - commitTS := r.coordinator.Clock().Next() + // HLC-4 parity: persistence-grade commit_ts allocation must honor + // the physical-ceiling fence so a stale-leader window cannot mint a + // timestamp that collides with the successor's. dispatchExecReuse + // already uses NextFenced (PR #887 round 1); the two listPush dedup + // sites shipped before the HLC-4 migration and were missed in that + // pass — see PR #887 verdict for the noted gap. + commitTS, allocErr := r.coordinator.Clock().NextFenced() + if allocErr != nil { + return 0, false, errors.Wrap(allocErr, "redis list-push reuse: allocate commitTS") + } _, dispErr := r.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{ IsTxn: true, StartTS: pending.startTS, @@ -3778,7 +3796,12 @@ func (r *RedisServer) listPushCoreWithDedup(ctx context.Context, key []byte, val return err } - commitTS := r.coordinator.Clock().Next() + // HLC-4 parity with prepareDispatch / dispatchExecReuse — + // see dispatchListPushReuse above for the rationale. + commitTS, allocErr := r.coordinator.Clock().NextFenced() + if allocErr != nil { + return errors.Wrap(allocErr, "redis list-push first-attempt: allocate commitTS") + } ops, updatedMeta, err := buildFn(meta, key, values, commitTS, 0) if err != nil { return err From 216ca94bdc3cf92ade3332f6bd8e325533c3a5a2 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 2 Jun 2026 13:35:10 +0900 Subject: [PATCH 2/2] fix(docs): strip PR-process refs from inline comments claude[bot] PR #901 minor #2 (CLAUDE.md convention): inline comments should state the underlying reason, not reference issue/PR numbers that rot as the codebase evolves. Dropped "PR #887" / "PR #887 round 1" / "PR #887 verdict" / "PR #887 review" attributions from the three doc blocks. The WHY content is kept (ctx-parent symmetry, HLC-4 fence rationale, ErrCeilingExpired non-retryable shape). The PR description is the durable home for the flake-fix lineage and the related PR refs. No behavior change. Comment-only. --- adapter/redis.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/adapter/redis.go b/adapter/redis.go index fa481e4f..5f132733 100644 --- a/adapter/redis.go +++ b/adapter/redis.go @@ -2784,10 +2784,10 @@ func (t *txnContext) prepareDispatch() (preparedTxnDispatch, error) { // txnContext's own ctx (the caller's dispatchCtx), not the server- // lifetime handlerContext, so an outer cancellation (client // disconnect, retryRedisWrite timeout) interrupts the prepare+dispatch - // promptly instead of waiting the full redisDispatchTimeout — same - // rationale as runTransactionWithDedup's reuseCtx fix on PR #887. The - // nil-guard falls back to handlerContext for callers that construct a - // txnContext without setting ctx (test fixtures). + // promptly instead of waiting the full redisDispatchTimeout. Symmetric + // with the reuseCtx threading in runTransactionWithDedup. The nil-guard + // falls back to handlerContext for callers that construct a txnContext + // without setting ctx (test fixtures). parentCtx := t.ctx if parentCtx == nil { parentCtx = t.server.handlerContext() @@ -3282,7 +3282,7 @@ func (r *RedisServer) runTransactionWithDedup(queue []redcon.Command) ([]redisRe // fresh 10 s budget elapses. The earlier "fresh ctx from // handlerContext" pattern (noted in design doc §M3) was strictly // more conservative but wasted resources on a disconnected - // client — see PR #887 review. + // client. reuseCtx, reuseCancel := context.WithTimeout(dispatchCtx, redisDispatchTimeout) defer reuseCancel() res, drop, dispErr := r.dispatchExecReuse(reuseCtx, pending) @@ -3618,10 +3618,11 @@ type reusableListPush struct { func (r *RedisServer) dispatchListPushReuse(ctx context.Context, key []byte, pending *reusableListPush) (newLen int64, drop bool, err error) { // HLC-4 parity: persistence-grade commit_ts allocation must honor // the physical-ceiling fence so a stale-leader window cannot mint a - // timestamp that collides with the successor's. dispatchExecReuse - // already uses NextFenced (PR #887 round 1); the two listPush dedup - // sites shipped before the HLC-4 migration and were missed in that - // pass — see PR #887 verdict for the noted gap. + // timestamp that collides with the successor's. The error path + // returns ErrCeilingExpired which isRetryableRedisTxnErr classifies + // as non-retryable, so it exits retryRedisWrite directly to the + // client — same shape as the other persistence-grade Next call + // sites in this file. commitTS, allocErr := r.coordinator.Clock().NextFenced() if allocErr != nil { return 0, false, errors.Wrap(allocErr, "redis list-push reuse: allocate commitTS")