ops(rolling-update): bump raftadmin RPC timeout + retry transfer (#799)

bootjp · web-flow · commit ab8eac87fe69 · 2026-05-25T15:41:12.000+09:00
## Summary
Make `scripts/rolling-update.sh` survive the post-restart catch-up
window during multi-node rolling updates:

1. Raise default `RAFTADMIN_RPC_TIMEOUT_SECONDS` from `5` → `15`
(single-RPC headroom).
2. Add `LEADERSHIP_TRANSFER_RETRY_ATTEMPTS` (default `3`) and
`LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS` (default `5`). The targeted
`leadership_transfer_to_server` RPC is now retried with backoff before
falling back to the generic transfer; the generic fallback is only used
after all targeted retries are exhausted.

## Why
The 2026-05-21 production re-deploy reproduction:

```
==&gt; [n2@192.168.0.211] start
node is leader; transferring leadership to n1@192.168.0.210:50051
targeted leadership transfer RPC failed: rpc error: code = FailedPrecondition desc = etcd raft leadership transfer aborted
falling back to generic leadership transfer
generic leadership transfer RPC failed: rpc error: code = FailedPrecondition desc = etcd raft leadership transfer aborted
[bailed out, cluster half-deployed]
```

n1 had been rolled-restarted ~10 s earlier and its log had not yet
caught up. raft refused both the targeted and the generic transfer for
the same reason. Manual recovery required
`RAFTADMIN_RPC_TIMEOUT_SECONDS=30` plus a hand-issued `raftadmin` call.

## Caller audit
- `leadership_transfer_to_server` retry: callers
(`maybe_transfer_leadership`) interpret any return failure as a refusal
to restart. The change only delays that decision under transient
failure, never widens its scope.
- `RAFTADMIN_RPC_TIMEOUT_SECONDS`: every raftadmin RPC respects this.
Raising the default does not change which RPCs succeed — only widens the
kill window for a slow RPC.

## Test plan
- [x] `bash -n scripts/rolling-update.sh` — clean
- [ ] Production re-run exercises retry path (would surface as `attempt
N/3` log lines if FailedPrecondition recurs)
diff --git a/scripts/rolling-update.env.example b/scripts/rolling-update.env.example
@@ -64,9 +64,22 @@ SSH_STRICT_HOST_KEY_CHECKING="accept-new"
 # If set, this binary must already be executable on the local control host.
 # RAFTADMIN_BIN="/absolute/path/to/linux/raftadmin"
 RAFTADMIN_REMOTE_BIN="/tmp/elastickv-raftadmin"
-RAFTADMIN_RPC_TIMEOUT_SECONDS="5"
+# Bumped from 5 to 15 (2026-05-22) so leadership-transfer RPCs survive
+# raft's transient pre-stable state right after a peer restart. The
+# 2026-05-21 reproduction (Actions run 26198185540) needed ~10 s of
+# headroom for the candidate's log to catch up before the transfer
+# could commit.
+RAFTADMIN_RPC_TIMEOUT_SECONDS="15"
 RAFTADMIN_ALLOW_INSECURE="true"
 
+# Retry the targeted leadership_transfer_to_server RPC up to N times
+# before falling back to generic transfer. Each retry waits
+# LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS to let the candidate's
+# log catch up. Counts the first attempt toward the budget; set to 1
+# to disable retry.
+LEADERSHIP_TRANSFER_RETRY_ATTEMPTS="3"
+LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS="5"
+
 # OOM defenses applied on 2026-04-24 after kernel OOM-SIGKILL cascades.
 # GOMEMLIMIT makes Go GC before the container hits --memory; --memory keeps
 # any kill scoped to the container, not host processes. Set either to "" to
diff --git a/scripts/rolling-update.sh b/scripts/rolling-update.sh
@@ -79,6 +79,8 @@ Optional environment:
   RAFTADMIN_REMOTE_BIN
   RAFTADMIN_RPC_TIMEOUT_SECONDS
   RAFTADMIN_ALLOW_INSECURE
+  LEADERSHIP_TRANSFER_RETRY_ATTEMPTS
+  LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS
 
   EXTRA_ENV
     Whitespace-separated list of additional container environment variables to
@@ -198,8 +200,23 @@ ROLLING_DELAY_SECONDS="${ROLLING_DELAY_SECONDS:-2}"
 SSH_CONNECT_TIMEOUT_SECONDS="${SSH_CONNECT_TIMEOUT_SECONDS:-10}"
 SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
 RAFTADMIN_REMOTE_BIN="${RAFTADMIN_REMOTE_BIN:-/tmp/elastickv-raftadmin}"
-RAFTADMIN_RPC_TIMEOUT_SECONDS="${RAFTADMIN_RPC_TIMEOUT_SECONDS:-5}"
+# Default raised from 5 s to 15 s after the 2026-05-21 reproduction
+# (https://github.com/bootjp/elastickv/actions/runs/26198185540) where
+# the previous node's restart left raft in a transient pre-stable state
+# for the next leadership-transfer RPC. 5 s gave the RPC no headroom
+# over a brief raft-internal abort and the script bailed out. 15 s is
+# still small enough that a truly stuck call surfaces quickly.
+RAFTADMIN_RPC_TIMEOUT_SECONDS="${RAFTADMIN_RPC_TIMEOUT_SECONDS:-15}"
 RAFTADMIN_ALLOW_INSECURE="${RAFTADMIN_ALLOW_INSECURE:-true}"
+# LEADERSHIP_TRANSFER_RETRY_ATTEMPTS bounds how many times we re-issue
+# a leadership_transfer_to_server RPC when raft returns
+# FailedPrecondition (e.g. "etcd raft leadership transfer aborted")
+# because the candidate's log has not yet caught up or an in-flight
+# conf change is blocking the transfer. The first attempt counts
+# toward the budget; ATTEMPTS=1 means "no retry". Default 3 covers
+# the ~10 s catch-up window that follows a peer's rolling restart.
+LEADERSHIP_TRANSFER_RETRY_ATTEMPTS="${LEADERSHIP_TRANSFER_RETRY_ATTEMPTS:-3}"
+LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS="${LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS:-5}"
 NODES="${NODES:-}"
 SSH_TARGETS="${SSH_TARGETS:-}"
 ROLLING_ORDER="${ROLLING_ORDER:-}"
@@ -574,6 +591,8 @@ update_one_node() {
       SQS_FIFO_PARTITION_MAP="$SQS_FIFO_PARTITION_MAP_Q" \
       HEALTH_TIMEOUT_SECONDS="$HEALTH_TIMEOUT_SECONDS" \
       LEADERSHIP_TRANSFER_TIMEOUT_SECONDS="$LEADERSHIP_TRANSFER_TIMEOUT_SECONDS" \
+      LEADERSHIP_TRANSFER_RETRY_ATTEMPTS="$LEADERSHIP_TRANSFER_RETRY_ATTEMPTS" \
+      LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS="$LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS" \
       LEADER_DISCOVERY_TIMEOUT_SECONDS="$LEADER_DISCOVERY_TIMEOUT_SECONDS" \
       RAFTADMIN_RPC_TIMEOUT_SECONDS="$RAFTADMIN_RPC_TIMEOUT_SECONDS" \
       RAFTADMIN_ALLOW_INSECURE="$RAFTADMIN_ALLOW_INSECURE" \
@@ -800,15 +819,34 @@ ensure_not_leader_before_restart() {
   candidate_addr="${candidate_host}:${RAFT_PORT}"
 
   echo "node is leader; transferring leadership to ${candidate_id}@${candidate_addr}"
-  rpc_output="$(raftadmin_text "${NODE_HOST}:${RAFT_PORT}" leadership_transfer_to_server "${candidate_id}" "${candidate_addr}")" || {
-    echo "targeted leadership transfer RPC failed: $rpc_output" >&2
+  # Retry the targeted transfer up to LEADERSHIP_TRANSFER_RETRY_ATTEMPTS
+  # times. A common failure shape under rolling restarts is etcd raft
+  # rejecting the transfer with "etcd raft leadership transfer aborted"
+  # when the candidate's log has not yet caught up to the leader. The
+  # candidate typically becomes ready within a few seconds, so a brief
+  # backoff between attempts is usually enough. Only when ALL targeted
+  # retries are exhausted do we fall back to the generic transfer.
+  local attempt=1 transfer_succeeded=false
+  while (( attempt <= LEADERSHIP_TRANSFER_RETRY_ATTEMPTS )); do
+    if rpc_output="$(raftadmin_text "${NODE_HOST}:${RAFT_PORT}" leadership_transfer_to_server "${candidate_id}" "${candidate_addr}")"; then
+      transfer_succeeded=true
+      break
+    fi
+    echo "targeted leadership transfer attempt ${attempt}/${LEADERSHIP_TRANSFER_RETRY_ATTEMPTS} failed: $rpc_output" >&2
+    if (( attempt < LEADERSHIP_TRANSFER_RETRY_ATTEMPTS )); then
+      echo "retrying in ${LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS}s..." >&2
+      sleep "${LEADERSHIP_TRANSFER_RETRY_BACKOFF_SECONDS}"
+    fi
+    attempt=$(( attempt + 1 ))
+  done
+  if [[ "$transfer_succeeded" != "true" ]]; then
     echo "falling back to generic leadership transfer"
     rpc_output="$(raftadmin_text "${NODE_HOST}:${RAFT_PORT}" leadership_transfer)" || {
       echo "generic leadership transfer RPC failed: $rpc_output" >&2
       return 1
     }
     candidate_addr=""
-  }
+  fi
 
   if ! wait_for_leader_change "${NODE_HOST}:${RAFT_PORT}" "$candidate_addr"; then
     echo "leadership did not move away from ${NODE_HOST}:${RAFT_PORT} within ${LEADERSHIP_TRANSFER_TIMEOUT_SECONDS}s" >&2