fix race condition between forget and failover (valkey-io#105)

ysqyang · yang.qiu · jdheyburn · web-flow · commit 2042a4727447 · 2026-04-10T14:26:46.000+01:00
## Summary Fix a race condition between `forgetStaleNodes` and Valkey's auto-failover that can permanently prevent a replica from being promoted after its primary dies. See valkey-io#103 for more context. ### The bug When a primary's deployment is deleted, the controller's `forgetStaleNodes` issues `CLUSTER FORGET` for the dead node from every surviving node. If this runs before Valkey's auto-failover election completes, it removes the dead primary from the other masters' node tables. Those masters can then no longer validate the replica's `FAILOVER_AUTH_REQUEST` (they don't recognize the dead node), so they never vote. The replica is permanently stuck as a slave, `findShardPrimary` never finds a primary for the shard, and the cluster enters an infinite loop of: ``` ERROR command failed: CLUSTER FORGET {"error": "Can't forget my master!"} DEBUG skipping replica; primary not ready yet DEBUG missing replicas, requeue.. ``` This is a timing-dependent race. The window is roughly 0.5–1 second between the `fail` flag being set and the failover election completing. It was reported by a user who hit it when deleting a primary deployment. ### The fix Before issuing `CLUSTER FORGET`, check whether any live node in the cluster still considers the failing node as its master (`HasReplicaOf`). If so, skip the FORGET — the replica needs the dead node in the other masters' node tables to complete the failover election. Once the failover completes and the replica is promoted, it no longer reports itself as a slave of the dead node, so the next reconcile will proceed with FORGET normally. ### Changes - **`internal/valkey/clusterstate.go`**: Add `HasReplicaOf(nodeId)` method on `ClusterState` that checks if any node's `CLUSTER NODES` self-report shows it as a replica of the given node ID. Add `MasterIdFromSelf()` helper on `NodeState` that extracts `fields[3]` (master ID) from the `myself` line. - **`internal/controller/valkeycluster_controller.go`**: Guard `forgetStaleNodes` with the `HasReplicaOf` check. When skipped, log `"skipping forget; failover pending for node"` at V(1). ### Why this is safe - **Dead replica (not a master):** No node claims a replica as its master → `HasReplicaOf` returns false → FORGET proceeds immediately. No behavior change. - **Both master and replica are dead:** The dead replica isn't in `state.Shards` (connection failed) → `HasReplicaOf` returns false → FORGET proceeds. Correct — no failover is possible anyway. - **Scale-down stale nodes:** Drained masters have no replicas left → `HasReplicaOf` returns false → FORGET proceeds. No behavior change. - **Failover permanently blocked for other reasons** (e.g., replica too far behind): `HasReplicaOf` returns true, FORGET is deferred. This is no worse than today where FORGET runs but the failover is also permanently blocked. With this fix, at least the failover has a chance if the blocking condition resolves. --------- Signed-off-by: yang.qiu <yang.qiu@reddit.com> Signed-off-by: Joseph Heyburn <jdheyburn@gmail.com> Co-authored-by: yang.qiu <yang.qiu@reddit.com> Co-authored-by: Joseph Heyburn <jdheyburn@gmail.com>
diff --git a/internal/controller/valkeycluster_controller.go b/internal/controller/valkeycluster_controller.go
@@ -805,14 +805,25 @@ func (r *ValkeyClusterReconciler) forgetStaleNodes(ctx context.Context, cluster
 				idx := slices.IndexFunc(nodes.Items, func(n valkeyiov1alpha1.ValkeyNode) bool {
 					return n.Status.PodIP == failing.Address
 				})
-				if idx == -1 {
-					log.V(1).Info("forget a failing node", "address", failing.Address, "Id", failing.Id)
-					if err := node.Client.Do(ctx, node.Client.B().ClusterForget().NodeId(failing.Id).Build()).Error(); err != nil {
-						log.Error(err, "command failed: CLUSTER FORGET")
-						r.Recorder.Eventf(cluster, nil, corev1.EventTypeWarning, "NodeForgetFailed", "ForgetNode", "Failed to forget node: %v", err)
-					} else {
-						r.Recorder.Eventf(cluster, nil, corev1.EventTypeNormal, "StaleNodeForgotten", "ForgetNode", "Forgot stale node %v", failing.Address)
-					}
+				if idx != -1 {
+					continue
+				}
+				// A live replica still considers this failing node its
+				// primary. Forgetting it from the other primaries now would
+				// remove it from their node tables and prevent them from
+				// voting in the auto-failover election, permanently
+				// blocking the replica's promotion.
+				if state.HasReplicaOf(failing.Id) {
+					log.V(1).Info("skipping forget; failover pending for node",
+						"address", failing.Address, "Id", failing.Id)
+					continue
+				}
+				log.V(1).Info("forget a failing node", "address", failing.Address, "Id", failing.Id)
+				if err := node.Client.Do(ctx, node.Client.B().ClusterForget().NodeId(failing.Id).Build()).Error(); err != nil {
+					log.Error(err, "command failed: CLUSTER FORGET")
+					r.Recorder.Eventf(cluster, nil, corev1.EventTypeWarning, "NodeForgetFailed", "ForgetNode", "Failed to forget node: %v", err)
+				} else {
+					r.Recorder.Eventf(cluster, nil, corev1.EventTypeNormal, "StaleNodeForgotten", "ForgetNode", "Forgot stale node %v", failing.Address)
 				}
 			}
 		}
diff --git a/internal/valkey/clusterstate.go b/internal/valkey/clusterstate.go
@@ -193,6 +193,38 @@ func (n *NodeState) IsReplicationInSync() bool {
 	return n.Info["master_link_status"] == "up"
 }
 
+// HasReplicaOf returns true if any live node in the cluster state reports
+// itself as a replica of the given node ID. This is used to prevent
+// CLUSTER FORGET from racing with auto-failover: forgetting a failed
+// primary from other primaries removes it from their node tables, which
+// prevents them from voting in the replica's failover election.
+func (s *ClusterState) HasReplicaOf(nodeId string) bool {
+	for _, shard := range s.Shards {
+		for _, node := range shard.Nodes {
+			if node.PrimaryIdFromSelf() == nodeId {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// PrimaryIdFromSelf returns the primary node ID that this node reports as its
+// own primary in CLUSTER NODES (fields[3] of the "myself" line). Returns "-"
+// for primaries and the primary's node ID for replicas.
+func (n *NodeState) PrimaryIdFromSelf() string {
+	for line := range strings.SplitSeq(n.ClusterNodes, "\n") {
+		fields := strings.Fields(line)
+		if len(fields) < 8 {
+			continue
+		}
+		if strings.Contains(fields[2], "myself") {
+			return fields[3]
+		}
+	}
+	return ""
+}
+
 // GetFailingNodes returns all known nodes that are failing.
 func (n *NodeState) GetFailingNodes() []NodeState {
 	nodes := []NodeState{}