Fix redis split-brain after pod-0 restart during failover

lmiccini · claude · lmiccini · commit 8bb9d354b95d · 2026-04-20T14:31:14.000+02:00
When redis-redis-0 (the bootstrap pod) is deleted during a failover,
it restarts and tries to contact sentinel to find the current master.
Three problems caused it to fall through to the bootstrap path and
start a new independent master, creating a split-brain:

1. Single-try timeout: if sentinel was momentarily unreachable (e.g.
   the sentinel container on pod-0 itself was still starting), the
   3-second timeout expired and pod-0 immediately bootstrapped.

2. Headless service DNS: with PublishNotReadyAddresses: true, the
   headless service DNS can resolve to pod-0's own IP, so redis-cli
   connects to its own uninitialized sentinel instead of a peer.

3. Stale master identity: even when contacting a peer sentinel, it
   may still report the restarting pod as master (within the
   down-after-milliseconds window before failover completes).

Fix by adding a wait_for_master() function in common.sh that:
- Contacts each peer pod individually by FQDN (skipping self)
- Retries up to 10 times (30s total) before allowing bootstrap
- Rejects answers where the peer still thinks we are master

Also increase InitialDelaySeconds to 40s on all redis and sentinel
probes so Kubernetes doesn't kill the pod before the retry loop
completes, and remove unused TCP probe variables that were never
referenced by the redis container.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/internal/redis/statefulset.go b/internal/redis/statefulset.go
@@ -27,36 +27,15 @@ func StatefulSet(
 	}
 	ls := labels.GetLabels(r, "redis", matchls)
 
-	livenessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds:      5,
-		PeriodSeconds:       3,
-		InitialDelaySeconds: 3,
-	}
-	readinessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds:      5,
-		PeriodSeconds:       5,
-		InitialDelaySeconds: 5,
-	}
 	sentinelLivenessProbe := &corev1.Probe{
-		// TODO might need tuning
 		TimeoutSeconds:      5,
 		PeriodSeconds:       3,
-		InitialDelaySeconds: 3,
+		InitialDelaySeconds: 40,
 	}
 	sentinelReadinessProbe := &corev1.Probe{
-		// TODO might need tuning
 		TimeoutSeconds:      5,
 		PeriodSeconds:       5,
-		InitialDelaySeconds: 5,
-	}
-
-	livenessProbe.TCPSocket = &corev1.TCPSocketAction{
-		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
-	}
-	readinessProbe.TCPSocket = &corev1.TCPSocketAction{
-		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
+		InitialDelaySeconds: 40,
 	}
 	sentinelLivenessProbe.TCPSocket = &corev1.TCPSocketAction{
 		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(26379)},
@@ -78,6 +57,9 @@ func StatefulSet(
 	}, {
 		Name:  "CONFIG_HASH",
 		Value: configHash,
+	}, {
+		Name:  "REPLICAS",
+		Value: strconv.Itoa(int(*r.Spec.Replicas)),
 	}}
 
 	sts := &appsv1.StatefulSet{
@@ -115,13 +97,15 @@ func StatefulSet(
 										Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "liveness"},
 									},
 								},
+								InitialDelaySeconds: 40,
 							},
 							ReadinessProbe: &corev1.Probe{
 								ProbeHandler: corev1.ProbeHandler{
 									Exec: &corev1.ExecAction{
 										Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "readiness"},
 									},
 								},
+								InitialDelaySeconds: 40,
 							},
 						}, {
 							Image:   r.Spec.ContainerImage,
diff --git a/templates/redis/bin/common.sh b/templates/redis/bin/common.sh
@@ -80,8 +80,77 @@ function remove_pod_label() {
     local pod="$1"
     local label="$2"
     local patch="[{\"op\": \"remove\", \"path\": \"/metadata/labels/${label}\"}]"
-    # 200: OK, 422: not found
-    configure_pod_label $pod "$patch" "(200|422)"
+    # 200: OK, 404: pod not found, 422: label not found
+    configure_pod_label $pod "$patch" "(200|404|422)"
+}
+
+# Wait for a peer sentinel to report a valid master for the cluster.
+# Contacts each peer pod individually by FQDN (skipping self) to avoid
+# the headless service DNS resolving to our own uninitialized sentinel.
+# If a peer still reports US as master (stale info before
+# down-after-milliseconds triggers failover), keeps retrying until
+# failover completes and a different master is elected.
+# Falls back to querying peer redis directly via the ROLE command
+# when sentinel output cannot be parsed.
+# Prints the master address on success (FQDN or IP).
+function wait_for_master() {
+    local retries=${SENTINEL_RETRIES:-10}
+    local delay=${SENTINEL_RETRY_DELAY:-3}
+    local pod_ordinal=${POD_NAME##*-}
+    local pod_base=${POD_NAME%-*}
+    local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
+
+    for i in $(seq 1 $retries); do
+        local ordinal=0
+        while [ $ordinal -le $max_ordinal ]; do
+            if [ "$ordinal" != "$pod_ordinal" ]; then
+                local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
+                local output
+                output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 26379 sentinel master redis 2>/dev/null)
+                if [ $? -eq 0 ] && [ -n "$output" ]; then
+                    local master
+                    master=$(echo "$output" | tr -d '\r' | awk '/^ip$/{getline; print; exit}')
+                    if [ -n "$master" ] && [ "$master" != "$POD_IP" ] && ! echo "$master" | grep -q "^${POD_NAME}\."; then
+                        echo "$master"
+                        return 0
+                    fi
+                    log "Peer ${peer} sentinel reports master=${master} (stale, skipping)" >&2
+                else
+                    # Sentinel unreachable; try redis ROLE as fallback
+                    local role
+                    role=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 6379 role 2>/dev/null | head -1 | tr -d '\r')
+                    if [ "$role" = "master" ]; then
+                        echo "$peer"
+                        return 0
+                    fi
+                fi
+            fi
+            ordinal=$((ordinal + 1))
+        done
+        log "Attempt $i/$retries: no valid master found, retrying in ${delay}s..." >&2
+        sleep $delay
+    done
+    return 1
+}
+
+# Check if any peer redis is alive (responds to PING on port 6379).
+# Used as a safety net before bootstrapping to distinguish a fresh
+# deployment (no peers) from a pod restart (peers alive).
+function has_alive_peers() {
+    local pod_ordinal=${POD_NAME##*-}
+    local pod_base=${POD_NAME%-*}
+    local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
+    local ordinal=0
+    while [ $ordinal -le $max_ordinal ]; do
+        if [ "$ordinal" != "$pod_ordinal" ]; then
+            local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
+            if timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${peer} -p 6379 ping 2>/dev/null | grep -q PONG; then
+                return 0
+            fi
+        fi
+        ordinal=$((ordinal + 1))
+    done
+    return 1
 }
 
 function set_pod_label() {
diff --git a/templates/redis/bin/start_redis_replication.sh b/templates/redis/bin/start_redis_replication.sh
@@ -5,17 +5,19 @@
 generate_configs
 sudo -E kolla_set_configs
 
-# 1. check if a redis cluster is already running by contacting sentinel
-output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
+# 1. check if a redis cluster is already running by contacting peer sentinels
+master=$(wait_for_master)
 if [ $? -eq 0 ]; then
-    master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
-    # TODO skip if no master was found
     log "Connecting to the existing Redis cluster (master: ${master})"
     exec redis-server $REDIS_CONFIG --protected-mode no --replicaof "$master" 6379
 fi
 
-# 2. else bootstrap a new cluster (assume we should be the first redis pod)
+# 2. else bootstrap a new cluster if no peers are alive (fresh deployment)
 if is_bootstrap_pod $POD_NAME; then
+    if has_alive_peers; then
+        log_error "Peers are alive but no master found. Refusing to bootstrap to avoid split-brain."
+        exit 1
+    fi
     log "Bootstrapping a new Redis cluster from ${POD_NAME}"
     set_pod_label $POD_NAME redis~1master
     exec redis-server $REDIS_CONFIG --protected-mode no
diff --git a/templates/redis/bin/start_sentinel.sh b/templates/redis/bin/start_sentinel.sh
@@ -5,19 +5,21 @@
 generate_configs
 sudo -E kolla_set_configs
 
-# 1. check if a redis cluster is already running by contacting sentinel
-output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
+# 1. check if a redis cluster is already running by contacting peer sentinels
+master=$(wait_for_master)
 if [ $? -eq 0 ]; then
-    master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
-    # TODO skip if no master was found
     log "Connecting to the existing sentinel cluster (master: $master)"
     echo "sentinel monitor redis ${master} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
     exec redis-sentinel $SENTINEL_CONFIG
 fi
 
 # 2. else let the pod's redis server bootstrap a new cluster and monitor it
-# (assume we should be the first redis pod)
+# (only if no peers are alive, meaning this is a fresh deployment)
 if is_bootstrap_pod $POD_NAME; then
+    if has_alive_peers; then
+        log_error "Peers are alive but no master found. Refusing to bootstrap sentinel to avoid split-brain."
+        exit 1
+    fi
     log "Bootstrapping a new sentinel cluster"
     echo "sentinel monitor redis ${POD_FQDN} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
     exec redis-sentinel $SENTINEL_CONFIG