Skip to content

Commit 8bb9d35

Browse files
lmicciniclaude
andcommitted
Fix redis split-brain after pod-0 restart during failover
When redis-redis-0 (the bootstrap pod) is deleted during a failover, it restarts and tries to contact sentinel to find the current master. Three problems caused it to fall through to the bootstrap path and start a new independent master, creating a split-brain: 1. Single-try timeout: if sentinel was momentarily unreachable (e.g. the sentinel container on pod-0 itself was still starting), the 3-second timeout expired and pod-0 immediately bootstrapped. 2. Headless service DNS: with PublishNotReadyAddresses: true, the headless service DNS can resolve to pod-0's own IP, so redis-cli connects to its own uninitialized sentinel instead of a peer. 3. Stale master identity: even when contacting a peer sentinel, it may still report the restarting pod as master (within the down-after-milliseconds window before failover completes). Fix by adding a wait_for_master() function in common.sh that: - Contacts each peer pod individually by FQDN (skipping self) - Retries up to 10 times (30s total) before allowing bootstrap - Rejects answers where the peer still thinks we are master Also increase InitialDelaySeconds to 40s on all redis and sentinel probes so Kubernetes doesn't kill the pod before the retry loop completes, and remove unused TCP probe variables that were never referenced by the redis container. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b98bf2f commit 8bb9d35

4 files changed

Lines changed: 92 additions & 35 deletions

File tree

internal/redis/statefulset.go

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,36 +27,15 @@ func StatefulSet(
2727
}
2828
ls := labels.GetLabels(r, "redis", matchls)
2929

30-
livenessProbe := &corev1.Probe{
31-
// TODO might need tuning
32-
TimeoutSeconds: 5,
33-
PeriodSeconds: 3,
34-
InitialDelaySeconds: 3,
35-
}
36-
readinessProbe := &corev1.Probe{
37-
// TODO might need tuning
38-
TimeoutSeconds: 5,
39-
PeriodSeconds: 5,
40-
InitialDelaySeconds: 5,
41-
}
4230
sentinelLivenessProbe := &corev1.Probe{
43-
// TODO might need tuning
4431
TimeoutSeconds: 5,
4532
PeriodSeconds: 3,
46-
InitialDelaySeconds: 3,
33+
InitialDelaySeconds: 40,
4734
}
4835
sentinelReadinessProbe := &corev1.Probe{
49-
// TODO might need tuning
5036
TimeoutSeconds: 5,
5137
PeriodSeconds: 5,
52-
InitialDelaySeconds: 5,
53-
}
54-
55-
livenessProbe.TCPSocket = &corev1.TCPSocketAction{
56-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
57-
}
58-
readinessProbe.TCPSocket = &corev1.TCPSocketAction{
59-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
38+
InitialDelaySeconds: 40,
6039
}
6140
sentinelLivenessProbe.TCPSocket = &corev1.TCPSocketAction{
6241
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(26379)},
@@ -78,6 +57,9 @@ func StatefulSet(
7857
}, {
7958
Name: "CONFIG_HASH",
8059
Value: configHash,
60+
}, {
61+
Name: "REPLICAS",
62+
Value: strconv.Itoa(int(*r.Spec.Replicas)),
8163
}}
8264

8365
sts := &appsv1.StatefulSet{
@@ -115,13 +97,15 @@ func StatefulSet(
11597
Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "liveness"},
11698
},
11799
},
100+
InitialDelaySeconds: 40,
118101
},
119102
ReadinessProbe: &corev1.Probe{
120103
ProbeHandler: corev1.ProbeHandler{
121104
Exec: &corev1.ExecAction{
122105
Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "readiness"},
123106
},
124107
},
108+
InitialDelaySeconds: 40,
125109
},
126110
}, {
127111
Image: r.Spec.ContainerImage,

templates/redis/bin/common.sh

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,77 @@ function remove_pod_label() {
8080
local pod="$1"
8181
local label="$2"
8282
local patch="[{\"op\": \"remove\", \"path\": \"/metadata/labels/${label}\"}]"
83-
# 200: OK, 422: not found
84-
configure_pod_label $pod "$patch" "(200|422)"
83+
# 200: OK, 404: pod not found, 422: label not found
84+
configure_pod_label $pod "$patch" "(200|404|422)"
85+
}
86+
87+
# Wait for a peer sentinel to report a valid master for the cluster.
88+
# Contacts each peer pod individually by FQDN (skipping self) to avoid
89+
# the headless service DNS resolving to our own uninitialized sentinel.
90+
# If a peer still reports US as master (stale info before
91+
# down-after-milliseconds triggers failover), keeps retrying until
92+
# failover completes and a different master is elected.
93+
# Falls back to querying peer redis directly via the ROLE command
94+
# when sentinel output cannot be parsed.
95+
# Prints the master address on success (FQDN or IP).
96+
function wait_for_master() {
97+
local retries=${SENTINEL_RETRIES:-10}
98+
local delay=${SENTINEL_RETRY_DELAY:-3}
99+
local pod_ordinal=${POD_NAME##*-}
100+
local pod_base=${POD_NAME%-*}
101+
local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
102+
103+
for i in $(seq 1 $retries); do
104+
local ordinal=0
105+
while [ $ordinal -le $max_ordinal ]; do
106+
if [ "$ordinal" != "$pod_ordinal" ]; then
107+
local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
108+
local output
109+
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 26379 sentinel master redis 2>/dev/null)
110+
if [ $? -eq 0 ] && [ -n "$output" ]; then
111+
local master
112+
master=$(echo "$output" | tr -d '\r' | awk '/^ip$/{getline; print; exit}')
113+
if [ -n "$master" ] && [ "$master" != "$POD_IP" ] && ! echo "$master" | grep -q "^${POD_NAME}\."; then
114+
echo "$master"
115+
return 0
116+
fi
117+
log "Peer ${peer} sentinel reports master=${master} (stale, skipping)" >&2
118+
else
119+
# Sentinel unreachable; try redis ROLE as fallback
120+
local role
121+
role=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 6379 role 2>/dev/null | head -1 | tr -d '\r')
122+
if [ "$role" = "master" ]; then
123+
echo "$peer"
124+
return 0
125+
fi
126+
fi
127+
fi
128+
ordinal=$((ordinal + 1))
129+
done
130+
log "Attempt $i/$retries: no valid master found, retrying in ${delay}s..." >&2
131+
sleep $delay
132+
done
133+
return 1
134+
}
135+
136+
# Check if any peer redis is alive (responds to PING on port 6379).
137+
# Used as a safety net before bootstrapping to distinguish a fresh
138+
# deployment (no peers) from a pod restart (peers alive).
139+
function has_alive_peers() {
140+
local pod_ordinal=${POD_NAME##*-}
141+
local pod_base=${POD_NAME%-*}
142+
local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
143+
local ordinal=0
144+
while [ $ordinal -le $max_ordinal ]; do
145+
if [ "$ordinal" != "$pod_ordinal" ]; then
146+
local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
147+
if timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${peer} -p 6379 ping 2>/dev/null | grep -q PONG; then
148+
return 0
149+
fi
150+
fi
151+
ordinal=$((ordinal + 1))
152+
done
153+
return 1
85154
}
86155

87156
function set_pod_label() {

templates/redis/bin/start_redis_replication.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,19 @@
55
generate_configs
66
sudo -E kolla_set_configs
77

8-
# 1. check if a redis cluster is already running by contacting sentinel
9-
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
8+
# 1. check if a redis cluster is already running by contacting peer sentinels
9+
master=$(wait_for_master)
1010
if [ $? -eq 0 ]; then
11-
master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
12-
# TODO skip if no master was found
1311
log "Connecting to the existing Redis cluster (master: ${master})"
1412
exec redis-server $REDIS_CONFIG --protected-mode no --replicaof "$master" 6379
1513
fi
1614

17-
# 2. else bootstrap a new cluster (assume we should be the first redis pod)
15+
# 2. else bootstrap a new cluster if no peers are alive (fresh deployment)
1816
if is_bootstrap_pod $POD_NAME; then
17+
if has_alive_peers; then
18+
log_error "Peers are alive but no master found. Refusing to bootstrap to avoid split-brain."
19+
exit 1
20+
fi
1921
log "Bootstrapping a new Redis cluster from ${POD_NAME}"
2022
set_pod_label $POD_NAME redis~1master
2123
exec redis-server $REDIS_CONFIG --protected-mode no

templates/redis/bin/start_sentinel.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,21 @@
55
generate_configs
66
sudo -E kolla_set_configs
77

8-
# 1. check if a redis cluster is already running by contacting sentinel
9-
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
8+
# 1. check if a redis cluster is already running by contacting peer sentinels
9+
master=$(wait_for_master)
1010
if [ $? -eq 0 ]; then
11-
master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
12-
# TODO skip if no master was found
1311
log "Connecting to the existing sentinel cluster (master: $master)"
1412
echo "sentinel monitor redis ${master} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
1513
exec redis-sentinel $SENTINEL_CONFIG
1614
fi
1715

1816
# 2. else let the pod's redis server bootstrap a new cluster and monitor it
19-
# (assume we should be the first redis pod)
17+
# (only if no peers are alive, meaning this is a fresh deployment)
2018
if is_bootstrap_pod $POD_NAME; then
19+
if has_alive_peers; then
20+
log_error "Peers are alive but no master found. Refusing to bootstrap sentinel to avoid split-brain."
21+
exit 1
22+
fi
2123
log "Bootstrapping a new sentinel cluster"
2224
echo "sentinel monitor redis ${POD_FQDN} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
2325
exec redis-sentinel $SENTINEL_CONFIG

0 commit comments

Comments
 (0)