Skip to content

Commit 7a1843b

Browse files
fix(redis-ha): split-brain script race condition and silent quorum failure (#404)
* fix(redis-ha): fix split-brain script race condition and silent quorum failure Fixes #383 and #398 in the fix-split-brain.sh template. https://claude.ai/code/session_015PLT3F82jivLQrxjfV9Hgg * Chart Update Signed-off-by: Aaron Layfield <aaron.layfield@gmail.com> --------- Signed-off-by: Aaron Layfield <aaron.layfield@gmail.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 065bc67 commit 7a1843b

2 files changed

Lines changed: 27 additions & 6 deletions

File tree

charts/redis-ha/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ keywords:
55
- redis
66
- keyvalue
77
- database
8-
version: 4.36.2
8+
version: 4.37.0
99
appVersion: 8.2.4
1010
description: This Helm chart provides a highly available Redis implementation with a master/slave configuration and uses Sentinel sidecars for failover management
1111
icon: https://img.icons8.com/external-tal-revivo-shadow-tal-revivo/24/external-redis-an-in-memory-data-structure-project-implementing-a-distributed-logo-shadow-tal-revivo.png

charts/redis-ha/templates/_configs.tpl

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,9 @@
501501
identify_announce_ip
502502
done
503503

504+
QUORUM_FAIL_COUNT=0
505+
MAX_QUORUM_FAILURES=${MAX_QUORUM_FAILURES:-5}
506+
504507
trap "exit 0" TERM
505508
while true; do
506509
sleep {{ .Values.splitBrainDetection.interval }}
@@ -509,6 +512,7 @@
509512
identify_master
510513

511514
if [ "$MASTER" = "$ANNOUNCE_IP" ]; then
515+
QUORUM_FAIL_COUNT=0
512516
redis_role
513517
if [ "$ROLE" != "master" ]; then
514518
echo "waiting for redis to become master"
@@ -522,17 +526,34 @@
522526
fi
523527
fi
524528
elif [ "${MASTER}" ]; then
529+
QUORUM_FAIL_COUNT=0
525530
identify_redis_master
526531
if [ "$REDIS_MASTER" != "$MASTER" ]; then
527532
echo "Redis master and local master are not the same. waiting."
528533
sleep {{ .Values.splitBrainDetection.retryInterval }}
529534
identify_master
530-
identify_redis_master
531-
echo "Redis master is ${MASTER}, expected master is ${REDIS_MASTER}. No need to reinitialize."
532-
if [ "${REDIS_MASTER}" != "${MASTER}" ]; then
533-
echo "Redis master is ${MASTER}, expected master is ${REDIS_MASTER}, reinitializing"
534-
reinit
535+
if [ "$MASTER" = "$ANNOUNCE_IP" ]; then
536+
echo "This pod became master during wait, skipping reinit"
537+
else
538+
identify_redis_master
539+
echo "Redis master is ${MASTER}, expected master is ${REDIS_MASTER}. No need to reinitialize."
540+
if [ "${REDIS_MASTER}" != "${MASTER}" ]; then
541+
echo "Redis master is ${MASTER}, expected master is ${REDIS_MASTER}, reinitializing"
542+
reinit
543+
fi
544+
fi
545+
fi
546+
else
547+
QUORUM_FAIL_COUNT=$((QUORUM_FAIL_COUNT + 1))
548+
echo "WARNING: Sentinel returned no master (quorum may be broken). Failure count: $QUORUM_FAIL_COUNT/$MAX_QUORUM_FAILURES"
549+
if [ "$QUORUM_FAIL_COUNT" -ge "$MAX_QUORUM_FAILURES" ]; then
550+
echo "ERROR: Quorum broken for $MAX_QUORUM_FAILURES consecutive checks. Attempting sentinel reset..."
551+
if [ "$SENTINEL_PORT" -eq 0 ]; then
552+
redis-cli -h "${SERVICE}" -p "${SENTINEL_TLS_PORT}" {{ if .Values.sentinel.auth }} -a "${SENTINELAUTH}" --no-auth-warning{{ end }} --tls --cacert /tls-certs/{{ .Values.tls.caCertFile }} {{ if ne (default "yes" .Values.sentinel.authClients) "no"}} --cert /tls-certs/{{ .Values.tls.certFile }} --key /tls-certs/{{ .Values.tls.keyFile }}{{ end }} sentinel reset "${MASTER_GROUP}" || true
553+
else
554+
redis-cli -h "${SERVICE}" -p "${SENTINEL_PORT}" {{ if .Values.sentinel.auth }} -a "${SENTINELAUTH}" --no-auth-warning{{ end }} sentinel reset "${MASTER_GROUP}" || true
535555
fi
556+
QUORUM_FAIL_COUNT=0
536557
fi
537558
fi
538559
done

0 commit comments

Comments
 (0)