Fix rolling update deadlock when pods are stuck in non-running state (#3051)

annielzy · FxKu · idanovinda · web-flow · commit 97f4de7cc042 · 2026-04-28T12:08:34.000+02:00
* add fix to recreate non running pods in syncStatefulsets

* remove TestSyncStatefulSetNonRunningPodsDoNotBlockRecreatio

* revert pod_test

* pod without status

---------

Co-authored-by: Felix Kunde &lt;felix-kunde@gmx.de&gt;
Co-authored-by: Ida Novindasari &lt;idanovinda@gmail.com&gt;
diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go
@@ -376,6 +376,36 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error)
 	return memberData, nil
 }
 
+// podIsNotRunning returns true if a pod is known to be in a non-running state,
+// e.g. stuck in CreateContainerConfigError, CrashLoopBackOff, ImagePullBackOff, etc.
+// Pods with no status information are not considered non-running, as they may
+// simply not have reported status yet.
+func podIsNotRunning(pod *v1.Pod) bool {
+	if pod.Status.Phase == "" {
+		// No status reported yet — don't treat as non-running
+		return false
+	}
+	if pod.Status.Phase != v1.PodRunning {
+		return true
+	}
+	for _, cs := range pod.Status.ContainerStatuses {
+		if cs.State.Waiting != nil || cs.State.Terminated != nil {
+			return true
+		}
+	}
+	return false
+}
+
+// allPodsRunning returns true only if every pod in the list is in a healthy running state.
+func (c *Cluster) allPodsRunning(pods []v1.Pod) bool {
+	for i := range pods {
+		if podIsNotRunning(&pods[i]) {
+			return false
+		}
+	}
+	return true
+}
+
 func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
 	stopCh := make(chan struct{})
 	ch := c.registerPodSubscriber(podName)
@@ -444,7 +474,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
 		// switchover if
 		// 1. we have not observed a new master pod when re-creating former replicas
 		// 2. we know possible switchover targets even when no replicas were recreated
-		if newMasterPod == nil && len(replicas) > 0 {
+		// 3. the master pod is actually running (can't switchover a dead master)
+		if newMasterPod == nil && len(replicas) > 0 && !podIsNotRunning(masterPod) {
 			masterCandidate, err := c.getSwitchoverCandidate(masterPod)
 			if err != nil {
 				// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
@@ -455,6 +486,9 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
 			}
 		} else if newMasterPod == nil && len(replicas) == 0 {
 			c.logger.Warningf("cannot perform switch over before re-creating the pod: no replicas")
+		} else if podIsNotRunning(masterPod) {
+			c.logger.Warningf("master pod %q is not running, skipping switchover and recreating directly",
+				util.NameFromMeta(masterPod.ObjectMeta))
 		}
 		c.logger.Infof("recreating old master pod %q", util.NameFromMeta(masterPod.ObjectMeta))
 
diff --git a/pkg/cluster/pod_test.go b/pkg/cluster/pod_test.go
@@ -15,6 +15,7 @@ import (
 	"github.com/zalando/postgres-operator/pkg/util/config"
 	"github.com/zalando/postgres-operator/pkg/util/k8sutil"
 	"github.com/zalando/postgres-operator/pkg/util/patroni"
+	v1 "k8s.io/api/core/v1"
 )
 
 func TestGetSwitchoverCandidate(t *testing.T) {
@@ -112,3 +113,302 @@ func TestGetSwitchoverCandidate(t *testing.T) {
 		}
 	}
 }
+
+func TestPodIsNotRunning(t *testing.T) {
+	tests := []struct {
+		subtest  string
+		pod      v1.Pod
+		expected bool
+	}{
+		{
+			subtest: "pod with no status reported yet",
+			pod: v1.Pod{
+				Status: v1.PodStatus{},
+			},
+			expected: false,
+		},
+		{
+			subtest: "pod running with all containers ready",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "pod in pending phase",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container in CreateContainerConfigError",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason:  "CreateContainerConfigError",
+									Message: `secret "some-secret" not found`,
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container in CrashLoopBackOff",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "CrashLoopBackOff",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container terminated",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 137,
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running with mixed container states - one healthy one broken",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "CreateContainerConfigError",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod in failed phase",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running with multiple healthy containers",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "pod running with ImagePullBackOff",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "ImagePullBackOff",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.subtest, func(t *testing.T) {
+			result := podIsNotRunning(&tt.pod)
+			if result != tt.expected {
+				t.Errorf("podIsNotRunning() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestAllPodsRunning(t *testing.T) {
+	client, _ := newFakeK8sSyncClient()
+
+	var cluster = New(
+		Config{
+			OpConfig: config.Config{
+				Resources: config.Resources{
+					ClusterLabels:    map[string]string{"application": "spilo"},
+					ClusterNameLabel: "cluster-name",
+					PodRoleLabel:     "spilo-role",
+				},
+			},
+		}, client, acidv1.Postgresql{}, logger, eventRecorder)
+
+	tests := []struct {
+		subtest  string
+		pods     []v1.Pod
+		expected bool
+	}{
+		{
+			subtest: "all pods running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "one pod not running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								State: v1.ContainerState{
+									Waiting: &v1.ContainerStateWaiting{
+										Reason: "CreateContainerConfigError",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "all pods not running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodPending,
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								State: v1.ContainerState{
+									Waiting: &v1.ContainerStateWaiting{
+										Reason: "CrashLoopBackOff",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest:  "empty pod list",
+			pods:     []v1.Pod{},
+			expected: true,
+		},
+		{
+			subtest: "pods with no status reported yet",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{},
+				},
+				{
+					Status: v1.PodStatus{},
+				},
+			},
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.subtest, func(t *testing.T) {
+			result := cluster.allPodsRunning(tt.pods)
+			if result != tt.expected {
+				t.Errorf("allPodsRunning() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go
@@ -719,14 +719,26 @@ func (c *Cluster) syncStatefulSet() error {
 	if configPatched, restartPrimaryFirst, restartWait, err = c.syncPatroniConfig(pods, c.Spec.Patroni, requiredPgParameters); err != nil {
 		c.logger.Warningf("Patroni config updated? %v - errors during config sync: %v", configPatched, err)
 		postponeReasons = append(postponeReasons, "errors during Patroni config sync")
-		isSafeToRecreatePods = false
+		// Only mark unsafe if all pods are running. If some pods are not running,
+		// Patroni API errors are expected and should not block pod recreation,
+		// which is the only way to fix non-running pods.
+		if c.allPodsRunning(pods) {
+			isSafeToRecreatePods = false
+		} else {
+			c.logger.Warningf("ignoring Patroni config sync errors because some pods are not running")
+		}
 	}
 
 	// restart Postgres where it is still pending
 	if err = c.restartInstances(pods, restartWait, restartPrimaryFirst); err != nil {
 		c.logger.Errorf("errors while restarting Postgres in pods via Patroni API: %v", err)
 		postponeReasons = append(postponeReasons, "errors while restarting Postgres via Patroni API")
-		isSafeToRecreatePods = false
+		// Same logic: don't let unreachable non-running pods block recreation.
+		if c.allPodsRunning(pods) {
+			isSafeToRecreatePods = false
+		} else {
+			c.logger.Warningf("ignoring Patroni restart errors because some pods are not running")
+		}
 	}
 
 	// if we get here we also need to re-create the pods (either leftovers from the old