Skip to content

Commit 97f4de7

Browse files
annielzyFxKuidanovinda
authored
Fix rolling update deadlock when pods are stuck in non-running state (#3051)
* add fix to recreate non running pods in syncStatefulsets * remove TestSyncStatefulSetNonRunningPodsDoNotBlockRecreatio * revert pod_test * pod without status --------- Co-authored-by: Felix Kunde <felix-kunde@gmx.de> Co-authored-by: Ida Novindasari <idanovinda@gmail.com>
1 parent 688bbf1 commit 97f4de7

3 files changed

Lines changed: 349 additions & 3 deletions

File tree

pkg/cluster/pod.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,36 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error)
376376
return memberData, nil
377377
}
378378

379+
// podIsNotRunning returns true if a pod is known to be in a non-running state,
380+
// e.g. stuck in CreateContainerConfigError, CrashLoopBackOff, ImagePullBackOff, etc.
381+
// Pods with no status information are not considered non-running, as they may
382+
// simply not have reported status yet.
383+
func podIsNotRunning(pod *v1.Pod) bool {
384+
if pod.Status.Phase == "" {
385+
// No status reported yet — don't treat as non-running
386+
return false
387+
}
388+
if pod.Status.Phase != v1.PodRunning {
389+
return true
390+
}
391+
for _, cs := range pod.Status.ContainerStatuses {
392+
if cs.State.Waiting != nil || cs.State.Terminated != nil {
393+
return true
394+
}
395+
}
396+
return false
397+
}
398+
399+
// allPodsRunning returns true only if every pod in the list is in a healthy running state.
400+
func (c *Cluster) allPodsRunning(pods []v1.Pod) bool {
401+
for i := range pods {
402+
if podIsNotRunning(&pods[i]) {
403+
return false
404+
}
405+
}
406+
return true
407+
}
408+
379409
func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
380410
stopCh := make(chan struct{})
381411
ch := c.registerPodSubscriber(podName)
@@ -444,7 +474,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
444474
// switchover if
445475
// 1. we have not observed a new master pod when re-creating former replicas
446476
// 2. we know possible switchover targets even when no replicas were recreated
447-
if newMasterPod == nil && len(replicas) > 0 {
477+
// 3. the master pod is actually running (can't switchover a dead master)
478+
if newMasterPod == nil && len(replicas) > 0 && !podIsNotRunning(masterPod) {
448479
masterCandidate, err := c.getSwitchoverCandidate(masterPod)
449480
if err != nil {
450481
// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
@@ -455,6 +486,9 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
455486
}
456487
} else if newMasterPod == nil && len(replicas) == 0 {
457488
c.logger.Warningf("cannot perform switch over before re-creating the pod: no replicas")
489+
} else if podIsNotRunning(masterPod) {
490+
c.logger.Warningf("master pod %q is not running, skipping switchover and recreating directly",
491+
util.NameFromMeta(masterPod.ObjectMeta))
458492
}
459493
c.logger.Infof("recreating old master pod %q", util.NameFromMeta(masterPod.ObjectMeta))
460494

pkg/cluster/pod_test.go

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/zalando/postgres-operator/pkg/util/config"
1616
"github.com/zalando/postgres-operator/pkg/util/k8sutil"
1717
"github.com/zalando/postgres-operator/pkg/util/patroni"
18+
v1 "k8s.io/api/core/v1"
1819
)
1920

2021
func TestGetSwitchoverCandidate(t *testing.T) {
@@ -112,3 +113,302 @@ func TestGetSwitchoverCandidate(t *testing.T) {
112113
}
113114
}
114115
}
116+
117+
func TestPodIsNotRunning(t *testing.T) {
118+
tests := []struct {
119+
subtest string
120+
pod v1.Pod
121+
expected bool
122+
}{
123+
{
124+
subtest: "pod with no status reported yet",
125+
pod: v1.Pod{
126+
Status: v1.PodStatus{},
127+
},
128+
expected: false,
129+
},
130+
{
131+
subtest: "pod running with all containers ready",
132+
pod: v1.Pod{
133+
Status: v1.PodStatus{
134+
Phase: v1.PodRunning,
135+
ContainerStatuses: []v1.ContainerStatus{
136+
{
137+
State: v1.ContainerState{
138+
Running: &v1.ContainerStateRunning{},
139+
},
140+
},
141+
},
142+
},
143+
},
144+
expected: false,
145+
},
146+
{
147+
subtest: "pod in pending phase",
148+
pod: v1.Pod{
149+
Status: v1.PodStatus{
150+
Phase: v1.PodPending,
151+
},
152+
},
153+
expected: true,
154+
},
155+
{
156+
subtest: "pod running but container in CreateContainerConfigError",
157+
pod: v1.Pod{
158+
Status: v1.PodStatus{
159+
Phase: v1.PodRunning,
160+
ContainerStatuses: []v1.ContainerStatus{
161+
{
162+
State: v1.ContainerState{
163+
Waiting: &v1.ContainerStateWaiting{
164+
Reason: "CreateContainerConfigError",
165+
Message: `secret "some-secret" not found`,
166+
},
167+
},
168+
},
169+
},
170+
},
171+
},
172+
expected: true,
173+
},
174+
{
175+
subtest: "pod running but container in CrashLoopBackOff",
176+
pod: v1.Pod{
177+
Status: v1.PodStatus{
178+
Phase: v1.PodRunning,
179+
ContainerStatuses: []v1.ContainerStatus{
180+
{
181+
State: v1.ContainerState{
182+
Waiting: &v1.ContainerStateWaiting{
183+
Reason: "CrashLoopBackOff",
184+
},
185+
},
186+
},
187+
},
188+
},
189+
},
190+
expected: true,
191+
},
192+
{
193+
subtest: "pod running but container terminated",
194+
pod: v1.Pod{
195+
Status: v1.PodStatus{
196+
Phase: v1.PodRunning,
197+
ContainerStatuses: []v1.ContainerStatus{
198+
{
199+
State: v1.ContainerState{
200+
Terminated: &v1.ContainerStateTerminated{
201+
ExitCode: 137,
202+
},
203+
},
204+
},
205+
},
206+
},
207+
},
208+
expected: true,
209+
},
210+
{
211+
subtest: "pod running with mixed container states - one healthy one broken",
212+
pod: v1.Pod{
213+
Status: v1.PodStatus{
214+
Phase: v1.PodRunning,
215+
ContainerStatuses: []v1.ContainerStatus{
216+
{
217+
State: v1.ContainerState{
218+
Running: &v1.ContainerStateRunning{},
219+
},
220+
},
221+
{
222+
State: v1.ContainerState{
223+
Waiting: &v1.ContainerStateWaiting{
224+
Reason: "CreateContainerConfigError",
225+
},
226+
},
227+
},
228+
},
229+
},
230+
},
231+
expected: true,
232+
},
233+
{
234+
subtest: "pod in failed phase",
235+
pod: v1.Pod{
236+
Status: v1.PodStatus{
237+
Phase: v1.PodFailed,
238+
},
239+
},
240+
expected: true,
241+
},
242+
{
243+
subtest: "pod running with multiple healthy containers",
244+
pod: v1.Pod{
245+
Status: v1.PodStatus{
246+
Phase: v1.PodRunning,
247+
ContainerStatuses: []v1.ContainerStatus{
248+
{
249+
State: v1.ContainerState{
250+
Running: &v1.ContainerStateRunning{},
251+
},
252+
},
253+
{
254+
State: v1.ContainerState{
255+
Running: &v1.ContainerStateRunning{},
256+
},
257+
},
258+
},
259+
},
260+
},
261+
expected: false,
262+
},
263+
{
264+
subtest: "pod running with ImagePullBackOff",
265+
pod: v1.Pod{
266+
Status: v1.PodStatus{
267+
Phase: v1.PodRunning,
268+
ContainerStatuses: []v1.ContainerStatus{
269+
{
270+
State: v1.ContainerState{
271+
Waiting: &v1.ContainerStateWaiting{
272+
Reason: "ImagePullBackOff",
273+
},
274+
},
275+
},
276+
},
277+
},
278+
},
279+
expected: true,
280+
},
281+
}
282+
283+
for _, tt := range tests {
284+
t.Run(tt.subtest, func(t *testing.T) {
285+
result := podIsNotRunning(&tt.pod)
286+
if result != tt.expected {
287+
t.Errorf("podIsNotRunning() = %v, expected %v", result, tt.expected)
288+
}
289+
})
290+
}
291+
}
292+
293+
func TestAllPodsRunning(t *testing.T) {
294+
client, _ := newFakeK8sSyncClient()
295+
296+
var cluster = New(
297+
Config{
298+
OpConfig: config.Config{
299+
Resources: config.Resources{
300+
ClusterLabels: map[string]string{"application": "spilo"},
301+
ClusterNameLabel: "cluster-name",
302+
PodRoleLabel: "spilo-role",
303+
},
304+
},
305+
}, client, acidv1.Postgresql{}, logger, eventRecorder)
306+
307+
tests := []struct {
308+
subtest string
309+
pods []v1.Pod
310+
expected bool
311+
}{
312+
{
313+
subtest: "all pods running",
314+
pods: []v1.Pod{
315+
{
316+
Status: v1.PodStatus{
317+
Phase: v1.PodRunning,
318+
ContainerStatuses: []v1.ContainerStatus{
319+
{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
320+
},
321+
},
322+
},
323+
{
324+
Status: v1.PodStatus{
325+
Phase: v1.PodRunning,
326+
ContainerStatuses: []v1.ContainerStatus{
327+
{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
328+
},
329+
},
330+
},
331+
},
332+
expected: true,
333+
},
334+
{
335+
subtest: "one pod not running",
336+
pods: []v1.Pod{
337+
{
338+
Status: v1.PodStatus{
339+
Phase: v1.PodRunning,
340+
ContainerStatuses: []v1.ContainerStatus{
341+
{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
342+
},
343+
},
344+
},
345+
{
346+
Status: v1.PodStatus{
347+
Phase: v1.PodRunning,
348+
ContainerStatuses: []v1.ContainerStatus{
349+
{
350+
State: v1.ContainerState{
351+
Waiting: &v1.ContainerStateWaiting{
352+
Reason: "CreateContainerConfigError",
353+
},
354+
},
355+
},
356+
},
357+
},
358+
},
359+
},
360+
expected: false,
361+
},
362+
{
363+
subtest: "all pods not running",
364+
pods: []v1.Pod{
365+
{
366+
Status: v1.PodStatus{
367+
Phase: v1.PodPending,
368+
},
369+
},
370+
{
371+
Status: v1.PodStatus{
372+
Phase: v1.PodRunning,
373+
ContainerStatuses: []v1.ContainerStatus{
374+
{
375+
State: v1.ContainerState{
376+
Waiting: &v1.ContainerStateWaiting{
377+
Reason: "CrashLoopBackOff",
378+
},
379+
},
380+
},
381+
},
382+
},
383+
},
384+
},
385+
expected: false,
386+
},
387+
{
388+
subtest: "empty pod list",
389+
pods: []v1.Pod{},
390+
expected: true,
391+
},
392+
{
393+
subtest: "pods with no status reported yet",
394+
pods: []v1.Pod{
395+
{
396+
Status: v1.PodStatus{},
397+
},
398+
{
399+
Status: v1.PodStatus{},
400+
},
401+
},
402+
expected: true,
403+
},
404+
}
405+
406+
for _, tt := range tests {
407+
t.Run(tt.subtest, func(t *testing.T) {
408+
result := cluster.allPodsRunning(tt.pods)
409+
if result != tt.expected {
410+
t.Errorf("allPodsRunning() = %v, expected %v", result, tt.expected)
411+
}
412+
})
413+
}
414+
}

pkg/cluster/sync.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -719,14 +719,26 @@ func (c *Cluster) syncStatefulSet() error {
719719
if configPatched, restartPrimaryFirst, restartWait, err = c.syncPatroniConfig(pods, c.Spec.Patroni, requiredPgParameters); err != nil {
720720
c.logger.Warningf("Patroni config updated? %v - errors during config sync: %v", configPatched, err)
721721
postponeReasons = append(postponeReasons, "errors during Patroni config sync")
722-
isSafeToRecreatePods = false
722+
// Only mark unsafe if all pods are running. If some pods are not running,
723+
// Patroni API errors are expected and should not block pod recreation,
724+
// which is the only way to fix non-running pods.
725+
if c.allPodsRunning(pods) {
726+
isSafeToRecreatePods = false
727+
} else {
728+
c.logger.Warningf("ignoring Patroni config sync errors because some pods are not running")
729+
}
723730
}
724731

725732
// restart Postgres where it is still pending
726733
if err = c.restartInstances(pods, restartWait, restartPrimaryFirst); err != nil {
727734
c.logger.Errorf("errors while restarting Postgres in pods via Patroni API: %v", err)
728735
postponeReasons = append(postponeReasons, "errors while restarting Postgres via Patroni API")
729-
isSafeToRecreatePods = false
736+
// Same logic: don't let unreachable non-running pods block recreation.
737+
if c.allPodsRunning(pods) {
738+
isSafeToRecreatePods = false
739+
} else {
740+
c.logger.Warningf("ignoring Patroni restart errors because some pods are not running")
741+
}
730742
}
731743

732744
// if we get here we also need to re-create the pods (either leftovers from the old

0 commit comments

Comments
 (0)