Skip to content

Commit 4af5bd6

Browse files
committed
controller/workload: Rework Progressing condition
Replace the generation-based progressing check and the 15-minute v1helpers.IsUpdatingTooLong timer with the deployment controller's native ProgressDeadlineExceeded condition. This avoids false Progressing=True signals on scaling operations (where generation changes but no rollout occurs) and uses a more accurate stall detection mechanism. Extract progressing condition logic into a new DeploymentProgressingCondition helper in pkg/apps/deployment, along with exported HasDeploymentProgressed and HasDeploymentTimedOutProgressing helpers.
1 parent b7e366d commit 4af5bd6

4 files changed

Lines changed: 384 additions & 122 deletions

File tree

pkg/apps/deployment/progressing.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package deployment
2+
3+
import (
4+
"fmt"
5+
6+
appsv1 "k8s.io/api/apps/v1"
7+
corev1 "k8s.io/api/core/v1"
8+
"k8s.io/utils/ptr"
9+
10+
operatorv1 "github.com/openshift/api/operator/v1"
11+
)
12+
13+
// DeploymentProgressingCondition computes an operator Progressing condition from
14+
// the deployment's own status conditions and replica counts.
15+
func DeploymentProgressingCondition(deployment *appsv1.Deployment) operatorv1.OperatorCondition {
16+
desiredReplicas := ptr.Deref(deployment.Spec.Replicas, 1)
17+
timedOutMessage, timedOut := HasDeploymentTimedOutProgressing(deployment.Status)
18+
19+
switch {
20+
case !HasDeploymentProgressed(deployment.Status) && !timedOut:
21+
return operatorv1.OperatorCondition{
22+
Type: operatorv1.OperatorStatusTypeProgressing,
23+
Status: operatorv1.ConditionTrue,
24+
Reason: "PodsUpdating",
25+
Message: fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest revision and %d/%d pods are available", deployment.Name, deployment.Namespace, deployment.Status.UpdatedReplicas, desiredReplicas, deployment.Status.AvailableReplicas, desiredReplicas),
26+
}
27+
28+
case timedOut:
29+
return operatorv1.OperatorCondition{
30+
Type: operatorv1.OperatorStatusTypeProgressing,
31+
Status: operatorv1.ConditionFalse,
32+
Reason: "ProgressDeadlineExceeded",
33+
Message: fmt.Sprintf("deployment/%s.%s has timed out progressing: %s", deployment.Name, deployment.Namespace, timedOutMessage),
34+
}
35+
36+
default:
37+
return operatorv1.OperatorCondition{
38+
Type: operatorv1.OperatorStatusTypeProgressing,
39+
Status: operatorv1.ConditionFalse,
40+
Reason: "AsExpected",
41+
}
42+
}
43+
}
44+
45+
// HasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
46+
// via the DeploymentProgressing condition.
47+
func HasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
48+
for _, cond := range status.Conditions {
49+
if cond.Type == appsv1.DeploymentProgressing {
50+
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
51+
}
52+
}
53+
return false
54+
}
55+
56+
// HasDeploymentTimedOutProgressing returns true if the deployment reports ProgressDeadlineExceeded.
57+
// The function returns the Progressing condition message as the first return value.
58+
func HasDeploymentTimedOutProgressing(status appsv1.DeploymentStatus) (string, bool) {
59+
for _, cond := range status.Conditions {
60+
if cond.Type == appsv1.DeploymentProgressing {
61+
return cond.Message, cond.Status == corev1.ConditionFalse && cond.Reason == "ProgressDeadlineExceeded"
62+
}
63+
}
64+
return "", false
65+
}
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
package deployment
2+
3+
import (
4+
"testing"
5+
6+
"github.com/google/go-cmp/cmp"
7+
8+
appsv1 "k8s.io/api/apps/v1"
9+
corev1 "k8s.io/api/core/v1"
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/utils/ptr"
12+
13+
operatorv1 "github.com/openshift/api/operator/v1"
14+
)
15+
16+
func TestDeploymentProgressingCondition(t *testing.T) {
17+
tests := []struct {
18+
name string
19+
deploy *appsv1.Deployment
20+
expected operatorv1.OperatorCondition
21+
}{
22+
{
23+
name: "active rollout, not all pods updated",
24+
deploy: &appsv1.Deployment{
25+
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "ns"},
26+
Spec: appsv1.DeploymentSpec{Replicas: ptr.To[int32](3)},
27+
Status: appsv1.DeploymentStatus{
28+
UpdatedReplicas: 1,
29+
AvailableReplicas: 2,
30+
Conditions: []appsv1.DeploymentCondition{
31+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ReplicaSetUpdated"},
32+
},
33+
},
34+
},
35+
expected: operatorv1.OperatorCondition{
36+
Type: operatorv1.OperatorStatusTypeProgressing,
37+
Status: operatorv1.ConditionTrue,
38+
Reason: "PodsUpdating",
39+
Message: "deployment/web.ns: 1/3 pods have been updated to the latest revision and 2/3 pods are available",
40+
},
41+
},
42+
{
43+
name: "rollout complete",
44+
deploy: &appsv1.Deployment{
45+
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "ns"},
46+
Spec: appsv1.DeploymentSpec{Replicas: ptr.To[int32](3)},
47+
Status: appsv1.DeploymentStatus{
48+
UpdatedReplicas: 3,
49+
AvailableReplicas: 3,
50+
Conditions: []appsv1.DeploymentCondition{
51+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "NewReplicaSetAvailable"},
52+
},
53+
},
54+
},
55+
expected: operatorv1.OperatorCondition{
56+
Type: operatorv1.OperatorStatusTypeProgressing,
57+
Status: operatorv1.ConditionFalse,
58+
Reason: "AsExpected",
59+
},
60+
},
61+
{
62+
name: "progress deadline exceeded",
63+
deploy: &appsv1.Deployment{
64+
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "ns"},
65+
Spec: appsv1.DeploymentSpec{Replicas: ptr.To[int32](3)},
66+
Status: appsv1.DeploymentStatus{
67+
UpdatedReplicas: 1,
68+
AvailableReplicas: 2,
69+
Conditions: []appsv1.DeploymentCondition{
70+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionFalse, Reason: "ProgressDeadlineExceeded", Message: "timed out waiting"},
71+
},
72+
},
73+
},
74+
expected: operatorv1.OperatorCondition{
75+
Type: operatorv1.OperatorStatusTypeProgressing,
76+
Status: operatorv1.ConditionFalse,
77+
Reason: "ProgressDeadlineExceeded",
78+
Message: "deployment/web.ns has timed out progressing: timed out waiting",
79+
},
80+
},
81+
{
82+
name: "no progressing condition on deployment",
83+
deploy: &appsv1.Deployment{
84+
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "ns"},
85+
Spec: appsv1.DeploymentSpec{Replicas: ptr.To[int32](1)},
86+
Status: appsv1.DeploymentStatus{},
87+
},
88+
expected: operatorv1.OperatorCondition{
89+
Type: operatorv1.OperatorStatusTypeProgressing,
90+
Status: operatorv1.ConditionTrue,
91+
Reason: "PodsUpdating",
92+
Message: "deployment/web.ns: 0/1 pods have been updated to the latest revision and 0/1 pods are available",
93+
},
94+
},
95+
{
96+
name: "nil replicas defaults to 1",
97+
deploy: &appsv1.Deployment{
98+
ObjectMeta: metav1.ObjectMeta{Name: "web", Namespace: "ns"},
99+
Spec: appsv1.DeploymentSpec{},
100+
Status: appsv1.DeploymentStatus{
101+
UpdatedReplicas: 1,
102+
AvailableReplicas: 1,
103+
Conditions: []appsv1.DeploymentCondition{
104+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "NewReplicaSetAvailable"},
105+
},
106+
},
107+
},
108+
expected: operatorv1.OperatorCondition{
109+
Type: operatorv1.OperatorStatusTypeProgressing,
110+
Status: operatorv1.ConditionFalse,
111+
Reason: "AsExpected",
112+
},
113+
},
114+
}
115+
116+
for _, tt := range tests {
117+
t.Run(tt.name, func(t *testing.T) {
118+
got := DeploymentProgressingCondition(tt.deploy)
119+
if d := cmp.Diff(tt.expected, got); d != "" {
120+
t.Errorf("unexpected condition (-want +got):\n%s", d)
121+
}
122+
})
123+
}
124+
}
125+
126+
func TestHasDeploymentProgressed(t *testing.T) {
127+
tests := []struct {
128+
name string
129+
conditions []appsv1.DeploymentCondition
130+
want bool
131+
}{
132+
{
133+
name: "NewReplicaSetAvailable and True",
134+
conditions: []appsv1.DeploymentCondition{
135+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "NewReplicaSetAvailable"},
136+
},
137+
want: true,
138+
},
139+
{
140+
name: "ReplicaSetUpdated and True",
141+
conditions: []appsv1.DeploymentCondition{
142+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ReplicaSetUpdated"},
143+
},
144+
want: false,
145+
},
146+
{
147+
name: "NewReplicaSetAvailable but False",
148+
conditions: []appsv1.DeploymentCondition{
149+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionFalse, Reason: "NewReplicaSetAvailable"},
150+
},
151+
want: false,
152+
},
153+
{
154+
name: "no conditions",
155+
conditions: nil,
156+
want: false,
157+
},
158+
{
159+
name: "unrelated condition only",
160+
conditions: []appsv1.DeploymentCondition{
161+
{Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue},
162+
},
163+
want: false,
164+
},
165+
}
166+
for _, tt := range tests {
167+
t.Run(tt.name, func(t *testing.T) {
168+
got := HasDeploymentProgressed(appsv1.DeploymentStatus{Conditions: tt.conditions})
169+
if got != tt.want {
170+
t.Errorf("got %v, want %v", got, tt.want)
171+
}
172+
})
173+
}
174+
}
175+
176+
func TestHasDeploymentTimedOutProgressing(t *testing.T) {
177+
tests := []struct {
178+
name string
179+
conditions []appsv1.DeploymentCondition
180+
wantMessage string
181+
wantTimedOut bool
182+
}{
183+
{
184+
name: "ProgressDeadlineExceeded",
185+
conditions: []appsv1.DeploymentCondition{
186+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionFalse, Reason: "ProgressDeadlineExceeded", Message: "timed out"},
187+
},
188+
wantMessage: "timed out",
189+
wantTimedOut: true,
190+
},
191+
{
192+
name: "ProgressDeadlineExceeded but True status",
193+
conditions: []appsv1.DeploymentCondition{
194+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ProgressDeadlineExceeded", Message: "timed out"},
195+
},
196+
wantMessage: "timed out",
197+
wantTimedOut: false,
198+
},
199+
{
200+
name: "normal progressing",
201+
conditions: []appsv1.DeploymentCondition{
202+
{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ReplicaSetUpdated", Message: "progressing"},
203+
},
204+
wantMessage: "progressing",
205+
wantTimedOut: false,
206+
},
207+
{
208+
name: "no conditions",
209+
conditions: nil,
210+
wantMessage: "",
211+
wantTimedOut: false,
212+
},
213+
}
214+
for _, tt := range tests {
215+
t.Run(tt.name, func(t *testing.T) {
216+
gotMsg, gotTimedOut := HasDeploymentTimedOutProgressing(appsv1.DeploymentStatus{Conditions: tt.conditions})
217+
if gotMsg != tt.wantMessage {
218+
t.Errorf("message: got %q, want %q", gotMsg, tt.wantMessage)
219+
}
220+
if gotTimedOut != tt.wantTimedOut {
221+
t.Errorf("timedOut: got %v, want %v", gotTimedOut, tt.wantTimedOut)
222+
}
223+
})
224+
}
225+
}

pkg/operator/apiserver/controller/workload/workload.go

Lines changed: 11 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -301,34 +301,16 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
301301

302302
desiredReplicas := ptr.Deref(workload.Spec.Replicas, 1)
303303

304-
// If the workload is up to date, then we are no longer progressing
305-
workloadAtHighestGeneration := workload.ObjectMeta.Generation == workload.Status.ObservedGeneration
306-
// Update is done when all pods have been updated to the latest revision
307-
// and the deployment controller has reported NewReplicaSetAvailable
308-
workloadIsBeingUpdated := !workloadAtHighestGeneration || !hasDeploymentProgressed(workload.Status)
309-
workloadIsBeingUpdatedTooLong := v1helpers.IsUpdatingTooLong(previousStatus, *deploymentProgressingCondition.Type)
310-
if !workloadAtHighestGeneration {
311-
deploymentProgressingCondition = deploymentProgressingCondition.
312-
WithStatus(operatorv1.ConditionTrue).
313-
WithReason("NewGeneration").
314-
WithMessage(fmt.Sprintf("deployment/%s.%s: observed generation is %d, desired generation is %d.", workload.Name, c.targetNamespace, workload.Status.ObservedGeneration, workload.ObjectMeta.Generation))
315-
} else if workloadIsBeingUpdated {
316-
deploymentProgressingCondition = deploymentProgressingCondition.
317-
WithStatus(operatorv1.ConditionTrue).
318-
WithReason("PodsUpdating").
319-
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation and %d/%d pods are available", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas, workload.Status.AvailableReplicas, desiredReplicas))
320-
} else {
321-
// Terminating pods don't account for any of the other status fields but
322-
// still can exist in a state when they are accepting connections and would
323-
// contribute to unexpected behavior when we report Progressing=False.
324-
// The case of too many pods might occur for example if `TerminationGracePeriodSeconds` is set
325-
//
326-
// The workload should ensure this does not happen by using for example EnsureAtMostOnePodPerNode
327-
// so that the old pods terminate before the new ones are started.
328-
deploymentProgressingCondition = deploymentProgressingCondition.
329-
WithStatus(operatorv1.ConditionFalse).
330-
WithReason("AsExpected")
331-
}
304+
// Update is done when the deployment controller has reported NewReplicaSetAvailable.
305+
// Checking the current vs. observed generation here is not possible since we don't want to be Progressing on scaling.
306+
progressingCond := deployment.DeploymentProgressingCondition(workload)
307+
_, workloadIsBeingUpdatedTooLong := deployment.HasDeploymentTimedOutProgressing(workload.Status)
308+
workloadIsBeingUpdated := !deployment.HasDeploymentProgressed(workload.Status) && !workloadIsBeingUpdatedTooLong
309+
310+
deploymentProgressingCondition = deploymentProgressingCondition.
311+
WithStatus(progressingCond.Status).
312+
WithReason(progressingCond.Reason).
313+
WithMessage(progressingCond.Message)
332314

333315
// During a rollout the default maxSurge (25%) will allow the available
334316
// replicas to temporarily exceed the desired replica count. If this were
@@ -354,6 +336,7 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
354336
// if the deployment is all available and at the expected generation, then update the version to the latest
355337
// when we update, the image pull spec should immediately be different, which should immediately cause a deployment rollout
356338
// which should immediately result in a deployment generation diff, which should cause this block to be skipped until it is ready.
339+
workloadAtHighestGeneration := workload.ObjectMeta.Generation == workload.Status.ObservedGeneration
357340
workloadHasAllPodsUpdated := workload.Status.UpdatedReplicas == desiredReplicas
358341
if workloadAtHighestGeneration && workloadHasAllPodsAvailable && workloadHasAllPodsUpdated && operatorConfigAtHighestGeneration {
359342
c.versionRecorder.SetVersion(c.constructOperandNameFor(workload.Name), c.targetOperandVersion)
@@ -373,17 +356,6 @@ func (c *Controller) constructOperandNameFor(name string) string {
373356
return name
374357
}
375358

376-
// hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
377-
// via the DeploymentProgressing condition
378-
func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
379-
for _, cond := range status.Conditions {
380-
if cond.Type == appsv1.DeploymentProgressing {
381-
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
382-
}
383-
}
384-
return false
385-
}
386-
387359
// EnsureAtMostOnePodPerNode updates the deployment spec to prevent more than
388360
// one pod of a given replicaset from landing on a node. It accomplishes this
389361
// by adding a label on the template and updates the pod anti-affinity term to include that label.

0 commit comments

Comments
 (0)