Skip to content

Commit 08c8c15

Browse files
committed
controller/workload: Rework Degraded/Progressing conditions
To enable scaling not to automatically set Progressing/Degraded, conditions handling is aligned so that the Deployment generation is not consulted any more. Degraded only happens when the Deployment is not Available or it times out progressing. Progressing is set to True when the Deployment is not in NewReplicaSetAvailable and it hasn't timed out progressing.
1 parent d2db42c commit 08c8c15

2 files changed

Lines changed: 268 additions & 90 deletions

File tree

pkg/operator/apiserver/controller/workload/workload.go

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"errors"
66
"fmt"
7-
"strings"
87

98
appsv1 "k8s.io/api/apps/v1"
109
corev1 "k8s.io/api/core/v1"
@@ -19,7 +18,6 @@ import (
1918

2019
operatorv1 "github.com/openshift/api/operator/v1"
2120
applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1"
22-
"github.com/openshift/library-go/pkg/apps/deployment"
2321
"github.com/openshift/library-go/pkg/controller/factory"
2422
"github.com/openshift/library-go/pkg/operator/events"
2523
"github.com/openshift/library-go/pkg/operator/status"
@@ -291,7 +289,7 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
291289
deploymentAvailableCondition = deploymentAvailableCondition.
292290
WithStatus(operatorv1.ConditionFalse).
293291
WithReason("NoPod").
294-
WithMessage(fmt.Sprintf("no %s.%s pods available on any node.", workload.Name, c.targetNamespace))
292+
WithMessage(fmt.Sprintf("no %s.%s pods available on any node", workload.Name, c.targetNamespace))
295293
} else {
296294
deploymentAvailableCondition = deploymentAvailableCondition.
297295
WithStatus(operatorv1.ConditionTrue).
@@ -303,23 +301,24 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
303301
desiredReplicas = *(workload.Spec.Replicas)
304302
}
305303

306-
// If the workload is up to date, then we are no longer progressing
307-
workloadAtHighestGeneration := workload.ObjectMeta.Generation == workload.Status.ObservedGeneration
308-
// Update is done when all pods have been updated to the latest revision
309-
// and the deployment controller has reported NewReplicaSetAvailable
310-
workloadIsBeingUpdated := !workloadAtHighestGeneration || !hasDeploymentProgressed(workload.Status)
311-
workloadIsBeingUpdatedTooLong := v1helpers.IsUpdatingTooLong(previousStatus, *deploymentProgressingCondition.Type)
312-
if !workloadAtHighestGeneration {
313-
deploymentProgressingCondition = deploymentProgressingCondition.
314-
WithStatus(operatorv1.ConditionTrue).
315-
WithReason("NewGeneration").
316-
WithMessage(fmt.Sprintf("deployment/%s.%s: observed generation is %d, desired generation is %d.", workload.Name, c.targetNamespace, workload.Status.ObservedGeneration, workload.ObjectMeta.Generation))
317-
} else if workloadIsBeingUpdated {
304+
// Update is done when the deployment controller has reported NewReplicaSetAvailable.
305+
// Checking the current vs. observed generation here is not possible since we don't want to be Progressing on scaling.
306+
progressTimedOutMessage, workloadIsBeingUpdatedTooLong := hasDeploymentTimedOutProgressing(workload.Status)
307+
workloadIsBeingUpdated := !hasDeploymentProgressed(workload.Status) && !workloadIsBeingUpdatedTooLong
308+
switch {
309+
case workloadIsBeingUpdated:
318310
deploymentProgressingCondition = deploymentProgressingCondition.
319311
WithStatus(operatorv1.ConditionTrue).
320312
WithReason("PodsUpdating").
321-
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest generation and %d/%d pods are available", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas, workload.Status.AvailableReplicas, desiredReplicas))
322-
} else {
313+
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest revision and %d/%d pods are available", workload.Name, c.targetNamespace, workload.Status.UpdatedReplicas, desiredReplicas, workload.Status.AvailableReplicas, desiredReplicas))
314+
315+
case workloadIsBeingUpdatedTooLong:
316+
deploymentProgressingCondition = deploymentProgressingCondition.
317+
WithStatus(operatorv1.ConditionFalse).
318+
WithReason("ProgressDeadlineExceeded").
319+
WithMessage(fmt.Sprintf("deployment/%s.%s has timed out progressing: %s", workload.Name, c.targetNamespace, progressTimedOutMessage))
320+
321+
default:
323322
// Terminating pods don't account for any of the other status fields but
324323
// still can exist in a state when they are accepting connections and would
325324
// contribute to unexpected behavior when we report Progressing=False.
@@ -332,22 +331,22 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
332331
WithReason("AsExpected")
333332
}
334333

335-
// During a rollout the default maxSurge (25%) will allow the available
336-
// replicas to temporarily exceed the desired replica count. If this were
337-
// to occur, the operator should not report degraded.
338-
workloadHasAllPodsAvailable := workload.Status.AvailableReplicas >= desiredReplicas
339-
if !workloadHasAllPodsAvailable && (!workloadIsBeingUpdated || workloadIsBeingUpdatedTooLong) {
340-
numNonAvailablePods := desiredReplicas - workload.Status.AvailableReplicas
334+
// Degraded is set when the deployment is not Available or timed out progressing.
335+
// This will cause the operator to go Degraded during the initial rollout.
336+
switch {
337+
case workload.Status.AvailableReplicas == 0:
341338
deploymentDegradedCondition = deploymentDegradedCondition.
342339
WithStatus(operatorv1.ConditionTrue).
343-
WithReason("UnavailablePod")
344-
podContainersStatus, err := deployment.PodContainersStatus(workload, c.podsLister)
345-
if err != nil {
346-
podContainersStatus = []string{fmt.Sprintf("failed to get pod containers details: %v", err)}
347-
}
340+
WithReason("Unavailable").
341+
WithMessage(fmt.Sprintf("no %s.%s pods available on any node", workload.Name, c.targetNamespace))
342+
343+
case workloadIsBeingUpdatedTooLong:
348344
deploymentDegradedCondition = deploymentDegradedCondition.
349-
WithMessage(fmt.Sprintf("%v of %v requested instances are unavailable for %s.%s (%s)", numNonAvailablePods, desiredReplicas, workload.Name, c.targetNamespace, strings.Join(podContainersStatus, ", ")))
350-
} else {
345+
WithStatus(operatorv1.ConditionTrue).
346+
WithReason("ProgressDeadlineExceeded").
347+
WithMessage(fmt.Sprintf("deployment/%s.%s has timed out progressing: %s", workload.Name, c.targetNamespace, progressTimedOutMessage))
348+
349+
default:
351350
deploymentDegradedCondition = deploymentDegradedCondition.
352351
WithStatus(operatorv1.ConditionFalse).
353352
WithReason("AsExpected")
@@ -356,8 +355,11 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
356355
// if the deployment is all available and at the expected generation, then update the version to the latest
357356
// when we update, the image pull spec should immediately be different, which should immediately cause a deployment rollout
358357
// which should immediately result in a deployment generation diff, which should cause this block to be skipped until it is ready.
359-
workloadHasAllPodsUpdated := workload.Status.UpdatedReplicas == desiredReplicas
360-
if workloadAtHighestGeneration && workloadHasAllPodsAvailable && workloadHasAllPodsUpdated && operatorConfigAtHighestGeneration {
358+
if operatorConfigAtHighestGeneration &&
359+
workload.ObjectMeta.Generation == workload.Status.ObservedGeneration &&
360+
workload.Status.AvailableReplicas == desiredReplicas &&
361+
workload.Status.UpdatedReplicas == desiredReplicas {
362+
361363
c.versionRecorder.SetVersion(c.constructOperandNameFor(workload.Name), c.targetOperandVersion)
362364
}
363365

@@ -386,6 +388,17 @@ func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
386388
return false
387389
}
388390

391+
// hasDeploymentTimedOutProgressing returns true if the deployment reports ProgressDeadlineExceeded.
392+
// The function returns the Progressing condition message as the first return value.
393+
func hasDeploymentTimedOutProgressing(status appsv1.DeploymentStatus) (string, bool) {
394+
for _, cond := range status.Conditions {
395+
if cond.Type == appsv1.DeploymentProgressing {
396+
return cond.Message, cond.Status == corev1.ConditionFalse && cond.Reason == "ProgressDeadlineExceeded"
397+
}
398+
}
399+
return "", false
400+
}
401+
389402
// EnsureAtMostOnePodPerNode updates the deployment spec to prevent more than
390403
// one pod of a given replicaset from landing on a node. It accomplishes this
391404
// by adding a label on the template and updates the pod anti-affinity term to include that label.

0 commit comments

Comments
 (0)