44 "context"
55 "errors"
66 "fmt"
7+ "slices"
8+ "strings"
79
810 appsv1 "k8s.io/api/apps/v1"
911 corev1 "k8s.io/api/core/v1"
@@ -15,6 +17,7 @@ import (
1517 corev1listers "k8s.io/client-go/listers/core/v1"
1618 "k8s.io/client-go/tools/cache"
1719 "k8s.io/client-go/util/workqueue"
20+ "k8s.io/utils/ptr"
1821
1922 operatorv1 "github.com/openshift/api/operator/v1"
2023 applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1"
@@ -215,10 +218,7 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
215218 }()
216219
217220 if ! preconditionsReady {
218- var message string
219- for _ , err := range errs {
220- message = message + err .Error () + "\n "
221- }
221+ message := errMessage (errs )
222222 if len (message ) == 0 {
223223 message = "the operator didn't specify what preconditions are missing"
224224 }
@@ -246,24 +246,14 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
246246 return kerrors .NewAggregate (errs )
247247 }
248248
249- if len (errs ) > 0 {
250- message := ""
251- for _ , err := range errs {
252- message = message + err .Error () + "\n "
249+ defer func () {
250+ if len (errs ) > 0 {
251+ workloadDegradedCondition = workloadDegradedCondition .
252+ WithStatus (operatorv1 .ConditionTrue ).
253+ WithReason ("SyncError" ).
254+ WithMessage (errMessage (errs ))
253255 }
254- workloadDegradedCondition = workloadDegradedCondition .
255- WithStatus (operatorv1 .ConditionTrue ).
256- WithReason ("SyncError" ).
257- WithMessage (message )
258- } else if workload == nil {
259- workloadDegradedCondition = workloadDegradedCondition .
260- WithStatus (operatorv1 .ConditionTrue ).
261- WithReason ("NoDeployment" ).
262- WithMessage (fmt .Sprintf ("deployment/%s: could not be retrieved" , c .targetNamespace ))
263- } else {
264- workloadDegradedCondition = workloadDegradedCondition .
265- WithStatus (operatorv1 .ConditionFalse )
266- }
256+ }()
267257
268258 if workload == nil {
269259 message := fmt .Sprintf ("deployment/%s: could not be retrieved" , c .targetNamespace )
@@ -282,9 +272,16 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
282272 WithReason ("NoDeployment" ).
283273 WithMessage (message )
284274
275+ workloadDegradedCondition = workloadDegradedCondition .
276+ WithStatus (operatorv1 .ConditionTrue ).
277+ WithReason ("NoDeployment" ).
278+ WithMessage (message )
279+
285280 return kerrors .NewAggregate (errs )
286281 }
287282
283+ workloadDegradedCondition = workloadDegradedCondition .WithStatus (operatorv1 .ConditionFalse )
284+
288285 if workload .Status .AvailableReplicas == 0 {
289286 deploymentAvailableCondition = deploymentAvailableCondition .
290287 WithStatus (operatorv1 .ConditionFalse ).
@@ -296,15 +293,18 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
296293 WithReason ("AsExpected" )
297294 }
298295
299- desiredReplicas := int32 (1 )
300- if workload .Spec .Replicas != nil {
301- desiredReplicas = * (workload .Spec .Replicas )
302- }
296+ desiredReplicas := ptr .Deref (workload .Spec .Replicas , 1 )
303297
304298 // Update is done when the deployment controller has reported NewReplicaSetAvailable.
305299 // Checking the current vs. observed generation here is not possible since we don't want to be Progressing on scaling.
306300 progressTimedOutMessage , workloadIsBeingUpdatedTooLong := hasDeploymentTimedOutProgressing (workload .Status )
307301 workloadIsBeingUpdated := ! hasDeploymentProgressed (workload .Status ) && ! workloadIsBeingUpdatedTooLong
302+
303+ var progressDeadlineExceededMessage string
304+ if workloadIsBeingUpdatedTooLong {
305+ progressDeadlineExceededMessage = fmt .Sprintf ("deployment/%s.%s has timed out progressing: %s" , workload .Name , c .targetNamespace , progressTimedOutMessage )
306+ }
307+
308308 switch {
309309 case workloadIsBeingUpdated :
310310 deploymentProgressingCondition = deploymentProgressingCondition .
@@ -316,7 +316,7 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
316316 deploymentProgressingCondition = deploymentProgressingCondition .
317317 WithStatus (operatorv1 .ConditionFalse ).
318318 WithReason ("ProgressDeadlineExceeded" ).
319- WithMessage (fmt . Sprintf ( "deployment/%s.%s has timed out progressing: %s" , workload . Name , c . targetNamespace , progressTimedOutMessage ) )
319+ WithMessage (progressDeadlineExceededMessage )
320320
321321 default :
322322 // Terminating pods don't account for any of the other status fields but
@@ -331,20 +331,37 @@ func (c *Controller) updateOperatorStatus(ctx context.Context, previousStatus *o
331331 WithReason ("AsExpected" )
332332 }
333333
334- // Degraded is set when the deployment is not Available or timed out progressing.
335- // This will cause the operator to go Degraded during the initial rollout.
336334 switch {
337- case workload .Status .AvailableReplicas == 0 :
338- deploymentDegradedCondition = deploymentDegradedCondition .
339- WithStatus (operatorv1 .ConditionTrue ).
340- WithReason ("Unavailable" ).
341- WithMessage (fmt .Sprintf ("no %s.%s pods available on any node" , workload .Name , c .targetNamespace ))
342-
343335 case workloadIsBeingUpdatedTooLong :
344336 deploymentDegradedCondition = deploymentDegradedCondition .
345337 WithStatus (operatorv1 .ConditionTrue ).
346338 WithReason ("ProgressDeadlineExceeded" ).
347- WithMessage (fmt .Sprintf ("deployment/%s.%s has timed out progressing: %s" , workload .Name , c .targetNamespace , progressTimedOutMessage ))
339+ WithMessage (progressDeadlineExceededMessage )
340+
341+ case workload .Status .AvailableReplicas < desiredReplicas :
342+ failureMessages , err := hasFailingPods (workload , c .podsLister )
343+ if err != nil {
344+ errs = append (errs , err )
345+ }
346+ if len (failureMessages ) > 0 || (workload .Status .AvailableReplicas == 0 && ! workloadIsBeingUpdated ) {
347+ var failureDescription string
348+ if len (failureMessages ) > 0 {
349+ failureDescription = ` (` + strings .Join (failureMessages , ", " ) + `)`
350+ }
351+
352+ numUnavailable := desiredReplicas - workload .Status .AvailableReplicas
353+ message := fmt .Sprintf ("%d of %d requested instances are unavailable for %s.%s%s" ,
354+ numUnavailable , desiredReplicas , workload .Name , c .targetNamespace , failureDescription )
355+
356+ deploymentDegradedCondition = deploymentDegradedCondition .
357+ WithStatus (operatorv1 .ConditionTrue ).
358+ WithReason ("UnavailablePod" ).
359+ WithMessage (message )
360+ } else {
361+ deploymentDegradedCondition = deploymentDegradedCondition .
362+ WithStatus (operatorv1 .ConditionFalse ).
363+ WithReason ("AsExpected" )
364+ }
348365
349366 default :
350367 deploymentDegradedCondition = deploymentDegradedCondition .
@@ -399,6 +416,55 @@ func hasDeploymentTimedOutProgressing(status appsv1.DeploymentStatus) (string, b
399416 return "" , false
400417}
401418
419+ func hasFailingPods (workload * appsv1.Deployment , podsLister corev1listers.PodLister ) ([]string , error ) {
420+ pods , err := podsLister .Pods (workload .Namespace ).List (labels .SelectorFromSet (workload .Spec .Template .Labels ))
421+ if err != nil {
422+ return nil , err
423+ }
424+
425+ var failureMessages []string
426+ for _ , pod := range pods {
427+ for _ , c := range slices .Concat (pod .Status .ContainerStatuses , pod .Status .InitContainerStatuses ) {
428+ if c .Ready {
429+ continue
430+ }
431+
432+ var msg string
433+ switch {
434+ case c .State .Terminated != nil && c .State .Terminated .ExitCode != 0 :
435+ msg = fmt .Sprintf ("container %s terminated with exit code %d in pod %s" , c .Name , c .State .Terminated .ExitCode , pod .Name )
436+ case c .RestartCount > 1 :
437+ msg = fmt .Sprintf ("container %s is crashlooping in pod %s" , c .Name , pod .Name )
438+ case c .State .Waiting != nil && isFailureWaitingReason (c .State .Waiting .Reason ):
439+ msg = fmt .Sprintf ("container %s is waiting (%s) in pod %s" , c .Name , c .State .Waiting .Reason , pod .Name )
440+ }
441+ if len (msg ) > 0 {
442+ failureMessages = append (failureMessages , msg )
443+ }
444+ }
445+ }
446+ return failureMessages , nil
447+ }
448+
449+ func isFailureWaitingReason (reason string ) bool {
450+ switch reason {
451+ case "ContainerCreating" , "PodInitializing" , "RestartingAllContainers" :
452+ return false
453+ }
454+ return true
455+ }
456+
457+ func errMessage (errs []error ) string {
458+ var b strings.Builder
459+ for i , err := range errs {
460+ if i > 0 {
461+ b .WriteString ("\n " )
462+ }
463+ b .WriteString (err .Error ())
464+ }
465+ return b .String ()
466+ }
467+
402468// EnsureAtMostOnePodPerNode updates the deployment spec to prevent more than
403469// one pod of a given replicaset from landing on a node. It accomplishes this
404470// by adding a label on the template and updates the pod anti-affinity term to include that label.
0 commit comments