99
1010 opv1 "github.com/openshift/api/operator/v1"
1111 applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1"
12+ "github.com/openshift/library-go/pkg/apps/deployment"
1213 "github.com/openshift/library-go/pkg/controller/factory"
1314 "github.com/openshift/library-go/pkg/operator/events"
1415 "github.com/openshift/library-go/pkg/operator/management"
@@ -42,7 +43,7 @@ type ManifestHookFunc func(*opv1.OperatorSpec, []byte) ([]byte, error)
4243// This controller optionally produces the following conditions:
4344// <name>Available: indicates that the deployment controller was successfully deployed and at least one Deployment replica is available.
4445// <name>Progressing: indicates that the Deployment is in progress.
45- // <name>Degraded: produced when the sync() method returns an error.
46+ // <name>Degraded: true when the deployment has timed out progressing, when failing pods reduce availability (while not mid-rollout), or when sync returns an error.
4647type DeploymentController struct {
4748 // instanceName is the name to identify what instance this belongs too: FooDriver for instance
4849 instanceName string
@@ -193,7 +194,9 @@ func (c *DeploymentController) ToController() (factory.Controller, error) {
193194 ).ResyncEvery (
194195 time .Minute ,
195196 )
196- if slices .Contains (c .conditions , opv1 .OperatorStatusTypeDegraded ) {
197+ // When this controller owns <name>Degraded in status, do not use WithSyncDegradedOnError: reconcile would set
198+ // Degraded=False on every successful sync and clear deployment operand degradation (see openshift/library-go#2128).
199+ if ! slices .Contains (c .conditions , opv1 .OperatorStatusTypeDegraded ) {
197200 controller = controller .WithSyncDegradedOnError (c .operatorClient )
198201 }
199202 return controller .ToController (
@@ -207,7 +210,18 @@ func (c *DeploymentController) Name() string {
207210 return c .instanceName
208211}
209212
210- func (c * DeploymentController ) sync (ctx context.Context , syncContext factory.SyncContext ) error {
213+ func (c * DeploymentController ) sync (ctx context.Context , syncContext factory.SyncContext ) (err error ) {
214+ if slices .Contains (c .conditions , opv1 .OperatorStatusTypeDegraded ) {
215+ defer func () {
216+ if err != nil {
217+ applyErr := c .applySyncErrorDegraded (ctx , err )
218+ if applyErr != nil {
219+ klog .V (2 ).Infof ("failed to apply sync error degraded status: %v" , applyErr )
220+ }
221+ }
222+ }()
223+ }
224+
211225 opSpec , opStatus , _ , err := c .operatorClient .GetOperatorState ()
212226 if apierrors .IsNotFound (err ) && management .IsOperatorRemovable () {
213227 return nil
@@ -217,7 +231,7 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
217231 }
218232
219233 if opSpec .ManagementState == opv1 .Removed && management .IsOperatorRemovable () {
220- return c .syncDeleting (ctx , opSpec , opStatus , syncContext )
234+ return c .syncDeleting (ctx , opSpec , syncContext )
221235 }
222236
223237 if opSpec .ManagementState != opv1 .Managed {
@@ -229,11 +243,21 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
229243 return err
230244 }
231245 if management .IsOperatorRemovable () && meta .DeletionTimestamp != nil {
232- return c .syncDeleting (ctx , opSpec , opStatus , syncContext )
246+ return c .syncDeleting (ctx , opSpec , syncContext )
233247 }
234248 return c .syncManaged (ctx , opSpec , opStatus , syncContext )
235249}
236250
251+ func (c * DeploymentController ) applySyncErrorDegraded (ctx context.Context , syncErr error ) error {
252+ degraded := applyoperatorv1 .OperatorCondition ().
253+ WithType (c .instanceName + opv1 .OperatorStatusTypeDegraded ).
254+ WithStatus (opv1 .ConditionTrue ).
255+ WithReason ("SyncError" ).
256+ WithMessage (syncErr .Error ())
257+ status := applyoperatorv1 .OperatorStatus ().WithConditions (degraded )
258+ return c .operatorClient .ApplyOperatorStatus (ctx , c .controllerInstanceName , status )
259+ }
260+
237261func (c * DeploymentController ) syncManaged (ctx context.Context , opSpec * opv1.OperatorSpec , opStatus * opv1.OperatorStatus , syncContext factory.SyncContext ) error {
238262 klog .V (4 ).Infof ("syncManaged" )
239263
@@ -247,7 +271,7 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
247271 return err
248272 }
249273
250- deployment , _ , err := resourceapply .ApplyDeployment (
274+ deploymentObj , _ , err := resourceapply .ApplyDeployment (
251275 ctx ,
252276 c .kubeClient .AppsV1 (),
253277 syncContext .Recorder (),
@@ -257,57 +281,112 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
257281 if err != nil {
258282 return err
259283 }
260- // Create an OperatorStatusApplyConfiguration with generations
284+
261285 status := applyoperatorv1 .OperatorStatus ().
262286 WithGenerations (& applyoperatorv1.GenerationStatusApplyConfiguration {
263287 Group : ptr .To ("apps" ),
264288 Resource : ptr .To ("deployments" ),
265- Namespace : ptr .To (deployment .Namespace ),
266- Name : ptr .To (deployment .Name ),
267- LastGeneration : ptr .To (deployment .Generation ),
289+ Namespace : ptr .To (deploymentObj .Namespace ),
290+ Name : ptr .To (deploymentObj .Name ),
291+ LastGeneration : ptr .To (deploymentObj .Generation ),
268292 })
269293
270- // Set Available condition
294+ now := time .Now ()
295+
271296 if slices .Contains (c .conditions , opv1 .OperatorStatusTypeAvailable ) {
272297 availableCondition := applyoperatorv1 .
273298 OperatorCondition ().WithType (c .instanceName + opv1 .OperatorStatusTypeAvailable )
274- if deployment .Status .AvailableReplicas > 0 {
299+ if deploymentObj .Status .AvailableReplicas > 0 {
275300 availableCondition = availableCondition .
276301 WithStatus (opv1 .ConditionTrue ).
277302 WithMessage ("Deployment is available" ).
278303 WithReason ("AsExpected" )
279-
280304 } else {
281305 availableCondition = availableCondition .
282306 WithStatus (opv1 .ConditionFalse ).
283- WithMessage ( "Waiting for Deployment " ).
284- WithReason ( "Deploying" )
307+ WithReason ( "NoPod " ).
308+ WithMessage ( fmt . Sprintf ( "no %s.%s pods available on any node" , deploymentObj . Name , deploymentObj . Namespace ) )
285309 }
286310 status = status .WithConditions (availableCondition )
287311 }
288312
289- // Set Progressing condition
313+ desiredReplicas := ptr .Deref (deploymentObj .Spec .Replicas , 1 )
314+
315+ progressTimedOutMessage , workloadIsBeingUpdatedTooLong := hasDeploymentTimedOutProgressing (deploymentObj .Status )
316+ workloadIsBeingUpdated := ! hasDeploymentProgressed (deploymentObj .Status ) && ! workloadIsBeingUpdatedTooLong
317+
318+ var progressDeadlineExceededMessage string
319+ if workloadIsBeingUpdatedTooLong {
320+ progressDeadlineExceededMessage = fmt .Sprintf ("deployment/%s.%s has timed out progressing: %s" , deploymentObj .Name , deploymentObj .Namespace , progressTimedOutMessage )
321+ }
322+
290323 if slices .Contains (c .conditions , opv1 .OperatorStatusTypeProgressing ) {
291324 progressingCondition := applyoperatorv1 .OperatorCondition ().
292325 WithType (c .instanceName + opv1 .OperatorStatusTypeProgressing ).
293326 WithStatus (opv1 .ConditionFalse ).
294327 WithMessage ("Deployment is not progressing" ).
295328 WithReason ("AsExpected" )
296329
297- if ok , msg := isProgressing (deployment ); ok {
330+ switch {
331+ case workloadIsBeingUpdated :
298332 progressingCondition = progressingCondition .
299333 WithStatus (opv1 .ConditionTrue ).
300- WithMessage (msg ).
301- WithReason ("Deploying" )
334+ WithReason ("PodsUpdating" ).
335+ WithMessage (fmt .Sprintf ("deployment/%s.%s: %d/%d pods have been updated to the latest revision and %d/%d pods are available" , deploymentObj .Name , deploymentObj .Namespace , deploymentObj .Status .UpdatedReplicas , desiredReplicas , deploymentObj .Status .AvailableReplicas , desiredReplicas ))
336+ case workloadIsBeingUpdatedTooLong :
337+ progressingCondition = progressingCondition .
338+ WithStatus (opv1 .ConditionFalse ).
339+ WithReason ("ProgressDeadlineExceeded" ).
340+ WithMessage (progressDeadlineExceededMessage )
341+ default :
342+ progressingCondition = progressingCondition .
343+ WithStatus (opv1 .ConditionFalse ).
344+ WithReason ("AsExpected" )
345+ }
346+ status = status .WithConditions (progressingCondition )
347+ }
348+
349+ if slices .Contains (c .conditions , opv1 .OperatorStatusTypeDegraded ) {
350+ degradedCondition := applyoperatorv1 .OperatorCondition ().
351+ WithType (c .instanceName + opv1 .OperatorStatusTypeDegraded ).
352+ WithStatus (opv1 .ConditionFalse ).
353+ WithReason ("AsExpected" )
302354
303- // Degrade when operator is progressing too long.
304- // Only do this if we would continue to be in the Progressing state, otherwise, we'll never get out
305- if v1helpers .IsUpdatingTooLong (opStatus , c .instanceName + opv1 .OperatorStatusTypeProgressing ) {
306- return fmt .Errorf ("Deployment was progressing too long" )
355+ switch {
356+ case workloadIsBeingUpdatedTooLong :
357+ degradedCondition = degradedCondition .
358+ WithStatus (opv1 .ConditionTrue ).
359+ WithReason ("ProgressDeadlineExceeded" ).
360+ WithMessage (progressDeadlineExceededMessage )
361+
362+ case ! workloadIsBeingUpdated && deploymentObj .Status .AvailableReplicas < desiredReplicas :
363+ operandPods := c .listOperandPodsForDiagnostics (ctx , deploymentObj , syncContext )
364+ livePods := nonDeletingPods (operandPods )
365+ hasFailing := hasFailingPods (deploymentObj , livePods , now )
366+ if hasFailing || deploymentObj .Status .AvailableReplicas == 0 {
367+ containerMessages := deployment .ContainerMessagesForPods (deploymentObj , livePods )
368+ var failureDescription string
369+ if len (containerMessages ) > 0 {
370+ failureDescription = ` (` + strings .Join (containerMessages , ", " ) + `)`
371+ }
372+ numUnavailable := desiredReplicas - deploymentObj .Status .AvailableReplicas
373+ message := fmt .Sprintf ("%d of %d requested instances are unavailable for %s.%s%s" , numUnavailable , desiredReplicas , deploymentObj .Name , deploymentObj .Namespace , failureDescription )
374+ degradedCondition = degradedCondition .
375+ WithStatus (opv1 .ConditionTrue ).
376+ WithReason ("UnavailablePod" ).
377+ WithMessage (message )
378+ } else {
379+ degradedCondition = degradedCondition .
380+ WithStatus (opv1 .ConditionFalse ).
381+ WithReason ("AsExpected" )
307382 }
308- }
309383
310- status = status .WithConditions (progressingCondition )
384+ default :
385+ degradedCondition = degradedCondition .
386+ WithStatus (opv1 .ConditionFalse ).
387+ WithReason ("AsExpected" )
388+ }
389+ status = status .WithConditions (degradedCondition )
311390 }
312391
313392 return c .operatorClient .ApplyOperatorStatus (
@@ -317,7 +396,34 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
317396 )
318397}
319398
320- func (c * DeploymentController ) syncDeleting (ctx context.Context , opSpec * opv1.OperatorSpec , opStatus * opv1.OperatorStatus , syncContext factory.SyncContext ) error {
399+ // listOperandPodsForDiagnostics lists pods matched by the deployment selector for UnavailablePod diagnostics.
400+ // A nil selector, selector conversion errors, or API list errors are logged and recorded as warnings; no error is returned.
401+ func (c * DeploymentController ) listOperandPodsForDiagnostics (ctx context.Context , deploymentObj * appsv1.Deployment , syncContext factory.SyncContext ) []* corev1.Pod {
402+ if deploymentObj .Spec .Selector == nil {
403+ klog .Warningf ("deployment/%s/%s has no spec.selector, skipping pod diagnostics" , deploymentObj .Namespace , deploymentObj .Name )
404+ syncContext .Recorder ().Warningf ("DeploymentSelectorMissing" , "deployment %s/%s has no spec.selector, skipping pod diagnostics" , deploymentObj .Namespace , deploymentObj .Name )
405+ return nil
406+ }
407+ selector , err := metav1 .LabelSelectorAsSelector (deploymentObj .Spec .Selector )
408+ if err != nil {
409+ klog .Warningf ("deployment/%s/%s has invalid spec.selector: %v" , deploymentObj .Namespace , deploymentObj .Name , err )
410+ syncContext .Recorder ().Warningf ("DeploymentSelectorInvalid" , "deployment %s/%s has invalid spec.selector: %v" , deploymentObj .Namespace , deploymentObj .Name , err )
411+ return nil
412+ }
413+ podList , err := c .kubeClient .CoreV1 ().Pods (deploymentObj .Namespace ).List (ctx , metav1.ListOptions {LabelSelector : selector .String ()})
414+ if err != nil {
415+ klog .Warningf ("deployment/%s/%s: pod list for diagnostics failed: %v" , deploymentObj .Namespace , deploymentObj .Name , err )
416+ syncContext .Recorder ().Warningf ("PodListFailed" , "listing pods for deployment %s/%s diagnostics: %v" , deploymentObj .Namespace , deploymentObj .Name , err )
417+ return nil
418+ }
419+ out := make ([]* corev1.Pod , len (podList .Items ))
420+ for i := range podList .Items {
421+ out [i ] = & podList .Items [i ]
422+ }
423+ return out
424+ }
425+
426+ func (c * DeploymentController ) syncDeleting (ctx context.Context , opSpec * opv1.OperatorSpec , syncContext factory.SyncContext ) error {
321427 klog .V (4 ).Infof ("syncDeleting" )
322428 required , err := c .getDeployment (opSpec )
323429 if err != nil {
@@ -356,36 +462,70 @@ func (c *DeploymentController) getDeployment(opSpec *opv1.OperatorSpec) (*appsv1
356462 return required , nil
357463}
358464
359- func isProgressing (deployment * appsv1.Deployment ) (bool , string ) {
465+ // hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
466+ // via the DeploymentProgressing condition.
467+ func hasDeploymentProgressed (status appsv1.DeploymentStatus ) bool {
468+ for _ , cond := range status .Conditions {
469+ if cond .Type == appsv1 .DeploymentProgressing {
470+ return cond .Status == corev1 .ConditionTrue && cond .Reason == "NewReplicaSetAvailable"
471+ }
472+ }
473+ return false
474+ }
360475
361- var deploymentExpectedReplicas int32
362- if deployment .Spec .Replicas != nil {
363- deploymentExpectedReplicas = * deployment .Spec .Replicas
476+ // hasDeploymentTimedOutProgressing returns true if the deployment reports ProgressDeadlineExceeded.
477+ // The function returns the Progressing condition message as the first return value.
478+ func hasDeploymentTimedOutProgressing (status appsv1.DeploymentStatus ) (string , bool ) {
479+ for _ , cond := range status .Conditions {
480+ if cond .Type == appsv1 .DeploymentProgressing {
481+ return cond .Message , cond .Status == corev1 .ConditionFalse && cond .Reason == "ProgressDeadlineExceeded"
482+ }
364483 }
484+ return "" , false
485+ }
365486
366- switch {
367- case deployment .Generation != deployment .Status .ObservedGeneration :
368- return true , "Waiting for Deployment to act on changes"
369- case hasFinishedProgressing (deployment ):
370- return false , ""
371- case deployment .Status .UnavailableReplicas > 0 :
372- return true , "Waiting for Deployment to deploy pods"
373- case deployment .Status .UpdatedReplicas < deploymentExpectedReplicas :
374- return true , "Waiting for Deployment to update pods"
375- case deployment .Status .AvailableReplicas < deploymentExpectedReplicas :
376- return true , "Waiting for Deployment to deploy pods"
487+ // nonDeletingPods returns pods that are not terminating (no deletion timestamp).
488+ func nonDeletingPods (pods []* corev1.Pod ) []* corev1.Pod {
489+ out := make ([]* corev1.Pod , 0 , len (pods ))
490+ for _ , p := range pods {
491+ if p != nil && p .DeletionTimestamp == nil {
492+ out = append (out , p )
493+ }
377494 }
378- return false , ""
495+ return out
379496}
380497
381- func hasFinishedProgressing (deployment * appsv1.Deployment ) bool {
382- // Deployment whose rollout is complete gets Progressing condition with Reason NewReplicaSetAvailable condition.
383- // https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#complete-deployment
384- // Any subsequent missing replicas (e.g. caused by a node reboot) must not not change the Progressing condition.
385- for _ , cond := range deployment .Status .Conditions {
386- if cond .Type == appsv1 .DeploymentProgressing {
387- return cond .Status == corev1 .ConditionTrue && cond .Reason == "NewReplicaSetAvailable"
498+ func hasFailingPods (workload * appsv1.Deployment , pods []* corev1.Pod , now time.Time ) bool {
499+ progressDeadline := time .Duration (ptr .Deref (workload .Spec .ProgressDeadlineSeconds , 600 )) * time .Second
500+ minReady := time .Duration (workload .Spec .MinReadySeconds ) * time .Second
501+
502+ for _ , pod := range pods {
503+ if pod .DeletionTimestamp != nil {
504+ continue
505+ }
506+
507+ readyCond := findPodReadyCondition (pod )
508+ deadline := pod .CreationTimestamp .Time .Add (progressDeadline )
509+
510+ if (readyCond == nil || readyCond .Status != corev1 .ConditionTrue ) && now .After (deadline ) {
511+ return true
512+ }
513+
514+ if minReady > 0 && readyCond != nil && readyCond .Status == corev1 .ConditionTrue {
515+ isRelevant := now .After (pod .CreationTimestamp .Time .Add (progressDeadline + minReady ))
516+ if isRelevant && now .Sub (readyCond .LastTransitionTime .Time ) < minReady {
517+ return true
518+ }
388519 }
389520 }
390521 return false
391522}
523+
524+ func findPodReadyCondition (pod * corev1.Pod ) * corev1.PodCondition {
525+ for i := range pod .Status .Conditions {
526+ if pod .Status .Conditions [i ].Type == corev1 .PodReady {
527+ return & pod .Status .Conditions [i ]
528+ }
529+ }
530+ return nil
531+ }
0 commit comments