Skip to content

Commit 7d80de2

Browse files
committed
OCPBUGS-62629: reflect workload controller changes in the deployment controller
Contains the degraded/progressing condition changes made to the workload controller from openshift#2128. Signed-off-by: Thomas Jungblut <tjungblu@redhat.com>
1 parent d6efc4e commit 7d80de2

5 files changed

Lines changed: 305 additions & 64 deletions

File tree

pkg/apps/deployment/status.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,20 @@ func PodContainersStatus(deployment *appsv1.Deployment, podClient corelistersv1.
1616
if err != nil {
1717
return nil, err
1818
}
19+
return ContainerMessagesForPods(deployment, deploymentPods), nil
20+
}
21+
22+
// ContainerMessagesForPods returns human-readable container status messages for the given pods.
23+
// If pods is empty, a single message is included describing that no pods matched (using the deployment template labels).
24+
func ContainerMessagesForPods(deployment *appsv1.Deployment, pods []*corev1.Pod) []string {
25+
containerStates := containerStatusMessagesForPods(pods)
26+
if len(pods) == 0 {
27+
containerStates = append(containerStates, fmt.Sprintf("no pods found with labels %q", labels.SelectorFromSet(deployment.Spec.Template.Labels).String()))
28+
}
29+
return containerStates
30+
}
31+
32+
func containerStatusMessagesForPods(deploymentPods []*corev1.Pod) []string {
1933
containerStates := []string{}
2034

2135
for i := range deploymentPods {
@@ -66,10 +80,7 @@ func PodContainersStatus(deployment *appsv1.Deployment, podClient corelistersv1.
6680
}
6781
}
6882

69-
if len(deploymentPods) == 0 {
70-
containerStates = append(containerStates, fmt.Sprintf("no pods found with labels %q", labels.SelectorFromSet(deployment.Spec.Template.Labels).String()))
71-
}
72-
return containerStates, nil
83+
return containerStates
7384
}
7485

7586
func containerPlural(c int, crashloop bool) string {

pkg/operator/csi/csidrivercontrollerservicecontroller/csi_driver_controller_service_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ import (
6161
//
6262
// <name>Available: indicates that the CSI Controller Service was successfully deployed and at least one Deployment replica is available.
6363
// <name>Progressing: indicates that the CSI Controller Service is being deployed.
64-
// <name>Degraded: produced when the sync() method returns an error.
64+
// <name>Degraded: true when the deployment has timed out progressing, when failing pods reduce availability (while not mid-rollout), or when sync returns an error.
6565

6666
func NewCSIDriverControllerServiceController(
6767
name string,

pkg/operator/deploymentcontroller/deployment_controller.go

Lines changed: 189 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99

1010
opv1 "github.com/openshift/api/operator/v1"
1111
applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1"
12+
"github.com/openshift/library-go/pkg/apps/deployment"
1213
"github.com/openshift/library-go/pkg/controller/factory"
1314
"github.com/openshift/library-go/pkg/operator/events"
1415
"github.com/openshift/library-go/pkg/operator/management"
@@ -42,7 +43,7 @@ type ManifestHookFunc func(*opv1.OperatorSpec, []byte) ([]byte, error)
4243
// This controller optionally produces the following conditions:
4344
// <name>Available: indicates that the deployment controller was successfully deployed and at least one Deployment replica is available.
4445
// <name>Progressing: indicates that the Deployment is in progress.
45-
// <name>Degraded: produced when the sync() method returns an error.
46+
// <name>Degraded: true when the deployment has timed out progressing, when failing pods reduce availability (while not mid-rollout), or when sync returns an error.
4647
type DeploymentController struct {
4748
// instanceName is the name to identify what instance this belongs too: FooDriver for instance
4849
instanceName string
@@ -193,7 +194,9 @@ func (c *DeploymentController) ToController() (factory.Controller, error) {
193194
).ResyncEvery(
194195
time.Minute,
195196
)
196-
if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
197+
// When this controller owns <name>Degraded in status, do not use WithSyncDegradedOnError: reconcile would set
198+
// Degraded=False on every successful sync and clear deployment operand degradation (see openshift/library-go#2128).
199+
if !slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
197200
controller = controller.WithSyncDegradedOnError(c.operatorClient)
198201
}
199202
return controller.ToController(
@@ -207,7 +210,18 @@ func (c *DeploymentController) Name() string {
207210
return c.instanceName
208211
}
209212

210-
func (c *DeploymentController) sync(ctx context.Context, syncContext factory.SyncContext) error {
213+
func (c *DeploymentController) sync(ctx context.Context, syncContext factory.SyncContext) (err error) {
214+
if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
215+
defer func() {
216+
if err != nil {
217+
applyErr := c.applySyncErrorDegraded(ctx, err)
218+
if applyErr != nil {
219+
klog.V(2).Infof("failed to apply sync error degraded status: %v", applyErr)
220+
}
221+
}
222+
}()
223+
}
224+
211225
opSpec, opStatus, _, err := c.operatorClient.GetOperatorState()
212226
if apierrors.IsNotFound(err) && management.IsOperatorRemovable() {
213227
return nil
@@ -217,7 +231,7 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
217231
}
218232

219233
if opSpec.ManagementState == opv1.Removed && management.IsOperatorRemovable() {
220-
return c.syncDeleting(ctx, opSpec, opStatus, syncContext)
234+
return c.syncDeleting(ctx, opSpec, syncContext)
221235
}
222236

223237
if opSpec.ManagementState != opv1.Managed {
@@ -229,11 +243,21 @@ func (c *DeploymentController) sync(ctx context.Context, syncContext factory.Syn
229243
return err
230244
}
231245
if management.IsOperatorRemovable() && meta.DeletionTimestamp != nil {
232-
return c.syncDeleting(ctx, opSpec, opStatus, syncContext)
246+
return c.syncDeleting(ctx, opSpec, syncContext)
233247
}
234248
return c.syncManaged(ctx, opSpec, opStatus, syncContext)
235249
}
236250

251+
func (c *DeploymentController) applySyncErrorDegraded(ctx context.Context, syncErr error) error {
252+
degraded := applyoperatorv1.OperatorCondition().
253+
WithType(c.instanceName + opv1.OperatorStatusTypeDegraded).
254+
WithStatus(opv1.ConditionTrue).
255+
WithReason("SyncError").
256+
WithMessage(syncErr.Error())
257+
status := applyoperatorv1.OperatorStatus().WithConditions(degraded)
258+
return c.operatorClient.ApplyOperatorStatus(ctx, c.controllerInstanceName, status)
259+
}
260+
237261
func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.OperatorSpec, opStatus *opv1.OperatorStatus, syncContext factory.SyncContext) error {
238262
klog.V(4).Infof("syncManaged")
239263

@@ -247,7 +271,7 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
247271
return err
248272
}
249273

250-
deployment, _, err := resourceapply.ApplyDeployment(
274+
deploymentObj, _, err := resourceapply.ApplyDeployment(
251275
ctx,
252276
c.kubeClient.AppsV1(),
253277
syncContext.Recorder(),
@@ -257,57 +281,112 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
257281
if err != nil {
258282
return err
259283
}
260-
// Create an OperatorStatusApplyConfiguration with generations
284+
261285
status := applyoperatorv1.OperatorStatus().
262286
WithGenerations(&applyoperatorv1.GenerationStatusApplyConfiguration{
263287
Group: ptr.To("apps"),
264288
Resource: ptr.To("deployments"),
265-
Namespace: ptr.To(deployment.Namespace),
266-
Name: ptr.To(deployment.Name),
267-
LastGeneration: ptr.To(deployment.Generation),
289+
Namespace: ptr.To(deploymentObj.Namespace),
290+
Name: ptr.To(deploymentObj.Name),
291+
LastGeneration: ptr.To(deploymentObj.Generation),
268292
})
269293

270-
// Set Available condition
294+
now := time.Now()
295+
271296
if slices.Contains(c.conditions, opv1.OperatorStatusTypeAvailable) {
272297
availableCondition := applyoperatorv1.
273298
OperatorCondition().WithType(c.instanceName + opv1.OperatorStatusTypeAvailable)
274-
if deployment.Status.AvailableReplicas > 0 {
299+
if deploymentObj.Status.AvailableReplicas > 0 {
275300
availableCondition = availableCondition.
276301
WithStatus(opv1.ConditionTrue).
277302
WithMessage("Deployment is available").
278303
WithReason("AsExpected")
279-
280304
} else {
281305
availableCondition = availableCondition.
282306
WithStatus(opv1.ConditionFalse).
283-
WithMessage("Waiting for Deployment").
284-
WithReason("Deploying")
307+
WithReason("NoPod").
308+
WithMessage(fmt.Sprintf("no %s.%s pods available on any node", deploymentObj.Name, deploymentObj.Namespace))
285309
}
286310
status = status.WithConditions(availableCondition)
287311
}
288312

289-
// Set Progressing condition
313+
desiredReplicas := ptr.Deref(deploymentObj.Spec.Replicas, 1)
314+
315+
progressTimedOutMessage, workloadIsBeingUpdatedTooLong := hasDeploymentTimedOutProgressing(deploymentObj.Status)
316+
workloadIsBeingUpdated := !hasDeploymentProgressed(deploymentObj.Status) && !workloadIsBeingUpdatedTooLong
317+
318+
var progressDeadlineExceededMessage string
319+
if workloadIsBeingUpdatedTooLong {
320+
progressDeadlineExceededMessage = fmt.Sprintf("deployment/%s.%s has timed out progressing: %s", deploymentObj.Name, deploymentObj.Namespace, progressTimedOutMessage)
321+
}
322+
290323
if slices.Contains(c.conditions, opv1.OperatorStatusTypeProgressing) {
291324
progressingCondition := applyoperatorv1.OperatorCondition().
292325
WithType(c.instanceName + opv1.OperatorStatusTypeProgressing).
293326
WithStatus(opv1.ConditionFalse).
294327
WithMessage("Deployment is not progressing").
295328
WithReason("AsExpected")
296329

297-
if ok, msg := isProgressing(deployment); ok {
330+
switch {
331+
case workloadIsBeingUpdated:
298332
progressingCondition = progressingCondition.
299333
WithStatus(opv1.ConditionTrue).
300-
WithMessage(msg).
301-
WithReason("Deploying")
334+
WithReason("PodsUpdating").
335+
WithMessage(fmt.Sprintf("deployment/%s.%s: %d/%d pods have been updated to the latest revision and %d/%d pods are available", deploymentObj.Name, deploymentObj.Namespace, deploymentObj.Status.UpdatedReplicas, desiredReplicas, deploymentObj.Status.AvailableReplicas, desiredReplicas))
336+
case workloadIsBeingUpdatedTooLong:
337+
progressingCondition = progressingCondition.
338+
WithStatus(opv1.ConditionFalse).
339+
WithReason("ProgressDeadlineExceeded").
340+
WithMessage(progressDeadlineExceededMessage)
341+
default:
342+
progressingCondition = progressingCondition.
343+
WithStatus(opv1.ConditionFalse).
344+
WithReason("AsExpected")
345+
}
346+
status = status.WithConditions(progressingCondition)
347+
}
348+
349+
if slices.Contains(c.conditions, opv1.OperatorStatusTypeDegraded) {
350+
degradedCondition := applyoperatorv1.OperatorCondition().
351+
WithType(c.instanceName + opv1.OperatorStatusTypeDegraded).
352+
WithStatus(opv1.ConditionFalse).
353+
WithReason("AsExpected")
302354

303-
// Degrade when operator is progressing too long.
304-
// Only do this if we would continue to be in the Progressing state, otherwise, we'll never get out
305-
if v1helpers.IsUpdatingTooLong(opStatus, c.instanceName+opv1.OperatorStatusTypeProgressing) {
306-
return fmt.Errorf("Deployment was progressing too long")
355+
switch {
356+
case workloadIsBeingUpdatedTooLong:
357+
degradedCondition = degradedCondition.
358+
WithStatus(opv1.ConditionTrue).
359+
WithReason("ProgressDeadlineExceeded").
360+
WithMessage(progressDeadlineExceededMessage)
361+
362+
case !workloadIsBeingUpdated && deploymentObj.Status.AvailableReplicas < desiredReplicas:
363+
operandPods := c.listOperandPodsForDiagnostics(ctx, deploymentObj, syncContext)
364+
livePods := nonDeletingPods(operandPods)
365+
hasFailing := hasFailingPods(deploymentObj, livePods, now)
366+
if hasFailing || deploymentObj.Status.AvailableReplicas == 0 {
367+
containerMessages := deployment.ContainerMessagesForPods(deploymentObj, livePods)
368+
var failureDescription string
369+
if len(containerMessages) > 0 {
370+
failureDescription = ` (` + strings.Join(containerMessages, ", ") + `)`
371+
}
372+
numUnavailable := desiredReplicas - deploymentObj.Status.AvailableReplicas
373+
message := fmt.Sprintf("%d of %d requested instances are unavailable for %s.%s%s", numUnavailable, desiredReplicas, deploymentObj.Name, deploymentObj.Namespace, failureDescription)
374+
degradedCondition = degradedCondition.
375+
WithStatus(opv1.ConditionTrue).
376+
WithReason("UnavailablePod").
377+
WithMessage(message)
378+
} else {
379+
degradedCondition = degradedCondition.
380+
WithStatus(opv1.ConditionFalse).
381+
WithReason("AsExpected")
307382
}
308-
}
309383

310-
status = status.WithConditions(progressingCondition)
384+
default:
385+
degradedCondition = degradedCondition.
386+
WithStatus(opv1.ConditionFalse).
387+
WithReason("AsExpected")
388+
}
389+
status = status.WithConditions(degradedCondition)
311390
}
312391

313392
return c.operatorClient.ApplyOperatorStatus(
@@ -317,7 +396,34 @@ func (c *DeploymentController) syncManaged(ctx context.Context, opSpec *opv1.Ope
317396
)
318397
}
319398

320-
func (c *DeploymentController) syncDeleting(ctx context.Context, opSpec *opv1.OperatorSpec, opStatus *opv1.OperatorStatus, syncContext factory.SyncContext) error {
399+
// listOperandPodsForDiagnostics lists pods matched by the deployment selector for UnavailablePod diagnostics.
400+
// A nil selector, selector conversion errors, or API list errors are logged and recorded as warnings; no error is returned.
401+
func (c *DeploymentController) listOperandPodsForDiagnostics(ctx context.Context, deploymentObj *appsv1.Deployment, syncContext factory.SyncContext) []*corev1.Pod {
402+
if deploymentObj.Spec.Selector == nil {
403+
klog.Warningf("deployment/%s/%s has no spec.selector, skipping pod diagnostics", deploymentObj.Namespace, deploymentObj.Name)
404+
syncContext.Recorder().Warningf("DeploymentSelectorMissing", "deployment %s/%s has no spec.selector, skipping pod diagnostics", deploymentObj.Namespace, deploymentObj.Name)
405+
return nil
406+
}
407+
selector, err := metav1.LabelSelectorAsSelector(deploymentObj.Spec.Selector)
408+
if err != nil {
409+
klog.Warningf("deployment/%s/%s has invalid spec.selector: %v", deploymentObj.Namespace, deploymentObj.Name, err)
410+
syncContext.Recorder().Warningf("DeploymentSelectorInvalid", "deployment %s/%s has invalid spec.selector: %v", deploymentObj.Namespace, deploymentObj.Name, err)
411+
return nil
412+
}
413+
podList, err := c.kubeClient.CoreV1().Pods(deploymentObj.Namespace).List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
414+
if err != nil {
415+
klog.Warningf("deployment/%s/%s: pod list for diagnostics failed: %v", deploymentObj.Namespace, deploymentObj.Name, err)
416+
syncContext.Recorder().Warningf("PodListFailed", "listing pods for deployment %s/%s diagnostics: %v", deploymentObj.Namespace, deploymentObj.Name, err)
417+
return nil
418+
}
419+
out := make([]*corev1.Pod, len(podList.Items))
420+
for i := range podList.Items {
421+
out[i] = &podList.Items[i]
422+
}
423+
return out
424+
}
425+
426+
func (c *DeploymentController) syncDeleting(ctx context.Context, opSpec *opv1.OperatorSpec, syncContext factory.SyncContext) error {
321427
klog.V(4).Infof("syncDeleting")
322428
required, err := c.getDeployment(opSpec)
323429
if err != nil {
@@ -356,36 +462,70 @@ func (c *DeploymentController) getDeployment(opSpec *opv1.OperatorSpec) (*appsv1
356462
return required, nil
357463
}
358464

359-
func isProgressing(deployment *appsv1.Deployment) (bool, string) {
465+
// hasDeploymentProgressed returns true if the deployment reports NewReplicaSetAvailable
466+
// via the DeploymentProgressing condition.
467+
func hasDeploymentProgressed(status appsv1.DeploymentStatus) bool {
468+
for _, cond := range status.Conditions {
469+
if cond.Type == appsv1.DeploymentProgressing {
470+
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
471+
}
472+
}
473+
return false
474+
}
360475

361-
var deploymentExpectedReplicas int32
362-
if deployment.Spec.Replicas != nil {
363-
deploymentExpectedReplicas = *deployment.Spec.Replicas
476+
// hasDeploymentTimedOutProgressing returns true if the deployment reports ProgressDeadlineExceeded.
477+
// The function returns the Progressing condition message as the first return value.
478+
func hasDeploymentTimedOutProgressing(status appsv1.DeploymentStatus) (string, bool) {
479+
for _, cond := range status.Conditions {
480+
if cond.Type == appsv1.DeploymentProgressing {
481+
return cond.Message, cond.Status == corev1.ConditionFalse && cond.Reason == "ProgressDeadlineExceeded"
482+
}
364483
}
484+
return "", false
485+
}
365486

366-
switch {
367-
case deployment.Generation != deployment.Status.ObservedGeneration:
368-
return true, "Waiting for Deployment to act on changes"
369-
case hasFinishedProgressing(deployment):
370-
return false, ""
371-
case deployment.Status.UnavailableReplicas > 0:
372-
return true, "Waiting for Deployment to deploy pods"
373-
case deployment.Status.UpdatedReplicas < deploymentExpectedReplicas:
374-
return true, "Waiting for Deployment to update pods"
375-
case deployment.Status.AvailableReplicas < deploymentExpectedReplicas:
376-
return true, "Waiting for Deployment to deploy pods"
487+
// nonDeletingPods returns pods that are not terminating (no deletion timestamp).
488+
func nonDeletingPods(pods []*corev1.Pod) []*corev1.Pod {
489+
out := make([]*corev1.Pod, 0, len(pods))
490+
for _, p := range pods {
491+
if p != nil && p.DeletionTimestamp == nil {
492+
out = append(out, p)
493+
}
377494
}
378-
return false, ""
495+
return out
379496
}
380497

381-
func hasFinishedProgressing(deployment *appsv1.Deployment) bool {
382-
// Deployment whose rollout is complete gets Progressing condition with Reason NewReplicaSetAvailable condition.
383-
// https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#complete-deployment
384-
// Any subsequent missing replicas (e.g. caused by a node reboot) must not not change the Progressing condition.
385-
for _, cond := range deployment.Status.Conditions {
386-
if cond.Type == appsv1.DeploymentProgressing {
387-
return cond.Status == corev1.ConditionTrue && cond.Reason == "NewReplicaSetAvailable"
498+
func hasFailingPods(workload *appsv1.Deployment, pods []*corev1.Pod, now time.Time) bool {
499+
progressDeadline := time.Duration(ptr.Deref(workload.Spec.ProgressDeadlineSeconds, 600)) * time.Second
500+
minReady := time.Duration(workload.Spec.MinReadySeconds) * time.Second
501+
502+
for _, pod := range pods {
503+
if pod.DeletionTimestamp != nil {
504+
continue
505+
}
506+
507+
readyCond := findPodReadyCondition(pod)
508+
deadline := pod.CreationTimestamp.Time.Add(progressDeadline)
509+
510+
if (readyCond == nil || readyCond.Status != corev1.ConditionTrue) && now.After(deadline) {
511+
return true
512+
}
513+
514+
if minReady > 0 && readyCond != nil && readyCond.Status == corev1.ConditionTrue {
515+
isRelevant := now.After(pod.CreationTimestamp.Time.Add(progressDeadline + minReady))
516+
if isRelevant && now.Sub(readyCond.LastTransitionTime.Time) < minReady {
517+
return true
518+
}
388519
}
389520
}
390521
return false
391522
}
523+
524+
func findPodReadyCondition(pod *corev1.Pod) *corev1.PodCondition {
525+
for i := range pod.Status.Conditions {
526+
if pod.Status.Conditions[i].Type == corev1.PodReady {
527+
return &pod.Status.Conditions[i]
528+
}
529+
}
530+
return nil
531+
}

0 commit comments

Comments
 (0)