Skip to content

Commit 489a317

Browse files
committed
controllers: cleanup status conditions update
So far we had only base conditions for the NRO object, but we want to have a more flexible interface to interact with while supporting non-base conditions, just like we have for NRS controller. In this commit: 1. preserve the consistency of updating status conditions for numaresources CRs (operator and scheduler). 2. minimize Status.Update calls on degraded condition updates. 3. keep using `conditioninfo` to maintain related commit modifications as much as possible, refactor later (switch to metav1 conditions). 4. update conditions and return them instead of mutation. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 37cac9a commit 489a317

7 files changed

Lines changed: 145 additions & 185 deletions

File tree

internal/controller/numaresourcesoperator_controller.go

Lines changed: 27 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,14 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
139139
return ctrl.Result{}, err
140140
}
141141

142+
initialStatus := *instance.Status.DeepCopy()
143+
if len(initialStatus.Conditions) == 0 {
144+
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
145+
}
146+
142147
if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
143148
err := fmt.Errorf("incorrect NUMAResourcesOperator resource name: %s", instance.Name)
144-
return r.degradeStatus(ctx, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
149+
return r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesOperatorResourceName, err)
145150
}
146151

147152
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
@@ -150,80 +155,64 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
150155
}
151156

152157
if err := validation.NodeGroups(instance.Spec.NodeGroups, r.Platform); err != nil {
153-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
158+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
154159
}
155160

156161
trees, err := getTreesByNodeGroup(ctx, r.Client, instance.Spec.NodeGroups, r.Platform)
157162
if err != nil {
158-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
163+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
159164
}
160165

161166
tolerable, err := validation.NodeGroupsTree(instance, trees, r.Platform)
162167
if err != nil {
163-
return r.degradeStatus(ctx, instance, validation.NodeGroupsError, err)
168+
return r.degradeStatus(ctx, initialStatus, instance, validation.NodeGroupsError, err)
164169
}
165170

166171
for idx := range trees {
167172
conf := trees[idx].NodeGroup.NormalizeConfig()
168173
trees[idx].NodeGroup.Config = &conf
169174
}
170175

171-
curStatus := instance.Status.DeepCopy()
172-
173176
step := r.reconcileResource(ctx, instance, trees)
177+
instance.Status.Conditions, _ = status.ComputeConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
174178

175179
if step.Done() && tolerable != nil {
176-
return r.degradeStatus(ctx, instance, tolerable.Reason, tolerable.Error)
177-
}
178-
179-
if !status.NUMAResourceOperatorNeedsUpdate(curStatus, &instance.Status) {
180-
return step.Result, step.Error
180+
return r.degradeStatus(ctx, initialStatus, instance, tolerable.Reason, tolerable.Error)
181181
}
182182

183-
updErr := r.Client.Status().Update(ctx, instance)
184-
if updErr != nil {
185-
klog.InfoS("Failed to update numaresourcesoperator status", "error", updErr)
186-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
183+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
184+
klog.InfoS("Failed to update numaresources-operator status", "error", err)
187185
}
188186

189187
return step.Result, step.Error
190188
}
191189

192-
// updateStatusConditionsIfNeeded returns true if conditions were updated.
193-
func updateStatusConditionsIfNeeded(instance *nropv1.NUMAResourcesOperator, cond conditioninfo.ConditionInfo) {
194-
if cond.Type == "" { // backward (=legacy) compatibility
195-
return
196-
}
197-
klog.InfoS("updateStatus", "condition", cond.Type, "reason", cond.Reason, "message", cond.Message)
198-
conditions, ok := status.ComputeConditions(instance.Status.Conditions, metav1.Condition{
199-
Type: cond.Type,
200-
Reason: cond.Reason,
201-
Message: cond.Message,
202-
ObservedGeneration: instance.Generation,
203-
}, time.Now())
204-
if ok {
205-
instance.Status.Conditions = conditions
190+
func (r *NUMAResourcesOperatorReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator) error {
191+
if !status.NUMAResourceOperatorNeedsUpdate(initialStatus, instance.Status) {
192+
return nil
206193
}
194+
195+
updErr := r.Client.Status().Update(ctx, instance)
196+
if updErr != nil {
197+
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), updErr)
198+
}
199+
return nil
207200
}
208201

209-
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
202+
func (r *NUMAResourcesOperatorReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesOperatorStatus, instance *nropv1.NUMAResourcesOperator, reason string, stErr error) (ctrl.Result, error) {
210203
info := conditioninfo.DegradedFromError(stErr)
211204
if reason != "" { // intentionally overwrite
212205
info.Reason = reason
213206
}
214207

215-
updateStatusConditionsIfNeeded(instance, info)
216-
// TODO: if we keep being degraded, we likely (= if we don't, it's too implicit) keep sending possibly redundant updates to the apiserver
208+
instance.Status.Conditions, _ = status.ComputeConditions(instance.Status.Conditions, info.ToMetav1Condition(instance.Generation), time.Now())
217209

218-
err := r.Client.Status().Update(ctx, instance)
210+
err := r.updateStatus(ctx, initialStatus, instance)
219211
if err != nil {
220212
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
221-
return ctrl.Result{}, fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(instance), err)
222213
}
223214

224-
// we do not return an error here because to pass the validation error a user will need to update NRO CR
225-
// that will anyway initiate to reconcile loop
226-
return ctrl.Result{}, nil
215+
return ctrl.Result{}, err
227216
}
228217

229218
func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
@@ -298,34 +287,27 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceDaemonSet(ctx context
298287

299288
func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesOperator, trees []nodegroupv1.Tree) intreconcile.Step {
300289
if step := r.reconcileResourceAPI(ctx, instance, trees); step.EarlyStop() {
301-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
302290
return step
303291
}
304292

305293
existing := rtestate.FromClient(ctx, r.Client, r.Platform, r.RTEManifests, instance, trees, r.Namespace)
306294

307295
if r.Platform == platform.OpenShift {
308296
if step := r.reconcileResourceMachineConfig(ctx, instance, existing, trees); step.EarlyStop() {
309-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
310297
return step
311298
}
312299
}
313300

314301
dsPerPool, step := r.reconcileResourceDaemonSet(ctx, instance, existing, trees)
315302
if step.EarlyStop() {
316-
updateStatusConditionsIfNeeded(instance, step.ConditionInfo)
317303
return step
318304
}
319305

320306
// all fields of NodeGroupStatus are required so publish the status only when all daemonset and MCPs are updated which
321307
// is a certain thing if we got to this point otherwise the function would have returned already
322308
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsPerPool)
323309

324-
updateStatusConditionsIfNeeded(instance, conditioninfo.Available())
325-
return intreconcile.Step{
326-
Result: ctrl.Result{},
327-
ConditionInfo: conditioninfo.Available(),
328-
}
310+
return intreconcile.StepSuccess()
329311
}
330312

331313
func (r *NUMAResourcesOperatorReconciler) syncDaemonSetsStatuses(ctx context.Context, rd client.Reader, daemonSetsInfo []poolDaemonSet) ([]nropv1.NamespacedName, string, error) {

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 48 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
rbacv1 "k8s.io/api/rbac/v1"
2727
apiequality "k8s.io/apimachinery/pkg/api/equality"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
29-
metahelper "k8s.io/apimachinery/pkg/api/meta"
3029
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3130
"k8s.io/apimachinery/pkg/labels"
3231
"k8s.io/apimachinery/pkg/runtime"
@@ -49,6 +48,7 @@ import (
4948
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
5049
"github.com/openshift-kni/numaresources-operator/internal/api/annotations"
5150
"github.com/openshift-kni/numaresources-operator/internal/platforminfo"
51+
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
5252
"github.com/openshift-kni/numaresources-operator/internal/relatedobjects"
5353
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5454
"github.com/openshift-kni/numaresources-operator/pkg/hash"
@@ -116,23 +116,45 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
116116
}
117117

118118
initialStatus := *instance.Status.DeepCopy()
119+
if len(initialStatus.Conditions) == 0 {
120+
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
121+
}
119122

120123
if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {
121124
message := fmt.Sprintf("incorrect NUMAResourcesScheduler resource name: %s", instance.Name)
122-
return ctrl.Result{}, r.updateStatus(ctx, initialStatus, instance, status.ConditionDegraded, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
125+
return ctrl.Result{}, r.degradeStatus(ctx, initialStatus, instance, status.ConditionTypeIncorrectNUMAResourcesSchedulerResourceName, message)
123126
}
124127

125128
if annotations.IsPauseReconciliationEnabled(instance.Annotations) {
126129
klog.InfoS("Pause reconciliation enabled", "object", req.NamespacedName)
127130
return ctrl.Result{}, nil
128131
}
129132

130-
result, condition, err := r.reconcileResource(ctx, instance)
131-
if err := r.updateStatus(ctx, initialStatus, instance, condition, status.ReasonFromError(err), status.MessageFromError(err)); err != nil {
132-
klog.InfoS("Failed to update numaresourcesscheduler status", "Desired condition", condition, "error", err)
133+
step := r.reconcileResource(ctx, instance)
134+
instance.Status.Conditions, _ = status.ComputeConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
135+
if err := r.updateStatus(ctx, initialStatus, instance); err != nil {
136+
klog.InfoS("Failed to update numaresourcesscheduler status", "error", err)
133137
}
134138

135-
return result, err
139+
return step.Result, step.Error
140+
}
141+
142+
func (r *NUMAResourcesSchedulerReconciler) degradeStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, instance *nropv1.NUMAResourcesScheduler, reason string, message string) error {
143+
condition := metav1.Condition{
144+
Type: status.ConditionDegraded,
145+
Status: metav1.ConditionTrue,
146+
Reason: reason,
147+
Message: message,
148+
ObservedGeneration: instance.Generation,
149+
}
150+
151+
instance.Status.Conditions, _ = status.ComputeConditions(instance.Status.Conditions, condition, time.Now())
152+
153+
err := r.updateStatus(ctx, initialStatus, instance)
154+
if err != nil {
155+
klog.InfoS("Failed to update numaresourcesoperator status", "error", err)
156+
}
157+
return err
136158
}
137159

138160
// SetupWithManager sets up the controller with the Manager.
@@ -184,24 +206,24 @@ func (r *NUMAResourcesSchedulerReconciler) nodeToNUMAResourcesScheduler(ctx cont
184206
return requests
185207
}
186208

187-
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) (reconcile.Result, string, error) {
209+
func (r *NUMAResourcesSchedulerReconciler) reconcileResource(ctx context.Context, instance *nropv1.NUMAResourcesScheduler) intreconcile.Step {
188210
schedStatus, err := r.syncNUMASchedulerResources(ctx, instance)
189211
if err != nil {
190-
return ctrl.Result{}, status.ConditionDegraded, fmt.Errorf("FailedSchedulerSync: %w", err)
212+
return intreconcile.StepFailed(fmt.Errorf("FailedSchedulerSync: %w", err))
191213
}
192214

193215
instance.Status = schedStatus
194216
instance.Status.RelatedObjects = relatedobjects.Scheduler(r.Namespace, instance.Status.Deployment)
195217

196218
ok, err := isDeploymentRunning(ctx, r.Client, schedStatus.Deployment)
197219
if err != nil {
198-
return ctrl.Result{}, status.ConditionDegraded, err
220+
return intreconcile.StepFailed(err)
199221
}
200222
if !ok {
201-
return ctrl.Result{RequeueAfter: 5 * time.Second}, status.ConditionProgressing, nil
223+
return intreconcile.StepOngoing(5 * time.Second)
202224
}
203225

204-
return ctrl.Result{}, status.ConditionAvailable, nil
226+
return intreconcile.StepSuccess()
205227
}
206228

207229
func isDeploymentRunning(ctx context.Context, c client.Client, key nropv1.NamespacedName) (bool, error) {
@@ -304,7 +326,7 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
304326
Duration: cacheResyncPeriod,
305327
}
306328

307-
schedStatus.Conditions = updateDedicatedInformerCondition(status.NewNUMAResourcesSchedulerBaseConditions(), *instance, schedSpec)
329+
schedStatus.Conditions = updateDedicatedInformerCondition(schedStatus.Conditions, instance.Generation, schedSpec)
308330

309331
r.SchedulerManifests.Deployment.Spec.Replicas = schedSpec.Replicas
310332
klog.V(4).InfoS("using scheduler replicas", "replicas", *r.SchedulerManifests.Deployment.Spec.Replicas)
@@ -368,52 +390,29 @@ func platformNormalize(spec *nropv1.NUMAResourcesSchedulerSpec, platInfo platfor
368390
klog.V(4).InfoS("SchedulerInformer default is overridden", "Platform", platInfo.Platform, "PlatformVersion", platInfo.Version.String(), "SchedulerInformer", *spec.SchedulerInformer)
369391
}
370392

371-
func updateDedicatedInformerCondition(conds []metav1.Condition, instance nropv1.NUMAResourcesScheduler, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
372-
condition := metahelper.FindStatusCondition(conds, status.ConditionDedicatedInformerActive)
373-
if condition == nil { // should never happen
374-
klog.InfoS("missing condition: %q", status.ConditionDedicatedInformerActive)
375-
return conds
376-
}
377-
if normalized.SchedulerInformer == nil { // should never happen
378-
klog.InfoS("nil SchedulerInformer in spec")
379-
condition.Status = metav1.ConditionUnknown
380-
return conds
381-
}
393+
func updateDedicatedInformerCondition(conds []metav1.Condition, instanceGeneration int64, normalized nropv1.NUMAResourcesSchedulerSpec) []metav1.Condition {
394+
dedicatedStatus := metav1.ConditionFalse
382395
if *normalized.SchedulerInformer == nropv1.SchedulerInformerDedicated {
383-
condition.Status = metav1.ConditionTrue
384-
} else {
385-
condition.Status = metav1.ConditionFalse
396+
dedicatedStatus = metav1.ConditionTrue
386397
}
387-
condition.ObservedGeneration = instance.Generation
388-
return conds
389-
}
390398

391-
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler, condition string, reason string, message string) error {
392-
conds := status.CloneConditions(sched.Status.Conditions)
393-
if len(conds) == 0 {
394-
conds = status.NewNUMAResourcesSchedulerBaseConditions()
399+
informerCondition := metav1.Condition{
400+
Type: status.ConditionDedicatedInformerActive,
401+
Status: dedicatedStatus,
402+
Reason: status.ConditionDedicatedInformerActive,
403+
ObservedGeneration: instanceGeneration,
404+
LastTransitionTime: metav1.Now(),
395405
}
396-
ok := status.UpdateConditionsInPlace(conds, metav1.Condition{
397-
Type: condition,
398-
Status: metav1.ConditionTrue,
399-
ObservedGeneration: sched.Generation,
400-
Reason: reason,
401-
Message: message,
402-
}, time.Now())
403-
if !ok {
404-
klog.InfoS("fail to update condition", "conditionType", condition, "reason", reason, "message", message)
405-
}
406-
// we need to set something anyway
407-
sched.Status.Conditions = conds
408406

407+
conds, _ = status.ComputeConditions(conds, informerCondition, time.Time{})
408+
return conds
409+
}
410+
411+
func (r *NUMAResourcesSchedulerReconciler) updateStatus(ctx context.Context, initialStatus nropv1.NUMAResourcesSchedulerStatus, sched *nropv1.NUMAResourcesScheduler) error {
409412
if !status.NUMAResourcesSchedulerNeedsUpdate(initialStatus, sched.Status) {
410413
return nil
411414
}
412415

413-
if status.EqualConditions(initialStatus.Conditions, sched.Status.Conditions) {
414-
sched.Status.Conditions = initialStatus.Conditions
415-
}
416-
417416
if err := r.Client.Status().Update(ctx, sched); err != nil {
418417
return fmt.Errorf("could not update status for object %s: %w", client.ObjectKeyFromObject(sched), err)
419418
}

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
121121

122122
node := &corev1.Node{
123123
ObjectMeta: metav1.ObjectMeta{
124-
Name: fmt.Sprintf("master-node-0"),
124+
Name: "master-node-0",
125125
Labels: map[string]string{
126126
"node-role.kubernetes.io/control-plane": "",
127127
},
@@ -132,13 +132,13 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
132132
Expect(err).ToNot(HaveOccurred())
133133

134134
Expect(reconciler.Client.Get(context.TODO(), key, nrs)).To(Succeed())
135+
136+
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
137+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
135138
degradedCondition = getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
136139
Expect(degradedCondition.Status).To(Equal(metav1.ConditionFalse), "scheduler reported as Degraded: %v", degradedCondition)
137140
Expect(degradedCondition.Reason).To(Equal(status.ConditionDegraded))
138141
Expect(degradedCondition.Message).To(BeEmpty())
139-
140-
progressingCondition := getConditionByType(nrs.Status.Conditions, status.ConditionProgressing)
141-
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue), "scheduler not reported Progressing: %v", nrs.Status.Conditions)
142142
})
143143
})
144144

pkg/status/conditioninfo/conditioninfo.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
package conditioninfo
1818

1919
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
2022
"github.com/openshift-kni/numaresources-operator/pkg/status"
2123
)
2224

@@ -73,3 +75,13 @@ func DegradedFromError(err error) ConditionInfo {
7375
Reason: status.ReasonFromError(err),
7476
}
7577
}
78+
79+
func (ci ConditionInfo) ToMetav1Condition(gen int64) metav1.Condition {
80+
return metav1.Condition{
81+
Type: ci.Type,
82+
Status: metav1.ConditionTrue,
83+
Reason: ci.Reason,
84+
Message: ci.Message,
85+
ObservedGeneration: gen,
86+
}
87+
}

0 commit comments

Comments
 (0)