Skip to content

Commit ead160c

Browse files
scotwellsclaude
andcommitted
feat(controller): surface referenced-data progress on deployment status
Roll the per-Instance ReferencedDataReady signal up to the WorkloadDeployment so users see why replicas are pending: ReplicasReady reports how many desired replicas are waiting for referenced data (after quota, which takes precedence as the earlier blocker). Rolling up a condition the resolver also writes makes reconcile loops the main hazard, so the reconciler gains two complementary guards: - The WorkloadDeployment watch only re-enqueues on spec changes or ReferencedDataReady transitions, filtering out this reconciler's own Available/ReplicasReady writes. - Status is only written when it actually changed, skipping the superfluous API call (and the resourceVersion bump that would otherwise fire another event) on no-op passes. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent d960c18 commit ead160c

2 files changed

Lines changed: 418 additions & 17 deletions

File tree

internal/controller/workloaddeployment_controller.go

Lines changed: 116 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99
"slices"
1010

11+
"k8s.io/apimachinery/pkg/api/equality"
1112
apierrors "k8s.io/apimachinery/pkg/api/errors"
1213
apimeta "k8s.io/apimachinery/pkg/api/meta"
1314
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -17,9 +18,11 @@ import (
1718
"sigs.k8s.io/controller-runtime/pkg/client"
1819
"sigs.k8s.io/controller-runtime/pkg/cluster"
1920
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
21+
"sigs.k8s.io/controller-runtime/pkg/event"
2022
"sigs.k8s.io/controller-runtime/pkg/finalizer"
2123
"sigs.k8s.io/controller-runtime/pkg/handler"
2224
"sigs.k8s.io/controller-runtime/pkg/log"
25+
"sigs.k8s.io/controller-runtime/pkg/predicate"
2326
"sigs.k8s.io/controller-runtime/pkg/reconcile"
2427
mcbuilder "sigs.k8s.io/multicluster-runtime/pkg/builder"
2528
mccontext "sigs.k8s.io/multicluster-runtime/pkg/context"
@@ -34,6 +37,13 @@ import (
3437
instancecontrolstateful "go.datum.net/compute/internal/controller/instancecontrol/stateful"
3538
)
3639

40+
const (
41+
// reasonReplicasAvailable is used in the ReplicasReady condition when replicas
42+
// are either available or pending; it has no equivalent in the API constants
43+
// package because it is an internal detail of this controller.
44+
reasonReplicasAvailable = "ReplicasAvailable"
45+
)
46+
3747
// WorkloadDeploymentReconciler reconciles a WorkloadDeployment object
3848
type WorkloadDeploymentReconciler struct {
3949
mgr mcmanager.Manager
@@ -107,6 +117,10 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco
107117
logger.Info("reconciling deployment")
108118
defer logger.Info("reconcile complete")
109119

120+
// Snapshot the existing status before any modifications so we can skip the
121+
// Status().Update call when nothing changed (see loop-prevention comment below).
122+
existingStatus := *deployment.Status.DeepCopy()
123+
110124
// Collect all instances for this deployment
111125
listOpts := client.MatchingLabels{
112126
computev1alpha.WorkloadDeploymentUIDLabel: string(deployment.GetUID()),
@@ -177,7 +191,7 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco
177191
desiredReplicas = 0
178192
}
179193

180-
currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady)
194+
currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, err := r.reconcileInstanceGates(ctx, cl.GetClient(), &deployment, instances.Items, networkReady)
181195
if err != nil {
182196
return ctrl.Result{}, err
183197
}
@@ -189,18 +203,26 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco
189203
deployment.Status.ReadyReplicas = int32(readyReplicas)
190204
deployment.Status.ObservedGeneration = deployment.Generation
191205

192-
if quotaBlockedReplicas > 0 {
206+
switch {
207+
case quotaBlockedReplicas > 0:
193208
apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{
194209
Type: computev1alpha.WorkloadDeploymentReplicasReady,
195210
Status: metav1.ConditionFalse,
196211
Reason: computev1alpha.InstanceQuotaGrantedReasonQuotaExceeded,
197212
Message: fmt.Sprintf("%d of %d desired replicas are pending quota", quotaBlockedReplicas, desiredReplicas),
198213
})
199-
} else {
214+
case referencedDataBlockedReplicas > 0:
215+
apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{
216+
Type: computev1alpha.WorkloadDeploymentReplicasReady,
217+
Status: metav1.ConditionFalse,
218+
Reason: computev1alpha.ReferencedDataReasonAwaitingPropagation,
219+
Message: fmt.Sprintf("%d of %d desired replicas are waiting for referenced data companions", referencedDataBlockedReplicas, desiredReplicas),
220+
})
221+
default:
200222
apimeta.SetStatusCondition(&deployment.Status.Conditions, metav1.Condition{
201223
Type: computev1alpha.WorkloadDeploymentReplicasReady,
202224
Status: metav1.ConditionTrue,
203-
Reason: "ReplicasAvailable",
225+
Reason: reasonReplicasAvailable,
204226
Message: fmt.Sprintf("%d/%d replicas available", readyReplicas, desiredReplicas),
205227
})
206228
}
@@ -239,12 +261,19 @@ func (r *WorkloadDeploymentReconciler) Reconcile(ctx context.Context, req mcreco
239261
})
240262
}
241263

242-
if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil {
243-
return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err)
264+
// Skip the write when the status is unchanged. Without this guard the
265+
// reconciler's own Status().Update would always produce a new resourceVersion,
266+
// firing another Update event on the WD and creating an infinite reconcile loop
267+
// before the predicate on For() was added. The guard is a belt-and-suspenders
268+
// complement to the predicate: the predicate prevents re-queuing on own writes,
269+
// and this guard avoids the superfluous API call entirely.
270+
if !equality.Semantic.DeepEqual(existingStatus, deployment.Status) {
271+
if err := cl.GetClient().Status().Update(ctx, &deployment); err != nil {
272+
return ctrl.Result{}, fmt.Errorf("failed updating deployment status: %w", err)
273+
}
274+
logger.Info("deployment status updated")
244275
}
245276

246-
logger.Info("deployment status updated")
247-
248277
return ctrl.Result{}, nil
249278
}
250279

@@ -254,13 +283,17 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates(
254283
deployment *computev1alpha.WorkloadDeployment,
255284
instances []computev1alpha.Instance,
256285
networkReady bool,
257-
) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas int, err error) {
286+
) (currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas int, err error) {
258287
templateHash := instancecontrol.ComputeHash(deployment.Spec.Template)
259288
for _, instance := range instances {
260289
if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.InstanceQuotaGranted, metav1.ConditionFalse) {
261290
quotaBlockedReplicas++
262291
}
263292

293+
if apimeta.IsStatusConditionPresentAndEqual(instance.Status.Conditions, computev1alpha.ReferencedDataReady, metav1.ConditionFalse) {
294+
referencedDataBlockedReplicas++
295+
}
296+
264297
// Spec.Controller is a nilable pointer; guard it before dereferencing the
265298
// scheduling gates so an instance without controller state cannot panic
266299
// the reconcile (mirrors the Status.Controller guard below).
@@ -273,7 +306,7 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates(
273306
instance.Spec.Controller.SchedulingGates = newGates
274307
return nil
275308
}); patchErr != nil {
276-
return 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr)
309+
return 0, 0, 0, 0, 0, fmt.Errorf("failed updating instance: %w", patchErr)
277310
}
278311
}
279312
}
@@ -300,7 +333,69 @@ func (r *WorkloadDeploymentReconciler) reconcileInstanceGates(
300333
readyReplicas++
301334
}
302335
}
303-
return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, nil
336+
return currentReplicas, updatedReplicas, readyReplicas, quotaBlockedReplicas, referencedDataBlockedReplicas, nil
337+
}
338+
339+
// wdReferencedDataChangedPredicate returns a predicate for the WorkloadDeployment
340+
// For() watch that fires on:
341+
// - Any Create, Delete, or Generic event (always enqueue).
342+
// - An Update event where metadata.generation changed (spec updated), OR where
343+
// the ReferencedDataReady condition's Status, Reason, or Message changed.
344+
//
345+
// The predicate intentionally does NOT fire when only the Available or
346+
// ReplicasReady conditions change, because those are written by this reconciler
347+
// itself. Without this guard the reconciler's own Status().Update would re-enqueue
348+
// itself on every run, creating a tight reconcile loop. The equality check before
349+
// Status().Update is a complementary guard, but the predicate is the primary
350+
// protection: it prevents re-enqueuing entirely so the workqueue stays quiet between
351+
// meaningful state transitions.
352+
//
353+
// Loop prevention: the ReferencedDataController (the only other writer of the
354+
// ReferencedDataReady condition) is the intended trigger. When it sets
355+
// ReferencedDataReady=False/SourceNotFound the predicate passes and this
356+
// reconciler re-runs, sees the resolver verdict in deployment.Status.Conditions, and
357+
// promotes Available to ReferencedDataNotReady. Subsequent runs by this reconciler
358+
// (which write Available but not ReferencedDataReady) are filtered out.
359+
func wdReferencedDataChangedPredicate() predicate.Predicate {
360+
return predicate.Funcs{
361+
UpdateFunc: func(e event.UpdateEvent) bool {
362+
oldWD, ok1 := e.ObjectOld.(*computev1alpha.WorkloadDeployment)
363+
newWD, ok2 := e.ObjectNew.(*computev1alpha.WorkloadDeployment)
364+
if !ok1 || !ok2 {
365+
return true // be conservative when type assertion fails
366+
}
367+
// Spec change: always reconcile.
368+
if oldWD.Generation != newWD.Generation {
369+
return true
370+
}
371+
// ReferencedDataReady condition changed: reconcile so Available is
372+
// updated to reflect the resolver's verdict.
373+
return wdRefDataCondChanged(
374+
apimeta.FindStatusCondition(oldWD.Status.Conditions, computev1alpha.ReferencedDataReady),
375+
apimeta.FindStatusCondition(newWD.Status.Conditions, computev1alpha.ReferencedDataReady),
376+
)
377+
},
378+
CreateFunc: func(_ event.CreateEvent) bool { return true },
379+
DeleteFunc: func(_ event.DeleteEvent) bool { return true },
380+
GenericFunc: func(_ event.GenericEvent) bool { return true },
381+
}
382+
}
383+
384+
// wdRefDataCondChanged returns true when the ReferencedDataReady condition's
385+
// observable fields (Status, Reason, Message) differ between old and new. Presence
386+
// changes (nil → non-nil or vice versa) are also treated as a change. The
387+
// LastTransitionTime field is excluded because it changes on every status flip and
388+
// would defeat the loop-prevention intent of wdReferencedDataChangedPredicate.
389+
func wdRefDataCondChanged(old, new *metav1.Condition) bool {
390+
if (old == nil) != (new == nil) {
391+
return true // condition was added or removed
392+
}
393+
if old == nil {
394+
return false // both nil — no change
395+
}
396+
return old.Status != new.Status ||
397+
old.Reason != new.Reason ||
398+
old.Message != new.Message
304399
}
305400

306401
// reconcileNetworks ensures NetworkBindings and SubnetClaims exist for all
@@ -573,7 +668,16 @@ func (r *WorkloadDeploymentReconciler) SetupWithManager(mgr mcmanager.Manager, o
573668
}
574669

575670
b := mcbuilder.ControllerManagedBy(mgr).
576-
For(&computev1alpha.WorkloadDeployment{}, mcbuilder.WithEngageWithLocalCluster(false)).
671+
// The predicate gates re-enqueuing on meaningful WD changes: spec updates
672+
// (generation bump) or a ReferencedDataReady condition change written by
673+
// ReferencedDataController. Without it, each Status().Update by this
674+
// reconciler (writing Available/ReplicasReady) would re-enqueue itself,
675+
// creating a tight loop and delaying the ReferencedDataReady signal from
676+
// the resolver.
677+
For(&computev1alpha.WorkloadDeployment{},
678+
mcbuilder.WithEngageWithLocalCluster(false),
679+
mcbuilder.WithPredicates(wdReferencedDataChangedPredicate()),
680+
).
577681
Owns(&computev1alpha.Instance{})
578682

579683
// Only watch networking resources when the networking integration is enabled.

0 commit comments

Comments
 (0)