Skip to content

Commit ee5e842

Browse files
committed
controller: handle drain completion and set desiredImageState Booted
Add collectDrainResults, called at the start of driveRollout before assigning new reboot slots. It does a non-blocking check on each drain's result channel. On success, it sets desiredImageState to Booted on the BootcNode so the daemon knows to apply the staged image and reboot. Drain errors and cancellations are logged but otherwise deferred. Assisted-by: Pi (Claude Opus 4.6)
1 parent ea9be4b commit ee5e842

1 file changed

Lines changed: 68 additions & 1 deletion

File tree

internal/controller/rollout.go

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ func (rs *rolloutState) nodeCount() int {
5050
func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv1alpha1.BootcNodePool, ownedBootcNodes map[string]*bootcv1alpha1.BootcNode) error {
5151
log := logf.FromContext(ctx)
5252

53+
// Process drain results first. This isn't really ordering dependent,
54+
// but it feels natural to do this upfront before classifying.
55+
if err := r.collectDrainResults(ctx, ownedBootcNodes); err != nil {
56+
return fmt.Errorf("collecting drain results: %w", err)
57+
}
58+
5359
rs := buildRolloutState(log, ownedBootcNodes)
5460

5561
maxUnavail, err := resolveMaxUnavailable(pool, rs.nodeCount())
@@ -84,11 +90,15 @@ func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv
8490
}
8591
}
8692

87-
// Start drains for all slotted Staged nodes.
93+
// Start drains for all slotted Staged nodes that haven't already been
94+
// approved for reboot (which implies they've already been drained).
8895
for _, bn := range rs.staged {
8996
if !metav1.HasAnnotation(bn.ObjectMeta, bootcv1alpha1.AnnotationInRebootSlot) {
9097
continue
9198
}
99+
if bn.Spec.DesiredImageState == bootcv1alpha1.DesiredImageStateBooted {
100+
continue
101+
}
92102
r.ensureDrain(ctx, pool, bn)
93103
}
94104

@@ -211,6 +221,63 @@ func (r *BootcNodePoolReconciler) ensureDrain(ctx context.Context, pool *bootcv1
211221
}()
212222
}
213223

224+
// collectDrainResults checks all in-progress drains for completed
225+
// results. On success, it sets desiredImageState to Booted on the
226+
// BootcNode.
227+
func (r *BootcNodePoolReconciler) collectDrainResults(ctx context.Context, ownedBootcNodes map[string]*bootcv1alpha1.BootcNode) error {
228+
log := logf.FromContext(ctx)
229+
230+
r.drainsMu.Lock()
231+
defer r.drainsMu.Unlock()
232+
233+
for nodeName, ds := range r.drains {
234+
// Non-blocking check for drain result.
235+
var drainErr error
236+
select {
237+
case drainErr = <-ds.result:
238+
default:
239+
// Drain still in progress.
240+
continue
241+
}
242+
243+
// Drain finished; remove from the map.
244+
delete(r.drains, nodeName)
245+
246+
if drainErr != nil {
247+
// TODO: handle drain errors and cancellations.
248+
log.Info("Drain failed", "node", nodeName, "error", drainErr)
249+
continue
250+
}
251+
252+
// Drain succeeded. Set desiredImageState to Booted.
253+
bn, ok := ownedBootcNodes[nodeName]
254+
if !ok {
255+
// Node left the pool while draining. Once
256+
// removeBootcNode cancels the drain (see related TODO
257+
// there), this would normally be caught by the
258+
// drainErr cancellation check above. But if somehow we
259+
// raced and did actually successfully drain, just
260+
// ignore it; we should've already uncordoned the node.
261+
log.Info("Drain completed but node no longer in pool", "node", nodeName)
262+
continue
263+
}
264+
265+
log.Info("Drain completed, setting desiredImageState to Booted", "node", nodeName)
266+
modified := bn.DeepCopy()
267+
modified.Spec.DesiredImageState = bootcv1alpha1.DesiredImageStateBooted
268+
if err := r.Patch(ctx, modified, client.MergeFrom(bn)); err != nil {
269+
// The drain result was already consumed, so on retry a
270+
// redundant drain will run (completing instantly).
271+
// Could optimize this but meh... not worth the
272+
// complexity.
273+
return fmt.Errorf("setting desiredImageState on %s: %w", nodeName, err)
274+
}
275+
*bn = *modified
276+
}
277+
278+
return nil
279+
}
280+
214281
// buildRolloutState classifies all owned BootcNodes and counts occupied
215282
// reboot slots.
216283
func buildRolloutState(log logr.Logger, ownedBootcNodes map[string]*bootcv1alpha1.BootcNode) *rolloutState {

0 commit comments

Comments
 (0)