@@ -50,6 +50,12 @@ func (rs *rolloutState) nodeCount() int {
5050func (r * BootcNodePoolReconciler ) driveRollout (ctx context.Context , pool * bootcv1alpha1.BootcNodePool , ownedBootcNodes map [string ]* bootcv1alpha1.BootcNode ) error {
5151 log := logf .FromContext (ctx )
5252
53+ // Process drain results first. This isn't really ordering dependent,
54+ // but it feels natural to do this upfront before classifying.
55+ if err := r .collectDrainResults (ctx , ownedBootcNodes ); err != nil {
56+ return fmt .Errorf ("collecting drain results: %w" , err )
57+ }
58+
5359 rs := buildRolloutState (log , ownedBootcNodes )
5460
5561 maxUnavail , err := resolveMaxUnavailable (pool , rs .nodeCount ())
@@ -84,11 +90,15 @@ func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv
8490 }
8591 }
8692
87- // Start drains for all slotted Staged nodes.
93+ // Start drains for all slotted Staged nodes that haven't already been
94+ // approved for reboot (which implies they've already been drained).
8895 for _ , bn := range rs .staged {
8996 if ! metav1 .HasAnnotation (bn .ObjectMeta , bootcv1alpha1 .AnnotationInRebootSlot ) {
9097 continue
9198 }
99+ if bn .Spec .DesiredImageState == bootcv1alpha1 .DesiredImageStateBooted {
100+ continue
101+ }
92102 r .ensureDrain (ctx , pool , bn )
93103 }
94104
@@ -211,6 +221,63 @@ func (r *BootcNodePoolReconciler) ensureDrain(ctx context.Context, pool *bootcv1
211221 }()
212222}
213223
224+ // collectDrainResults checks all in-progress drains for completed
225+ // results. On success, it sets desiredImageState to Booted on the
226+ // BootcNode.
227+ func (r * BootcNodePoolReconciler ) collectDrainResults (ctx context.Context , ownedBootcNodes map [string ]* bootcv1alpha1.BootcNode ) error {
228+ log := logf .FromContext (ctx )
229+
230+ r .drainsMu .Lock ()
231+ defer r .drainsMu .Unlock ()
232+
233+ for nodeName , ds := range r .drains {
234+ // Non-blocking check for drain result.
235+ var drainErr error
236+ select {
237+ case drainErr = <- ds .result :
238+ default :
239+ // Drain still in progress.
240+ continue
241+ }
242+
243+ // Drain finished; remove from the map.
244+ delete (r .drains , nodeName )
245+
246+ if drainErr != nil {
247+ // TODO: handle drain errors and cancellations.
248+ log .Info ("Drain failed" , "node" , nodeName , "error" , drainErr )
249+ continue
250+ }
251+
252+ // Drain succeeded. Set desiredImageState to Booted.
253+ bn , ok := ownedBootcNodes [nodeName ]
254+ if ! ok {
255+ // Node left the pool while draining. Once
256+ // removeBootcNode cancels the drain (see related TODO
257+ // there), this would normally be caught by the
258+ // drainErr cancellation check above. But if somehow we
259+ // raced and did actually successfully drain, just
260+ // ignore it; we should've already uncordoned the node.
261+ log .Info ("Drain completed but node no longer in pool" , "node" , nodeName )
262+ continue
263+ }
264+
265+ log .Info ("Drain completed, setting desiredImageState to Booted" , "node" , nodeName )
266+ modified := bn .DeepCopy ()
267+ modified .Spec .DesiredImageState = bootcv1alpha1 .DesiredImageStateBooted
268+ if err := r .Patch (ctx , modified , client .MergeFrom (bn )); err != nil {
269+ // The drain result was already consumed, so on retry a
270+ // redundant drain will run (completing instantly).
271+ // Could optimize this but meh... not worth the
272+ // complexity.
273+ return fmt .Errorf ("setting desiredImageState on %s: %w" , nodeName , err )
274+ }
275+ * bn = * modified
276+ }
277+
278+ return nil
279+ }
280+
214281// buildRolloutState classifies all owned BootcNodes and counts occupied
215282// reboot slots.
216283func buildRolloutState (log logr.Logger , ownedBootcNodes map [string ]* bootcv1alpha1.BootcNode ) * rolloutState {
0 commit comments