Skip to content

Commit e8e5159

Browse files
jlebonalicefr
authored andcommitted
controller: free reboot slots for nodes that complete their update
This is the crucial next step for the controller: we actually handle nodes rebooting into the desired image and free the reboot slot so rollout can continue on to other nodes. Assisted-by: Pi (Claude Opus 4.6)
1 parent d4240ea commit e8e5159

2 files changed

Lines changed: 119 additions & 46 deletions

File tree

internal/controller/rollout.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,13 @@ func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv
5959

6060
rs := buildRolloutState(log, ownedBootcNodes)
6161

62+
// Free reboot slots for nodes that have successfully rebooted into
63+
// the desired image. This runs before computing available slots so
64+
// that freed capacity is immediately usable for new candidates.
65+
if err := r.freeCompletedSlots(ctx, rs); err != nil {
66+
return fmt.Errorf("freeing completed slots: %w", err)
67+
}
68+
6269
maxUnavail, err := resolveMaxUnavailable(pool, rs.nodeCount())
6370
if err != nil {
6471
return err
@@ -149,9 +156,40 @@ func (r *BootcNodePoolReconciler) assignRebootSlot(ctx context.Context, bn *boot
149156
return nil
150157
}
151158

159+
// freeCompletedSlots releases reboot slots for upToDate nodes that are Ready.
160+
// It decrements rs.occupiedSlots for each freed slot so the caller's available
161+
// slot count is accurate.
162+
func (r *BootcNodePoolReconciler) freeCompletedSlots(ctx context.Context, rs *rolloutState) error {
163+
log := logf.FromContext(ctx)
164+
165+
for _, bn := range rs.upToDate {
166+
if !metav1.HasAnnotation(bn.ObjectMeta, bootcv1alpha1.AnnotationInRebootSlot) {
167+
continue
168+
}
169+
170+
var node corev1.Node
171+
if err := r.Get(ctx, types.NamespacedName{Name: bn.Name}, &node); err != nil {
172+
return fmt.Errorf("fetching node %s: %w", bn.Name, err)
173+
}
174+
175+
if nodeReadyStatus(&node) != corev1.ConditionTrue {
176+
log.V(1).Info("Node rebooted but not yet Ready, holding reboot slot", "node", bn.Name)
177+
continue
178+
}
179+
180+
log.Info("Freeing reboot slot: node healthy and Ready", "node", bn.Name)
181+
if err := r.freeRebootSlot(ctx, bn, &node); err != nil {
182+
return fmt.Errorf("freeing reboot slot for %s: %w", bn.Name, err)
183+
}
184+
rs.occupiedSlots--
185+
}
186+
187+
return nil
188+
}
189+
152190
// freeRebootSlot releases a node's reboot slot by restoring its prior cordon
153191
// state and removing annotations from the BootcNode.
154-
func (r *BootcNodePoolReconciler) freeRebootSlot(ctx context.Context, bn *bootcv1alpha1.BootcNode, node *corev1.Node) error { //nolint:unused // used by post-reboot handling
192+
func (r *BootcNodePoolReconciler) freeRebootSlot(ctx context.Context, bn *bootcv1alpha1.BootcNode, node *corev1.Node) error {
155193
if err := r.restoreCordonState(ctx, bn, node); err != nil {
156194
return err
157195
}

internal/controller/rollout_envtest_test.go

Lines changed: 80 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ import (
1717
)
1818

1919
// TestSimpleRollout simulates a 3-node rollout with maxUnavailable: 1. It
20-
// verifies that only one node is cordoned at a time and that desiredImageState
21-
// is set to Booted after drain completes.
20+
// verifies that only one node is cordoned at a time, that desiredImageState is
21+
// set to Booted after drain completes, and that after each node reboots into
22+
// the desired image and becomes Ready, its reboot slot is freed and the next
23+
// node gets its turn.
2224
func TestSimpleRollout(t *testing.T) {
2325
g := NewWithT(t)
2426
g.SetDefaultEventuallyTimeout(pollTimeout)
@@ -69,53 +71,62 @@ func TestSimpleRollout(t *testing.T) {
6971
simulateDaemonStatus(g, ctx, name, testDigestA, bootcv1alpha1.NodeReasonStaged)
7072
}
7173

72-
// Wait for exactly one node to be cordoned (reboot slot assigned).
73-
var cordonedNode string
74-
g.Eventually(func() string {
75-
cordonedNode = ""
76-
for _, name := range nodeNames {
74+
// Verify the sequential rollout: with maxUnavailable: 1 and
75+
// alphabetical candidate selection, nodes are processed in
76+
// deterministic order w1 → w2 → w3.
77+
for i, name := range nodeNames {
78+
// Wait for this node to receive its reboot slot: cordoned,
79+
// annotated, desiredImageState set to Booted (drain completes
80+
// instantly in envtest since there are no pods).
81+
g.Eventually(func(g Gomega) {
82+
var bn bootcv1alpha1.BootcNode
83+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &bn)).To(Succeed())
84+
g.Expect(bn.Annotations).To(HaveKey(bootcv1alpha1.AnnotationInRebootSlot),
85+
"node %s should have in-reboot-slot annotation", name)
86+
g.Expect(bn.Annotations).To(HaveKey(bootcv1alpha1.AnnotationWasCordoned),
87+
"node %s should have was-cordoned annotation", name)
88+
g.Expect(bn.Spec.DesiredImageState).To(Equal(bootcv1alpha1.DesiredImageStateBooted),
89+
"node %s should have desiredImageState Booted", name)
90+
7791
var node corev1.Node
7892
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &node)).To(Succeed())
79-
if node.Spec.Unschedulable {
80-
if cordonedNode != "" {
81-
// More than one node cordoned — fail.
82-
return "MULTIPLE"
83-
}
84-
cordonedNode = name
85-
}
86-
}
87-
return cordonedNode
88-
}).ShouldNot(BeEmpty())
89-
g.Expect(cordonedNode).NotTo(Equal("MULTIPLE"), "only one node should be cordoned with maxUnavailable: 1")
93+
g.Expect(node.Spec.Unschedulable).To(BeTrue(),
94+
"node %s should be cordoned", name)
95+
}).Should(Succeed())
9096

91-
// Verify the cordoned node has the in-reboot-slot annotation.
92-
var bn bootcv1alpha1.BootcNode
93-
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: cordonedNode}, &bn)).To(Succeed())
94-
g.Expect(bn.Annotations).To(HaveKey(bootcv1alpha1.AnnotationInRebootSlot))
95-
g.Expect(bn.Annotations).To(HaveKey(bootcv1alpha1.AnnotationWasCordoned))
96-
97-
// In envtest, drain completes instantly (no pods). Verify that
98-
// desiredImageState is set to Booted on the cordoned node.
99-
g.Eventually(func(g Gomega) {
100-
var bn bootcv1alpha1.BootcNode
101-
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: cordonedNode}, &bn)).To(Succeed())
102-
g.Expect(bn.Spec.DesiredImageState).To(Equal(bootcv1alpha1.DesiredImageStateBooted))
103-
}).Should(Succeed())
104-
105-
// Verify the other nodes are NOT cordoned and still have
106-
// desiredImageState: Staged.
107-
for _, name := range nodeNames {
108-
if name == cordonedNode {
109-
continue
97+
// Verify remaining nodes are not yet touched.
98+
for _, other := range nodeNames[i+1:] {
99+
var node corev1.Node
100+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: other}, &node)).To(Succeed())
101+
g.Expect(node.Spec.Unschedulable).To(BeFalse(),
102+
"node %s should not be cordoned", other)
103+
104+
var bn bootcv1alpha1.BootcNode
105+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: other}, &bn)).To(Succeed())
106+
g.Expect(bn.Spec.DesiredImageState).To(Equal(bootcv1alpha1.DesiredImageStateStaged),
107+
"node %s should still have desiredImageState Staged", other)
110108
}
111-
var node corev1.Node
112-
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &node)).To(Succeed())
113-
g.Expect(node.Spec.Unschedulable).To(BeFalse(), "node %s should not be cordoned", name)
114-
115-
var bn bootcv1alpha1.BootcNode
116-
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &bn)).To(Succeed())
117-
g.Expect(bn.Spec.DesiredImageState).To(Equal(bootcv1alpha1.DesiredImageStateStaged),
118-
"node %s should still have desiredImageState Staged", name)
109+
110+
// Simulate the daemon reporting a successful reboot and the
111+
// node becoming Ready.
112+
simulateDaemonStatus(g, ctx, name, testDigestB, bootcv1alpha1.NodeReasonIdle)
113+
setNodeReady(g, ctx, name)
114+
115+
// Verify the reboot slot is freed: annotations removed and
116+
// node uncordoned.
117+
g.Eventually(func(g Gomega) {
118+
var bn bootcv1alpha1.BootcNode
119+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &bn)).To(Succeed())
120+
g.Expect(bn.Annotations).NotTo(HaveKey(bootcv1alpha1.AnnotationInRebootSlot),
121+
"node %s should have in-reboot-slot annotation removed", name)
122+
g.Expect(bn.Annotations).NotTo(HaveKey(bootcv1alpha1.AnnotationWasCordoned),
123+
"node %s should have was-cordoned annotation removed", name)
124+
125+
var node corev1.Node
126+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: name}, &node)).To(Succeed())
127+
g.Expect(node.Spec.Unschedulable).To(BeFalse(),
128+
"node %s should be uncordoned after reboot", name)
129+
}).Should(Succeed())
119130
}
120131
}
121132

@@ -145,3 +156,27 @@ func simulateDaemonStatus(g Gomega, ctx context.Context, nodeName, bootedDigest,
145156

146157
g.Expect(k8sClient.Status().Update(ctx, &bn)).To(Succeed())
147158
}
159+
160+
// setNodeReady sets the Ready condition on a K8s Node to True. In
161+
// envtest there is no kubelet, so this simulates the node becoming
162+
// healthy after a reboot.
163+
func setNodeReady(g Gomega, ctx context.Context, nodeName string) {
164+
var node corev1.Node
165+
g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: nodeName}, &node)).To(Succeed())
166+
167+
// Replace any existing Ready condition, preserving other conditions.
168+
var filtered []corev1.NodeCondition
169+
for _, c := range node.Status.Conditions {
170+
if c.Type != corev1.NodeReady {
171+
filtered = append(filtered, c)
172+
}
173+
}
174+
node.Status.Conditions = append(filtered, corev1.NodeCondition{
175+
Type: corev1.NodeReady,
176+
Status: corev1.ConditionTrue,
177+
LastHeartbeatTime: metav1.Now(),
178+
LastTransitionTime: metav1.Now(),
179+
})
180+
181+
g.Expect(k8sClient.Status().Update(ctx, &node)).To(Succeed())
182+
}

0 commit comments

Comments
 (0)