Skip to content

Commit f34740a

Browse files
committed
controller: add reboot slot tracking and candidate selection
Add rolloutState struct to classify owned BootcNodes into state buckets and count occupied reboot slots from the in-reboot-slot annotation. Add resolveMaxUnavailable to compute effective maxUnavailable from the pool spec (defaults to 1, rounds up for percentages, returns 0 when paused). Add selectDrainCandidates to pick Staged nodes needing the drain flow, always re-selecting already-slotted nodes regardless of capacity. driveRollout now computes slots and candidates but does not yet act on them. Also adds testutil node builder options (WithBootedDigest, WithNodeCondition, WithNodeAnnotation) used across rollout tests. Assisted-by: Pi (Claude Opus 4.6)
1 parent ad98eaf commit f34740a

4 files changed

Lines changed: 337 additions & 21 deletions

File tree

internal/controller/bootcnodepool_controller.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ func (r *BootcNodePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques
256256

257257
// Drive the rollout state machine.
258258
if err := r.driveRollout(ctx, &pool, ownedBootcNodes); err != nil {
259+
if isInvalidSpecError(err) {
260+
return r.setInvalidSpecCondition(ctx, &pool, err)
261+
}
259262
return ctrl.Result{}, fmt.Errorf("driving rollout: %w", err)
260263
}
261264

internal/controller/rollout.go

Lines changed: 150 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,33 +5,174 @@ package controller
55
import (
66
"context"
77
"fmt"
8+
"slices"
9+
"strings"
810

911
"github.com/distribution/reference"
12+
"github.com/go-logr/logr"
1013
apimeta "k8s.io/apimachinery/pkg/api/meta"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/util/intstr"
1116
logf "sigs.k8s.io/controller-runtime/pkg/log"
1217

1318
bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1"
1419
)
1520

16-
// driveRollout iterates owned BootcNodes, classifies each by the state
17-
// table, and logs their states. Transition logic is added in later
18-
// commits.
21+
// rolloutState holds the classified BootcNodes for a single reconcile
22+
// pass.
23+
type rolloutState struct {
24+
// nodes are sorted into these buckets
25+
idle []*bootcv1alpha1.BootcNode
26+
staging []*bootcv1alpha1.BootcNode
27+
staged []*bootcv1alpha1.BootcNode
28+
rebooting []*bootcv1alpha1.BootcNode
29+
degraded []*bootcv1alpha1.BootcNode
30+
unclassified []*bootcv1alpha1.BootcNode
31+
32+
// BootcNodes with the in-reboot-slot annotation
33+
occupiedSlots int
34+
}
35+
36+
// nodeCount returns the total number of nodes in the pool, including
37+
// unclassified ones. Used for resolving percentage-based maxUnavailable.
38+
func (rs *rolloutState) nodeCount() int {
39+
return len(rs.idle) + len(rs.staging) + len(rs.staged) +
40+
len(rs.rebooting) + len(rs.degraded) + len(rs.unclassified)
41+
}
42+
43+
// driveRollout is the main function that advances the rollout state machine.
1944
func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv1alpha1.BootcNodePool, ownedBootcNodes map[string]*bootcv1alpha1.BootcNode) error {
20-
log := logf.FromContext(ctx).WithValues("pool", pool.Name)
45+
log := logf.FromContext(ctx)
46+
47+
rs := buildRolloutState(log, ownedBootcNodes)
48+
49+
maxUnavail, err := resolveMaxUnavailable(pool, rs.nodeCount())
50+
if err != nil {
51+
return err
52+
}
53+
54+
avail := max(0, maxUnavail-rs.occupiedSlots)
55+
candidates := selectDrainCandidates(rs.staged, avail)
2156

57+
log.V(1).Info("Rollout state",
58+
"idle", len(rs.idle),
59+
"staging", len(rs.staging),
60+
"staged", len(rs.staged),
61+
"rebooting", len(rs.rebooting),
62+
"degraded", len(rs.degraded),
63+
"unclassified", nodeNames(rs.unclassified),
64+
"occupiedSlots", rs.occupiedSlots,
65+
"maxUnavailable", maxUnavail,
66+
"availableSlots", avail,
67+
"candidates", nodeNames(candidates),
68+
)
69+
70+
return nil
71+
}
72+
73+
// buildRolloutState classifies all owned BootcNodes and counts occupied
74+
// reboot slots.
75+
func buildRolloutState(log logr.Logger, ownedBootcNodes map[string]*bootcv1alpha1.BootcNode) *rolloutState {
76+
rs := &rolloutState{}
2277
for _, bn := range ownedBootcNodes {
78+
// Count occupied reboot slots from the persistent annotation.
79+
if metav1.HasAnnotation(bn.ObjectMeta, bootcv1alpha1.AnnotationInRebootSlot) {
80+
rs.occupiedSlots++
81+
}
82+
2383
state, err := classifyNode(bn)
2484
if err != nil {
25-
// This can happen transiently (e.g. daemon hasn't populated
26-
// booted status yet). Skip the node; it will be re-evaluated
27-
// when the daemon updates the BootcNode.
28-
log.V(1).Info("Skipping unclassifiable node", "node", bn.Name, "error", err)
85+
// as mentioned in classifyNode(), should never happen...
86+
log.Info("WARNING: skipping unclassifiable node", "node", bn.Name, "error", err)
87+
rs.unclassified = append(rs.unclassified, bn)
2988
continue
3089
}
3190
log.V(1).Info("Classified node", "node", bn.Name, "state", state.String())
91+
92+
switch state {
93+
case nodeStateIdle:
94+
rs.idle = append(rs.idle, bn)
95+
case nodeStateStaging:
96+
rs.staging = append(rs.staging, bn)
97+
case nodeStateStaged:
98+
rs.staged = append(rs.staged, bn)
99+
case nodeStateRebooting:
100+
rs.rebooting = append(rs.rebooting, bn)
101+
case nodeStateDegraded:
102+
rs.degraded = append(rs.degraded, bn)
103+
}
104+
}
105+
return rs
106+
}
107+
108+
// resolveMaxUnavailable computes the effective maxUnavailable value from the
109+
// pool's rollout spec. Defaults to 1 when unset. A value of 0 is allowed and
110+
// means no reboot slots are available (effectively paused). Returns an
111+
// invalidSpecError if the value is malformed.
112+
func resolveMaxUnavailable(pool *bootcv1alpha1.BootcNodePool, nodeCount int) (int, error) {
113+
if pool.Spec.Rollout != nil && pool.Spec.Rollout.Paused {
114+
return 0, nil
115+
}
116+
if pool.Spec.Rollout == nil || pool.Spec.Rollout.MaxUnavailable == nil {
117+
return 1, nil
32118
}
33119

34-
return nil
120+
// We roundUp here; this matches Deployments maxUnavailable for example
121+
v, err := intstr.GetScaledValueFromIntOrPercent(pool.Spec.Rollout.MaxUnavailable, nodeCount, true)
122+
if err != nil {
123+
return 0, newInvalidSpecError(fmt.Sprintf("invalid maxUnavailable %q: %v", pool.Spec.Rollout.MaxUnavailable.String(), err))
124+
}
125+
return v, nil
126+
}
127+
128+
// selectDrainCandidates picks Staged nodes that need the drain flow started or
129+
// restarted. Nodes that already have the in-reboot-slot annotation are always
130+
// included (e.g. they had a slot before a controller restart and need their
131+
// drain restarted). These nodes are already counted in occupiedSlots so they
132+
// don't consume availableSlots. Beyond those, up to availableSlots unslotted
133+
// nodes are appended, sorted alphabetically.
134+
func selectDrainCandidates(staged []*bootcv1alpha1.BootcNode, availableSlots int) []*bootcv1alpha1.BootcNode {
135+
if len(staged) == 0 {
136+
return nil
137+
}
138+
139+
// Partition into already-slotted vs new candidates.
140+
var slotted, unslotted []*bootcv1alpha1.BootcNode
141+
for _, bn := range staged {
142+
if metav1.HasAnnotation(bn.ObjectMeta, bootcv1alpha1.AnnotationInRebootSlot) {
143+
slotted = append(slotted, bn)
144+
} else {
145+
unslotted = append(unslotted, bn)
146+
}
147+
}
148+
slices.SortFunc(slotted, func(a, b *bootcv1alpha1.BootcNode) int {
149+
return strings.Compare(a.Name, b.Name)
150+
})
151+
slices.SortFunc(unslotted, func(a, b *bootcv1alpha1.BootcNode) int {
152+
return strings.Compare(a.Name, b.Name)
153+
})
154+
155+
// Always re-select slotted nodes. Fill remaining capacity with
156+
// unslotted nodes.
157+
result := slotted
158+
if availableSlots > 0 && len(unslotted) > 0 {
159+
n := min(availableSlots, len(unslotted))
160+
result = append(result, unslotted[:n]...)
161+
}
162+
163+
if len(result) == 0 {
164+
return nil
165+
}
166+
return result
167+
}
168+
169+
// nodeNames returns the names of the given BootcNodes for logging.
170+
func nodeNames(nodes []*bootcv1alpha1.BootcNode) []string {
171+
names := make([]string, len(nodes))
172+
for i, n := range nodes {
173+
names[i] = n.Name
174+
}
175+
return names
35176
}
36177

37178
// nodeState represents the effective state of a BootcNode as seen by

internal/controller/rollout_test.go

Lines changed: 144 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,150 @@ package controller
55
import (
66
"testing"
77

8+
"github.com/go-logr/logr"
89
. "github.com/onsi/gomega"
910
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/util/intstr"
1012

1113
bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1"
14+
testutil "github.com/jlebon/bootc-operator/test/util"
1215
)
1316

17+
func TestBuildRolloutState(t *testing.T) {
18+
const (
19+
desiredImage = testImageDigestRefA
20+
otherDigest = testDigestB
21+
)
22+
23+
g := NewWithT(t)
24+
25+
// Classification of each state is tested in TestClassifyNode. This
26+
// test focuses more on aggregation: bucketing, slot counting, and
27+
// nodeCount.
28+
nodes := map[string]*bootcv1alpha1.BootcNode{
29+
"idle": testutil.NewNode("idle", desiredImage,
30+
testutil.WithBootedDigest(testDigestA),
31+
testutil.WithNodeCondition(bootcv1alpha1.NodeIdle, metav1.ConditionTrue, bootcv1alpha1.NodeReasonIdle)),
32+
"staged": testutil.NewNode("staged", desiredImage,
33+
testutil.WithBootedDigest(otherDigest),
34+
testutil.WithNodeCondition(bootcv1alpha1.NodeIdle, metav1.ConditionFalse, bootcv1alpha1.NodeReasonStaged)),
35+
"rebooting-1": testutil.NewNode("rebooting-1", desiredImage,
36+
testutil.WithBootedDigest(otherDigest),
37+
testutil.WithNodeCondition(bootcv1alpha1.NodeIdle, metav1.ConditionFalse, bootcv1alpha1.NodeReasonRebooting),
38+
testutil.WithNodeAnnotation(bootcv1alpha1.AnnotationInRebootSlot, "")),
39+
"rebooting-2": testutil.NewNode("rebooting-2", desiredImage,
40+
testutil.WithBootedDigest(otherDigest),
41+
testutil.WithNodeCondition(bootcv1alpha1.NodeIdle, metav1.ConditionFalse, bootcv1alpha1.NodeReasonRebooting),
42+
testutil.WithNodeAnnotation(bootcv1alpha1.AnnotationInRebootSlot, "")),
43+
}
44+
45+
rs := buildRolloutState(logr.Discard(), nodes)
46+
47+
g.Expect(rs.idle).To(HaveLen(1))
48+
g.Expect(rs.staged).To(HaveLen(1))
49+
g.Expect(rs.rebooting).To(HaveLen(2))
50+
g.Expect(rs.occupiedSlots).To(Equal(2))
51+
g.Expect(rs.unclassified).To(BeEmpty())
52+
g.Expect(rs.nodeCount()).To(Equal(4))
53+
}
54+
55+
func TestResolveMaxUnavailable(t *testing.T) {
56+
intstrPtr := func(v intstr.IntOrString) *intstr.IntOrString { return &v }
57+
58+
tests := []struct {
59+
name string
60+
maxUnavailable *intstr.IntOrString
61+
paused bool
62+
nodeCount int
63+
want int
64+
wantErr bool
65+
}{
66+
{
67+
name: "nil maxUnavailable defaults to 1",
68+
maxUnavailable: nil,
69+
nodeCount: 10,
70+
want: 1,
71+
},
72+
{
73+
name: "int value",
74+
maxUnavailable: intstrPtr(intstr.FromInt32(3)),
75+
nodeCount: 10,
76+
want: 3,
77+
},
78+
{
79+
name: "percentage rounds up",
80+
maxUnavailable: intstrPtr(intstr.FromString("25%")),
81+
nodeCount: 10,
82+
want: 3,
83+
},
84+
{
85+
name: "paused returns 0 regardless of maxUnavailable",
86+
maxUnavailable: intstrPtr(intstr.FromInt32(3)),
87+
paused: true,
88+
nodeCount: 10,
89+
want: 0,
90+
},
91+
{
92+
name: "invalid string returns error",
93+
maxUnavailable: intstrPtr(intstr.FromString("banana")),
94+
nodeCount: 10,
95+
wantErr: true,
96+
},
97+
}
98+
99+
for _, tt := range tests {
100+
t.Run(tt.name, func(t *testing.T) {
101+
g := NewWithT(t)
102+
var opts []testutil.PoolOption
103+
if tt.maxUnavailable != nil {
104+
opts = append(opts, testutil.WithMaxUnavailable(*tt.maxUnavailable))
105+
}
106+
if tt.paused {
107+
opts = append(opts, testutil.WithPaused(true))
108+
}
109+
pool := testutil.NewPool("p", testImageDigestRefA, opts...)
110+
got, err := resolveMaxUnavailable(pool, tt.nodeCount)
111+
if tt.wantErr {
112+
g.Expect(err).To(HaveOccurred())
113+
g.Expect(isInvalidSpecError(err)).To(BeTrue())
114+
return
115+
}
116+
g.Expect(err).NotTo(HaveOccurred())
117+
g.Expect(got).To(Equal(tt.want))
118+
})
119+
}
120+
}
121+
122+
func TestSelectDrainCandidates(t *testing.T) {
123+
g := NewWithT(t)
124+
125+
makeBN := func(name string, slotted bool) *bootcv1alpha1.BootcNode {
126+
bn := &bootcv1alpha1.BootcNode{}
127+
bn.Name = name
128+
if slotted {
129+
bn.Annotations = map[string]string{bootcv1alpha1.AnnotationInRebootSlot: ""}
130+
}
131+
return bn
132+
}
133+
134+
// Core behavior: slotted nodes always included and come first
135+
// (alphabetical), then unslotted up to availableSlots (alphabetical).
136+
staged := []*bootcv1alpha1.BootcNode{
137+
makeBN("node-c", false),
138+
makeBN("node-b", true),
139+
makeBN("node-a", true),
140+
makeBN("node-d", false),
141+
}
142+
candidates := selectDrainCandidates(staged, 1)
143+
g.Expect(candidates).To(HaveLen(3))
144+
g.Expect(candidates[0].Name).To(Equal("node-a")) // slotted, alphabetical
145+
g.Expect(candidates[1].Name).To(Equal("node-b")) // slotted, alphabetical
146+
g.Expect(candidates[2].Name).To(Equal("node-c")) // first unslotted by name
147+
148+
// Zero available slots with no slotted nodes returns nil.
149+
g.Expect(selectDrainCandidates([]*bootcv1alpha1.BootcNode{makeBN("x", false)}, 0)).To(BeNil())
150+
}
151+
14152
func TestClassifyNode(t *testing.T) {
15153
const (
16154
desiredImage = testImageDigestRefA
@@ -103,20 +241,14 @@ func TestClassifyNode(t *testing.T) {
103241
t.Run(tt.name, func(t *testing.T) {
104242
g := NewWithT(t)
105243

106-
bn := &bootcv1alpha1.BootcNode{
107-
Spec: bootcv1alpha1.BootcNodeSpec{
108-
DesiredImage: desiredImage,
109-
},
110-
Status: bootcv1alpha1.BootcNodeStatus{
111-
Conditions: tt.conditions,
112-
},
113-
}
114-
bn.Name = nodeName
244+
var opts []testutil.NodeOption
115245
if tt.bootedDigest != "" {
116-
bn.Status.Booted = &bootcv1alpha1.ImageInfo{
117-
ImageDigest: tt.bootedDigest,
118-
}
246+
opts = append(opts, testutil.WithBootedDigest(tt.bootedDigest))
247+
}
248+
for _, c := range tt.conditions {
249+
opts = append(opts, testutil.WithNodeCondition(c.Type, c.Status, c.Reason))
119250
}
251+
bn := testutil.NewNode(nodeName, desiredImage, opts...)
120252
got, err := classifyNode(bn)
121253
g.Expect(err).NotTo(HaveOccurred())
122254
g.Expect(got).To(Equal(tt.want))

0 commit comments

Comments
 (0)