Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docs/ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,16 @@ The controller determines whether a node is up to date by comparing
`spec.desiredImage` against `status.booted.imageDigest`. It does not
rely on the daemon's `Idle` condition for this.

The controller classifies each node into one of five effective states.
The controller classifies each node into one of six effective states.
The `Degraded` condition is checked first (takes priority over activity
state).

| Effective state | Determination | Reconciler action |
|-----------------|-----------------------------------------------------------|------------------------------------------------------|
| Degraded | BootcNode `Degraded=True` | Mark pool degraded |
| Idle | `desiredImage == booted` and `Idle=True` | If in reboot slot: free slot only once node is Ready |
| UpToDate | `desiredImage == booted` | If in reboot slot: free slot only once node is Ready |
| Pending | No booted status, or `desiredImage != booted` with no | Wait for daemon to report or react |
| | actionable Idle reason (daemon hasn't reported/reacted) | |
| Staging | `desiredImage != booted`, `Idle=False reason=Staging` | Wait (non-disruptive) |
| Staged | `desiredImage != booted`, `Idle=False reason=Staged` | If reboot slot available: assign slot; else wait |
| Rebooting | `desiredImage != booted`, `Idle=False reason=Rebooting` | Wait for node to come back |
Expand All @@ -414,9 +416,9 @@ governed by three rules:
1. **A node can only take a reboot slot when healthy** -- only Staged
nodes (not Degraded) are candidates for reboot slots.
2. **A node can only release a reboot slot when healthy** -- after
reboot, the slot is held until the node is Idle (`Idle=True`, not
Degraded) and the K8s Node is Ready. A node that is Degraded or
not Ready post-reboot holds its slot indefinitely.
reboot, the slot is held until the node is Idle (`desiredImage ==
booted`, not Degraded) and the K8s Node is Ready. A node that is
Degraded or not Ready post-reboot holds its slot indefinitely.
3. **2 unhealthy nodes in reboot slots stop the rollout** -- when 2 or
more nodes occupying reboot slots are unhealthy (Degraded or not
Ready), the controller stops assigning new slots. A single
Expand Down
15 changes: 9 additions & 6 deletions internal/controller/bootcnodepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,8 @@ func (r *BootcNodePoolReconciler) syncMembership(ctx context.Context, pool *boot
} else {
// New match: create BootcNode and label the node.
log.Info("Creating BootcNode for new match", "node", nodeName)
if err := r.createBootcNode(ctx, pool, node); err != nil {
bn, err := r.createBootcNode(ctx, pool, node)
if err != nil {
if !apierrors.IsAlreadyExists(err) {
return nil, fmt.Errorf("creating BootcNode for %s: %w", nodeName, err)
}
Expand All @@ -390,6 +391,8 @@ func (r *BootcNodePoolReconciler) syncMembership(ctx context.Context, pool *boot
conflicting[owner.Name] = true
}
}
} else {
ownedSet[nodeName] = bn
}
}
}
Expand Down Expand Up @@ -484,7 +487,7 @@ func desiredImageFromPool(pool *bootcv1alpha1.BootcNodePool) string {

// createBootcNode creates a BootcNode for a node joining the pool and
// labels the node as managed.
func (r *BootcNodePoolReconciler) createBootcNode(ctx context.Context, pool *bootcv1alpha1.BootcNodePool, node *corev1.Node) error {
func (r *BootcNodePoolReconciler) createBootcNode(ctx context.Context, pool *bootcv1alpha1.BootcNodePool, node *corev1.Node) (*bootcv1alpha1.BootcNode, error) {
bn := &bootcv1alpha1.BootcNode{
ObjectMeta: metav1.ObjectMeta{
Name: node.Name,
Expand All @@ -503,19 +506,19 @@ func (r *BootcNodePoolReconciler) createBootcNode(ctx context.Context, pool *boo
// Set ownerReference so the BootcNode is cleaned up if the pool is
// deleted and so the Owns() watch routes BootcNode events to this pool.
if err := controllerutil.SetControllerReference(pool, bn, r.Scheme); err != nil {
return fmt.Errorf("setting owner reference: %w", err)
return nil, fmt.Errorf("setting owner reference: %w", err)
}

if err := r.Create(ctx, bn); err != nil {
return fmt.Errorf("creating BootcNode: %w", err)
return nil, fmt.Errorf("creating BootcNode: %w", err)
}

// Label the node as managed.
if err := r.ensureManagedLabel(ctx, node, true); err != nil {
return fmt.Errorf("labeling node: %w", err)
return nil, fmt.Errorf("labeling node: %w", err)
}

return nil
return bn, nil
}

// ensureManagedLabel adds or removes the bootc.dev/managed label on a Node.
Expand Down
98 changes: 72 additions & 26 deletions internal/controller/rollout.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import (
// pass.
type rolloutState struct {
// nodes are sorted into these buckets
idle []*bootcv1alpha1.BootcNode
upToDate []*bootcv1alpha1.BootcNode
pending []*bootcv1alpha1.BootcNode
staging []*bootcv1alpha1.BootcNode
staged []*bootcv1alpha1.BootcNode
rebooting []*bootcv1alpha1.BootcNode
Expand All @@ -42,7 +43,7 @@ type rolloutState struct {
// nodeCount returns the total number of nodes in the pool, including
// unclassified ones. Used for resolving percentage-based maxUnavailable.
func (rs *rolloutState) nodeCount() int {
return len(rs.idle) + len(rs.staging) + len(rs.staged) +
return len(rs.upToDate) + len(rs.pending) + len(rs.staging) + len(rs.staged) +
len(rs.rebooting) + len(rs.degraded) + len(rs.unclassified)
}

Expand All @@ -58,6 +59,13 @@ func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv

rs := buildRolloutState(log, ownedBootcNodes)

// Free reboot slots for nodes that have successfully rebooted into
// the desired image. This runs before computing available slots so
// that freed capacity is immediately usable for new candidates.
if err := r.freeCompletedSlots(ctx, rs); err != nil {
return fmt.Errorf("freeing completed slots: %w", err)
}

maxUnavail, err := resolveMaxUnavailable(pool, rs.nodeCount())
if err != nil {
return err
Expand All @@ -67,7 +75,8 @@ func (r *BootcNodePoolReconciler) driveRollout(ctx context.Context, pool *bootcv
candidates := selectDrainCandidates(rs.staged, avail)

log.V(1).Info("Rollout state",
"idle", len(rs.idle),
"upToDate", len(rs.upToDate),
"pending", len(rs.pending),
"staging", len(rs.staging),
"staged", len(rs.staged),
"rebooting", len(rs.rebooting),
Expand Down Expand Up @@ -147,9 +156,40 @@ func (r *BootcNodePoolReconciler) assignRebootSlot(ctx context.Context, bn *boot
return nil
}

// freeCompletedSlots releases reboot slots for upToDate nodes that are Ready.
// It decrements rs.occupiedSlots for each freed slot so the caller's available
// slot count is accurate.
func (r *BootcNodePoolReconciler) freeCompletedSlots(ctx context.Context, rs *rolloutState) error {
log := logf.FromContext(ctx)

for _, bn := range rs.upToDate {
if !metav1.HasAnnotation(bn.ObjectMeta, bootcv1alpha1.AnnotationInRebootSlot) {
continue
}

var node corev1.Node
if err := r.Get(ctx, types.NamespacedName{Name: bn.Name}, &node); err != nil {
return fmt.Errorf("fetching node %s: %w", bn.Name, err)
}

if nodeReadyStatus(&node) != corev1.ConditionTrue {
log.V(1).Info("Node rebooted but not yet Ready, holding reboot slot", "node", bn.Name)
continue
}

log.Info("Freeing reboot slot: node healthy and Ready", "node", bn.Name)
if err := r.freeRebootSlot(ctx, bn, &node); err != nil {
return fmt.Errorf("freeing reboot slot for %s: %w", bn.Name, err)
}
rs.occupiedSlots--
}

return nil
}

// freeRebootSlot releases a node's reboot slot by restoring its prior cordon
// state and removing annotations from the BootcNode.
func (r *BootcNodePoolReconciler) freeRebootSlot(ctx context.Context, bn *bootcv1alpha1.BootcNode, node *corev1.Node) error { //nolint:unused // used by post-reboot handling
func (r *BootcNodePoolReconciler) freeRebootSlot(ctx context.Context, bn *bootcv1alpha1.BootcNode, node *corev1.Node) error {
if err := r.restoreCordonState(ctx, bn, node); err != nil {
return err
}
Expand Down Expand Up @@ -298,8 +338,10 @@ func buildRolloutState(log logr.Logger, ownedBootcNodes map[string]*bootcv1alpha
log.V(1).Info("Classified node", "node", bn.Name, "state", state.String())

switch state {
case nodeStateIdle:
rs.idle = append(rs.idle, bn)
case nodeStateUpToDate:
rs.upToDate = append(rs.upToDate, bn)
case nodeStatePending:
rs.pending = append(rs.pending, bn)
case nodeStateStaging:
rs.staging = append(rs.staging, bn)
case nodeStateStaged:
Expand Down Expand Up @@ -388,9 +430,13 @@ func nodeNames(nodes []*bootcv1alpha1.BootcNode) []string {
type nodeState int

const (
// nodeStateIdle means the node is running the desired image and the
// daemon has no active update cycle.
nodeStateIdle nodeState = iota
// nodeStateUpToDate means the node is running the desired image.
nodeStateUpToDate nodeState = iota

// nodeStatePending means the node's state is indeterminate: the
// daemon hasn't reported yet (no booted status), or hasn't reacted
// to a desiredImage change.
nodeStatePending

// nodeStateStaging means the daemon is pulling/staging the image.
nodeStateStaging
Expand All @@ -409,8 +455,10 @@ const (

func (s nodeState) String() string {
switch s {
case nodeStateIdle:
return "Idle"
case nodeStateUpToDate:
return "UpToDate"
case nodeStatePending:
return "Pending"
case nodeStateStaging:
return "Staging"
case nodeStateStaged:
Expand All @@ -432,12 +480,12 @@ func classifyNode(bn *bootcv1alpha1.BootcNode) (nodeState, error) {
}

if bn.Status.Booted == nil {
// The only way this can happen really is on a brand new BootcNode and
// the daemon is still being provisioned. Let's just treat this as Idle
// for now and it should resolve in a future reconciliation (though
// ideally eventually we can handle 'stuck' states like this... see
// The only way this can happen really is on a brand new
// BootcNode and the daemon is still being provisioned. It
// should resolve in a future reconciliation (though ideally
// eventually we can handle 'stuck' states like this... see
// related comment below).
return nodeStateIdle, nil
return nodeStatePending, nil
}

// We could pass in the pool here and use targetDigest instead to avoid
Expand All @@ -456,7 +504,7 @@ func classifyNode(bn *bootcv1alpha1.BootcNode) (nodeState, error) {
if digested.Digest().String() == bn.Status.Booted.ImageDigest {
// Image matches; nothing for the controller to act on
// regardless of whether the daemon has settled yet.
return nodeStateIdle, nil
return nodeStateUpToDate, nil
}

// OK, the node isn't yet booting the desired digest. Let's dig into where
Expand All @@ -474,14 +522,12 @@ func classifyNode(bn *bootcv1alpha1.BootcNode) (nodeState, error) {
}
}

// Image doesn't match and daemon is either Idle, has no conditions,
// or has an unrecognized Idle reason. Classify as Idle since the
// daemon should eventually react to the spec change.

// Hmm, daemon is idle... weird. It could just be that we're racing with the
// daemon reconciliation. Or something more broken might be happening (e.g.
// daemon not running at all). For now we don't try to detect 'stuck' nodes,
// but may in the future. It'll still show up as holding up the pool's
// Image doesn't match and daemon is either Idle, has no conditions, or
// has an unrecognized Idle reason. This likely means we're racing with
// the daemon reconciliation (it hasn't reacted to the spec change
// yet), or something more broken is happening (e.g. daemon not running
// at all). For now we don't try to detect 'stuck' nodes, but may in
// the future. It'll still show up as holding up the pool's
// `deployedDigest` field and the updatingCount stat.
return nodeStateIdle, nil
return nodeStatePending, nil
}
Loading