Skip to content

Commit bcb06a5

Browse files
committed
[feat]: implement maintenance watcher for linodeCluster
1 parent 2eaea1e commit bcb06a5

6 files changed

Lines changed: 685 additions & 1 deletion

File tree

clients/clients.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ type LinodeClient interface {
2424
LinodeFirewallClient
2525
LinodeTokenClient
2626
LinodeInterfacesClient
27-
27+
LinodeMaintenanceClient
2828
OnAfterResponse(m func(response *resty.Response) error)
2929
}
3030

@@ -134,6 +134,10 @@ type LinodeInterfacesClient interface {
134134
ListInterfaceFirewalls(ctx context.Context, linodeID int, interfaceID int, opts *linodego.ListOptions) ([]linodego.Firewall, error)
135135
}
136136

137+
type LinodeMaintenanceClient interface {
138+
ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error)
139+
}
140+
137141
type K8sClient interface {
138142
client.Client
139143
}

docs/src/topics/health-checking.md

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,96 @@ on the infrastructure provider.
2323

2424
Refer to the [Cluster API documentation](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking)
2525
for further information on configuring and using `MachineHealthChecks`.
26+
27+
## Replacing Machines Scheduled for Maintenance
28+
29+
CAPL detects upcoming Linode infrastructure maintenance windows and sets a `MaintenanceScheduled` condition on
30+
the corresponding CAPI `Machine` objects. This condition can be used as a trigger for `MachineHealthCheck` to
31+
automatically replace machines before their maintenance window begins.
32+
33+
### How it works
34+
35+
During each `LinodeCluster` reconciliation, CAPL queries the Linode API for maintenance events scheduled within
36+
the next 72 hours. For each Linode instance that matches a `LinodeMachine` in the cluster, CAPL sets:
37+
38+
```
39+
condition:
40+
type: MaintenanceScheduled
41+
status: "True"
42+
```
43+
44+
on the owning CAPI `Machine` object. A `MachineHealthCheck` with `unhealthyMachineConditions` targeting this
45+
condition will then trigger remediation — replacing the machine before the maintenance window starts.
46+
47+
### Example MachineHealthCheck
48+
49+
The following `MachineHealthCheck` replaces worker machines when `MaintenanceScheduled=True` has been set for
50+
more than 1 hour:
51+
52+
```yaml
53+
apiVersion: cluster.x-k8s.io/v1beta2
54+
kind: MachineHealthCheck
55+
metadata:
56+
name: ${CLUSTER_NAME}-maintenance
57+
spec:
58+
clusterName: ${CLUSTER_NAME}
59+
selector:
60+
matchLabels:
61+
cluster.x-k8s.io/deployment-name: ${CLUSTER_NAME}
62+
checks:
63+
unhealthyMachineConditions:
64+
- type: MaintenanceScheduled
65+
status: "True"
66+
timeoutSeconds: 3600
67+
remediation:
68+
triggerIf:
69+
unhealthyLessThanOrEqualTo: 1
70+
```
71+
72+
For control plane machines managed by `KubeadmControlPlane`:
73+
74+
```yaml
75+
apiVersion: cluster.x-k8s.io/v1beta2
76+
kind: MachineHealthCheck
77+
metadata:
78+
name: ${CLUSTER_NAME}-cp-maintenance
79+
spec:
80+
clusterName: ${CLUSTER_NAME}
81+
selector:
82+
matchLabels:
83+
cluster.x-k8s.io/control-plane: ""
84+
checks:
85+
unhealthyMachineConditions:
86+
- type: MaintenanceScheduled
87+
status: "True"
88+
timeoutSeconds: 3600
89+
remediation:
90+
triggerIf:
91+
unhealthyLessThanOrEqualTo: 1
92+
```
93+
94+
### Field reference
95+
96+
| Field | Description |
97+
|-------|-------------|
98+
| `checks.unhealthyMachineConditions` | Conditions checked on the CAPI `Machine` object (not the Node). `MaintenanceScheduled` is set here by CAPL. |
99+
| `type: MaintenanceScheduled` | The condition type set by CAPL when a Linode maintenance event is scheduled within 72 hours. |
100+
| `status: "True"` | The condition status that indicates maintenance is scheduled. |
101+
| `timeoutSeconds` | How long the condition must be present before remediation is triggered. Set this to a value less than the expected lead time before the maintenance window starts. |
102+
| `remediation.triggerIf.unhealthyLessThanOrEqualTo` | Prevents remediation if too many machines are already unhealthy. For control plane clusters, set to `1` to avoid remediating multiple control plane nodes simultaneously and losing etcd quorum. |
103+
104+
### Choosing a timeout
105+
106+
CAPL sets `MaintenanceScheduled` up to 72 hours before the maintenance window. A `timeoutSeconds` of `3600`
107+
(1 hour) means remediation begins 71 hours before the window at the earliest. Adjust this value based on
108+
how much lead time your workloads require for graceful draining.
109+
110+
### Limitations
111+
112+
- Only machines owned by a `MachineSet` or `KubeadmControlPlane` can be remediated by a `MachineHealthCheck`.
113+
Standalone machines are not eligible.
114+
- The `MaintenanceScheduled` condition is never explicitly cleared by CAPL. Machines will be replaced by the
115+
`MachineHealthCheck` before the condition is removed, which is the intended behavior.
116+
- Control plane remediation preserves etcd quorum: CAPI will not remediate a second control plane machine
117+
until the replacement for the first is healthy. Set `unhealthyLessThanOrEqualTo: 1` for control plane
118+
`MachineHealthChecks` to prevent simultaneous replacements.

internal/controller/linodecluster_controller.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,16 @@ import (
2424
"time"
2525

2626
"github.com/go-logr/logr"
27+
"github.com/linode/linodego"
2728
corev1 "k8s.io/api/core/v1"
2829
apierrors "k8s.io/apimachinery/pkg/api/errors"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
utilerrors "k8s.io/apimachinery/pkg/util/errors"
3132
"k8s.io/client-go/tools/events"
3233
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
3334
kutil "sigs.k8s.io/cluster-api/util"
35+
"sigs.k8s.io/cluster-api/util/conditions"
36+
"sigs.k8s.io/cluster-api/util/patch"
3437
"sigs.k8s.io/cluster-api/util/paused"
3538
"sigs.k8s.io/cluster-api/util/predicates"
3639
ctrl "sigs.k8s.io/controller-runtime"
@@ -54,6 +57,7 @@ const (
5457
lbTypeNB string = "NodeBalancer"
5558
ConditionPreflightLinodeVPCReady string = "PreflightLinodeVPCReady"
5659
ConditionPreflightLinodeNBFirewallReady string = "PreflightLinodeNBFirewallReady"
60+
ConditionMaintenanceScheduled string = "MaintenanceScheduled"
5761
)
5862

5963
// LinodeClusterReconciler reconciles a LinodeCluster object
@@ -218,9 +222,82 @@ func (r *LinodeClusterReconciler) reconcile(
218222
return retryIfTransient(err, logger)
219223
}
220224

225+
if err := r.setMaintenanceConditions(ctx, clusterScope, logger); err != nil {
226+
return retryIfTransient(err, logger)
227+
}
228+
221229
return res, nil
222230
}
223231

232+
func (r *LinodeClusterReconciler) setMaintenanceConditions(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) error {
233+
linodeMachines, err := r.collectMaintenanceInfo(ctx, clusterScope, logger)
234+
if err != nil {
235+
return err
236+
}
237+
var errs []error
238+
for i := range linodeMachines {
239+
capiMachine, err := kutil.GetOwnerMachine(ctx, clusterScope.Client, linodeMachines[i].ObjectMeta)
240+
if err != nil {
241+
errs = append(errs, fmt.Errorf("failed to get owner Machine for LinodeMachine %s: %w", linodeMachines[i].Name, err))
242+
continue
243+
}
244+
if capiMachine == nil {
245+
logger.Info("no owner Machine found for LinodeMachine, skipping", "LinodeMachine", linodeMachines[i].Name)
246+
continue
247+
}
248+
patchHelper, err := patch.NewHelper(capiMachine, clusterScope.Client)
249+
if err != nil {
250+
errs = append(errs, fmt.Errorf("failed to create patch helper for Machine %s: %w", capiMachine.Name, err))
251+
continue
252+
}
253+
conditions.Set(capiMachine, metav1.Condition{
254+
Type: ConditionMaintenanceScheduled,
255+
Status: metav1.ConditionTrue,
256+
LastTransitionTime: metav1.Now(),
257+
Reason: ConditionMaintenanceScheduled,
258+
})
259+
if err := patchHelper.Patch(ctx, capiMachine); err != nil {
260+
errs = append(errs, fmt.Errorf("failed to patch Machine %s: %w", capiMachine.Name, err))
261+
continue
262+
}
263+
}
264+
return utilerrors.NewAggregate(errs)
265+
}
266+
267+
func (r *LinodeClusterReconciler) collectMaintenanceInfo(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) ([]infrav1alpha2.LinodeMachine, error) {
268+
// Fetch all maintenance information
269+
threeDaysLater := time.Now().Add(72 * time.Hour).UTC().Format("2006-01-02T15:04:05") // API doesn't like RFC3339
270+
f := linodego.Filter{}
271+
f.AddField(linodego.Eq, "status", "scheduled")
272+
f.AddField(linodego.Lte, "when", threeDaysLater)
273+
filter, err := f.MarshalJSON()
274+
if err != nil {
275+
return nil, fmt.Errorf("could not marshal filter: %w", err)
276+
}
277+
maintenances, err := clusterScope.LinodeClient.ListMaintenances(ctx, &linodego.ListOptions{Filter: string(filter)})
278+
if err != nil {
279+
logger.Error(err, "Failed to fetch maintenance information from Linode API")
280+
return nil, err
281+
}
282+
283+
maintenanceLabels := make(map[string]struct{}, len(maintenances))
284+
for _, maint := range maintenances {
285+
if maint.Entity.Type != "linode" {
286+
continue
287+
}
288+
maintenanceLabels[maint.Entity.Label] = struct{}{}
289+
}
290+
291+
var machinesForMaintenance []infrav1alpha2.LinodeMachine
292+
for _, lm := range clusterScope.LinodeMachines.Items {
293+
if _, ok := maintenanceLabels[lm.Name]; ok {
294+
logger.Info("Found maintenance information for", "LinodeMachine", lm.Name)
295+
machinesForMaintenance = append(machinesForMaintenance, lm)
296+
}
297+
}
298+
return machinesForMaintenance, nil
299+
}
300+
224301
func (r *LinodeClusterReconciler) performPreflightChecks(ctx context.Context, logger logr.Logger, clusterScope *scope.ClusterScope) (ctrl.Result, error) {
225302
// Check VPC configuration - either direct ID or reference
226303
if clusterScope.LinodeCluster.Spec.VPCID != nil || clusterScope.LinodeCluster.Spec.VPCRef != nil {

0 commit comments

Comments
 (0)