linode · tchinmai7 · May 15, 2026 · May 15, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/clients/clients.go b/clients/clients.go
@@ -24,7 +24,7 @@ type LinodeClient interface {
 	LinodeFirewallClient
 	LinodeTokenClient
 	LinodeInterfacesClient
-
+	LinodeMaintenanceClient
 	OnAfterResponse(m func(response *resty.Response) error)
 }
 
@@ -134,6 +134,10 @@ type LinodeInterfacesClient interface {
 	ListInterfaceFirewalls(ctx context.Context, linodeID int, interfaceID int, opts *linodego.ListOptions) ([]linodego.Firewall, error)
 }
 
+type LinodeMaintenanceClient interface {
+	ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error)
+}
+
 type K8sClient interface {
 	client.Client
 }

diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -21,11 +21,27 @@ rules:
   - cluster.x-k8s.io
   resources:
   - clusters
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - cluster.x-k8s.io
+  resources:
   - machines
   verbs:
   - get
   - list
+  - patch
   - watch
+- apiGroups:
+  - cluster.x-k8s.io
+  resources:
+  - machines/status
+  verbs:
+  - get
+  - patch
+  - update
 - apiGroups:
   - infrastructure.cluster.x-k8s.io
   resources:

diff --git a/docs/src/topics/health-checking.md b/docs/src/topics/health-checking.md
@@ -23,3 +23,96 @@ on the infrastructure provider.
 
 Refer to the [Cluster API documentation](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking)
 for further information on configuring and using `MachineHealthChecks`.
+
+## Replacing Machines Scheduled for Maintenance
+
+CAPL detects upcoming Linode infrastructure maintenance windows and sets a `MaintenanceScheduled` condition on
+the corresponding CAPI `Machine` objects. This condition can be used as a trigger for `MachineHealthCheck` to
+automatically replace machines before their maintenance window begins.
+
+### How it works
+
+During each `LinodeCluster` reconciliation, CAPL queries the Linode API for maintenance events scheduled within
+the next 72 hours. For each Linode instance that matches a `LinodeMachine` in the cluster, CAPL sets:
+
+```
+condition:
+  type: MaintenanceScheduled
+  status: "True"
+```
+
+on the owning CAPI `Machine` object. A `MachineHealthCheck` with `unhealthyMachineConditions` targeting this
+condition will then trigger remediation — replacing the machine before the maintenance window starts.
+
+### Example MachineHealthCheck
+
+The following `MachineHealthCheck` replaces worker machines when `MaintenanceScheduled=True` has been set for
+more than 1 hour:
+
+```yaml
+apiVersion: cluster.x-k8s.io/v1beta2
+kind: MachineHealthCheck
+metadata:
+  name: ${CLUSTER_NAME}-maintenance
+spec:
+  clusterName: ${CLUSTER_NAME}
+  selector:
+    matchLabels:
+      cluster.x-k8s.io/deployment-name: ${CLUSTER_NAME}
+  checks:
+    unhealthyMachineConditions:
+      - type: MaintenanceScheduled
+        status: "True"
+        timeoutSeconds: 3600
+  remediation:
+    triggerIf:
+      unhealthyLessThanOrEqualTo: 1
+```
+
+For control plane machines managed by `KubeadmControlPlane`:
+
+```yaml
+apiVersion: cluster.x-k8s.io/v1beta2
+kind: MachineHealthCheck
+metadata:
+  name: ${CLUSTER_NAME}-cp-maintenance
+spec:
+  clusterName: ${CLUSTER_NAME}
+  selector:
+    matchLabels:
+      cluster.x-k8s.io/control-plane: ""
+  checks:
+    unhealthyMachineConditions:
+      - type: MaintenanceScheduled
+        status: "True"
+        timeoutSeconds: 3600
+  remediation:
+    triggerIf:
+      unhealthyLessThanOrEqualTo: 1
+```
+
+### Field reference
+
+| Field | Description |
+|-------|-------------|
+| `checks.unhealthyMachineConditions` | Conditions checked on the CAPI `Machine` object (not the Node). `MaintenanceScheduled` is set here by CAPL. |
+| `type: MaintenanceScheduled` | The condition type set by CAPL when a Linode maintenance event is scheduled within 72 hours. |
+| `status: "True"` | The condition status that indicates maintenance is scheduled. |
+| `timeoutSeconds` | How long the condition must be present before remediation is triggered. Set this to a value less than the expected lead time before the maintenance window starts. |
+| `remediation.triggerIf.unhealthyLessThanOrEqualTo` | Prevents remediation if too many machines are already unhealthy. For control plane clusters, set to `1` to avoid remediating multiple control plane nodes simultaneously and losing etcd quorum. |
+
+### Choosing a timeout
+
+CAPL sets `MaintenanceScheduled` up to 72 hours before the maintenance window. A `timeoutSeconds` of `3600`
+(1 hour) means remediation begins 71 hours before the window at the earliest. Adjust this value based on
+how much lead time your workloads require for graceful draining.
+
+### Limitations
+
+- Only machines owned by a `MachineSet` or `KubeadmControlPlane` can be remediated by a `MachineHealthCheck`.
+  Standalone machines are not eligible.
+- The `MaintenanceScheduled` condition is never explicitly cleared by CAPL. Machines will be replaced by the
+  `MachineHealthCheck` before the condition is removed, which is the intended behavior. 
+- Control plane remediation preserves etcd quorum: CAPI will not remediate a second control plane machine
+  until the replacement for the first is healthy. Set `unhealthyLessThanOrEqualTo: 1` for control plane
+  `MachineHealthChecks` to prevent simultaneous replacements.
diff --git a/internal/controller/linodecluster_controller.go b/internal/controller/linodecluster_controller.go
@@ -24,13 +24,16 @@ import (
 	"time"
 
 	"github.com/go-logr/logr"
+	"github.com/linode/linodego"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/client-go/tools/events"
 	clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
 	kutil "sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/conditions"
+	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/cluster-api/util/paused"
 	"sigs.k8s.io/cluster-api/util/predicates"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -54,8 +57,11 @@ const (
 	lbTypeNB                                string = "NodeBalancer"
 	ConditionPreflightLinodeVPCReady        string = "PreflightLinodeVPCReady"
 	ConditionPreflightLinodeNBFirewallReady string = "PreflightLinodeNBFirewallReady"
+	ConditionMaintenanceScheduled           string = "MaintenanceScheduled"
 )
 
+var threeDays = 72 * time.Hour
+
 // LinodeClusterReconciler reconciles a LinodeCluster object
 type LinodeClusterReconciler struct {
 	client.Client
@@ -69,7 +75,8 @@ type LinodeClusterReconciler struct {
 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters/finalizers,verbs=update
-
+// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;watch;list;patch
+// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines/status,verbs=get;update;patch
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
 
@@ -218,9 +225,92 @@ func (r *LinodeClusterReconciler) reconcile(
 		return retryIfTransient(err, logger)
 	}
 
+	if err := r.setMaintenanceConditions(ctx, clusterScope, logger); err != nil {
+		return retryIfTransient(err, logger)
+	}
+
 	return res, nil
 }
 
+func (r *LinodeClusterReconciler) setMaintenanceConditions(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) error {
+	linodeMachines, err := r.collectMaintenanceInfo(ctx, clusterScope, logger)
+	if err != nil {
+		return err
+	}
+	var errs []error
+	for _, lm := range linodeMachines {
+		capiMachine, err := kutil.GetOwnerMachine(ctx, clusterScope.Client, lm.ObjectMeta)
+		if err != nil {
+			errs = append(errs, fmt.Errorf("failed to get owner Machine for LinodeMachine %s: %w", lm.Name, err))
+			continue
+		}
+		if capiMachine == nil {
+			logger.Info("no owner Machine found for LinodeMachine, skipping", "LinodeMachine", lm.Name)
+			continue
+		}
+		patchHelper, err := patch.NewHelper(capiMachine, clusterScope.Client)
+		if err != nil {
+			errs = append(errs, fmt.Errorf("failed to create patch helper for Machine %s: %w", capiMachine.Name, err))
+			continue
+		}
+		conditions.Set(capiMachine, metav1.Condition{
+			Type:   ConditionMaintenanceScheduled,
+			Status: metav1.ConditionTrue,
+			Reason: ConditionMaintenanceScheduled,
+		})
+		if err := patchHelper.Patch(ctx, capiMachine); err != nil {
+			errs = append(errs, fmt.Errorf("failed to patch Machine %s: %w", capiMachine.Name, err))
+			continue
+		}
+	}
+	return utilerrors.NewAggregate(errs)
+}
+
+func (r *LinodeClusterReconciler) collectMaintenanceInfo(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) ([]infrav1alpha2.LinodeMachine, error) {
+	// Fetch all maintenance information
+	threeDaysLater := time.Now().Add(threeDays).UTC().Format("2006-01-02T15:04:05") // API doesn't like RFC3339
+	f := linodego.Filter{}
+	f.AddField(linodego.Eq, "status", "scheduled")
+	f.AddField(linodego.Lte, "when", threeDaysLater)
+	filter, err := f.MarshalJSON()
+	if err != nil {
+		return nil, fmt.Errorf("could not marshal filter: %w", err)
+	}
+	maintenances, err := clusterScope.LinodeClient.ListMaintenances(ctx, &linodego.ListOptions{Filter: string(filter)})
+	if err != nil {
+		logger.Error(err, "Failed to fetch maintenance information from Linode API")
+		return nil, err
+	}
+
+	maintenanceLabels := make(map[int]struct{}, len(maintenances))
+	for _, maint := range maintenances {
+		if maint.Entity == nil {
+			continue
+		}
+		if maint.Entity.Type != "linode" {
+			continue
+		}
+		maintenanceLabels[maint.Entity.ID] = struct{}{}
+	}
+
+	var machinesForMaintenance []infrav1alpha2.LinodeMachine
+	linodeMachines, err := util.GetLinodeMachinesForCluster(ctx, clusterScope.Client, clusterScope.Cluster)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, lm := range linodeMachines.Items {
+		if lm.Spec.InstanceID == nil {
+			continue
+		}
+		if _, ok := maintenanceLabels[*lm.Spec.InstanceID]; ok {
+			logger.Info("Found maintenance information for", "LinodeMachine", lm.Name, "id", *lm.Spec.InstanceID)
+			machinesForMaintenance = append(machinesForMaintenance, lm)
+		}
+	}
+	return machinesForMaintenance, nil
+}
+
 func (r *LinodeClusterReconciler) performPreflightChecks(ctx context.Context, logger logr.Logger, clusterScope *scope.ClusterScope) (ctrl.Result, error) {
 	// Check VPC configuration - either direct ID or reference
 	if clusterScope.LinodeCluster.Spec.VPCID != nil || clusterScope.LinodeCluster.Spec.VPCRef != nil {