linode
diff --git a/‎clients/clients.go‎
Lines changed: 5 additions & 1 deletion b/‎clients/clients.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/src/topics/health-checking.md‎
Lines changed: 93 additions & 0 deletions b/‎docs/src/topics/health-checking.md‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎internal/controller/linodecluster_controller.go‎
Lines changed: 77 additions & 0 deletions b/‎internal/controller/linodecluster_controller.go‎
Lines changed: 77 additions & 0 deletions
@@ -24,7 +24,7 @@ type LinodeClient interface {
 	LinodeFirewallClient
 	LinodeTokenClient
 	LinodeInterfacesClient
-
+	LinodeMaintenanceClient
 	OnAfterResponse(m func(response *resty.Response) error)
 }
 
@@ -134,6 +134,10 @@ type LinodeInterfacesClient interface {
 	ListInterfaceFirewalls(ctx context.Context, linodeID int, interfaceID int, opts *linodego.ListOptions) ([]linodego.Firewall, error)
 }
 
+type LinodeMaintenanceClient interface {
+	ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error)
+}
+
 type K8sClient interface {
 	client.Client
 }
 
@@ -23,3 +23,96 @@ on the infrastructure provider.
 
 Refer to the [Cluster API documentation](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking)
 for further information on configuring and using `MachineHealthChecks`.
+
+## Replacing Machines Scheduled for Maintenance
+
+CAPL detects upcoming Linode infrastructure maintenance windows and sets a `MaintenanceScheduled` condition on
+the corresponding CAPI `Machine` objects. This condition can be used as a trigger for `MachineHealthCheck` to
+automatically replace machines before their maintenance window begins.
+
+### How it works
+
+During each `LinodeCluster` reconciliation, CAPL queries the Linode API for maintenance events scheduled within
+the next 72 hours. For each Linode instance that matches a `LinodeMachine` in the cluster, CAPL sets:
+
+```
+condition:
+  type: MaintenanceScheduled
+  status: "True"
+```
+
+on the owning CAPI `Machine` object. A `MachineHealthCheck` with `unhealthyMachineConditions` targeting this
+condition will then trigger remediation — replacing the machine before the maintenance window starts.
+
+### Example MachineHealthCheck
+
+The following `MachineHealthCheck` replaces worker machines when `MaintenanceScheduled=True` has been set for
+more than 1 hour:
+
+```yaml
+apiVersion: cluster.x-k8s.io/v1beta2
+kind: MachineHealthCheck
+metadata:
+  name: ${CLUSTER_NAME}-maintenance
+spec:
+  clusterName: ${CLUSTER_NAME}
+  selector:
+    matchLabels:
+      cluster.x-k8s.io/deployment-name: ${CLUSTER_NAME}
+  checks:
+    unhealthyMachineConditions:
+      - type: MaintenanceScheduled
+        status: "True"
+        timeoutSeconds: 3600
+  remediation:
+    triggerIf:
+      unhealthyLessThanOrEqualTo: 1
+```
+
+For control plane machines managed by `KubeadmControlPlane`:
+
+```yaml
+apiVersion: cluster.x-k8s.io/v1beta2
+kind: MachineHealthCheck
+metadata:
+  name: ${CLUSTER_NAME}-cp-maintenance
+spec:
+  clusterName: ${CLUSTER_NAME}
+  selector:
+    matchLabels:
+      cluster.x-k8s.io/control-plane: ""
+  checks:
+    unhealthyMachineConditions:
+      - type: MaintenanceScheduled
+        status: "True"
+        timeoutSeconds: 3600
+  remediation:
+    triggerIf:
+      unhealthyLessThanOrEqualTo: 1
+```
+
+### Field reference
+
+| Field | Description |
+|-------|-------------|
+| `checks.unhealthyMachineConditions` | Conditions checked on the CAPI `Machine` object (not the Node). `MaintenanceScheduled` is set here by CAPL. |
+| `type: MaintenanceScheduled` | The condition type set by CAPL when a Linode maintenance event is scheduled within 72 hours. |
+| `status: "True"` | The condition status that indicates maintenance is scheduled. |
+| `timeoutSeconds` | How long the condition must be present before remediation is triggered. Set this to a value less than the expected lead time before the maintenance window starts. |
+| `remediation.triggerIf.unhealthyLessThanOrEqualTo` | Prevents remediation if too many machines are already unhealthy. For control plane clusters, set to `1` to avoid remediating multiple control plane nodes simultaneously and losing etcd quorum. |
+
+### Choosing a timeout
+
+CAPL sets `MaintenanceScheduled` up to 72 hours before the maintenance window. A `timeoutSeconds` of `3600`
+(1 hour) means remediation begins 71 hours before the window at the earliest. Adjust this value based on
+how much lead time your workloads require for graceful draining.
+
+### Limitations
+
+- Only machines owned by a `MachineSet` or `KubeadmControlPlane` can be remediated by a `MachineHealthCheck`.
+  Standalone machines are not eligible.
+- The `MaintenanceScheduled` condition is never explicitly cleared by CAPL. Machines will be replaced by the
+  `MachineHealthCheck` before the condition is removed, which is the intended behavior. 
+- Control plane remediation preserves etcd quorum: CAPI will not remediate a second control plane machine
+  until the replacement for the first is healthy. Set `unhealthyLessThanOrEqualTo: 1` for control plane
+  `MachineHealthChecks` to prevent simultaneous replacements.
@@ -24,13 +24,16 @@ import (
 	"time"
 
 	"github.com/go-logr/logr"
+	"github.com/linode/linodego"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/client-go/tools/events"
 	clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
 	kutil "sigs.k8s.io/cluster-api/util"
+	"sigs.k8s.io/cluster-api/util/conditions"
+	"sigs.k8s.io/cluster-api/util/patch"
 	"sigs.k8s.io/cluster-api/util/paused"
 	"sigs.k8s.io/cluster-api/util/predicates"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -54,6 +57,7 @@ const (
 	lbTypeNB                                string = "NodeBalancer"
 	ConditionPreflightLinodeVPCReady        string = "PreflightLinodeVPCReady"
 	ConditionPreflightLinodeNBFirewallReady string = "PreflightLinodeNBFirewallReady"
+	ConditionMaintenanceScheduled           string = "MaintenanceScheduled"
 )
 
 // LinodeClusterReconciler reconciles a LinodeCluster object
@@ -218,9 +222,82 @@ func (r *LinodeClusterReconciler) reconcile(
 		return retryIfTransient(err, logger)
 	}
 
+	if err := r.setMaintenanceConditions(ctx, clusterScope, logger); err != nil {
+		return retryIfTransient(err, logger)
+	}
+
 	return res, nil
 }
 
+func (r *LinodeClusterReconciler) setMaintenanceConditions(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) error {
+	linodeMachines, err := r.collectMaintenanceInfo(ctx, clusterScope, logger)
+	if err != nil {
+		return err
+	}
+	var errs []error
+	for i := range linodeMachines {
+		capiMachine, err := kutil.GetOwnerMachine(ctx, clusterScope.Client, linodeMachines[i].ObjectMeta)
+		if err != nil {
+			errs = append(errs, fmt.Errorf("failed to get owner Machine for LinodeMachine %s: %w", linodeMachines[i].Name, err))
+			continue
+		}
+		if capiMachine == nil {
+			logger.Info("no owner Machine found for LinodeMachine, skipping", "LinodeMachine", linodeMachines[i].Name)
+			continue
+		}
+		patchHelper, err := patch.NewHelper(capiMachine, clusterScope.Client)
+		if err != nil {
+			errs = append(errs, fmt.Errorf("failed to create patch helper for Machine %s: %w", capiMachine.Name, err))
+			continue
+		}
+		conditions.Set(capiMachine, metav1.Condition{
+			Type:               ConditionMaintenanceScheduled,
+			Status:             metav1.ConditionTrue,
+			LastTransitionTime: metav1.Now(),
+			Reason:             ConditionMaintenanceScheduled,
+		})
+		if err := patchHelper.Patch(ctx, capiMachine); err != nil {
+			errs = append(errs, fmt.Errorf("failed to patch Machine %s: %w", capiMachine.Name, err))
+			continue
+		}
+	}
+	return utilerrors.NewAggregate(errs)
+}
+
+func (r *LinodeClusterReconciler) collectMaintenanceInfo(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) ([]infrav1alpha2.LinodeMachine, error) {
+	// Fetch all maintenance information
+	threeDaysLater := time.Now().Add(72 * time.Hour).UTC().Format("2006-01-02T15:04:05") // API doesn't like RFC3339
+	f := linodego.Filter{}
+	f.AddField(linodego.Eq, "status", "scheduled")
+	f.AddField(linodego.Lte, "when", threeDaysLater)
+	filter, err := f.MarshalJSON()
+	if err != nil {
+		return nil, fmt.Errorf("could not marshal filter: %w", err)
+	}
+	maintenances, err := clusterScope.LinodeClient.ListMaintenances(ctx, &linodego.ListOptions{Filter: string(filter)})
+	if err != nil {
+		logger.Error(err, "Failed to fetch maintenance information from Linode API")
+		return nil, err
+	}
+
+	maintenanceLabels := make(map[string]struct{}, len(maintenances))
+	for _, maint := range maintenances {
+		if maint.Entity.Type != "linode" {
+			continue
+		}
+		maintenanceLabels[maint.Entity.Label] = struct{}{}
+	}
+
+	var machinesForMaintenance []infrav1alpha2.LinodeMachine
+	for _, lm := range clusterScope.LinodeMachines.Items {
+		if _, ok := maintenanceLabels[lm.Name]; ok {
+			logger.Info("Found maintenance information for", "LinodeMachine", lm.Name)
+			machinesForMaintenance = append(machinesForMaintenance, lm)
+		}
+	}
+	return machinesForMaintenance, nil
+}
+
 func (r *LinodeClusterReconciler) performPreflightChecks(ctx context.Context, logger logr.Logger, clusterScope *scope.ClusterScope) (ctrl.Result, error) {
 	// Check VPC configuration - either direct ID or reference
 	if clusterScope.LinodeCluster.Spec.VPCID != nil || clusterScope.LinodeCluster.Spec.VPCRef != nil {
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ type LinodeClient interface {`
`24`	`24`	`LinodeFirewallClient`
`25`	`25`	`LinodeTokenClient`
`26`	`26`	`LinodeInterfacesClient`
`27`		`-`
	`27`	`+ LinodeMaintenanceClient`
`28`	`28`	`OnAfterResponse(m func(response *resty.Response) error)`
`29`	`29`	`}`
`30`	`30`
`@@ -134,6 +134,10 @@ type LinodeInterfacesClient interface {`
`134`	`134`	`ListInterfaceFirewalls(ctx context.Context, linodeID int, interfaceID int, opts *linodego.ListOptions) ([]linodego.Firewall, error)`
`135`	`135`	`}`
`136`	`136`
	`137`	`+type LinodeMaintenanceClient interface {`
	`138`	`+ ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error)`
	`139`	`+}`
	`140`	`+`
`137`	`141`	`type K8sClient interface {`
`138`	`142`	`client.Client`
`139`	`143`	`}`