diff --git a/clients/clients.go b/clients/clients.go index ce954fe8d..c321fb396 100644 --- a/clients/clients.go +++ b/clients/clients.go @@ -24,7 +24,7 @@ type LinodeClient interface { LinodeFirewallClient LinodeTokenClient LinodeInterfacesClient - + LinodeMaintenanceClient OnAfterResponse(m func(response *resty.Response) error) } @@ -134,6 +134,10 @@ type LinodeInterfacesClient interface { ListInterfaceFirewalls(ctx context.Context, linodeID int, interfaceID int, opts *linodego.ListOptions) ([]linodego.Firewall, error) } +type LinodeMaintenanceClient interface { + ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error) +} + type K8sClient interface { client.Client } diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a0aa974ef..8f5a5f8b2 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -21,11 +21,27 @@ rules: - cluster.x-k8s.io resources: - clusters + verbs: + - get + - list + - watch +- apiGroups: + - cluster.x-k8s.io + resources: - machines verbs: - get - list + - patch - watch +- apiGroups: + - cluster.x-k8s.io + resources: + - machines/status + verbs: + - get + - patch + - update - apiGroups: - infrastructure.cluster.x-k8s.io resources: diff --git a/docs/src/topics/health-checking.md b/docs/src/topics/health-checking.md index 254c6fee7..381ac7bf5 100644 --- a/docs/src/topics/health-checking.md +++ b/docs/src/topics/health-checking.md @@ -23,3 +23,96 @@ on the infrastructure provider. Refer to the [Cluster API documentation](https://cluster-api.sigs.k8s.io/tasks/automated-machine-management/healthchecking) for further information on configuring and using `MachineHealthChecks`. + +## Replacing Machines Scheduled for Maintenance + +CAPL detects upcoming Linode infrastructure maintenance windows and sets a `MaintenanceScheduled` condition on +the corresponding CAPI `Machine` objects. This condition can be used as a trigger for `MachineHealthCheck` to +automatically replace machines before their maintenance window begins. + +### How it works + +During each `LinodeCluster` reconciliation, CAPL queries the Linode API for maintenance events scheduled within +the next 72 hours. For each Linode instance that matches a `LinodeMachine` in the cluster, CAPL sets: + +``` +condition: + type: MaintenanceScheduled + status: "True" +``` + +on the owning CAPI `Machine` object. A `MachineHealthCheck` with `unhealthyMachineConditions` targeting this +condition will then trigger remediation — replacing the machine before the maintenance window starts. + +### Example MachineHealthCheck + +The following `MachineHealthCheck` replaces worker machines when `MaintenanceScheduled=True` has been set for +more than 1 hour: + +```yaml +apiVersion: cluster.x-k8s.io/v1beta2 +kind: MachineHealthCheck +metadata: + name: ${CLUSTER_NAME}-maintenance +spec: + clusterName: ${CLUSTER_NAME} + selector: + matchLabels: + cluster.x-k8s.io/deployment-name: ${CLUSTER_NAME} + checks: + unhealthyMachineConditions: + - type: MaintenanceScheduled + status: "True" + timeoutSeconds: 3600 + remediation: + triggerIf: + unhealthyLessThanOrEqualTo: 1 +``` + +For control plane machines managed by `KubeadmControlPlane`: + +```yaml +apiVersion: cluster.x-k8s.io/v1beta2 +kind: MachineHealthCheck +metadata: + name: ${CLUSTER_NAME}-cp-maintenance +spec: + clusterName: ${CLUSTER_NAME} + selector: + matchLabels: + cluster.x-k8s.io/control-plane: "" + checks: + unhealthyMachineConditions: + - type: MaintenanceScheduled + status: "True" + timeoutSeconds: 3600 + remediation: + triggerIf: + unhealthyLessThanOrEqualTo: 1 +``` + +### Field reference + +| Field | Description | +|-------|-------------| +| `checks.unhealthyMachineConditions` | Conditions checked on the CAPI `Machine` object (not the Node). `MaintenanceScheduled` is set here by CAPL. | +| `type: MaintenanceScheduled` | The condition type set by CAPL when a Linode maintenance event is scheduled within 72 hours. | +| `status: "True"` | The condition status that indicates maintenance is scheduled. | +| `timeoutSeconds` | How long the condition must be present before remediation is triggered. Set this to a value less than the expected lead time before the maintenance window starts. | +| `remediation.triggerIf.unhealthyLessThanOrEqualTo` | Prevents remediation if too many machines are already unhealthy. For control plane clusters, set to `1` to avoid remediating multiple control plane nodes simultaneously and losing etcd quorum. | + +### Choosing a timeout + +CAPL sets `MaintenanceScheduled` up to 72 hours before the maintenance window. A `timeoutSeconds` of `3600` +(1 hour) means remediation begins 71 hours before the window at the earliest. Adjust this value based on +how much lead time your workloads require for graceful draining. + +### Limitations + +- Only machines owned by a `MachineSet` or `KubeadmControlPlane` can be remediated by a `MachineHealthCheck`. + Standalone machines are not eligible. +- The `MaintenanceScheduled` condition is never explicitly cleared by CAPL. Machines will be replaced by the + `MachineHealthCheck` before the condition is removed, which is the intended behavior. +- Control plane remediation preserves etcd quorum: CAPI will not remediate a second control plane machine + until the replacement for the first is healthy. Set `unhealthyLessThanOrEqualTo: 1` for control plane + `MachineHealthChecks` to prevent simultaneous replacements. diff --git a/internal/controller/linodecluster_controller.go b/internal/controller/linodecluster_controller.go index b8473eebb..fa6bbcb91 100644 --- a/internal/controller/linodecluster_controller.go +++ b/internal/controller/linodecluster_controller.go @@ -24,6 +24,7 @@ import ( "time" "github.com/go-logr/logr" + "github.com/linode/linodego" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -31,6 +32,8 @@ import ( "k8s.io/client-go/tools/events" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" kutil "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/paused" "sigs.k8s.io/cluster-api/util/predicates" ctrl "sigs.k8s.io/controller-runtime" @@ -54,8 +57,11 @@ const ( lbTypeNB string = "NodeBalancer" ConditionPreflightLinodeVPCReady string = "PreflightLinodeVPCReady" ConditionPreflightLinodeNBFirewallReady string = "PreflightLinodeNBFirewallReady" + ConditionMaintenanceScheduled string = "MaintenanceScheduled" ) +var threeDays = 72 * time.Hour + // LinodeClusterReconciler reconciles a LinodeCluster object type LinodeClusterReconciler struct { client.Client @@ -69,7 +75,8 @@ type LinodeClusterReconciler struct { // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=linodeclusters/finalizers,verbs=update - +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;watch;list;patch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines/status,verbs=get;update;patch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -218,9 +225,92 @@ func (r *LinodeClusterReconciler) reconcile( return retryIfTransient(err, logger) } + if err := r.setMaintenanceConditions(ctx, clusterScope, logger); err != nil { + return retryIfTransient(err, logger) + } + return res, nil } +func (r *LinodeClusterReconciler) setMaintenanceConditions(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) error { + linodeMachines, err := r.collectMaintenanceInfo(ctx, clusterScope, logger) + if err != nil { + return err + } + var errs []error + for _, lm := range linodeMachines { + capiMachine, err := kutil.GetOwnerMachine(ctx, clusterScope.Client, lm.ObjectMeta) + if err != nil { + errs = append(errs, fmt.Errorf("failed to get owner Machine for LinodeMachine %s: %w", lm.Name, err)) + continue + } + if capiMachine == nil { + logger.Info("no owner Machine found for LinodeMachine, skipping", "LinodeMachine", lm.Name) + continue + } + patchHelper, err := patch.NewHelper(capiMachine, clusterScope.Client) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create patch helper for Machine %s: %w", capiMachine.Name, err)) + continue + } + conditions.Set(capiMachine, metav1.Condition{ + Type: ConditionMaintenanceScheduled, + Status: metav1.ConditionTrue, + Reason: ConditionMaintenanceScheduled, + }) + if err := patchHelper.Patch(ctx, capiMachine); err != nil { + errs = append(errs, fmt.Errorf("failed to patch Machine %s: %w", capiMachine.Name, err)) + continue + } + } + return utilerrors.NewAggregate(errs) +} + +func (r *LinodeClusterReconciler) collectMaintenanceInfo(ctx context.Context, clusterScope *scope.ClusterScope, logger logr.Logger) ([]infrav1alpha2.LinodeMachine, error) { + // Fetch all maintenance information + threeDaysLater := time.Now().Add(threeDays).UTC().Format("2006-01-02T15:04:05") // API doesn't like RFC3339 + f := linodego.Filter{} + f.AddField(linodego.Eq, "status", "scheduled") + f.AddField(linodego.Lte, "when", threeDaysLater) + filter, err := f.MarshalJSON() + if err != nil { + return nil, fmt.Errorf("could not marshal filter: %w", err) + } + maintenances, err := clusterScope.LinodeClient.ListMaintenances(ctx, &linodego.ListOptions{Filter: string(filter)}) + if err != nil { + logger.Error(err, "Failed to fetch maintenance information from Linode API") + return nil, err + } + + maintenanceLabels := make(map[int]struct{}, len(maintenances)) + for _, maint := range maintenances { + if maint.Entity == nil { + continue + } + if maint.Entity.Type != "linode" { + continue + } + maintenanceLabels[maint.Entity.ID] = struct{}{} + } + + var machinesForMaintenance []infrav1alpha2.LinodeMachine + linodeMachines, err := util.GetLinodeMachinesForCluster(ctx, clusterScope.Client, clusterScope.Cluster) + if err != nil { + return nil, err + } + + for _, lm := range linodeMachines.Items { + if lm.Spec.InstanceID == nil { + continue + } + if _, ok := maintenanceLabels[*lm.Spec.InstanceID]; ok { + logger.Info("Found maintenance information for", "LinodeMachine", lm.Name, "id", *lm.Spec.InstanceID) + machinesForMaintenance = append(machinesForMaintenance, lm) + } + } + return machinesForMaintenance, nil +} + func (r *LinodeClusterReconciler) performPreflightChecks(ctx context.Context, logger logr.Logger, clusterScope *scope.ClusterScope) (ctrl.Result, error) { // Check VPC configuration - either direct ID or reference if clusterScope.LinodeCluster.Spec.VPCID != nil || clusterScope.LinodeCluster.Spec.VPCRef != nil { diff --git a/internal/controller/linodecluster_controller_maintenance_test.go b/internal/controller/linodecluster_controller_maintenance_test.go new file mode 100644 index 000000000..0b8b65ed5 --- /dev/null +++ b/internal/controller/linodecluster_controller_maintenance_test.go @@ -0,0 +1,473 @@ +/* +Copyright 2023 Akamai Technologies, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/go-logr/logr/testr" + "github.com/linode/linodego" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/controller-runtime/pkg/client" + fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1alpha2 "github.com/linode/cluster-api-provider-linode/api/v1alpha2" + "github.com/linode/cluster-api-provider-linode/cloud/scope" + "github.com/linode/cluster-api-provider-linode/mock" + "github.com/linode/cluster-api-provider-linode/util" +) + +const clusterLabelKey = "cluster.x-k8s.io/cluster-name" + +func testSchemeForMaintenance(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + require.NoError(t, clientgoscheme.AddToScheme(s)) + require.NoError(t, clusterv1.AddToScheme(s)) + require.NoError(t, infrav1alpha2.AddToScheme(s)) + return s +} + +func newLinodeMachineWithID(name, ns, clusterName string, instanceID int) infrav1alpha2.LinodeMachine { + return infrav1alpha2.LinodeMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + Labels: map[string]string{ + clusterLabelKey: clusterName, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Machine", + Name: name, + UID: types.UID("uid-" + name), + }, + }, + }, + Spec: infrav1alpha2.LinodeMachineSpec{ + InstanceID: util.Pointer(instanceID), + }, + } +} + +func maintenanceForID(id int) linodego.AccountMaintenance { + return linodego.AccountMaintenance{ + Entity: &linodego.Entity{ + Type: "linode", + ID: id, + }, + Status: "scheduled", + } +} + +func newCapiCluster(name, ns string) *clusterv1.Cluster { + return &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + } +} + +// newCapiMachine UID must match the OwnerReference UID set by newLinodeMachineWithID. +func newCapiMachine(name, ns string) *clusterv1.Machine { + return &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + UID: types.UID("uid-" + name), + }, + } +} + +func TestCollectMaintenanceInfo(t *testing.T) { + t.Parallel() + + const clusterName = "test-cluster" + const ns = defaultNamespace + + scheme := testSchemeForMaintenance(t) + + tests := []struct { + name string + instanceIDs []int + apiMaintenances []linodego.AccountMaintenance + apiError error + expectedNames []string + expectError bool + }{ + { + name: "no maintenance scheduled", + instanceIDs: []int{101, 102}, + apiMaintenances: []linodego.AccountMaintenance{}, + expectedNames: nil, + }, + { + name: "one machine has maintenance", + instanceIDs: []int{101, 102}, + apiMaintenances: []linodego.AccountMaintenance{maintenanceForID(101)}, + expectedNames: []string{"machine-0"}, + }, + { + name: "multiple machines have maintenance", + instanceIDs: []int{101, 102, 103}, + apiMaintenances: []linodego.AccountMaintenance{ + maintenanceForID(101), + maintenanceForID(103), + }, + expectedNames: []string{"machine-0", "machine-2"}, + }, + { + name: "maintenance entity ID not matching any machine is ignored", + instanceIDs: []int{101}, + apiMaintenances: []linodego.AccountMaintenance{maintenanceForID(999)}, + expectedNames: nil, + }, + { + name: "non-linode entity type is ignored", + instanceIDs: []int{101}, + apiMaintenances: []linodego.AccountMaintenance{ + {Entity: &linodego.Entity{Type: "volume", ID: 101}, Status: "scheduled"}, + }, + expectedNames: nil, + }, + { + name: "nil entity is ignored", + instanceIDs: []int{101}, + apiMaintenances: []linodego.AccountMaintenance{ + {Entity: nil, Status: "scheduled"}, + }, + expectedNames: nil, + }, + { + name: "maintenance API call fails", + instanceIDs: []int{101}, + apiError: errors.New("linode API unavailable"), + expectError: true, + }, + { + name: "no LinodeMachines in cluster", + instanceIDs: []int{}, + apiMaintenances: []linodego.AccountMaintenance{maintenanceForID(101)}, + expectedNames: nil, + }, + } + + for _, tt := range tests { + tc := tt + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + mockLinodeClient := mock.NewMockLinodeClient(mockCtrl) + mockLinodeClient.EXPECT(). + ListMaintenances(gomock.Any(), gomock.Any()). + Return(tc.apiMaintenances, tc.apiError) + + cluster := newCapiCluster(clusterName, ns) + objs := []client.Object{cluster} + for i, id := range tc.instanceIDs { + lm := newLinodeMachineWithID(fmt.Sprintf("machine-%d", i), ns, clusterName, id) + objs = append(objs, &lm) + } + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() + + clusterScope := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: mockLinodeClient, + Client: fakeClient, + } + + result, err := (&LinodeClusterReconciler{}).collectMaintenanceInfo(context.Background(), clusterScope, testr.New(t)) + + if tc.expectError { + require.Error(t, err) + return + } + require.NoError(t, err) + + var resultNames []string + for _, lm := range result { + resultNames = append(resultNames, lm.Name) + } + assert.ElementsMatch(t, tc.expectedNames, resultNames) + }) + } +} + +func TestSetMaintenanceConditions(t *testing.T) { + t.Parallel() + + const clusterName = "test-cluster" + const ns = defaultNamespace + + scheme := testSchemeForMaintenance(t) + + t.Run("no maintenance — no conditions set", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{}, nil) + + cluster := newCapiCluster(clusterName, ns) + lm := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(cluster, &lm).Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + }) + + t.Run("one machine in maintenance — MaintenanceScheduled condition set", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + }, nil) + + cluster := newCapiCluster(clusterName, ns) + lm := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + machine := newCapiMachine("machine-1", ns) + fakeClient := fakeclient.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, &lm, machine). + WithStatusSubresource(machine). + Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + + updated := &clusterv1.Machine{} + require.NoError(t, fakeClient.Get(context.Background(), client.ObjectKey{Name: "machine-1", Namespace: ns}, updated)) + var found bool + for _, c := range updated.Status.Conditions { + if c.Type == ConditionMaintenanceScheduled { + found = true + assert.Equal(t, metav1.ConditionTrue, c.Status) + } + } + assert.True(t, found, "expected MaintenanceScheduled condition to be set") + }) + + t.Run("two machines in maintenance — both get conditions set", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + maintenanceForID(102), + }, nil) + + cluster := newCapiCluster(clusterName, ns) + lm1 := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + lm2 := newLinodeMachineWithID("machine-2", ns, clusterName, 102) + m1, m2 := newCapiMachine("machine-1", ns), newCapiMachine("machine-2", ns) + fakeClient := fakeclient.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, &lm1, &lm2, m1, m2). + WithStatusSubresource(m1, m2). + Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + + for _, name := range []string{"machine-1", "machine-2"} { + updated := &clusterv1.Machine{} + require.NoError(t, fakeClient.Get(context.Background(), client.ObjectKey{Name: name, Namespace: ns}, updated)) + var found bool + for _, c := range updated.Status.Conditions { + if c.Type == ConditionMaintenanceScheduled { + found = true + } + } + assert.Truef(t, found, "expected MaintenanceScheduled condition on %s", name) + } + }) + + t.Run("only machines matching maintenance are patched — other machines unaffected", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + }, nil) + + cluster := newCapiCluster(clusterName, ns) + lm1 := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + lm2 := newLinodeMachineWithID("machine-2", ns, clusterName, 102) + m1, m2 := newCapiMachine("machine-1", ns), newCapiMachine("machine-2", ns) + fakeClient := fakeclient.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cluster, &lm1, &lm2, m1, m2). + WithStatusSubresource(m1, m2). + Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + + m1Updated := &clusterv1.Machine{} + require.NoError(t, fakeClient.Get(context.Background(), client.ObjectKey{Name: "machine-1", Namespace: ns}, m1Updated)) + var m1HasCondition bool + for _, c := range m1Updated.Status.Conditions { + if c.Type == ConditionMaintenanceScheduled { + m1HasCondition = true + } + } + assert.True(t, m1HasCondition, "machine-1 should have MaintenanceScheduled condition") + + m2Updated := &clusterv1.Machine{} + require.NoError(t, fakeClient.Get(context.Background(), client.ObjectKey{Name: "machine-2", Namespace: ns}, m2Updated)) + for _, c := range m2Updated.Status.Conditions { + assert.NotEqual(t, ConditionMaintenanceScheduled, c.Type, "machine-2 should not have MaintenanceScheduled condition") + } + }) + + t.Run("LinodeMachine has no InstanceID — skipped without error", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + }, nil) + + cluster := newCapiCluster(clusterName, ns) + lmNoID := infrav1alpha2.LinodeMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-noid", + Namespace: ns, + Labels: map[string]string{clusterLabelKey: clusterName}, + }, + } + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(cluster, &lmNoID).Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + }) + + t.Run("LinodeMachine has no owner reference — skipped without error", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + }, nil) + + cluster := newCapiCluster(clusterName, ns) + lmNoOwner := infrav1alpha2.LinodeMachine{ + ObjectMeta: metav1.ObjectMeta{ + Name: "machine-orphan", + Namespace: ns, + Labels: map[string]string{clusterLabelKey: clusterName}, + }, + Spec: infrav1alpha2.LinodeMachineSpec{ + InstanceID: util.Pointer(101), + }, + } + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(cluster, &lmNoOwner).Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + require.NoError(t, (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t))) + }) + + t.Run("GetOwnerMachine fails — error aggregated", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + mk := mock.NewMockK8sClient(mockCtrl) + + lm := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + lmList := infrav1alpha2.LinodeMachineList{Items: []infrav1alpha2.LinodeMachine{lm}} + + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{ + maintenanceForID(101), + }, nil) + mk.EXPECT(). + List(gomock.Any(), gomock.AssignableToTypeOf(&infrav1alpha2.LinodeMachineList{}), gomock.Any()). + DoAndReturn(func(_ context.Context, list *infrav1alpha2.LinodeMachineList, _ ...client.ListOption) error { + *list = lmList + return nil + }) + mk.EXPECT(). + Get(gomock.Any(), gomock.Any(), gomock.AssignableToTypeOf(&clusterv1.Machine{}), gomock.Any()). + Return(errors.New("API server unavailable")) + + cs := &scope.ClusterScope{ + Cluster: newCapiCluster(clusterName, ns), + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: mk, + } + err := (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t)) + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to get owner Machine") + }) + + t.Run("ListMaintenances API error — returned immediately", func(t *testing.T) { + t.Parallel() + mockCtrl := gomock.NewController(t) + ml := mock.NewMockLinodeClient(mockCtrl) + ml.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return(nil, errors.New("API down")) + + cluster := newCapiCluster(clusterName, ns) + lm := newLinodeMachineWithID("machine-1", ns, clusterName, 101) + fakeClient := fakeclient.NewClientBuilder().WithScheme(scheme).WithObjects(cluster, &lm).Build() + cs := &scope.ClusterScope{ + Cluster: cluster, + LinodeCluster: &infrav1alpha2.LinodeCluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns}}, + LinodeClient: ml, + Client: fakeClient, + } + err := (&LinodeClusterReconciler{}).setMaintenanceConditions(context.Background(), cs, testr.New(t)) + require.Error(t, err) + assert.Contains(t, err.Error(), "API down") + }) +} diff --git a/internal/controller/linodecluster_controller_test.go b/internal/controller/linodecluster_controller_test.go index 7fcd50a70..2f4b204c9 100644 --- a/internal/controller/linodecluster_controller_test.go +++ b/internal/controller/linodecluster_controller_test.go @@ -114,6 +114,7 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc reconciler.Recorder = mck.Recorder() Expect(k8sClient.Get(ctx, clusterKey, &linodeCluster)).To(Succeed()) + cScope.Cluster = &clusterv1.Cluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: defaultNamespace}} cScope.LinodeCluster = &linodeCluster // Create patch helper with latest state of resource. @@ -310,6 +311,7 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc Call("cluster is created", func(ctx context.Context, mck Mock) { cScope.LinodeClient = mck.LinodeClient cScope.LinodeCluster.Spec.Network.ApiserverNodeBalancerConfigID = nil + mck.LinodeClient.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{}, nil).AnyTimes() mck.LinodeClient.EXPECT().ListNodeBalancerNodes(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return([]linodego.NodeBalancerNode{}, nil).AnyTimes() getNB := mck.LinodeClient.EXPECT().GetNodeBalancer(gomock.Any(), gomock.Any()). Return(&linodego.NodeBalancer{ @@ -499,6 +501,7 @@ var _ = Describe("cluster-lifecycle-dns", Ordered, Label("cluster", "cluster-lif reconciler.Recorder = mck.Recorder() Expect(k8sClient.Get(ctx, clusterKey, &linodeCluster)).To(Succeed()) + cScope.Cluster = &clusterv1.Cluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: defaultNamespace}} cScope.LinodeCluster = &linodeCluster // Create patch helper with latest state of resource. @@ -514,6 +517,7 @@ var _ = Describe("cluster-lifecycle-dns", Ordered, Label("cluster", "cluster-lif Call("cluster with dns loadbalancing is created", func(ctx context.Context, mck Mock) { cScope.LinodeClient = mck.LinodeClient cScope.LinodeDomainsClient = mck.LinodeClient + mck.LinodeClient.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{}, nil).AnyTimes() mck.LinodeClient.EXPECT().ListDomains(gomock.Any(), gomock.Any()).Return([]linodego.Domain{ { ID: 1, @@ -793,6 +797,7 @@ var _ = Describe("dns-override-endpoint", Ordered, Label("cluster", "dns-overrid cScope.LinodeClient = mck.LinodeClient cScope.LinodeDomainsClient = mck.LinodeClient cScope.AkamaiDomainsClient = mck.AkamEdgeDNSClient + mck.LinodeClient.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{}, nil).AnyTimes() linodeMachines := infrav1alpha2.LinodeMachineList{ Items: []infrav1alpha2.LinodeMachine{linodeMachine}, } @@ -879,6 +884,7 @@ var _ = Describe("cluster-with-direct-vpcid", Ordered, Label("cluster", "direct- reconciler.Recorder = mck.Recorder() Expect(k8sClient.Get(ctx, clusterKey, &linodeCluster)).To(Succeed()) + cScope.Cluster = &clusterv1.Cluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: defaultNamespace}} cScope.LinodeCluster = &linodeCluster // Create patch helper with latest state of resource. @@ -892,6 +898,7 @@ var _ = Describe("cluster-with-direct-vpcid", Ordered, Label("cluster", "direct- Path( Call("direct VPCID exists and has subnets", func(ctx context.Context, mck Mock) { cScope.LinodeClient = mck.LinodeClient + mck.LinodeClient.EXPECT().ListMaintenances(gomock.Any(), gomock.Any()).Return([]linodego.AccountMaintenance{}, nil).AnyTimes() // Mock GetVPC call to return a VPC with subnets mck.LinodeClient.EXPECT().GetVPC(gomock.Any(), gomock.Eq(12345)). diff --git a/mock/client.go b/mock/client.go index 62c7ef285..c1ea18c5f 100644 --- a/mock/client.go +++ b/mock/client.go @@ -772,6 +772,21 @@ func (mr *MockLinodeClientMockRecorder) ListInterfaces(ctx, linodeID, opts any) return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListInterfaces", reflect.TypeOf((*MockLinodeClient)(nil).ListInterfaces), ctx, linodeID, opts) } +// ListMaintenances mocks base method. +func (m *MockLinodeClient) ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListMaintenances", ctx, opts) + ret0, _ := ret[0].([]linodego.AccountMaintenance) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListMaintenances indicates an expected call of ListMaintenances. +func (mr *MockLinodeClientMockRecorder) ListMaintenances(ctx, opts any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListMaintenances", reflect.TypeOf((*MockLinodeClient)(nil).ListMaintenances), ctx, opts) +} + // ListNodeBalancerNodes mocks base method. func (m *MockLinodeClient) ListNodeBalancerNodes(ctx context.Context, nodebalancerID, configID int, opts *linodego.ListOptions) ([]linodego.NodeBalancerNode, error) { m.ctrl.T.Helper() @@ -2291,6 +2306,45 @@ func (mr *MockLinodeInterfacesClientMockRecorder) ListInterfaces(ctx, linodeID, return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListInterfaces", reflect.TypeOf((*MockLinodeInterfacesClient)(nil).ListInterfaces), ctx, linodeID, opts) } +// MockLinodeMaintenanceClient is a mock of LinodeMaintenanceClient interface. +type MockLinodeMaintenanceClient struct { + ctrl *gomock.Controller + recorder *MockLinodeMaintenanceClientMockRecorder + isgomock struct{} +} + +// MockLinodeMaintenanceClientMockRecorder is the mock recorder for MockLinodeMaintenanceClient. +type MockLinodeMaintenanceClientMockRecorder struct { + mock *MockLinodeMaintenanceClient +} + +// NewMockLinodeMaintenanceClient creates a new mock instance. +func NewMockLinodeMaintenanceClient(ctrl *gomock.Controller) *MockLinodeMaintenanceClient { + mock := &MockLinodeMaintenanceClient{ctrl: ctrl} + mock.recorder = &MockLinodeMaintenanceClientMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockLinodeMaintenanceClient) EXPECT() *MockLinodeMaintenanceClientMockRecorder { + return m.recorder +} + +// ListMaintenances mocks base method. +func (m *MockLinodeMaintenanceClient) ListMaintenances(ctx context.Context, opts *linodego.ListOptions) ([]linodego.AccountMaintenance, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ListMaintenances", ctx, opts) + ret0, _ := ret[0].([]linodego.AccountMaintenance) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ListMaintenances indicates an expected call of ListMaintenances. +func (mr *MockLinodeMaintenanceClientMockRecorder) ListMaintenances(ctx, opts any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListMaintenances", reflect.TypeOf((*MockLinodeMaintenanceClient)(nil).ListMaintenances), ctx, opts) +} + // MockK8sClient is a mock of K8sClient interface. type MockK8sClient struct { ctrl *gomock.Controller diff --git a/observability/wrappers/linodeclient/linodeclient.gen.go b/observability/wrappers/linodeclient/linodeclient.gen.go index f2bc15fa3..b21678987 100644 --- a/observability/wrappers/linodeclient/linodeclient.gen.go +++ b/observability/wrappers/linodeclient/linodeclient.gen.go @@ -1273,6 +1273,31 @@ func (_d LinodeClientWithTracing) ListInterfaces(ctx context.Context, linodeID i return _d.LinodeClient.ListInterfaces(ctx, linodeID, opts) } +// ListMaintenances implements _sourceClients.LinodeClient +func (_d LinodeClientWithTracing) ListMaintenances(ctx context.Context, opts *linodego.ListOptions) (aa1 []linodego.AccountMaintenance, err error) { + ctx, _span := tracing.Start(ctx, "_sourceClients.LinodeClient.ListMaintenances") + defer func() { + if _d._spanDecorator != nil { + _d._spanDecorator(_span, map[string]interface{}{ + "ctx": ctx, + "opts": opts}, map[string]interface{}{ + "aa1": aa1, + "err": err}) + } + + if err != nil { + _span.RecordError(err) + _span.SetAttributes( + attribute.String("event", "error"), + attribute.String("message", err.Error()), + ) + } + + _span.End() + }() + return _d.LinodeClient.ListMaintenances(ctx, opts) +} + // ListNodeBalancerNodes implements _sourceClients.LinodeClient func (_d LinodeClientWithTracing) ListNodeBalancerNodes(ctx context.Context, nodebalancerID int, configID int, opts *linodego.ListOptions) (na1 []linodego.NodeBalancerNode, err error) { ctx, _span := tracing.Start(ctx, "_sourceClients.LinodeClient.ListNodeBalancerNodes") diff --git a/util/helpers.go b/util/helpers.go index b7affd879..1e2d4a122 100644 --- a/util/helpers.go +++ b/util/helpers.go @@ -3,6 +3,7 @@ package util import ( "context" "errors" + "fmt" "io" "net" "net/http" @@ -11,7 +12,9 @@ import ( "strings" "github.com/linode/linodego" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/types" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -137,3 +140,18 @@ func SetOwnerReferenceToLinodeCluster(ctx context.Context, k8sclient client.Clie return nil } + +func GetLinodeMachinesForCluster(ctx context.Context, k8sclient client.Client, cluster *clusterv1.Cluster) (*infrav1alpha2.LinodeMachineList, error) { + clusterReq, err := labels.NewRequirement("cluster.x-k8s.io/cluster-name", selection.Equals, []string{cluster.Name}) + if err != nil { + return nil, fmt.Errorf("building label selector: %w", err) + } + + selector := labels.NewSelector() + selector = selector.Add(*clusterReq) + var linodeMachineList infrav1alpha2.LinodeMachineList + if err := k8sclient.List(ctx, &linodeMachineList, &client.ListOptions{Namespace: cluster.Namespace, LabelSelector: selector}); err != nil { + return nil, fmt.Errorf("listing linode machines %w", err) + } + return &linodeMachineList, nil +}