diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 5d7da24..cec7a9c 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/predicates" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -189,6 +190,17 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust provisioned := r.isInfrastructureProvisioned(clusterScope) clusterScope.CloudscaleCluster.Status.Initialization.Provisioned = new(provisioned) + // The load balancer health monitor is created only once the control plane is + // initialized, which itself requires the infrastructure to be provisioned. + // Running this last lets us poll for ControlPlaneInitialized without deadlocking the control plane we wait on. + result, err = r.reconcileLBHealthMonitor(ctx, clusterScope) + if err != nil { + return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err) + } + if !result.IsZero() { + return result, nil + } + return ctrl.Result{}, nil } @@ -328,6 +340,12 @@ func (r *CloudscaleClusterReconciler) SetupWithManager(ctx context.Context, mgr Watches( &clusterv1.Cluster{}, handler.EnqueueRequestsFromMapFunc(util.ClusterToInfrastructureMapFunc(ctx, infrastructurev1beta2.SchemeGroupVersion.WithKind("CloudscaleCluster"), mgr.GetClient(), &infrastructurev1beta2.CloudscaleCluster{})), + builder.WithPredicates(predicates.Any(r.Scheme, logger, + // Resume reconciliation when the Cluster is paused/unpaused, and when + // the control plane initializes so the LB health monitor gets added. + predicates.ClusterPausedTransitions(r.Scheme, logger), + predicates.ClusterControlPlaneInitialized(r.Scheme, logger), + )), ). Watches( &infrastructurev1beta2.CloudscaleMachine{}, @@ -348,7 +366,7 @@ func (r *CloudscaleClusterReconciler) cloudscaleMachineToCluster(ctx context.Con } // Only care about control plane machines - if _, ok := machine.Labels[clusterv1.MachineControlPlaneNameLabel]; !ok { + if _, ok := machine.Labels[clusterv1.MachineControlPlaneLabel]; !ok { return nil } diff --git a/internal/controller/cloudscalecluster_controller_test.go b/internal/controller/cloudscalecluster_controller_test.go index 10f01e9..d51fd1b 100644 --- a/internal/controller/cloudscalecluster_controller_test.go +++ b/internal/controller/cloudscalecluster_controller_test.go @@ -29,8 +29,43 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/testutils" ) +// TestCloudscaleMachineToCluster verifies the CloudscaleMachine watch maps a +// control-plane machine to its owning CloudscaleCluster. +func TestCloudscaleMachineToCluster(t *testing.T) { + cluster := &infrastructurev1beta2.CloudscaleCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + Labels: map[string]string{clusterv1.ClusterNameLabel: "test-cluster"}, + }, + } + + t.Run("control-plane machine maps to its cluster", func(t *testing.T) { + g := NewWithT(t) + r := newTestReconciler(cluster) + mapFn := r.cloudscaleMachineToCluster(context.Background(), r.Client) + + requests := mapFn(context.Background(), testutils.NewControlPlaneMachine("cp-0", "srv-0")) + + g.Expect(requests).To(HaveLen(1)) + g.Expect(requests[0].NamespacedName).To(Equal(types.NamespacedName{Name: "test-cluster", Namespace: "default"})) + }) + + t.Run("non control-plane machine maps to nothing", func(t *testing.T) { + g := NewWithT(t) + r := newTestReconciler(cluster) + mapFn := r.cloudscaleMachineToCluster(context.Background(), r.Client) + + worker := testutils.NewControlPlaneMachine("worker-0", "srv-1") + delete(worker.Labels, clusterv1.MachineControlPlaneLabel) + + g.Expect(mapFn(context.Background(), worker)).To(BeEmpty()) + }) +} + // TestCloudscaleClusterReconciler_Reconcile_EntryPoint covers the cheap // early-exit paths of Reconcile against the envtest API server. func TestCloudscaleClusterReconciler_Reconcile_EntryPoint(t *testing.T) { diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index c9cfc80..b76b01c 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -142,21 +142,14 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, return result, nil } - // 4. Reconcile the health monitor - if result, err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil { - return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err) - } else if !result.IsZero() { - return result, nil - } - - // 5. Reconcile the members + // 4. Reconcile the members if result, err := r.reconcileLBMembers(ctx, clusterScope); err != nil { return ctrl.Result{}, fmt.Errorf("reconciling load balancer members: %w", err) } else if !result.IsZero() { return result, nil } - // 6. Set the control plane endpoint from the VIP + // 5. Set the control plane endpoint from the VIP if clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host == "" { if clusterScope.CloudscaleCluster.Spec.FloatingIP != nil { // Floating IP is configured — the FIP reconciler will set the endpoint. @@ -172,9 +165,21 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, } } + // The LB is not running (degraded/error). + // We poll for the LB status since we don't have a way to watch the loadbalancer status otherwise. + if lbPending { + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + return ctrl.Result{}, nil } +// controlPlaneInitialized denotes when the control plane is functional enough to accept requests. +// Never goes back to false once true. +func controlPlaneInitialized(clusterScope *scope.ClusterScope) bool { + return ptr.Deref(clusterScope.Cluster.Status.Initialization.ControlPlaneInitialized, false) +} + // reconcileLB ensures the load balancer exists. func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { _, id, err := ensureResource(ctx, clusterScope, @@ -338,9 +343,24 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c return ctrl.Result{}, nil } -// reconcileLBHealthMonitor ensures the load balancer health monitor exists. +// reconcileLBHealthMonitor ensures the load balancer health monitor exists once the control plane is initialized. +// +// Without a health monitor, all members are marked as `no_monitor` and traffic will be routed to them regardless +// if they respond or not. // The health monitor performs TCP health checks on the API server port. +// This is called as the last phase of reconcileNormal, after the cluster is marked provisioned. func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { + // External control plane: no load balancer, so no health monitor. + if !ptr.Deref(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.Enabled, true) { + return ctrl.Result{}, nil + } + + // Defer the initial creation until the control plane is initialized. + if clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID == "" && !controlPlaneInitialized(clusterScope) { + clusterScope.Info("Waiting for control-plane initialized before creating health monitor") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + _, id, err := ensureResource(ctx, clusterScope, clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID, "load balancer health monitor", @@ -406,7 +426,7 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl return ctrl.Result{}, fmt.Errorf("failed to get desired load balancer members: %w", err) } - clusterScope.V(2).Info("reconcileLBMembers", "currentMembers", currentMembers, "desiredMembers", desiredMembers) + clusterScope.V(2).Info("Reconciling LB Members", "currentMembers", currentMembers, "desiredMembers", desiredMembers) // Build maps keyed by member name for comparison currentByName := make(map[string]cloudscalesdk.LoadBalancerPoolMember, len(currentMembers)) diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index 182c0d7..d751848 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -252,6 +252,7 @@ func TestReconcileLBHealthMonitor(t *testing.T) { } clusterScope := newLBClusterScope( + testutils.WithControlPlaneInitialized(true), testutils.WithHMService(healthMonitorService), testutils.WithHealthMonitorParams(tc.delayS, tc.timeoutS, tc.upThreshold, tc.downThresh), ) @@ -333,7 +334,7 @@ func TestReconcileLoadBalancer_ErrorStatusProceedsWithMemberReconciliation(t *te result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(BeZero(), "error status should not block member reconciliation") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "error status should requeue to re-check the LB after member reconciliation") // LoadBalancerReadyCondition should be False because LB is not running cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition) @@ -371,7 +372,7 @@ func TestReconcileLoadBalancer_DegradedStatusProceedsWithMemberReconciliation(t result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) - g.Expect(result.RequeueAfter).To(BeZero(), "degraded status should not block member reconciliation") + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "degraded status should requeue to re-check the LB after member reconciliation") // Stale member should have been removed g.Expect(deletedMemberUUID).To(Equal("stale-uuid")) @@ -433,17 +434,80 @@ func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) { clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID - clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID r := newTestReconciler() - _, err := r.reconcileLoadBalancer(context.Background(), clusterScope) + result, err := r.reconcileLoadBalancer(context.Background(), clusterScope) g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.IsZero()).To(BeTrue(), "reconcileLoadBalancer must not requeue for the health monitor") g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).To(Equal("203.0.113.10")) g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Port).To(Equal(int32(6443))) } +// hmServiceWithCreate returns a health-monitor mock whose List is empty and +// whose Create flips *created and returns a fixed UUID. +func hmServiceWithCreate(created *bool) *testutils.MockLoadBalancerHealthMonitorService { + return &testutils.MockLoadBalancerHealthMonitorService{ + ListFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) { + return nil, nil + }, + CreateFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) { + *created = true + return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: "hm-uuid"}, nil + }, + } +} + +func TestReconcileLBHealthMonitor_RequeuesUntilControlPlaneInitialized(t *testing.T) { + g := NewWithT(t) + + var created bool + clusterScope := newLBClusterScope(testutils.WithHMService(hmServiceWithCreate(&created))) + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + + result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "must poll until the control plane is initialized") + g.Expect(created).To(BeFalse(), "health monitor must not be created before the control plane is initialized") + g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(BeEmpty()) +} + +func TestReconcileLBHealthMonitor_CreatesWhenControlPlaneInitialized(t *testing.T) { + g := NewWithT(t) + + var created bool + clusterScope := newLBClusterScope( + testutils.WithControlPlaneInitialized(true), + testutils.WithHMService(hmServiceWithCreate(&created)), + ) + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID + + result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.IsZero()).To(BeTrue()) + g.Expect(created).To(BeTrue(), "health monitor must be created once the control plane is initialized") + g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("hm-uuid")) +} + +func TestReconcileLBHealthMonitor_SkipsWhenLBDisabled(t *testing.T) { + g := NewWithT(t) + + var created bool + clusterScope := newLBClusterScope( + testutils.WithLBEnabled(false), + testutils.WithHMService(hmServiceWithCreate(&created)), + ) + + result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.IsZero()).To(BeTrue()) + g.Expect(created).To(BeFalse(), "external control plane has no health monitor") +} + // ============================================================================ // Tests for deleteLoadBalancer // ============================================================================ diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index b89a0aa..facc983 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -26,6 +26,7 @@ package controller import ( "context" "testing" + "time" cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" @@ -126,6 +127,32 @@ func TestReconcileNormal_FullyProvisionedCluster(t *testing.T) { g.Expect(readyCond.Status).To(Equal(metav1.ConditionTrue)) } +// TestReconcileNormal_RequeuesForHealthMonitorWithoutBlockingProvisioning ensures a provisioned cluster will poll +// for control plane initialized (in reconcile health monitor). +func TestReconcileNormal_RequeuesForHealthMonitorWithoutBlockingProvisioning(t *testing.T) { + g := NewWithT(t) + + clusterScope := defaultProvisionedScope() + clusterScope.CloudscaleCluster.Status.Networks = []infrastructurev1beta2.NetworkStatus{ + {Name: "test", NetworkID: "net-123", SubnetID: "subnet-123", Managed: true}, + } + clusterScope.CloudscaleCluster.Status.LoadBalancerID = "lb-123" + clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = "pool-123" + clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = "listener-123" + + r := newTestReconciler() + + result, err := r.reconcileNormal(context.Background(), clusterScope) + + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "must keep polling for control-plane initialization") + + // provisioning and controlPlaneEndpoint should be set, even though we requeue due to the logic in reconciling the health monitor. + g.Expect(clusterScope.CloudscaleCluster.Status.Initialization).ToNot(BeNil()) + g.Expect(*clusterScope.CloudscaleCluster.Status.Initialization.Provisioned).To(BeTrue()) + g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).ToNot(BeEmpty()) +} + // TestReconcileDelete_SuccessfulDeletion is the top-level happy-path smoke test // for reconcileDelete: deletes the LB and the network, clears all status IDs, // removes the finalizer, and sets the Deleting/Ready conditions. diff --git a/internal/testutils/cluster_scope.go b/internal/testutils/cluster_scope.go index 9fd3390..6a295ff 100644 --- a/internal/testutils/cluster_scope.go +++ b/internal/testutils/cluster_scope.go @@ -163,6 +163,15 @@ func WithGeneration(gen int64) ClusterScopeOption { } } +// WithControlPlaneInitialized sets the CAPI Cluster ControlPlaneInitialized flag. +func WithControlPlaneInitialized(v bool) ClusterScopeOption { + return func(cs *scope.ClusterScope) { + cs.Cluster.Status.Initialization = clusterv1.ClusterInitializationStatus{ + ControlPlaneInitialized: new(v), + } + } +} + // MachineScopeOption configures a MachineScope for testing. type MachineScopeOption func(*scope.MachineScope) diff --git a/internal/testutils/fixtures.go b/internal/testutils/fixtures.go index 4803137..0fa19e4 100644 --- a/internal/testutils/fixtures.go +++ b/internal/testutils/fixtures.go @@ -10,8 +10,8 @@ import ( ) // NewControlPlaneMachine builds a control-plane CloudscaleMachine fixture in -// the "default" namespace, labelled as control-plane of "test-cluster" and -// with the given ServerID. Used heavily by the FIP and LB controller tests. +// the "default" namespace, labeled as control-plane of "test-cluster" and +// with the given ServerID. func NewControlPlaneMachine(name, serverID string) *infrastructurev1beta2.CloudscaleMachine { m := &infrastructurev1beta2.CloudscaleMachine{ ObjectMeta: metav1.ObjectMeta{