Skip to content

Commit a737632

Browse files
committed
cluster(lb): improve initial bootstrap behavior
Changes how the loadbalancer is handled in a cluster. Previously, it could happen that a loadbalancer stayed `degraded` or `error` because until all control-plane nodes were available, it took too long and we didn't requeue the cluster when loadbalancer was not marked running. 1. requeue every 10s when lbPending (degraded, error) status on lb, similar to the `changing` status. 2. only add health monitor once control plane is marked initialized. This is done once at least one control plane node is marked initialized by the control plane provider. 3. refactor health monitor reconciliation to be last after lb and floating IP because it depends on both being set. Additionally, the change fixes a bug with the cloudscaleMachineToCluster watch function - we filtered on the wrong label unfortunately.
1 parent 6fa20fe commit a737632

7 files changed

Lines changed: 191 additions & 18 deletions

internal/controller/cloudscalecluster_controller.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"sigs.k8s.io/cluster-api/util/conditions"
3636
"sigs.k8s.io/cluster-api/util/predicates"
3737
ctrl "sigs.k8s.io/controller-runtime"
38+
"sigs.k8s.io/controller-runtime/pkg/builder"
3839
"sigs.k8s.io/controller-runtime/pkg/client"
3940
"sigs.k8s.io/controller-runtime/pkg/controller"
4041
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -189,6 +190,17 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust
189190
provisioned := r.isInfrastructureProvisioned(clusterScope)
190191
clusterScope.CloudscaleCluster.Status.Initialization.Provisioned = new(provisioned)
191192

193+
// The load balancer health monitor is created only once the control plane is
194+
// initialized, which itself requires the infrastructure to be provisioned.
195+
// Running this last lets us poll for ControlPlaneInitialized without deadlocking the control plane we wait on.
196+
result, err = r.reconcileLBHealthMonitor(ctx, clusterScope)
197+
if err != nil {
198+
return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err)
199+
}
200+
if !result.IsZero() {
201+
return result, nil
202+
}
203+
192204
return ctrl.Result{}, nil
193205
}
194206

@@ -328,6 +340,12 @@ func (r *CloudscaleClusterReconciler) SetupWithManager(ctx context.Context, mgr
328340
Watches(
329341
&clusterv1.Cluster{},
330342
handler.EnqueueRequestsFromMapFunc(util.ClusterToInfrastructureMapFunc(ctx, infrastructurev1beta2.SchemeGroupVersion.WithKind("CloudscaleCluster"), mgr.GetClient(), &infrastructurev1beta2.CloudscaleCluster{})),
343+
builder.WithPredicates(predicates.Any(r.Scheme, logger,
344+
// Resume reconciliation when the Cluster is paused/unpaused, and when
345+
// the control plane initializes so the LB health monitor gets added.
346+
predicates.ClusterPausedTransitions(r.Scheme, logger),
347+
predicates.ClusterControlPlaneInitialized(r.Scheme, logger),
348+
)),
331349
).
332350
Watches(
333351
&infrastructurev1beta2.CloudscaleMachine{},
@@ -348,7 +366,7 @@ func (r *CloudscaleClusterReconciler) cloudscaleMachineToCluster(ctx context.Con
348366
}
349367

350368
// Only care about control plane machines
351-
if _, ok := machine.Labels[clusterv1.MachineControlPlaneNameLabel]; !ok {
369+
if _, ok := machine.Labels[clusterv1.MachineControlPlaneLabel]; !ok {
352370
return nil
353371
}
354372

internal/controller/cloudscalecluster_controller_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,43 @@ import (
2929

3030
infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2"
3131
"github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope"
32+
"github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/testutils"
3233
)
3334

35+
// TestCloudscaleMachineToCluster verifies the CloudscaleMachine watch maps a
36+
// control-plane machine to its owning CloudscaleCluster.
37+
func TestCloudscaleMachineToCluster(t *testing.T) {
38+
cluster := &infrastructurev1beta2.CloudscaleCluster{
39+
ObjectMeta: metav1.ObjectMeta{
40+
Name: "test-cluster",
41+
Namespace: "default",
42+
Labels: map[string]string{clusterv1.ClusterNameLabel: "test-cluster"},
43+
},
44+
}
45+
46+
t.Run("control-plane machine maps to its cluster", func(t *testing.T) {
47+
g := NewWithT(t)
48+
r := newTestReconciler(cluster)
49+
mapFn := r.cloudscaleMachineToCluster(context.Background(), r.Client)
50+
51+
requests := mapFn(context.Background(), testutils.NewControlPlaneMachine("cp-0", "srv-0"))
52+
53+
g.Expect(requests).To(HaveLen(1))
54+
g.Expect(requests[0].NamespacedName).To(Equal(types.NamespacedName{Name: "test-cluster", Namespace: "default"}))
55+
})
56+
57+
t.Run("non control-plane machine maps to nothing", func(t *testing.T) {
58+
g := NewWithT(t)
59+
r := newTestReconciler(cluster)
60+
mapFn := r.cloudscaleMachineToCluster(context.Background(), r.Client)
61+
62+
worker := testutils.NewControlPlaneMachine("worker-0", "srv-1")
63+
delete(worker.Labels, clusterv1.MachineControlPlaneLabel)
64+
65+
g.Expect(mapFn(context.Background(), worker)).To(BeEmpty())
66+
})
67+
}
68+
3469
// TestCloudscaleClusterReconciler_Reconcile_EntryPoint covers the cheap
3570
// early-exit paths of Reconcile against the envtest API server.
3671
func TestCloudscaleClusterReconciler_Reconcile_EntryPoint(t *testing.T) {

internal/controller/cloudscalecluster_loadbalancer.go

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -142,21 +142,14 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context,
142142
return result, nil
143143
}
144144

145-
// 4. Reconcile the health monitor
146-
if result, err := r.reconcileLBHealthMonitor(ctx, clusterScope); err != nil {
147-
return ctrl.Result{}, fmt.Errorf("reconciling load balancer health monitor: %w", err)
148-
} else if !result.IsZero() {
149-
return result, nil
150-
}
151-
152-
// 5. Reconcile the members
145+
// 4. Reconcile the members
153146
if result, err := r.reconcileLBMembers(ctx, clusterScope); err != nil {
154147
return ctrl.Result{}, fmt.Errorf("reconciling load balancer members: %w", err)
155148
} else if !result.IsZero() {
156149
return result, nil
157150
}
158151

159-
// 6. Set the control plane endpoint from the VIP
152+
// 5. Set the control plane endpoint from the VIP
160153
if clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host == "" {
161154
if clusterScope.CloudscaleCluster.Spec.FloatingIP != nil {
162155
// Floating IP is configured — the FIP reconciler will set the endpoint.
@@ -172,9 +165,21 @@ func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context,
172165
}
173166
}
174167

168+
// The LB is not running (degraded/error).
169+
// We poll for the LB status since we don't have a way to watch the loadbalancer status otherwise.
170+
if lbPending {
171+
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
172+
}
173+
175174
return ctrl.Result{}, nil
176175
}
177176

177+
// controlPlaneInitialized denotes when the control plane is functional enough to accept requests.
178+
// Never goes back to false once true.
179+
func controlPlaneInitialized(clusterScope *scope.ClusterScope) bool {
180+
return ptr.Deref(clusterScope.Cluster.Status.Initialization.ControlPlaneInitialized, false)
181+
}
182+
178183
// reconcileLB ensures the load balancer exists.
179184
func (r *CloudscaleClusterReconciler) reconcileLB(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) {
180185
_, id, err := ensureResource(ctx, clusterScope,
@@ -338,9 +343,24 @@ func (r *CloudscaleClusterReconciler) reconcileLBListener(ctx context.Context, c
338343
return ctrl.Result{}, nil
339344
}
340345

341-
// reconcileLBHealthMonitor ensures the load balancer health monitor exists.
346+
// reconcileLBHealthMonitor ensures the load balancer health monitor exists once the control plane is initialized.
347+
//
348+
// Without a health monitor, all members are marked as `no_monitor` and traffic will be routed to them regardless
349+
// if they respond or not.
342350
// The health monitor performs TCP health checks on the API server port.
351+
// This is called as the last phase of reconcileNormal, after the cluster is marked provisioned.
343352
func (r *CloudscaleClusterReconciler) reconcileLBHealthMonitor(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) {
353+
// External control plane: no load balancer, so no health monitor.
354+
if !ptr.Deref(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.Enabled, true) {
355+
return ctrl.Result{}, nil
356+
}
357+
358+
// Defer the initial creation until the control plane is initialized.
359+
if clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID == "" && !controlPlaneInitialized(clusterScope) {
360+
clusterScope.Info("Waiting for control-plane initialized before creating health monitor")
361+
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
362+
}
363+
344364
_, id, err := ensureResource(ctx, clusterScope,
345365
clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID,
346366
"load balancer health monitor",
@@ -406,7 +426,7 @@ func (r *CloudscaleClusterReconciler) reconcileLBMembers(ctx context.Context, cl
406426
return ctrl.Result{}, fmt.Errorf("failed to get desired load balancer members: %w", err)
407427
}
408428

409-
clusterScope.V(2).Info("reconcileLBMembers", "currentMembers", currentMembers, "desiredMembers", desiredMembers)
429+
clusterScope.V(2).Info("Reconciling LB Members", "currentMembers", currentMembers, "desiredMembers", desiredMembers)
410430

411431
// Build maps keyed by member name for comparison
412432
currentByName := make(map[string]cloudscalesdk.LoadBalancerPoolMember, len(currentMembers))

internal/controller/cloudscalecluster_loadbalancer_test.go

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ func TestReconcileLBHealthMonitor(t *testing.T) {
252252
}
253253

254254
clusterScope := newLBClusterScope(
255+
testutils.WithControlPlaneInitialized(true),
255256
testutils.WithHMService(healthMonitorService),
256257
testutils.WithHealthMonitorParams(tc.delayS, tc.timeoutS, tc.upThreshold, tc.downThresh),
257258
)
@@ -333,7 +334,7 @@ func TestReconcileLoadBalancer_ErrorStatusProceedsWithMemberReconciliation(t *te
333334
result, err := r.reconcileLoadBalancer(context.Background(), clusterScope)
334335

335336
g.Expect(err).ToNot(HaveOccurred())
336-
g.Expect(result.RequeueAfter).To(BeZero(), "error status should not block member reconciliation")
337+
g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "error status should requeue to re-check the LB after member reconciliation")
337338

338339
// LoadBalancerReadyCondition should be False because LB is not running
339340
cond := conditions.Get(clusterScope.CloudscaleCluster, infrastructurev1beta2.LoadBalancerReadyCondition)
@@ -371,7 +372,7 @@ func TestReconcileLoadBalancer_DegradedStatusProceedsWithMemberReconciliation(t
371372
result, err := r.reconcileLoadBalancer(context.Background(), clusterScope)
372373

373374
g.Expect(err).ToNot(HaveOccurred())
374-
g.Expect(result.RequeueAfter).To(BeZero(), "degraded status should not block member reconciliation")
375+
g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "degraded status should requeue to re-check the LB after member reconciliation")
375376

376377
// Stale member should have been removed
377378
g.Expect(deletedMemberUUID).To(Equal("stale-uuid"))
@@ -433,17 +434,80 @@ func TestReconcileLoadBalancer_SetsControlPlaneEndpoint(t *testing.T) {
433434
clusterScope.CloudscaleCluster.Status.LoadBalancerID = testLBUUID
434435
clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID
435436
clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = testListenerUUID
436-
clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID = testHMUUID
437437

438438
r := newTestReconciler()
439439

440-
_, err := r.reconcileLoadBalancer(context.Background(), clusterScope)
440+
result, err := r.reconcileLoadBalancer(context.Background(), clusterScope)
441441

442442
g.Expect(err).ToNot(HaveOccurred())
443+
g.Expect(result.IsZero()).To(BeTrue(), "reconcileLoadBalancer must not requeue for the health monitor")
443444
g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).To(Equal("203.0.113.10"))
444445
g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Port).To(Equal(int32(6443)))
445446
}
446447

448+
// hmServiceWithCreate returns a health-monitor mock whose List is empty and
449+
// whose Create flips *created and returns a fixed UUID.
450+
func hmServiceWithCreate(created *bool) *testutils.MockLoadBalancerHealthMonitorService {
451+
return &testutils.MockLoadBalancerHealthMonitorService{
452+
ListFn: func(ctx context.Context, modifiers ...cloudscalesdk.ListRequestModifier) ([]cloudscalesdk.LoadBalancerHealthMonitor, error) {
453+
return nil, nil
454+
},
455+
CreateFn: func(ctx context.Context, req *cloudscalesdk.LoadBalancerHealthMonitorRequest) (*cloudscalesdk.LoadBalancerHealthMonitor, error) {
456+
*created = true
457+
return &cloudscalesdk.LoadBalancerHealthMonitor{UUID: "hm-uuid"}, nil
458+
},
459+
}
460+
}
461+
462+
func TestReconcileLBHealthMonitor_RequeuesUntilControlPlaneInitialized(t *testing.T) {
463+
g := NewWithT(t)
464+
465+
var created bool
466+
clusterScope := newLBClusterScope(testutils.WithHMService(hmServiceWithCreate(&created)))
467+
clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID
468+
469+
result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope)
470+
471+
g.Expect(err).ToNot(HaveOccurred())
472+
g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "must poll until the control plane is initialized")
473+
g.Expect(created).To(BeFalse(), "health monitor must not be created before the control plane is initialized")
474+
g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(BeEmpty())
475+
}
476+
477+
func TestReconcileLBHealthMonitor_CreatesWhenControlPlaneInitialized(t *testing.T) {
478+
g := NewWithT(t)
479+
480+
var created bool
481+
clusterScope := newLBClusterScope(
482+
testutils.WithControlPlaneInitialized(true),
483+
testutils.WithHMService(hmServiceWithCreate(&created)),
484+
)
485+
clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = testPoolUUID
486+
487+
result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope)
488+
489+
g.Expect(err).ToNot(HaveOccurred())
490+
g.Expect(result.IsZero()).To(BeTrue())
491+
g.Expect(created).To(BeTrue(), "health monitor must be created once the control plane is initialized")
492+
g.Expect(clusterScope.CloudscaleCluster.Status.LoadBalancerHealthMonitorID).To(Equal("hm-uuid"))
493+
}
494+
495+
func TestReconcileLBHealthMonitor_SkipsWhenLBDisabled(t *testing.T) {
496+
g := NewWithT(t)
497+
498+
var created bool
499+
clusterScope := newLBClusterScope(
500+
testutils.WithLBEnabled(false),
501+
testutils.WithHMService(hmServiceWithCreate(&created)),
502+
)
503+
504+
result, err := newTestReconciler().reconcileLBHealthMonitor(context.Background(), clusterScope)
505+
506+
g.Expect(err).ToNot(HaveOccurred())
507+
g.Expect(result.IsZero()).To(BeTrue())
508+
g.Expect(created).To(BeFalse(), "external control plane has no health monitor")
509+
}
510+
447511
// ============================================================================
448512
// Tests for deleteLoadBalancer
449513
// ============================================================================

internal/controller/cloudscalecluster_reconcile_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ package controller
2626
import (
2727
"context"
2828
"testing"
29+
"time"
2930

3031
cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9"
3132
. "github.com/onsi/gomega"
@@ -126,6 +127,32 @@ func TestReconcileNormal_FullyProvisionedCluster(t *testing.T) {
126127
g.Expect(readyCond.Status).To(Equal(metav1.ConditionTrue))
127128
}
128129

130+
// TestReconcileNormal_RequeuesForHealthMonitorWithoutBlockingProvisioning ensures a provisioned cluster will poll
131+
// for control plane initialized (in reconcile health monitor).
132+
func TestReconcileNormal_RequeuesForHealthMonitorWithoutBlockingProvisioning(t *testing.T) {
133+
g := NewWithT(t)
134+
135+
clusterScope := defaultProvisionedScope()
136+
clusterScope.CloudscaleCluster.Status.Networks = []infrastructurev1beta2.NetworkStatus{
137+
{Name: "test", NetworkID: "net-123", SubnetID: "subnet-123", Managed: true},
138+
}
139+
clusterScope.CloudscaleCluster.Status.LoadBalancerID = "lb-123"
140+
clusterScope.CloudscaleCluster.Status.LoadBalancerPoolID = "pool-123"
141+
clusterScope.CloudscaleCluster.Status.LoadBalancerListenerID = "listener-123"
142+
143+
r := newTestReconciler()
144+
145+
result, err := r.reconcileNormal(context.Background(), clusterScope)
146+
147+
g.Expect(err).ToNot(HaveOccurred())
148+
g.Expect(result.RequeueAfter).To(Equal(10*time.Second), "must keep polling for control-plane initialization")
149+
150+
// provisioning and controlPlaneEndpoint should be set, even though we requeue due to the logic in reconciling the health monitor.
151+
g.Expect(clusterScope.CloudscaleCluster.Status.Initialization).ToNot(BeNil())
152+
g.Expect(*clusterScope.CloudscaleCluster.Status.Initialization.Provisioned).To(BeTrue())
153+
g.Expect(clusterScope.CloudscaleCluster.Spec.ControlPlaneEndpoint.Host).ToNot(BeEmpty())
154+
}
155+
129156
// TestReconcileDelete_SuccessfulDeletion is the top-level happy-path smoke test
130157
// for reconcileDelete: deletes the LB and the network, clears all status IDs,
131158
// removes the finalizer, and sets the Deleting/Ready conditions.

internal/testutils/cluster_scope.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,15 @@ func WithGeneration(gen int64) ClusterScopeOption {
163163
}
164164
}
165165

166+
// WithControlPlaneInitialized sets the CAPI Cluster ControlPlaneInitialized flag.
167+
func WithControlPlaneInitialized(v bool) ClusterScopeOption {
168+
return func(cs *scope.ClusterScope) {
169+
cs.Cluster.Status.Initialization = clusterv1.ClusterInitializationStatus{
170+
ControlPlaneInitialized: new(v),
171+
}
172+
}
173+
}
174+
166175
// MachineScopeOption configures a MachineScope for testing.
167176
type MachineScopeOption func(*scope.MachineScope)
168177

internal/testutils/fixtures.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ import (
1010
)
1111

1212
// NewControlPlaneMachine builds a control-plane CloudscaleMachine fixture in
13-
// the "default" namespace, labelled as control-plane of "test-cluster" and
14-
// with the given ServerID. Used heavily by the FIP and LB controller tests.
13+
// the "default" namespace, labeled as control-plane of "test-cluster" and
14+
// with the given ServerID.
1515
func NewControlPlaneMachine(name, serverID string) *infrastructurev1beta2.CloudscaleMachine {
1616
m := &infrastructurev1beta2.CloudscaleMachine{
1717
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)