Skip to content

Commit 59420ee

Browse files
moshevaynerCopilot
andauthored
feat: add jitter to reconciler request delays for improved error handling (#1069)
* fix: add .vscode to .gitignore Signed-off-by: Moshe Vayner <moshe@vayner.me> * feat: add jitter to reconciler request delays for improved error handling Also, removed `DefaultVPCControllerWaitForHasNodesDelay` from defaults, since it isn't used anywhere (dead code) Signed-off-by: Moshe Vayner <moshe@vayner.me> Co-authored-by: Copilot <copilot@github.com> Signed-off-by: Moshe Vayner <moshe@vayner.me> * test(bucket): verify reconciler requeues with jitter on finalizer failures Signed-off-by: Moshe Vayner <moshe@vayner.me> --------- Signed-off-by: Moshe Vayner <moshe@vayner.me> Co-authored-by: Copilot <copilot@github.com>
1 parent 96dc43f commit 59420ee

17 files changed

Lines changed: 192 additions & 84 deletions

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,6 @@ release/*
1010
templates/cluster-template*.yaml
1111
infrastructure-*-linode/*
1212
.envrc
13-
vendor/
13+
vendor/
14+
.vscode/
15+
.tool-versions

internal/controller/linodecluster_controller.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ func (r *LinodeClusterReconciler) reconcile(
167167
if !reconciler.HasStaleCondition(clusterScope.LinodeCluster.GetCondition(string(clusterv1.ReadyCondition)),
168168
reconciler.DefaultTimeout(r.ReconcileTimeout, reconciler.DefaultClusterControllerReconcileTimeout)) {
169169
logger.Info("re-queuing cluster/nb deletion")
170-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
170+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
171171
}
172172
return res, err
173173
}
@@ -190,7 +190,7 @@ func (r *LinodeClusterReconciler) reconcile(
190190
if !reconciler.HasStaleCondition(clusterScope.LinodeCluster.GetCondition(clusterv1.ReadyCondition),
191191
reconciler.DefaultTimeout(r.ReconcileTimeout, reconciler.DefaultClusterControllerReconcileTimeout)) {
192192
logger.Info("re-queuing cluster/load-balancer creation")
193-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
193+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
194194
}
195195
return res, err
196196
}
@@ -212,7 +212,7 @@ func (r *LinodeClusterReconciler) reconcile(
212212
if err := addMachineToLB(ctx, clusterScope); err != nil {
213213
if errors.Is(err, util.ErrReconcileAgain) {
214214
logger.Info("re-queuing adding machine to loadbalancer")
215-
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
215+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultMachineControllerRetryDelay)}, nil
216216
}
217217
logger.Error(err, "Failed to add Linode machine to loadbalancer option")
218218
return retryIfTransient(err, logger)
@@ -259,7 +259,7 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeFirewallCheck(ctx cont
259259
Reason: util.CreateError,
260260
Message: err.Error(),
261261
})
262-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
262+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
263263
}
264264
clusterScope.LinodeCluster.SetCondition(metav1.Condition{
265265
Type: ConditionPreflightLinodeNBFirewallReady,
@@ -302,7 +302,7 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeFirewallCheck(ctx cont
302302
Reason: "LinodeFirewallNotYetAvailable", // We have to set the reason to not fail object patching
303303
})
304304

305-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
305+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
306306
}
307307
if linodeFirewall.Spec.FirewallID == nil {
308308
logger.Info("Linode firewall not yet available")
@@ -312,7 +312,7 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeFirewallCheck(ctx cont
312312
Reason: "LinodeFirewallNotYetAvailable", // We have to set the reason to not fail object patching
313313
})
314314

315-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
315+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
316316
}
317317

318318
// Only set to true if there was no error
@@ -338,7 +338,7 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeVPCCheck(ctx context.C
338338
Reason: util.CreateError,
339339
Message: fmt.Sprintf("VPC with ID %d not found: %v", vpcID, err),
340340
})
341-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
341+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
342342
}
343343
// VPC exists, verify it has at least one subnet
344344
if len(vpc.Subnets) == 0 {
@@ -350,7 +350,7 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeVPCCheck(ctx context.C
350350
Reason: util.CreateError,
351351
Message: err.Error(),
352352
})
353-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
353+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
354354
}
355355

356356
// Only set to true if there was no error
@@ -392,15 +392,15 @@ func (r *LinodeClusterReconciler) reconcilePreflightLinodeVPCCheck(ctx context.C
392392
Status: metav1.ConditionFalse,
393393
Reason: "LinodeVPCNotYetAvailable", // We have to set the reason to not fail object patching
394394
})
395-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
395+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
396396
} else if !linodeVPC.Status.Ready {
397397
logger.Info("LinodeVPC is not yet available")
398398
clusterScope.LinodeCluster.SetCondition(metav1.Condition{
399399
Type: ConditionPreflightLinodeVPCReady,
400400
Status: metav1.ConditionFalse,
401401
Reason: "LinodeVPCNotYetAvailable", // We have to set the reason to not fail object patching
402402
})
403-
return ctrl.Result{RequeueAfter: reconciler.DefaultClusterControllerReconcileDelay}, nil
403+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultClusterControllerReconcileDelay)}, nil
404404
}
405405

406406
// Only set to true if there was no error

internal/controller/linodecluster_controller_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package controller
1919
import (
2020
"context"
2121
"errors"
22+
"time"
2223

2324
"github.com/go-logr/logr"
2425
"github.com/linode/linodego"
@@ -216,7 +217,8 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc
216217
reconciler.Client = k8sClient
217218
res, err := reconciler.reconcile(ctx, cScope, mck.Logger())
218219
Expect(err).NotTo(HaveOccurred())
219-
Expect(res.RequeueAfter).To(Equal(rec.DefaultClusterControllerReconcileDelay))
220+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultClusterControllerReconcileDelay))
221+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultClusterControllerReconcileDelay+time.Duration(float64(rec.DefaultClusterControllerReconcileDelay)*rec.RetryJitterFraction)))
220222
Expect(mck.Logs()).To(Or(
221223
ContainSubstring("re-queuing cluster/load-balancer creation"),
222224
ContainSubstring("failed to ensure nodebalancer"),
@@ -237,7 +239,8 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc
237239
reconciler.Client = k8sClient
238240
res, err := reconciler.reconcile(ctx, cScope, mck.Logger())
239241
Expect(err).NotTo(HaveOccurred())
240-
Expect(res.RequeueAfter).To(Equal(rec.DefaultClusterControllerReconcileDelay))
242+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultClusterControllerReconcileDelay))
243+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultClusterControllerReconcileDelay+time.Duration(float64(rec.DefaultClusterControllerReconcileDelay)*rec.RetryJitterFraction)))
241244
Expect(mck.Logs()).To(ContainSubstring("re-queuing cluster/load-balancer creation"))
242245
})),
243246
),
@@ -260,7 +263,8 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc
260263
reconciler.Client = k8sClient
261264
res, err := reconciler.reconcile(ctx, cScope, mck.Logger())
262265
Expect(err).NotTo(HaveOccurred())
263-
Expect(res.RequeueAfter).To(Equal(rec.DefaultClusterControllerReconcileDelay))
266+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultClusterControllerReconcileDelay))
267+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultClusterControllerReconcileDelay+time.Duration(float64(rec.DefaultClusterControllerReconcileDelay)*rec.RetryJitterFraction)))
264268
Expect(mck.Logs()).To(ContainSubstring("re-queuing cluster/load-balancer creation"))
265269
})),
266270
),
@@ -283,7 +287,8 @@ var _ = Describe("cluster-lifecycle", Ordered, Label("cluster", "cluster-lifecyc
283287
reconciler.Client = k8sClient
284288
res, err := reconciler.reconcile(ctx, cScope, mck.Logger())
285289
Expect(err).NotTo(HaveOccurred())
286-
Expect(res.RequeueAfter).To(Equal(rec.DefaultClusterControllerReconcileDelay))
290+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultClusterControllerReconcileDelay))
291+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultClusterControllerReconcileDelay+time.Duration(float64(rec.DefaultClusterControllerReconcileDelay)*rec.RetryJitterFraction)))
287292
Expect(mck.Logs()).To(ContainSubstring("re-queuing cluster/load-balancer creation"))
288293
})),
289294
),

internal/controller/linodefirewall_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ func (r *LinodeFirewallReconciler) reconcile(
248248
reconciler.DefaultTimeout(r.ReconcileTimeout, reconciler.DefaultFWControllerReconcileTimeout)):
249249
logger.Info(fmt.Sprintf("%s failed, requeuing", action))
250250

251-
return ctrl.Result{RequeueAfter: reconciler.DefaultFWControllerReconcilerDelay}, nil
251+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultFWControllerReconcilerDelay)}, nil
252252
}
253253

254254
return ctrl.Result{}, err
@@ -285,7 +285,7 @@ func (r *LinodeFirewallReconciler) reconcileDelete(
285285
if fwScope.LinodeFirewall.ObjectMeta.DeletionTimestamp.Add(reconciler.DefaultTimeout(r.ReconcileTimeout, reconciler.DefaultFWControllerReconcileTimeout)).After(time.Now()) {
286286
logger.Info("DeleteFirewall failed, requeuing")
287287

288-
return ctrl.Result{RequeueAfter: reconciler.DefaultFWControllerReconcilerDelay}, nil
288+
return ctrl.Result{RequeueAfter: reconciler.WithJitter(reconciler.DefaultFWControllerReconcilerDelay)}, nil
289289
}
290290

291291
return ctrl.Result{}, err

internal/controller/linodefirewall_controller_test.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ var _ = Describe("lifecycle", Ordered, Label("firewalls", "lifecycle"), func() {
160160
Path(Result("create requeues", func(ctx context.Context, mck Mock) {
161161
res, err := reconciler.reconcile(ctx, mck.Logger(), &fwScope)
162162
Expect(err).NotTo(HaveOccurred())
163-
Expect(res.RequeueAfter).To(Equal(rec.DefaultFWControllerReconcilerDelay))
163+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultFWControllerReconcilerDelay))
164+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultFWControllerReconcilerDelay+time.Duration(float64(rec.DefaultFWControllerReconcilerDelay)*rec.RetryJitterFraction)))
164165
Expect(mck.Logs()).To(ContainSubstring("failed, requeuing"))
165166
})),
166167
Path(Result("timeout error", func(ctx context.Context, mck Mock) {
@@ -242,15 +243,17 @@ var _ = Describe("lifecycle", Ordered, Label("firewalls", "lifecycle"), func() {
242243
mck.LinodeClient.EXPECT().UpdateFirewallRules(ctx, 1, gomock.Any()).Return(nil, &linodego.Error{Code: http.StatusInternalServerError})
243244
res, err := reconciler.reconcile(ctx, mck.Logger(), &fwScope)
244245
Expect(err).NotTo(HaveOccurred())
245-
Expect(res.RequeueAfter).To(Equal(rec.DefaultFWControllerReconcilerDelay))
246+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultFWControllerReconcilerDelay))
247+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultFWControllerReconcilerDelay+time.Duration(float64(rec.DefaultFWControllerReconcilerDelay)*rec.RetryJitterFraction)))
246248
Expect(mck.Logs()).To(ContainSubstring("failed, requeuing"))
247249
})),
248250
Path(Result("update requeues for update error", func(ctx context.Context, mck Mock) {
249251
mck.LinodeClient.EXPECT().UpdateFirewallRules(ctx, 1, gomock.Any()).Return(nil, nil)
250252
mck.LinodeClient.EXPECT().UpdateFirewall(ctx, 1, gomock.Any()).Return(nil, &linodego.Error{Code: http.StatusInternalServerError})
251253
res, err := reconciler.reconcile(ctx, mck.Logger(), &fwScope)
252254
Expect(err).NotTo(HaveOccurred())
253-
Expect(res.RequeueAfter).To(Equal(rec.DefaultFWControllerReconcilerDelay))
255+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultFWControllerReconcilerDelay))
256+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultFWControllerReconcilerDelay+time.Duration(float64(rec.DefaultFWControllerReconcilerDelay)*rec.RetryJitterFraction)))
254257
Expect(mck.Logs()).To(ContainSubstring("failed, requeuing"))
255258
})),
256259
Path(Result("timeout error", func(ctx context.Context, mck Mock) {
@@ -306,7 +309,8 @@ var _ = Describe("lifecycle", Ordered, Label("firewalls", "lifecycle"), func() {
306309
Path(Result("deletes are requeued", func(ctx context.Context, mck Mock) {
307310
res, err := reconciler.reconcile(ctx, mck.Logger(), &fwScope)
308311
Expect(err).NotTo(HaveOccurred())
309-
Expect(res.RequeueAfter).To(Equal(rec.DefaultFWControllerReconcilerDelay))
312+
Expect(res.RequeueAfter).To(BeNumerically(">=", rec.DefaultFWControllerReconcilerDelay))
313+
Expect(res.RequeueAfter).To(BeNumerically("<=", rec.DefaultFWControllerReconcilerDelay+time.Duration(float64(rec.DefaultFWControllerReconcilerDelay)*rec.RetryJitterFraction)))
310314
Expect(mck.Logs()).To(ContainSubstring("failed, requeuing"))
311315
})),
312316
Path(Result("timeout error", func(ctx context.Context, mck Mock) {

0 commit comments

Comments
 (0)