diff --git a/internal/controller/linodecluster_controller.go b/internal/controller/linodecluster_controller.go index 156314b83..b566844cd 100644 --- a/internal/controller/linodecluster_controller.go +++ b/internal/controller/linodecluster_controller.go @@ -207,7 +207,7 @@ func (r *LinodeClusterReconciler) reconcile( if err := addMachineToLB(ctx, clusterScope); err != nil { logger.Error(err, "Failed to add Linode machine to loadbalancer option") - return retryIfTransient(err, logger) + return retryIfTransient(err) } return res, nil diff --git a/internal/controller/linodemachine_controller.go b/internal/controller/linodemachine_controller.go index 8da8b9ab9..b8dca8fed 100644 --- a/internal/controller/linodemachine_controller.go +++ b/internal/controller/linodemachine_controller.go @@ -178,8 +178,13 @@ func (r *LinodeMachineReconciler) reconcile(ctx context.Context, logger logr.Log //nolint:dupl // Code duplication is simplicity in this case. defer func() { if err != nil { - // Only set failure reason if the error is not retryable. - if linodego.ErrHasStatus(err, http.StatusBadRequest) { + // Set specific failure reason for authentication errors + if util.IsAuthenticationError(err) { + failureReason = util.CredentialError + } + + // Set failure status for terminal errors (400, 401, 403, 404) + if util.IsTerminalError(err) { machineScope.LinodeMachine.Status.FailureReason = util.Pointer(failureReason) machineScope.LinodeMachine.Status.FailureMessage = util.Pointer(err.Error()) machineScope.LinodeMachine.SetCondition(metav1.Condition{ @@ -535,7 +540,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightMetadataSupportConfigure(ctx _, err := machineScope.LinodeClient.GetRegion(ctx, machineScope.LinodeMachine.Spec.Region) if err != nil { logger.Error(err, fmt.Sprintf("Failed to fetch region %s", machineScope.LinodeMachine.Spec.Region)) - return retryIfTransient(err, logger) + return retryIfTransient(err) } imageName := reconciler.DefaultMachineControllerLinodeImage if machineScope.LinodeMachine.Spec.Image != "" { @@ -544,7 +549,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightMetadataSupportConfigure(ctx _, err = machineScope.LinodeClient.GetImage(ctx, imageName) if err != nil { logger.Error(err, fmt.Sprintf("Failed to fetch image %s", imageName)) - return retryIfTransient(err, logger) + return retryIfTransient(err) } machineScope.LinodeMachine.SetCondition(metav1.Condition{ Type: ConditionPreflightMetadataSupportConfigured, @@ -559,7 +564,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightCreate(ctx context.Context, createOpts, err := newCreateConfig(ctx, machineScope, r.GzipCompressionEnabled, logger) if err != nil { logger.Error(err, "Failed to create Linode machine InstanceCreateOptions") - return retryIfTransient(err, logger) + return retryIfTransient(err) } linodeInstance, retryAfter, err := createInstance(ctx, logger, machineScope, createOpts) @@ -585,7 +590,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightCreate(ctx context.Context, Reason: util.CreateError, Message: err.Error(), }) - return retryIfTransient(err, logger) + return retryIfTransient(err) } machineScope.LinodeMachine.SetCondition(metav1.Condition{ @@ -643,11 +648,11 @@ func (r *LinodeMachineReconciler) reconcilePreflightConfigure(ctx context.Contex instanceConfig, err := getDefaultInstanceConfig(ctx, machineScope, instanceID) if err != nil { logger.Error(err, "Failed to get default instance configuration") - return retryIfTransient(err, logger) + return retryIfTransient(err) } if _, err := machineScope.LinodeClient.UpdateInstanceConfig(ctx, instanceID, instanceConfig.ID, configData); err != nil { logger.Error(err, "Failed to update default instance configuration", "configuration", configData) - return retryIfTransient(err, logger) + return retryIfTransient(err) } } @@ -729,7 +734,7 @@ func (r *LinodeMachineReconciler) reconcileUpdate(ctx context.Context, logger lo var linodeInstance *linodego.Instance if linodeInstance, err = machineScope.LinodeClient.GetInstance(ctx, instanceID); err != nil { - return retryIfTransient(err, logger) + return retryIfTransient(err) } // update the status machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status diff --git a/internal/controller/linodemachine_controller_helpers.go b/internal/controller/linodemachine_controller_helpers.go index a34768c90..927ee9b1e 100644 --- a/internal/controller/linodemachine_controller_helpers.go +++ b/internal/controller/linodemachine_controller_helpers.go @@ -68,15 +68,18 @@ var ( errNoPublicIPv6SLAACAddrs = errors.New("no public SLAAC address set") ) -func retryIfTransient(err error, logger logr.Logger) (ctrl.Result, error) { +func retryIfTransient(err error) (ctrl.Result, error) { + // For known retryable errors (429, 5xx), use application-managed requeue + // to avoid log pollution and control retry timing if util.IsRetryableError(err) { if linodego.ErrHasStatus(err, http.StatusTooManyRequests) { return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil } return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil } - logger.Error(err, "unknown Linode API error") - return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil + // For all other errors (terminal or unknown), return the error + // and let controller-runtime handle requeue with exponential backoff + return ctrl.Result{}, err } func fillCreateConfig(createConfig *linodego.InstanceCreateOptions, machineScope *scope.MachineScope) { diff --git a/internal/controller/linodemachine_controller_helpers_test.go b/internal/controller/linodemachine_controller_helpers_test.go index 8c54b02d7..324675457 100644 --- a/internal/controller/linodemachine_controller_helpers_test.go +++ b/internal/controller/linodemachine_controller_helpers_test.go @@ -6,7 +6,9 @@ import ( "context" "crypto/rand" b64 "encoding/base64" + "errors" "fmt" + "net/http" "slices" "testing" @@ -2727,3 +2729,108 @@ func TestBuildInstanceAddrs(t *testing.T) { }) } } + +func TestRetryIfTransient(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + expectRequeue bool + expectError bool + expectErrorMsg string + }{ + { + name: "rate limit error (429) - should requeue without error", + err: &linodego.Error{ + Code: http.StatusTooManyRequests, + Message: "rate limited", + }, + expectRequeue: true, + expectError: false, + }, + { + name: "internal server error (500) - should requeue without error", + err: &linodego.Error{ + Code: http.StatusInternalServerError, + Message: "internal error", + }, + expectRequeue: true, + expectError: false, + }, + { + name: "bad gateway error (502) - should requeue without error", + err: &linodego.Error{ + Code: http.StatusBadGateway, + Message: "bad gateway", + }, + expectRequeue: true, + expectError: false, + }, + { + name: "unauthorized error (401) - should return error", + err: &linodego.Error{ + Code: http.StatusUnauthorized, + Message: "Invalid Token", + }, + expectRequeue: false, + expectError: true, + expectErrorMsg: "Invalid Token", + }, + { + name: "forbidden error (403) - should return error", + err: &linodego.Error{ + Code: http.StatusForbidden, + Message: "Forbidden", + }, + expectRequeue: false, + expectError: true, + expectErrorMsg: "Forbidden", + }, + { + name: "bad request error (400) - should return error", + err: &linodego.Error{ + Code: http.StatusBadRequest, + Message: "bad request", + }, + expectRequeue: false, + expectError: true, + expectErrorMsg: "bad request", + }, + { + name: "not found error (404) - should return error", + err: &linodego.Error{ + Code: http.StatusNotFound, + Message: "not found", + }, + expectRequeue: false, + expectError: true, + expectErrorMsg: "not found", + }, + { + name: "generic error - should return error", + err: errors.New("some generic error"), + expectRequeue: false, + expectError: true, + expectErrorMsg: "some generic error", + }, + } + + for _, tt := range tests { + testcase := tt + t.Run(testcase.name, func(t *testing.T) { + t.Parallel() + + result, err := retryIfTransient(testcase.err) + + if testcase.expectError { + require.Error(t, err, "expected error but got nil") + assert.Contains(t, err.Error(), testcase.expectErrorMsg) + assert.True(t, result.IsZero(), "expected zero result for terminal errors") + } else { + require.NoError(t, err, "expected no error but got one") + assert.Positive(t, result.RequeueAfter, "expected requeue delay for retryable errors") + } + }) + } +} diff --git a/internal/controller/linodemachine_controller_test.go b/internal/controller/linodemachine_controller_test.go index 722866731..08b386934 100644 --- a/internal/controller/linodemachine_controller_test.go +++ b/internal/controller/linodemachine_controller_test.go @@ -670,8 +670,10 @@ var _ = Describe("create", Label("machine", "create"), func() { reconciler.ReconcileTimeout = time.Nanosecond res, err := reconciler.reconcileCreate(ctx, logger, &mScope) - Expect(res.RequeueAfter).To(Equal(rutil.DefaultMachineControllerRetryDelay)) - Expect(err).NotTo(HaveOccurred()) + // Error is returned so controller-runtime handles requeue with exponential backoff + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("time is up")) + Expect(res.RequeueAfter).To(Equal(time.Duration(0))) Expect(linodeMachine.GetCondition(ConditionPreflightMetadataSupportConfigured).Status).To(Equal(metav1.ConditionTrue)) Expect(linodeMachine.GetCondition(ConditionPreflightCreated).Status).To(Equal(metav1.ConditionFalse)) diff --git a/internal/controller/linodeobjectstoragebucket_controller.go b/internal/controller/linodeobjectstoragebucket_controller.go index de210cdf8..65457c0da 100644 --- a/internal/controller/linodeobjectstoragebucket_controller.go +++ b/internal/controller/linodeobjectstoragebucket_controller.go @@ -140,7 +140,7 @@ func (r *LinodeObjectStorageBucketReconciler) reconcile(ctx context.Context, bSc // Handle deleted buckets if !bScope.Bucket.DeletionTimestamp.IsZero() { if err := r.reconcileDelete(ctx, bScope); err != nil { - return retryIfTransient(err, bScope.Logger) + return retryIfTransient(err) } return res, nil } diff --git a/internal/controller/linodeobjectstoragebucket_controller_test.go b/internal/controller/linodeobjectstoragebucket_controller_test.go index dc07ce78e..8a626d1a0 100644 --- a/internal/controller/linodeobjectstoragebucket_controller_test.go +++ b/internal/controller/linodeobjectstoragebucket_controller_test.go @@ -266,8 +266,8 @@ var _ = Describe("lifecycle", Ordered, Label("bucket", "lifecycle"), func() { OneOf( Path(Result("cannot purge bucket", func(ctx context.Context, mck Mock) { _, err := reconciler.reconcile(ctx, &bScope) - Expect(err).NotTo(HaveOccurred()) - Expect(mck.Logs()).To(ContainSubstring("failed to purge all objects")) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to purge all objects")) })), ), ), diff --git a/util/errors.go b/util/errors.go index b3a70f7a6..e99d96610 100644 --- a/util/errors.go +++ b/util/errors.go @@ -29,8 +29,9 @@ var ( // List of failure reasons to use in the status fields of our resources var ( - CreateError = "CreateError" - DeleteError = "DeleteError" - UpdateError = "UpdateError" - UnknownError = "UnknownError" + CreateError = "CreateError" + DeleteError = "DeleteError" + UpdateError = "UpdateError" + UnknownError = "UnknownError" + CredentialError = "CredentialError" // #nosec G101 -- false positive ) diff --git a/util/helpers.go b/util/helpers.go index 65949f574..a21785b2f 100644 --- a/util/helpers.go +++ b/util/helpers.go @@ -61,6 +61,24 @@ func IsRetryableError(err error) bool { linodego.ErrorFromError) || errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF) } +// IsAuthenticationError determines if the error is an authentication or authorization error (401/403) +// These errors are terminal and should not be retried without user intervention +func IsAuthenticationError(err error) bool { + return linodego.ErrHasStatus(err, http.StatusUnauthorized, http.StatusForbidden) +} + +// IsTerminalError determines if the error is terminal and should not be retried. +// Terminal errors include bad requests (400), authentication errors (401/403), and not found (404) +func IsTerminalError(err error) bool { + return linodego.ErrHasStatus( + err, + http.StatusBadRequest, + http.StatusUnauthorized, + http.StatusForbidden, + http.StatusNotFound, + ) +} + // GetInstanceID determines the instance ID from the ProviderID func GetInstanceID(providerID *string) (int, error) { if providerID == nil { diff --git a/util/helpers_test.go b/util/helpers_test.go index 542eecc29..478c09e06 100644 --- a/util/helpers_test.go +++ b/util/helpers_test.go @@ -121,6 +121,138 @@ func TestIsRetryableError(t *testing.T) { } } +func TestIsAuthenticationError(t *testing.T) { + t.Parallel() + tests := []struct { + name string + err error + want bool + }{{ + name: "unauthorized error (401)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusUnauthorized, + Message: "Invalid Token", + }, + want: true, + }, { + name: "forbidden error (403)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusForbidden, + Message: "Forbidden", + }, + want: true, + }, { + name: "bad request error (400)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusBadRequest, + Message: "bad request", + }, + want: false, + }, { + name: "not found error (404)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusNotFound, + Message: "not found", + }, + want: false, + }, { + name: "internal server error (500)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusInternalServerError, + Message: "internal error", + }, + want: false, + }, { + name: "non-Linode error", + err: errors.New("random error"), + want: false, + }} + for _, tt := range tests { + testcase := tt + t.Run(testcase.name, func(t *testing.T) { + t.Parallel() + if testcase.want != IsAuthenticationError(testcase.err) { + t.Errorf("IsAuthenticationError() = %v, want %v", IsAuthenticationError(testcase.err), testcase.want) + } + }) + } +} + +func TestIsTerminalError(t *testing.T) { + t.Parallel() + tests := []struct { + name string + err error + want bool + }{{ + name: "bad request error (400)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusBadRequest, + Message: "bad request", + }, + want: true, + }, { + name: "unauthorized error (401)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusUnauthorized, + Message: "Invalid Token", + }, + want: true, + }, { + name: "forbidden error (403)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusForbidden, + Message: "Forbidden", + }, + want: true, + }, { + name: "not found error (404)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusNotFound, + Message: "not found", + }, + want: true, + }, { + name: "internal server error (500)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusInternalServerError, + Message: "internal error", + }, + want: false, + }, { + name: "too many requests (429)", + err: &linodego.Error{ + Response: nil, + Code: http.StatusTooManyRequests, + Message: "rate limited", + }, + want: false, + }, { + name: "non-Linode error", + err: errors.New("random error"), + want: false, + }} + for _, tt := range tests { + testcase := tt + t.Run(testcase.name, func(t *testing.T) { + t.Parallel() + if testcase.want != IsTerminalError(testcase.err) { + t.Errorf("IsTerminalError() = %v, want %v", IsTerminalError(testcase.err), testcase.want) + } + }) + } +} + func TestGetInstanceID(t *testing.T) { t.Parallel() tests := []struct {