Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/controller/linodecluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ func (r *LinodeClusterReconciler) reconcile(

if err := addMachineToLB(ctx, clusterScope); err != nil {
logger.Error(err, "Failed to add Linode machine to loadbalancer option")
return retryIfTransient(err, logger)
return retryIfTransient(err)
}

return res, nil
Expand Down
23 changes: 14 additions & 9 deletions internal/controller/linodemachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,13 @@ func (r *LinodeMachineReconciler) reconcile(ctx context.Context, logger logr.Log
//nolint:dupl // Code duplication is simplicity in this case.
defer func() {
if err != nil {
// Only set failure reason if the error is not retryable.
if linodego.ErrHasStatus(err, http.StatusBadRequest) {
// Set specific failure reason for authentication errors
if util.IsAuthenticationError(err) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to use a Ready=false condition with a message over setting the failure reason

failureReason = util.CredentialError
}

// Set failure status for terminal errors (400, 401, 403, 404)
if util.IsTerminalError(err) {
machineScope.LinodeMachine.Status.FailureReason = util.Pointer(failureReason)
machineScope.LinodeMachine.Status.FailureMessage = util.Pointer(err.Error())
machineScope.LinodeMachine.SetCondition(metav1.Condition{
Expand Down Expand Up @@ -535,7 +540,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightMetadataSupportConfigure(ctx
_, err := machineScope.LinodeClient.GetRegion(ctx, machineScope.LinodeMachine.Spec.Region)
if err != nil {
logger.Error(err, fmt.Sprintf("Failed to fetch region %s", machineScope.LinodeMachine.Spec.Region))
return retryIfTransient(err, logger)
return retryIfTransient(err)
}
imageName := reconciler.DefaultMachineControllerLinodeImage
if machineScope.LinodeMachine.Spec.Image != "" {
Expand All @@ -544,7 +549,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightMetadataSupportConfigure(ctx
_, err = machineScope.LinodeClient.GetImage(ctx, imageName)
if err != nil {
logger.Error(err, fmt.Sprintf("Failed to fetch image %s", imageName))
return retryIfTransient(err, logger)
return retryIfTransient(err)
}
machineScope.LinodeMachine.SetCondition(metav1.Condition{
Type: ConditionPreflightMetadataSupportConfigured,
Expand All @@ -559,7 +564,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightCreate(ctx context.Context,
createOpts, err := newCreateConfig(ctx, machineScope, r.GzipCompressionEnabled, logger)
if err != nil {
logger.Error(err, "Failed to create Linode machine InstanceCreateOptions")
return retryIfTransient(err, logger)
return retryIfTransient(err)
}

linodeInstance, retryAfter, err := createInstance(ctx, logger, machineScope, createOpts)
Expand All @@ -585,7 +590,7 @@ func (r *LinodeMachineReconciler) reconcilePreflightCreate(ctx context.Context,
Reason: util.CreateError,
Message: err.Error(),
})
return retryIfTransient(err, logger)
return retryIfTransient(err)
}

machineScope.LinodeMachine.SetCondition(metav1.Condition{
Expand Down Expand Up @@ -643,11 +648,11 @@ func (r *LinodeMachineReconciler) reconcilePreflightConfigure(ctx context.Contex
instanceConfig, err := getDefaultInstanceConfig(ctx, machineScope, instanceID)
if err != nil {
logger.Error(err, "Failed to get default instance configuration")
return retryIfTransient(err, logger)
return retryIfTransient(err)
}
if _, err := machineScope.LinodeClient.UpdateInstanceConfig(ctx, instanceID, instanceConfig.ID, configData); err != nil {
logger.Error(err, "Failed to update default instance configuration", "configuration", configData)
return retryIfTransient(err, logger)
return retryIfTransient(err)
}
}

Expand Down Expand Up @@ -729,7 +734,7 @@ func (r *LinodeMachineReconciler) reconcileUpdate(ctx context.Context, logger lo

var linodeInstance *linodego.Instance
if linodeInstance, err = machineScope.LinodeClient.GetInstance(ctx, instanceID); err != nil {
return retryIfTransient(err, logger)
return retryIfTransient(err)
}
// update the status
machineScope.LinodeMachine.Status.InstanceState = &linodeInstance.Status
Expand Down
9 changes: 6 additions & 3 deletions internal/controller/linodemachine_controller_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,18 @@ var (
errNoPublicIPv6SLAACAddrs = errors.New("no public SLAAC address set")
)

func retryIfTransient(err error, logger logr.Logger) (ctrl.Result, error) {
func retryIfTransient(err error) (ctrl.Result, error) {
// For known retryable errors (429, 5xx), use application-managed requeue
// to avoid log pollution and control retry timing
if util.IsRetryableError(err) {
if linodego.ErrHasStatus(err, http.StatusTooManyRequests) {
return ctrl.Result{RequeueAfter: reconciler.DefaultLinodeTooManyRequestsErrorRetryDelay}, nil
}
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
}
logger.Error(err, "unknown Linode API error")
return ctrl.Result{RequeueAfter: reconciler.DefaultMachineControllerRetryDelay}, nil
// For all other errors (terminal or unknown), return the error
// and let controller-runtime handle requeue with exponential backoff
return ctrl.Result{}, err
}

func fillCreateConfig(createConfig *linodego.InstanceCreateOptions, machineScope *scope.MachineScope) {
Expand Down
107 changes: 107 additions & 0 deletions internal/controller/linodemachine_controller_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import (
"context"
"crypto/rand"
b64 "encoding/base64"
"errors"
"fmt"
"net/http"
"slices"
"testing"

Expand Down Expand Up @@ -2727,3 +2729,108 @@ func TestBuildInstanceAddrs(t *testing.T) {
})
}
}

func TestRetryIfTransient(t *testing.T) {
t.Parallel()

tests := []struct {
name string
err error
expectRequeue bool
expectError bool
expectErrorMsg string
}{
{
name: "rate limit error (429) - should requeue without error",
err: &linodego.Error{
Code: http.StatusTooManyRequests,
Message: "rate limited",
},
expectRequeue: true,
expectError: false,
},
{
name: "internal server error (500) - should requeue without error",
err: &linodego.Error{
Code: http.StatusInternalServerError,
Message: "internal error",
},
expectRequeue: true,
expectError: false,
},
{
name: "bad gateway error (502) - should requeue without error",
err: &linodego.Error{
Code: http.StatusBadGateway,
Message: "bad gateway",
},
expectRequeue: true,
expectError: false,
},
{
name: "unauthorized error (401) - should return error",
err: &linodego.Error{
Code: http.StatusUnauthorized,
Message: "Invalid Token",
},
expectRequeue: false,
expectError: true,
expectErrorMsg: "Invalid Token",
},
{
name: "forbidden error (403) - should return error",
err: &linodego.Error{
Code: http.StatusForbidden,
Message: "Forbidden",
},
expectRequeue: false,
expectError: true,
expectErrorMsg: "Forbidden",
},
{
name: "bad request error (400) - should return error",
err: &linodego.Error{
Code: http.StatusBadRequest,
Message: "bad request",
},
expectRequeue: false,
expectError: true,
expectErrorMsg: "bad request",
},
{
name: "not found error (404) - should return error",
err: &linodego.Error{
Code: http.StatusNotFound,
Message: "not found",
},
expectRequeue: false,
expectError: true,
expectErrorMsg: "not found",
},
{
name: "generic error - should return error",
err: errors.New("some generic error"),
expectRequeue: false,
expectError: true,
expectErrorMsg: "some generic error",
},
}

for _, tt := range tests {
testcase := tt
t.Run(testcase.name, func(t *testing.T) {
t.Parallel()

result, err := retryIfTransient(testcase.err)

if testcase.expectError {
require.Error(t, err, "expected error but got nil")
assert.Contains(t, err.Error(), testcase.expectErrorMsg)
assert.True(t, result.IsZero(), "expected zero result for terminal errors")
} else {
require.NoError(t, err, "expected no error but got one")
assert.Positive(t, result.RequeueAfter, "expected requeue delay for retryable errors")
}
})
}
}
6 changes: 4 additions & 2 deletions internal/controller/linodemachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,10 @@ var _ = Describe("create", Label("machine", "create"), func() {
reconciler.ReconcileTimeout = time.Nanosecond

res, err := reconciler.reconcileCreate(ctx, logger, &mScope)
Expect(res.RequeueAfter).To(Equal(rutil.DefaultMachineControllerRetryDelay))
Expect(err).NotTo(HaveOccurred())
// Error is returned so controller-runtime handles requeue with exponential backoff
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("time is up"))
Expect(res.RequeueAfter).To(Equal(time.Duration(0)))

Expect(linodeMachine.GetCondition(ConditionPreflightMetadataSupportConfigured).Status).To(Equal(metav1.ConditionTrue))
Expect(linodeMachine.GetCondition(ConditionPreflightCreated).Status).To(Equal(metav1.ConditionFalse))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func (r *LinodeObjectStorageBucketReconciler) reconcile(ctx context.Context, bSc
// Handle deleted buckets
if !bScope.Bucket.DeletionTimestamp.IsZero() {
if err := r.reconcileDelete(ctx, bScope); err != nil {
return retryIfTransient(err, bScope.Logger)
return retryIfTransient(err)
}
return res, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ var _ = Describe("lifecycle", Ordered, Label("bucket", "lifecycle"), func() {
OneOf(
Path(Result("cannot purge bucket", func(ctx context.Context, mck Mock) {
_, err := reconciler.reconcile(ctx, &bScope)
Expect(err).NotTo(HaveOccurred())
Expect(mck.Logs()).To(ContainSubstring("failed to purge all objects"))
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("failed to purge all objects"))
})),
),
),
Expand Down
9 changes: 5 additions & 4 deletions util/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ var (

// List of failure reasons to use in the status fields of our resources
var (
CreateError = "CreateError"
DeleteError = "DeleteError"
UpdateError = "UpdateError"
UnknownError = "UnknownError"
CreateError = "CreateError"
DeleteError = "DeleteError"
UpdateError = "UpdateError"
UnknownError = "UnknownError"
CredentialError = "CredentialError" // #nosec G101 -- false positive
)
18 changes: 18 additions & 0 deletions util/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,24 @@ func IsRetryableError(err error) bool {
linodego.ErrorFromError) || errors.Is(err, http.ErrHandlerTimeout) || errors.Is(err, os.ErrDeadlineExceeded) || errors.Is(err, io.ErrUnexpectedEOF)
}

// IsAuthenticationError determines if the error is an authentication or authorization error (401/403)
// These errors are terminal and should not be retried without user intervention
func IsAuthenticationError(err error) bool {
return linodego.ErrHasStatus(err, http.StatusUnauthorized, http.StatusForbidden)
}

// IsTerminalError determines if the error is terminal and should not be retried.
// Terminal errors include bad requests (400), authentication errors (401/403), and not found (404)
func IsTerminalError(err error) bool {
return linodego.ErrHasStatus(
err,
http.StatusBadRequest,
http.StatusUnauthorized,
http.StatusForbidden,
http.StatusNotFound,
)
}

// GetInstanceID determines the instance ID from the ProviderID
func GetInstanceID(providerID *string) (int, error) {
if providerID == nil {
Expand Down
Loading
Loading