Skip to content

Commit 6c78c67

Browse files
committed
fix(controller): enhance error handling and retry logic in DeployKey reconciler
1 parent ef2eb5f commit 6c78c67

1 file changed

Lines changed: 82 additions & 37 deletions

File tree

internal/controller/deploykey_controller.go

Lines changed: 82 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,26 @@ func (r *DeployKeyReconciler) HandleError(deploykey *authv1alpha1.DeployKey, err
7373
return ctrl.Result{}, err
7474
}
7575
r.logger.Errorw("Failed to reconcile deploykey", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
76-
if deploykey.Status.CurrentRetries >= MaxRetries {
76+
77+
// Determine classification of error (rate limit vs other) early
78+
var errMsg string
79+
if err != nil {
80+
errMsg = err.Error()
81+
}
82+
isRateLimit := strings.Contains(errMsg, "rate limit") || strings.Contains(errMsg, "403")
83+
84+
// Decide next retry count: don't increment on rate limit (we want long backoff but not exhaust retries)
85+
nextRetries := deploykey.Status.CurrentRetries
86+
if !isRateLimit {
87+
nextRetries = deploykey.Status.CurrentRetries + 1
88+
}
89+
90+
if nextRetries >= MaxRetries {
7791
deploykey.Status = authv1alpha1.DeployKeyStatus{
7892
CrStatus: authv1alpha1.CrStatus{
7993
State: authv1alpha1.StateFailed,
8094
LastAction: deploykey.Status.LastAction,
81-
LastMessage: fmt.Sprintf("Failed to reconcile deploykey after %d retries: %v", MaxRetries, err),
95+
LastMessage: fmt.Sprintf("Failed to reconcile deploykey after %d attempts: %v", MaxRetries, err),
8296
LastReconcileTime: time.Now().Format(time.RFC3339),
8397
CurrentRetries: 0,
8498
},
@@ -92,39 +106,46 @@ func (r *DeployKeyReconciler) HandleError(deploykey *authv1alpha1.DeployKey, err
92106
LastAction: deploykey.Status.LastAction,
93107
LastMessage: fmt.Sprintf("Failed to reconcile deploykey: %v", err),
94108
LastReconcileTime: time.Now().Format(time.RFC3339),
95-
CurrentRetries: deploykey.Status.CurrentRetries + 1,
109+
CurrentRetries: nextRetries,
96110
},
97111
SecretRef: deploykey.Status.SecretRef,
98112
Created: deploykey.Status.Created,
99113
}
100114
}
101115
updateErr := r.Status().Update(context.Background(), deploykey)
102116
if updateErr != nil {
103-
r.logger.Errorw("Failed to update deploykey status", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", updateErr)
104-
return ctrl.Result{RequeueAfter: 1 * time.Minute}, err
117+
if apierrors.IsConflict(updateErr) || apierrors.IsNotFound(updateErr) {
118+
// Ignore conflicts/notfound for error status updates; we'll pick up latest state next reconcile
119+
r.logger.Debugw("Status update conflict/notfound recording error; proceeding with backoff", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", updateErr)
120+
} else {
121+
// Log but continue to apply backoff with original error classification
122+
r.logger.Errorw("Failed to update deploykey status (non-terminal)", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", updateErr)
123+
}
105124
}
106125

107-
if deploykey.Status.CurrentRetries >= MaxRetries {
126+
if nextRetries >= MaxRetries {
108127
r.logger.Errorw("Max retries reached for deploykey", "name", deploykey.Name, "namespace", deploykey.Namespace)
109128
return ctrl.Result{}, fmt.Errorf("max retries reached for deploykey %s/%s: %v", deploykey.Namespace, deploykey.Name, err)
110129
}
111130

112-
// Implement exponential backoff with jitter
131+
// Backoff calculation uses nextRetries (the number after potential increment) for non-rate-limit
113132
var requeueDelay time.Duration
114-
if strings.Contains(err.Error(), "rate limit") || strings.Contains(err.Error(), "403") {
115-
// For rate limiting, use longer delays
116-
requeueDelay = time.Duration(RateLimitBaseDelay.Minutes()+float64(deploykey.Status.CurrentRetries*10)) * time.Minute
117-
r.logger.Warnw("Rate limit detected, using extended backoff", "name", deploykey.Name, "namespace", deploykey.Namespace, "delay", requeueDelay)
133+
if isRateLimit {
134+
// For rate limiting, keep retries stable and use base + additive component (still bounded)
135+
requeueDelay = time.Duration(RateLimitBaseDelay.Minutes()+float64(nextRetries*10)) * time.Minute
136+
if requeueDelay > MaxRequeueDelay {
137+
requeueDelay = MaxRequeueDelay
138+
}
139+
r.logger.Warnw("Rate limit detected, using extended backoff", "name", deploykey.Name, "namespace", deploykey.Namespace, "delay", requeueDelay, "retries", nextRetries)
118140
} else {
119-
// Exponential backoff for other errors: 5m, 10m, 20m, 40m...
120-
requeueDelay = time.Duration(5*(1<<deploykey.Status.CurrentRetries)) * time.Minute
141+
requeueDelay = time.Duration(5*(1<<nextRetries)) * time.Minute
121142
if requeueDelay > MaxRequeueDelay {
122-
requeueDelay = MaxRequeueDelay // Cap at 1 hour
143+
requeueDelay = MaxRequeueDelay
123144
}
124145
}
125146

126-
r.logger.Infow("Requeue deploykey with backoff", "name", deploykey.Name, "namespace", deploykey.Namespace, "delay", requeueDelay, "retries", deploykey.Status.CurrentRetries)
127-
return ctrl.Result{RequeueAfter: requeueDelay}, err
147+
r.logger.Infow("Requeue deploykey with backoff", "name", deploykey.Name, "namespace", deploykey.Namespace, "delay", requeueDelay, "retries", nextRetries)
148+
return ctrl.Result{RequeueAfter: requeueDelay}, nil
128149
}
129150

130151
// +kubebuilder:rbac:groups=auth.github.odit.services,resources=deploykeys,verbs=get;list;watch;create;update;patch;delete
@@ -163,12 +184,16 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
163184
}
164185
}
165186

166-
// Only update status if it's actually changing
167-
if deploykey.Status.State != authv1alpha1.StateReconciling && deploykey.DeletionTimestamp == nil {
187+
// Only update status to Reconciling early for non-create/non-delete flows to reduce conflicts
188+
if deploykey.DeletionTimestamp == nil && deploykey.Status.Created && deploykey.Status.State != authv1alpha1.StateReconciling {
168189
deploykey.Status.State = authv1alpha1.StateReconciling
169190
if err := r.Status().Update(ctx, deploykey); err != nil {
170-
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
171-
return r.HandleError(deploykey, err)
191+
if apierrors.IsConflict(err) {
192+
r.logger.Debugw("Status update conflict setting Reconciling (existing resource); skipping", "name", deploykey.Name, "namespace", deploykey.Namespace)
193+
} else {
194+
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
195+
return r.HandleError(deploykey, err)
196+
}
172197
}
173198
}
174199

@@ -178,8 +203,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
178203

179204
deploykey.Status.LastAction = authv1alpha1.ActionDelete
180205
if err := r.Status().Update(ctx, deploykey); err != nil {
181-
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
182-
return r.HandleError(deploykey, err)
206+
if apierrors.IsConflict(err) {
207+
r.logger.Debugw("Status update conflict writing delete LastAction; continuing", "name", deploykey.Name, "namespace", deploykey.Namespace)
208+
} else {
209+
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
210+
return r.HandleError(deploykey, err)
211+
}
183212
}
184213

185214
// Only attempt to delete from GitHub if we have a valid key ID
@@ -264,8 +293,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
264293
r.logger.Infow("Creating deploy key", "name", deploykey.Name, "namespace", deploykey.Namespace)
265294
deploykey.Status.LastAction = authv1alpha1.ActionCreate
266295
if err := r.Status().Update(ctx, deploykey); err != nil {
267-
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
268-
return r.HandleError(deploykey, err)
296+
if apierrors.IsConflict(err) {
297+
r.logger.Debugw("Status update conflict setting create LastAction; continuing", "name", deploykey.Name, "namespace", deploykey.Namespace)
298+
} else {
299+
r.logger.Errorw("Failed to update DeployKey status to Reconciling", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
300+
return r.HandleError(deploykey, err)
301+
}
269302
}
270303

271304
var pubKey, privKey string
@@ -309,8 +342,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
309342

310343
deploykey.Status.SecretRef = secret.Name
311344
if err := r.Status().Update(ctx, deploykey); err != nil {
312-
r.logger.Errorw("Failed to update DeployKey status secretref", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
313-
return r.HandleError(deploykey, err)
345+
if apierrors.IsConflict(err) {
346+
r.logger.Debugw("Status update conflict writing SecretRef; skipping error handling", "name", deploykey.Name, "namespace", deploykey.Namespace)
347+
} else {
348+
r.logger.Errorw("Failed to update DeployKey status secretref", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
349+
return r.HandleError(deploykey, err)
350+
}
314351
}
315352

316353
readOnly := deploykey.Spec.Permission == authv1alpha1.ReadOnly
@@ -320,9 +357,9 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
320357
ReadOnly: &readOnly,
321358
}
322359

323-
keyresponse, resp, err := r.ghclient.Repositories.CreateKey(ctx, deploykey.Spec.Owner, deploykey.Spec.Repository, keyrequest)
324-
if err != nil {
325-
r.logger.Errorw("Failed to create deploy key on GitHub", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
360+
keyresponse, resp, ghErr := r.ghclient.Repositories.CreateKey(ctx, deploykey.Spec.Owner, deploykey.Spec.Repository, keyrequest)
361+
if ghErr != nil {
362+
r.logger.Errorw("Failed to create deploy key on GitHub", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", ghErr)
326363

327364
// Log rate limit information if available
328365
if resp != nil && resp.Header != nil {
@@ -336,12 +373,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
336373
}
337374
}
338375

339-
err = DeleteSecret(ctx, r.Client, deploykey.Namespace, secret.Name)
340-
if err != nil {
341-
r.logger.Errorw("Failed to delete secret after GitHub key creation failure", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
376+
// Attempt to cleanup the secret, but preserve the original GitHub error for HandleError
377+
if delErr := DeleteSecret(ctx, r.Client, deploykey.Namespace, secret.Name); delErr != nil {
378+
r.logger.Errorw("Failed to delete secret after GitHub key creation failure", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", delErr)
342379
}
343380

344-
return r.HandleError(deploykey, err)
381+
return r.HandleError(deploykey, ghErr)
345382
}
346383

347384
deploykey.Status.Created = true
@@ -352,8 +389,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
352389
deploykey.Status.GitHubKeyID = keyresponse.GetID()
353390

354391
if err := r.Status().Update(ctx, deploykey); err != nil {
355-
r.logger.Errorw("Failed to update DeployKey status after creation", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
356-
return r.HandleError(deploykey, err)
392+
if apierrors.IsConflict(err) {
393+
r.logger.Debugw("Status update conflict after creation; skipping error handling", "name", deploykey.Name, "namespace", deploykey.Namespace)
394+
} else {
395+
r.logger.Errorw("Failed to update DeployKey status after creation", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
396+
return r.HandleError(deploykey, err)
397+
}
357398
}
358399

359400
r.logger.Infow("Deploy key created", "name", deploykey.Name, "namespace", deploykey.Namespace)
@@ -368,8 +409,12 @@ func (r *DeployKeyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
368409
deploykey.Status.LastMessage = "Deploy key validated successfully"
369410
deploykey.Status.LastReconcileTime = time.Now().Format(time.RFC3339)
370411
if err := r.Status().Update(ctx, deploykey); err != nil {
371-
r.logger.Errorw("Failed to update DeployKey status", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
372-
return r.HandleError(deploykey, err)
412+
if apierrors.IsConflict(err) {
413+
r.logger.Debugw("Status update conflict validating existing key; skipping error handling", "name", deploykey.Name, "namespace", deploykey.Namespace)
414+
} else {
415+
r.logger.Errorw("Failed to update DeployKey status", "name", deploykey.Name, "namespace", deploykey.Namespace, "error", err)
416+
return r.HandleError(deploykey, err)
417+
}
373418
}
374419
}
375420
}

0 commit comments

Comments
 (0)