Skip to content

Commit d26a5d6

Browse files
🌱 Do not reconcile on irrecoverable errors (#1887)
do not reconcile on irrecoverable errors Do not reconcile or remidiate on irrecoverable errors like `invalid_input` and `resource_unavailable` as reconciling again won't change the result for these errors and it will rather stuck in a loop. Signed-off-by: Dhairya Arora <dhairya.arora@syself.com>
1 parent 774ec79 commit d26a5d6

3 files changed

Lines changed: 30 additions & 0 deletions

File tree

api/v1beta1/conditions_const.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ const (
4848
ServerTypeNotFoundReason = "ServerTypeNotFound"
4949
// ServerCreateFailedReason indicates that server could not get created.
5050
ServerCreateFailedReason = "ServerCreateFailedReason"
51+
// ServerCreateFailedIrrecoverableErrorReason indicates that server creation failed with an irrecoverable error.
52+
ServerCreateFailedIrrecoverableErrorReason = "ServerCreateFailedIrrecoverableError"
5153
)
5254

5355
const (

controllers/hcloudremediation_controller.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,19 @@ func (r *HCloudRemediationReconciler) Reconcile(ctx context.Context, req reconci
109109

110110
log = log.WithValues("HCloudMachine", klog.KObj(hcloudMachine))
111111

112+
// Skip remediation for machines that failed to create with irrecoverable errors (e.g. invalid_input, resource_unavailable).
113+
// These errors cannot be fixed by rebooting or replacing the machine.
114+
// We return without error so the MHC does not keep retrying remediation.
115+
if conditions.IsFalse(hcloudMachine, infrav1.ServerCreateSucceededCondition) &&
116+
conditions.GetReason(hcloudMachine, infrav1.ServerCreateSucceededCondition) == infrav1.ServerCreateFailedIrrecoverableErrorReason {
117+
log.Info("Skipping remediation for machine with irrecoverable creation failure",
118+
"reason", conditions.GetMessage(hcloudMachine, infrav1.ServerCreateSucceededCondition),
119+
)
120+
121+
// signal remediation done.
122+
return reconcile.Result{}, nil
123+
}
124+
112125
// Fetch the Cluster.
113126
cluster, err := util.GetClusterFromMetadata(ctx, r.Client, machine.ObjectMeta)
114127
if err != nil {

pkg/services/hcloud/server/server.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,21 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro
142142
// otherwise create server.
143143
server, err = s.createServer(ctx)
144144
if err != nil {
145+
// Terminal errors like invalid_input (e.g. unsupported location for server type)
146+
// or resource_unavailable (e.g. server location disabled) will never succeed on retry.
147+
// Mark the machine as irrecoverably failed and stop reconciling.
148+
if hcloud.IsError(err, hcloud.ErrorCodeInvalidInput) || hcloud.IsError(err, hcloud.ErrorCodeResourceUnavailable) {
149+
conditions.MarkFalse(
150+
s.scope.HCloudMachine,
151+
infrav1.ServerCreateSucceededCondition,
152+
infrav1.ServerCreateFailedIrrecoverableErrorReason,
153+
clusterv1.ConditionSeverityError,
154+
"%s",
155+
err.Error(),
156+
)
157+
return reconcile.Result{}, nil
158+
}
159+
145160
if errors.Is(err, errServerCreateNotPossible) {
146161
return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil
147162
}

0 commit comments

Comments
 (0)