Skip to content

Commit a2a7e34

Browse files
committed
fix(etcd-recovery): OCPBUGS-84577: handle transient errors when checking etcd statefulset health
Return error instead of silently falling through to the failure condition when r.Get() for the etcd StatefulSet fails with a non-NotFound error. This prevents transient API errors from incorrectly marking a healthy cluster as needing manual intervention. Signed-off-by: Vimal Solanki <vsolanki@redhat.com>
1 parent 9355084 commit a2a7e34

1 file changed

Lines changed: 16 additions & 15 deletions

File tree

hypershift-operator/controllers/hostedcluster/etcd_recovery.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -58,23 +58,24 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
5858
if !jobStatus.successful {
5959
// Check if etcd has recovered despite the job failing
6060
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
61-
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err == nil {
62-
if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
63-
log.Info("etcd recovered despite failed recovery job, cleaning up")
64-
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
65-
return nil, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
66-
}
67-
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
68-
etcdRecoveryActiveCondition.Reason = hyperv1.AsExpectedReason
69-
etcdRecoveryActiveCondition.LastTransitionTime = r.now()
70-
meta.SetStatusCondition(&hcluster.Status.Conditions, etcdRecoveryActiveCondition)
71-
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
72-
return nil, fmt.Errorf("failed to update etcd recovery job condition: %w", err)
73-
}
74-
return nil, nil
61+
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
62+
if !apierrors.IsNotFound(err) {
63+
return nil, fmt.Errorf("failed to get etcd statefulset: %w", err)
7564
}
65+
} else if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
66+
log.Info("etcd recovered despite failed recovery job, cleaning up")
67+
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
68+
return nil, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
69+
}
70+
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
71+
etcdRecoveryActiveCondition.Reason = hyperv1.AsExpectedReason
72+
etcdRecoveryActiveCondition.LastTransitionTime = r.now()
73+
meta.SetStatusCondition(&hcluster.Status.Conditions, etcdRecoveryActiveCondition)
74+
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
75+
return nil, fmt.Errorf("failed to update etcd recovery job condition: %w", err)
76+
}
77+
return nil, nil
7678
}
77-
7879
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
7980
etcdRecoveryActiveCondition.Reason = hyperv1.EtcdRecoveryJobFailedReason
8081
etcdRecoveryActiveCondition.Message = "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."

0 commit comments

Comments
 (0)