fix(etcd-recovery): clear stale EtcdRecoveryActive failure condition when etcd is healthy

vsolanki12 · vsolanki12 · commit d34dc0d8c3ca · 2026-05-12T18:47:07.000+05:30
When the etcd recovery job fails but etcd self-heals, the
EtcdRecoveryJobFailed condition was never cleared. This caused the
OpenShift Console to display a stale error message even when the
cluster was fully healthy.

This fix adds two checks:
- When a failed recovery job exists but etcd StatefulSet is fully
  available (3/3), clean up the job and clear the condition.
  Transient API errors are propagated instead of silently falling
  through to the failure path.
- When no failing etcd pods exist and etcd is healthy, clear any
  stale EtcdRecoveryJobFailed condition.

Signed-off-by: Vimal Solanki &lt;vsolanki@redhat.com&gt;
diff --git a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go
@@ -29,6 +29,12 @@ import (
 	"github.com/go-logr/logr"
 )
 
+func isEtcdStatefulSetHealthy(sts *appsv1.StatefulSet) bool {
+	return sts.Spec.Replicas != nil &&
+		sts.Status.ReadyReplicas == *sts.Spec.Replicas &&
+		sts.Status.AvailableReplicas == *sts.Spec.Replicas
+}
+
 type etcdJobStatus struct {
 	exists     bool
 	finished   bool
@@ -62,12 +68,30 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
 // It returns (done, err) where done=true means the caller should return immediately
 // without falling through to detectAndTriggerEtcdRecovery.
 func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) {
+	hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)
+
 	if !jobStatus.finished {
 		log.Info("waiting for etcd recovery job to complete")
 		return true, nil
 	}
 
 	if !jobStatus.successful {
+		etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
+		if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
+			if !apierrors.IsNotFound(err) {
+				return false, fmt.Errorf("failed to get etcd statefulset: %w", err)
+			}
+		} else if isEtcdStatefulSetHealthy(etcdStatefulSet) {
+			log.Info("etcd recovered despite failed recovery job, cleaning up")
+			if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
+				return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
+			}
+			if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD recovered despite failed recovery job."); err != nil {
+				return false, err
+			}
+			return true, nil
+		}
+
 		if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil {
 			return false, err
 		}
@@ -99,7 +123,7 @@ func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context,
 	}
 
 	oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
-	if oldCondition == nil || oldCondition.Status != condition.Status {
+	if oldCondition == nil || oldCondition.Status != condition.Status || oldCondition.Reason != condition.Reason {
 		meta.SetStatusCondition(&hcluster.Status.Conditions, condition)
 		if err := r.Client.Status().Update(ctx, hcluster); err != nil {
 			return fmt.Errorf("failed to update etcd recovery job condition: %w", err)
@@ -131,6 +155,15 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
 		if !fullyAvailable {
 			return &requeueAfter, nil
 		}
+
+		oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
+		if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
+			log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
+			if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD cluster is healthy."); err != nil {
+				return nil, err
+			}
+		}
+
 		return nil, nil
 	}
 
diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go
@@ -28,6 +28,7 @@ import (
 	hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
 	hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
 	"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
+	etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
 	kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
 	"github.com/openshift/hypershift/support/api"
 	"github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
 	configv1 "github.com/openshift/api/config/v1"
 
 	appsv1 "k8s.io/api/apps/v1"
+	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/equality"
 	errors2 "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -6734,3 +6737,192 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
 		})
 	}
 }
+
+func TestReconcileETCDMemberRecovery(t *testing.T) {
+	hcpNS := "clusters-test-hc"
+
+	healthyEtcdPods := func() []crclient.Object {
+		var pods []crclient.Object
+		for i := 0; i < 3; i++ {
+			pods = append(pods, &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("etcd-%d", i),
+					Namespace: hcpNS,
+					Labels:    map[string]string{"app": "etcd"},
+				},
+				Status: corev1.PodStatus{
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "etcd",
+							State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+						},
+					},
+				},
+			})
+		}
+		return pods
+	}
+
+	recoveredEtcdPods := func() []crclient.Object {
+		var pods []crclient.Object
+		for i := 0; i < 3; i++ {
+			pods = append(pods, &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("etcd-%d", i),
+					Namespace: hcpNS,
+					Labels:    map[string]string{"app": "etcd"},
+				},
+				Status: corev1.PodStatus{
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:         "etcd",
+							RestartCount: 3,
+							State:        corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+						},
+					},
+				},
+			})
+		}
+		return pods
+	}
+
+	initEtcdStatefulSet := func(specReplicas, readyReplicas, availableReplicas int32) *appsv1.StatefulSet {
+		return &appsv1.StatefulSet{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "etcd",
+				Namespace: hcpNS,
+			},
+			Spec: appsv1.StatefulSetSpec{
+				Replicas: ptr.To[int32](specReplicas),
+			},
+			Status: appsv1.StatefulSetStatus{
+				ReadyReplicas:     readyReplicas,
+				AvailableReplicas: availableReplicas,
+			},
+		}
+	}
+
+	healthyStatefulSet := initEtcdStatefulSet(3, 3, 3)
+	unhealthyStatefulSet := initEtcdStatefulSet(3, 2, 2)
+
+	staleCondition := metav1.Condition{
+		Type:               string(hyperv1.EtcdRecoveryActive),
+		Status:             metav1.ConditionFalse,
+		Reason:             hyperv1.EtcdRecoveryJobFailedReason,
+		Message:            "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
+		LastTransitionTime: metav1.Now(),
+	}
+
+	failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
+	failedJob.Status = batchv1.JobStatus{
+		Conditions: []batchv1.JobCondition{
+			{
+				Type:   batchv1.JobFailed,
+				Status: corev1.ConditionTrue,
+			},
+		},
+	}
+
+	testCases := []struct {
+		name             string
+		objects          []crclient.Object
+		conditions       []metav1.Condition
+		expectedReason   string
+		conditionExists  bool
+		expectJobDeleted bool
+	}{
+		{
+			name:            "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:         append(healthyEtcdPods(), healthyStatefulSet),
+			expectedReason:  hyperv1.AsExpectedReason,
+			conditionExists: true,
+		},
+		{
+			name:            "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
+			conditions:      []metav1.Condition{},
+			objects:         append(healthyEtcdPods(), healthyStatefulSet),
+			conditionExists: false,
+		},
+		{
+			name:             "When failed job exists but etcd recovered it should cleanup job and clear condition",
+			conditions:       []metav1.Condition{staleCondition},
+			objects:          append(healthyEtcdPods(), healthyStatefulSet, failedJob),
+			expectedReason:   hyperv1.AsExpectedReason,
+			conditionExists:  true,
+			expectJobDeleted: true,
+		},
+		{
+			name:            "When etcd pods have restarted but recovered it should clear the stale condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:         append(recoveredEtcdPods(), healthyStatefulSet),
+			expectedReason:  hyperv1.AsExpectedReason,
+			conditionExists: true,
+		},
+		{
+			name:            "When failed job exists and etcd is still unhealthy it should keep the failure condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:         append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
+			expectedReason:  hyperv1.EtcdRecoveryJobFailedReason,
+			conditionExists: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			g := NewGomegaWithT(t)
+
+			hcluster := &hyperv1.HostedCluster{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-hc",
+					Namespace: "clusters",
+				},
+				Spec: hyperv1.HostedClusterSpec{
+					Etcd: hyperv1.EtcdSpec{
+						ManagementType: hyperv1.Managed,
+					},
+					ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
+				},
+				Status: hyperv1.HostedClusterStatus{
+					Conditions: tc.conditions,
+				},
+			}
+
+			objects := append([]crclient.Object{hcluster}, tc.objects...)
+			client := fake.NewClientBuilder().
+				WithScheme(api.Scheme).
+				WithObjects(objects...).
+				WithStatusSubresource(hcluster).
+				Build()
+
+			r := &HostedClusterReconciler{
+				Client:             client,
+				now:                metav1.Now,
+				EnableEtcdRecovery: true,
+			}
+
+			_, err := r.reconcileETCDMemberRecovery(
+				ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
+				hcluster,
+				upsert.New(false).CreateOrUpdate,
+			)
+			g.Expect(err).ToNot(HaveOccurred())
+
+			updatedHC := &hyperv1.HostedCluster{}
+			g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
+
+			condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
+			if tc.conditionExists {
+				g.Expect(condition).ToNot(BeNil())
+				g.Expect(condition.Reason).To(Equal(tc.expectedReason))
+			} else {
+				g.Expect(condition).To(BeNil())
+			}
+			if tc.expectJobDeleted {
+				job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
+				err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
+				g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
+			}
+		})
+	}
+}