fix(etcd-recovery): OCPBUGS-84577: clear stale EtcdRecoveryActive failure condition when etcd is healthy

vsolanki12 · vsolanki12 · commit 3e6a8577b639 · 2026-05-05T16:30:12.000+05:30
When the etcd recovery job fails but etcd self-heals, the
EtcdRecoveryJobFailed condition was never cleared. This caused the
OpenShift Console to display a stale error message even when the
cluster was fully healthy.

This fix adds two checks:
- When a failed recovery job exists but etcd StatefulSet is fully
  available (3/3), clean up the job and clear the condition.
- When no failing etcd pods exist and etcd is healthy, clear any
  stale EtcdRecoveryJobFailed condition.

Signed-off-by: Vimal Solanki &lt;vsolanki@redhat.com&gt;
diff --git a/hypershift-operator/controllers/hostedcluster/etcd_recovery.go b/hypershift-operator/controllers/hostedcluster/etcd_recovery.go
@@ -56,6 +56,25 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
 		}
 
 		if !jobStatus.successful {
+			// Check if etcd has recovered despite the job failing
+			etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
+			if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err == nil {
+				if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
+					log.Info("etcd recovered despite failed recovery job, cleaning up")
+					if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
+						return nil, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
+					}
+					etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
+					etcdRecoveryActiveCondition.Reason = hyperv1.AsExpectedReason
+					etcdRecoveryActiveCondition.LastTransitionTime = r.now()
+					meta.SetStatusCondition(&hcluster.Status.Conditions, etcdRecoveryActiveCondition)
+					if err := r.Client.Status().Update(ctx, hcluster); err != nil {
+						return nil, fmt.Errorf("failed to update etcd recovery job condition: %w", err)
+					}
+					return nil, nil
+				}
+			}
+
 			etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
 			etcdRecoveryActiveCondition.Reason = hyperv1.EtcdRecoveryJobFailedReason
 			etcdRecoveryActiveCondition.Message = "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."
@@ -137,6 +156,22 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
 		if !fullyAvailable {
 			return &requeueAfter, nil
 		}
+
+		// Clear stale EtcdRecoveryJobFailed condition if etcd is healthy
+		oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
+		if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
+			log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
+			meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{
+				Type:               string(hyperv1.EtcdRecoveryActive),
+				Status:             metav1.ConditionFalse,
+				Reason:             hyperv1.AsExpectedReason,
+				ObservedGeneration: hcluster.Generation,
+			})
+			if err := r.Client.Status().Update(ctx, hcluster); err != nil {
+				return nil, fmt.Errorf("failed to clear stale etcd recovery condition: %w", err)
+			}
+		}
+
 		return nil, nil
 	}
 
diff --git a/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go b/hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go
@@ -28,6 +28,7 @@ import (
 	hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
 	hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
 	"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
+	etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
 	kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
 	"github.com/openshift/hypershift/support/api"
 	"github.com/openshift/hypershift/support/azureutil"
@@ -47,9 +48,11 @@ import (
 	configv1 "github.com/openshift/api/config/v1"
 
 	appsv1 "k8s.io/api/apps/v1"
+	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/equality"
 	errors2 "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -6568,3 +6571,165 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
 		})
 	}
 }
+
+func TestReconcileETCDMemberRecovery(t *testing.T) {
+	hcpNS := "clusters-test-hc"
+
+	healthyEtcdPods := func() []crclient.Object {
+		var pods []crclient.Object
+		for i := 0; i < 3; i++ {
+			pods = append(pods, &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      fmt.Sprintf("etcd-%d", i),
+					Namespace: hcpNS,
+					Labels:    map[string]string{"app": "etcd"},
+				},
+				Status: corev1.PodStatus{
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name:  "etcd",
+							State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+						},
+					},
+				},
+			})
+		}
+		return pods
+	}
+
+	healthyStatefulSet := &appsv1.StatefulSet{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "etcd",
+			Namespace: hcpNS,
+		},
+		Status: appsv1.StatefulSetStatus{
+			ReadyReplicas:     3,
+			AvailableReplicas: 3,
+		},
+	}
+
+	unhealthyStatefulSet := &appsv1.StatefulSet{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "etcd",
+			Namespace: hcpNS,
+		},
+		Status: appsv1.StatefulSetStatus{
+			ReadyReplicas:     2,
+			AvailableReplicas: 2,
+		},
+	}
+
+	staleCondition := metav1.Condition{
+		Type:               string(hyperv1.EtcdRecoveryActive),
+		Status:             metav1.ConditionFalse,
+		Reason:             hyperv1.EtcdRecoveryJobFailedReason,
+		Message:            "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
+		LastTransitionTime: metav1.Now(),
+	}
+
+	failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
+	failedJob.Status = batchv1.JobStatus{
+		Conditions: []batchv1.JobCondition{
+			{
+				Type:   batchv1.JobFailed,
+				Status: corev1.ConditionTrue,
+			},
+		},
+	}
+
+	testCases := []struct {
+		name            string
+		objects         []crclient.Object
+		conditions      []metav1.Condition
+		expectedReason  string
+		conditionExists  bool
+		expectJobDeleted bool
+	}{
+		{
+			name:            "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:         append(healthyEtcdPods(), healthyStatefulSet),
+			expectedReason:  hyperv1.AsExpectedReason,
+			conditionExists: true,
+		},
+		{
+			name:            "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
+			conditions:      []metav1.Condition{},
+			objects:         append(healthyEtcdPods(), healthyStatefulSet),
+			conditionExists: false,
+		},
+		{
+			name:            "When failed job exists but etcd recovered it should cleanup job and clear condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:          append(healthyEtcdPods(), healthyStatefulSet, failedJob),
+			expectedReason:  hyperv1.AsExpectedReason,
+			conditionExists:  true,
+			expectJobDeleted: true,
+		},
+		{
+			name:            "When failed job exists and etcd is still unhealthy it should keep the failure condition",
+			conditions:      []metav1.Condition{staleCondition},
+			objects:         append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
+			expectedReason:  hyperv1.EtcdRecoveryJobFailedReason,
+			conditionExists: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			g := NewGomegaWithT(t)
+
+			hcluster := &hyperv1.HostedCluster{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-hc",
+					Namespace: "clusters",
+				},
+				Spec: hyperv1.HostedClusterSpec{
+					Etcd: hyperv1.EtcdSpec{
+						ManagementType: hyperv1.Managed,
+					},
+					ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
+				},
+				Status: hyperv1.HostedClusterStatus{
+					Conditions: tc.conditions,
+				},
+			}
+
+			objects := append([]crclient.Object{hcluster}, tc.objects...)
+			client := fake.NewClientBuilder().
+				WithScheme(api.Scheme).
+				WithObjects(objects...).
+				WithStatusSubresource(hcluster).
+				Build()
+
+			r := &HostedClusterReconciler{
+				Client:             client,
+				now:                metav1.Now,
+				EnableEtcdRecovery: true,
+			}
+
+			_, err := r.reconcileETCDMemberRecovery(
+				ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
+				hcluster,
+				upsert.New(false).CreateOrUpdate,
+			)
+			g.Expect(err).ToNot(HaveOccurred())
+
+			updatedHC := &hyperv1.HostedCluster{}
+			g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
+
+			condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
+			if tc.conditionExists {
+				g.Expect(condition).ToNot(BeNil())
+				g.Expect(condition.Reason).To(Equal(tc.expectedReason))
+			} else {
+				g.Expect(condition).To(BeNil())
+			}
+			if tc.expectJobDeleted {
+				job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
+				err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
+				g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
+			}
+		})
+	}
+}