Skip to content

Commit 3e6a857

Browse files
committed
fix(etcd-recovery): OCPBUGS-84577: clear stale EtcdRecoveryActive failure condition when etcd is healthy
When the etcd recovery job fails but etcd self-heals, the EtcdRecoveryJobFailed condition was never cleared. This caused the OpenShift Console to display a stale error message even when the cluster was fully healthy. This fix adds two checks: - When a failed recovery job exists but etcd StatefulSet is fully available (3/3), clean up the job and clear the condition. - When no failing etcd pods exist and etcd is healthy, clear any stale EtcdRecoveryJobFailed condition. Signed-off-by: Vimal Solanki <vsolanki@redhat.com>
1 parent 68106f0 commit 3e6a857

2 files changed

Lines changed: 200 additions & 0 deletions

File tree

hypershift-operator/controllers/hostedcluster/etcd_recovery.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,25 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
5656
}
5757

5858
if !jobStatus.successful {
59+
// Check if etcd has recovered despite the job failing
60+
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
61+
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err == nil {
62+
if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
63+
log.Info("etcd recovered despite failed recovery job, cleaning up")
64+
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
65+
return nil, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
66+
}
67+
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
68+
etcdRecoveryActiveCondition.Reason = hyperv1.AsExpectedReason
69+
etcdRecoveryActiveCondition.LastTransitionTime = r.now()
70+
meta.SetStatusCondition(&hcluster.Status.Conditions, etcdRecoveryActiveCondition)
71+
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
72+
return nil, fmt.Errorf("failed to update etcd recovery job condition: %w", err)
73+
}
74+
return nil, nil
75+
}
76+
}
77+
5978
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
6079
etcdRecoveryActiveCondition.Reason = hyperv1.EtcdRecoveryJobFailedReason
6180
etcdRecoveryActiveCondition.Message = "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."
@@ -137,6 +156,22 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
137156
if !fullyAvailable {
138157
return &requeueAfter, nil
139158
}
159+
160+
// Clear stale EtcdRecoveryJobFailed condition if etcd is healthy
161+
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
162+
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
163+
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
164+
meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{
165+
Type: string(hyperv1.EtcdRecoveryActive),
166+
Status: metav1.ConditionFalse,
167+
Reason: hyperv1.AsExpectedReason,
168+
ObservedGeneration: hcluster.Generation,
169+
})
170+
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
171+
return nil, fmt.Errorf("failed to clear stale etcd recovery condition: %w", err)
172+
}
173+
}
174+
140175
return nil, nil
141176
}
142177

hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929
hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030
"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+
etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132
kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233
"github.com/openshift/hypershift/support/api"
3334
"github.com/openshift/hypershift/support/azureutil"
@@ -47,9 +48,11 @@ import (
4748
configv1 "github.com/openshift/api/config/v1"
4849

4950
appsv1 "k8s.io/api/apps/v1"
51+
batchv1 "k8s.io/api/batch/v1"
5052
corev1 "k8s.io/api/core/v1"
5153
"k8s.io/apimachinery/pkg/api/equality"
5254
errors2 "k8s.io/apimachinery/pkg/api/errors"
55+
"k8s.io/apimachinery/pkg/api/meta"
5356
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5457
"k8s.io/apimachinery/pkg/types"
5558
"k8s.io/apimachinery/pkg/util/intstr"
@@ -6568,3 +6571,165 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
65686571
})
65696572
}
65706573
}
6574+
6575+
func TestReconcileETCDMemberRecovery(t *testing.T) {
6576+
hcpNS := "clusters-test-hc"
6577+
6578+
healthyEtcdPods := func() []crclient.Object {
6579+
var pods []crclient.Object
6580+
for i := 0; i < 3; i++ {
6581+
pods = append(pods, &corev1.Pod{
6582+
ObjectMeta: metav1.ObjectMeta{
6583+
Name: fmt.Sprintf("etcd-%d", i),
6584+
Namespace: hcpNS,
6585+
Labels: map[string]string{"app": "etcd"},
6586+
},
6587+
Status: corev1.PodStatus{
6588+
ContainerStatuses: []corev1.ContainerStatus{
6589+
{
6590+
Name: "etcd",
6591+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6592+
},
6593+
},
6594+
},
6595+
})
6596+
}
6597+
return pods
6598+
}
6599+
6600+
healthyStatefulSet := &appsv1.StatefulSet{
6601+
ObjectMeta: metav1.ObjectMeta{
6602+
Name: "etcd",
6603+
Namespace: hcpNS,
6604+
},
6605+
Status: appsv1.StatefulSetStatus{
6606+
ReadyReplicas: 3,
6607+
AvailableReplicas: 3,
6608+
},
6609+
}
6610+
6611+
unhealthyStatefulSet := &appsv1.StatefulSet{
6612+
ObjectMeta: metav1.ObjectMeta{
6613+
Name: "etcd",
6614+
Namespace: hcpNS,
6615+
},
6616+
Status: appsv1.StatefulSetStatus{
6617+
ReadyReplicas: 2,
6618+
AvailableReplicas: 2,
6619+
},
6620+
}
6621+
6622+
staleCondition := metav1.Condition{
6623+
Type: string(hyperv1.EtcdRecoveryActive),
6624+
Status: metav1.ConditionFalse,
6625+
Reason: hyperv1.EtcdRecoveryJobFailedReason,
6626+
Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
6627+
LastTransitionTime: metav1.Now(),
6628+
}
6629+
6630+
failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6631+
failedJob.Status = batchv1.JobStatus{
6632+
Conditions: []batchv1.JobCondition{
6633+
{
6634+
Type: batchv1.JobFailed,
6635+
Status: corev1.ConditionTrue,
6636+
},
6637+
},
6638+
}
6639+
6640+
testCases := []struct {
6641+
name string
6642+
objects []crclient.Object
6643+
conditions []metav1.Condition
6644+
expectedReason string
6645+
conditionExists bool
6646+
expectJobDeleted bool
6647+
}{
6648+
{
6649+
name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
6650+
conditions: []metav1.Condition{staleCondition},
6651+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6652+
expectedReason: hyperv1.AsExpectedReason,
6653+
conditionExists: true,
6654+
},
6655+
{
6656+
name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
6657+
conditions: []metav1.Condition{},
6658+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6659+
conditionExists: false,
6660+
},
6661+
{
6662+
name: "When failed job exists but etcd recovered it should cleanup job and clear condition",
6663+
conditions: []metav1.Condition{staleCondition},
6664+
objects: append(healthyEtcdPods(), healthyStatefulSet, failedJob),
6665+
expectedReason: hyperv1.AsExpectedReason,
6666+
conditionExists: true,
6667+
expectJobDeleted: true,
6668+
},
6669+
{
6670+
name: "When failed job exists and etcd is still unhealthy it should keep the failure condition",
6671+
conditions: []metav1.Condition{staleCondition},
6672+
objects: append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
6673+
expectedReason: hyperv1.EtcdRecoveryJobFailedReason,
6674+
conditionExists: true,
6675+
},
6676+
}
6677+
6678+
for _, tc := range testCases {
6679+
t.Run(tc.name, func(t *testing.T) {
6680+
g := NewGomegaWithT(t)
6681+
6682+
hcluster := &hyperv1.HostedCluster{
6683+
ObjectMeta: metav1.ObjectMeta{
6684+
Name: "test-hc",
6685+
Namespace: "clusters",
6686+
},
6687+
Spec: hyperv1.HostedClusterSpec{
6688+
Etcd: hyperv1.EtcdSpec{
6689+
ManagementType: hyperv1.Managed,
6690+
},
6691+
ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
6692+
},
6693+
Status: hyperv1.HostedClusterStatus{
6694+
Conditions: tc.conditions,
6695+
},
6696+
}
6697+
6698+
objects := append([]crclient.Object{hcluster}, tc.objects...)
6699+
client := fake.NewClientBuilder().
6700+
WithScheme(api.Scheme).
6701+
WithObjects(objects...).
6702+
WithStatusSubresource(hcluster).
6703+
Build()
6704+
6705+
r := &HostedClusterReconciler{
6706+
Client: client,
6707+
now: metav1.Now,
6708+
EnableEtcdRecovery: true,
6709+
}
6710+
6711+
_, err := r.reconcileETCDMemberRecovery(
6712+
ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
6713+
hcluster,
6714+
upsert.New(false).CreateOrUpdate,
6715+
)
6716+
g.Expect(err).ToNot(HaveOccurred())
6717+
6718+
updatedHC := &hyperv1.HostedCluster{}
6719+
g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
6720+
6721+
condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
6722+
if tc.conditionExists {
6723+
g.Expect(condition).ToNot(BeNil())
6724+
g.Expect(condition.Reason).To(Equal(tc.expectedReason))
6725+
} else {
6726+
g.Expect(condition).To(BeNil())
6727+
}
6728+
if tc.expectJobDeleted {
6729+
job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6730+
err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
6731+
g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
6732+
}
6733+
})
6734+
}
6735+
}

0 commit comments

Comments
 (0)