Skip to content

Commit 7788d6b

Browse files
committed
fix(etcd-recovery): clear stale EtcdRecoveryActive failure condition when etcd is healthy
When the etcd recovery job fails but etcd self-heals, the EtcdRecoveryJobFailed condition was never cleared. This caused the OpenShift Console to display a stale error message even when the cluster was fully healthy. This fix adds two checks: - When a failed recovery job exists but etcd StatefulSet is fully available (3/3), clean up the job and clear the condition. Transient API errors are propagated instead of silently falling through to the failure path. - When no failing etcd pods exist and etcd is healthy, clear any stale EtcdRecoveryJobFailed condition. Signed-off-by: Vimal Solanki <vsolanki@redhat.com>
1 parent a62eefd commit 7788d6b

2 files changed

Lines changed: 233 additions & 0 deletions

File tree

hypershift-operator/controllers/hostedcluster/etcd_recovery.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,27 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
5656
}
5757

5858
if !jobStatus.successful {
59+
// Check if etcd has recovered despite the job failing
60+
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
61+
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
62+
if !apierrors.IsNotFound(err) {
63+
return nil, fmt.Errorf("failed to get etcd statefulset: %w", err)
64+
}
65+
} else if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
66+
log.Info("etcd recovered despite failed recovery job, cleaning up")
67+
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
68+
return nil, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
69+
}
70+
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
71+
etcdRecoveryActiveCondition.Reason = hyperv1.AsExpectedReason
72+
etcdRecoveryActiveCondition.Message = "ETCD recovered despite failed recovery job."
73+
etcdRecoveryActiveCondition.LastTransitionTime = r.now()
74+
meta.SetStatusCondition(&hcluster.Status.Conditions, etcdRecoveryActiveCondition)
75+
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
76+
return nil, fmt.Errorf("failed to update etcd recovery job condition: %w", err)
77+
}
78+
return nil, nil
79+
}
5980
etcdRecoveryActiveCondition.Status = metav1.ConditionFalse
6081
etcdRecoveryActiveCondition.Reason = hyperv1.EtcdRecoveryJobFailedReason
6182
etcdRecoveryActiveCondition.Message = "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."
@@ -137,6 +158,23 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
137158
if !fullyAvailable {
138159
return &requeueAfter, nil
139160
}
161+
162+
// Clear stale EtcdRecoveryJobFailed condition if etcd is healthy
163+
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
164+
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
165+
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
166+
meta.SetStatusCondition(&hcluster.Status.Conditions, metav1.Condition{
167+
Type: string(hyperv1.EtcdRecoveryActive),
168+
Status: metav1.ConditionFalse,
169+
Reason: hyperv1.AsExpectedReason,
170+
Message: "ETCD cluster is healthy.",
171+
ObservedGeneration: hcluster.Generation,
172+
})
173+
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
174+
return nil, fmt.Errorf("failed to clear stale etcd recovery condition: %w", err)
175+
}
176+
}
177+
140178
return nil, nil
141179
}
142180

hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929
hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030
"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+
etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132
kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233
"github.com/openshift/hypershift/support/api"
3334
"github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849
configv1 "github.com/openshift/api/config/v1"
4950

5051
appsv1 "k8s.io/api/apps/v1"
52+
batchv1 "k8s.io/api/batch/v1"
5153
corev1 "k8s.io/api/core/v1"
5254
"k8s.io/apimachinery/pkg/api/equality"
5355
errors2 "k8s.io/apimachinery/pkg/api/errors"
56+
"k8s.io/apimachinery/pkg/api/meta"
5457
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558
"k8s.io/apimachinery/pkg/types"
5659
"k8s.io/apimachinery/pkg/util/intstr"
@@ -6569,3 +6572,195 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
65696572
})
65706573
}
65716574
}
6575+
6576+
func TestReconcileETCDMemberRecovery(t *testing.T) {
6577+
hcpNS := "clusters-test-hc"
6578+
6579+
healthyEtcdPods := func() []crclient.Object {
6580+
var pods []crclient.Object
6581+
for i := 0; i < 3; i++ {
6582+
pods = append(pods, &corev1.Pod{
6583+
ObjectMeta: metav1.ObjectMeta{
6584+
Name: fmt.Sprintf("etcd-%d", i),
6585+
Namespace: hcpNS,
6586+
Labels: map[string]string{"app": "etcd"},
6587+
},
6588+
Status: corev1.PodStatus{
6589+
ContainerStatuses: []corev1.ContainerStatus{
6590+
{
6591+
Name: "etcd",
6592+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6593+
},
6594+
},
6595+
},
6596+
})
6597+
}
6598+
return pods
6599+
}
6600+
6601+
recoveredEtcdPods := func() []crclient.Object {
6602+
var pods []crclient.Object
6603+
for i := 0; i < 3; i++ {
6604+
pods = append(pods, &corev1.Pod{
6605+
ObjectMeta: metav1.ObjectMeta{
6606+
Name: fmt.Sprintf("etcd-%d", i),
6607+
Namespace: hcpNS,
6608+
Labels: map[string]string{"app": "etcd"},
6609+
},
6610+
Status: corev1.PodStatus{
6611+
ContainerStatuses: []corev1.ContainerStatus{
6612+
{
6613+
Name: "etcd",
6614+
RestartCount: 3,
6615+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6616+
},
6617+
},
6618+
},
6619+
})
6620+
}
6621+
return pods
6622+
}
6623+
6624+
healthyStatefulSet := &appsv1.StatefulSet{
6625+
ObjectMeta: metav1.ObjectMeta{
6626+
Name: "etcd",
6627+
Namespace: hcpNS,
6628+
},
6629+
Status: appsv1.StatefulSetStatus{
6630+
ReadyReplicas: 3,
6631+
AvailableReplicas: 3,
6632+
},
6633+
}
6634+
6635+
unhealthyStatefulSet := &appsv1.StatefulSet{
6636+
ObjectMeta: metav1.ObjectMeta{
6637+
Name: "etcd",
6638+
Namespace: hcpNS,
6639+
},
6640+
Status: appsv1.StatefulSetStatus{
6641+
ReadyReplicas: 2,
6642+
AvailableReplicas: 2,
6643+
},
6644+
}
6645+
6646+
staleCondition := metav1.Condition{
6647+
Type: string(hyperv1.EtcdRecoveryActive),
6648+
Status: metav1.ConditionFalse,
6649+
Reason: hyperv1.EtcdRecoveryJobFailedReason,
6650+
Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
6651+
LastTransitionTime: metav1.Now(),
6652+
}
6653+
6654+
failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6655+
failedJob.Status = batchv1.JobStatus{
6656+
Conditions: []batchv1.JobCondition{
6657+
{
6658+
Type: batchv1.JobFailed,
6659+
Status: corev1.ConditionTrue,
6660+
},
6661+
},
6662+
}
6663+
6664+
testCases := []struct {
6665+
name string
6666+
objects []crclient.Object
6667+
conditions []metav1.Condition
6668+
expectedReason string
6669+
conditionExists bool
6670+
expectJobDeleted bool
6671+
}{
6672+
{
6673+
name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
6674+
conditions: []metav1.Condition{staleCondition},
6675+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6676+
expectedReason: hyperv1.AsExpectedReason,
6677+
conditionExists: true,
6678+
},
6679+
{
6680+
name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
6681+
conditions: []metav1.Condition{},
6682+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6683+
conditionExists: false,
6684+
},
6685+
{
6686+
name: "When failed job exists but etcd recovered it should cleanup job and clear condition",
6687+
conditions: []metav1.Condition{staleCondition},
6688+
objects: append(healthyEtcdPods(), healthyStatefulSet, failedJob),
6689+
expectedReason: hyperv1.AsExpectedReason,
6690+
conditionExists: true,
6691+
expectJobDeleted: true,
6692+
},
6693+
{
6694+
name: "When etcd pods have restarted but recovered it should clear the stale condition",
6695+
conditions: []metav1.Condition{staleCondition},
6696+
objects: append(recoveredEtcdPods(), healthyStatefulSet),
6697+
expectedReason: hyperv1.AsExpectedReason,
6698+
conditionExists: true,
6699+
},
6700+
{
6701+
name: "When failed job exists and etcd is still unhealthy it should keep the failure condition",
6702+
conditions: []metav1.Condition{staleCondition},
6703+
objects: append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
6704+
expectedReason: hyperv1.EtcdRecoveryJobFailedReason,
6705+
conditionExists: true,
6706+
},
6707+
}
6708+
6709+
for _, tc := range testCases {
6710+
t.Run(tc.name, func(t *testing.T) {
6711+
g := NewGomegaWithT(t)
6712+
6713+
hcluster := &hyperv1.HostedCluster{
6714+
ObjectMeta: metav1.ObjectMeta{
6715+
Name: "test-hc",
6716+
Namespace: "clusters",
6717+
},
6718+
Spec: hyperv1.HostedClusterSpec{
6719+
Etcd: hyperv1.EtcdSpec{
6720+
ManagementType: hyperv1.Managed,
6721+
},
6722+
ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
6723+
},
6724+
Status: hyperv1.HostedClusterStatus{
6725+
Conditions: tc.conditions,
6726+
},
6727+
}
6728+
6729+
objects := append([]crclient.Object{hcluster}, tc.objects...)
6730+
client := fake.NewClientBuilder().
6731+
WithScheme(api.Scheme).
6732+
WithObjects(objects...).
6733+
WithStatusSubresource(hcluster).
6734+
Build()
6735+
6736+
r := &HostedClusterReconciler{
6737+
Client: client,
6738+
now: metav1.Now,
6739+
EnableEtcdRecovery: true,
6740+
}
6741+
6742+
_, err := r.reconcileETCDMemberRecovery(
6743+
ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
6744+
hcluster,
6745+
upsert.New(false).CreateOrUpdate,
6746+
)
6747+
g.Expect(err).ToNot(HaveOccurred())
6748+
6749+
updatedHC := &hyperv1.HostedCluster{}
6750+
g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
6751+
6752+
condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
6753+
if tc.conditionExists {
6754+
g.Expect(condition).ToNot(BeNil())
6755+
g.Expect(condition.Reason).To(Equal(tc.expectedReason))
6756+
} else {
6757+
g.Expect(condition).To(BeNil())
6758+
}
6759+
if tc.expectJobDeleted {
6760+
job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6761+
err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
6762+
g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
6763+
}
6764+
})
6765+
}
6766+
}

0 commit comments

Comments
 (0)