Skip to content

Commit 6e9192b

Browse files
committed
fix(etcd-recovery): clear stale EtcdRecoveryActive failure condition when etcd is healthy
When the etcd recovery job fails but etcd self-heals, the EtcdRecoveryJobFailed condition was never cleared. This caused the OpenShift Console to display a stale error message even when the cluster was fully healthy. This fix adds two checks: - When a failed recovery job exists but etcd StatefulSet is fully available (3/3), clean up the job and clear the condition. Transient API errors are propagated instead of silently falling through to the failure path. - When no failing etcd pods exist and etcd is healthy, clear any stale EtcdRecoveryJobFailed condition. Signed-off-by: Vimal Solanki <vsolanki@redhat.com>
1 parent 96a2bcb commit 6e9192b

2 files changed

Lines changed: 222 additions & 0 deletions

File tree

hypershift-operator/controllers/hostedcluster/etcd_recovery.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,30 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
6262
// It returns (done, err) where done=true means the caller should return immediately
6363
// without falling through to detectAndTriggerEtcdRecovery.
6464
func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) {
65+
hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)
66+
6567
if !jobStatus.finished {
6668
log.Info("waiting for etcd recovery job to complete")
6769
return true, nil
6870
}
6971

7072
if !jobStatus.successful {
73+
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
74+
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
75+
if !apierrors.IsNotFound(err) {
76+
return false, fmt.Errorf("failed to get etcd statefulset: %w", err)
77+
}
78+
} else if etcdStatefulSet.Status.ReadyReplicas == 3 && etcdStatefulSet.Status.AvailableReplicas == 3 {
79+
log.Info("etcd recovered despite failed recovery job, cleaning up")
80+
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
81+
return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
82+
}
83+
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD recovered despite failed recovery job."); err != nil {
84+
return false, err
85+
}
86+
return true, nil
87+
}
88+
7189
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil {
7290
return false, err
7391
}
@@ -131,6 +149,15 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
131149
if !fullyAvailable {
132150
return &requeueAfter, nil
133151
}
152+
153+
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
154+
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
155+
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
156+
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD cluster is healthy."); err != nil {
157+
return nil, err
158+
}
159+
}
160+
134161
return nil, nil
135162
}
136163

hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929
hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030
"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+
etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132
kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233
"github.com/openshift/hypershift/support/api"
3334
"github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849
configv1 "github.com/openshift/api/config/v1"
4950

5051
appsv1 "k8s.io/api/apps/v1"
52+
batchv1 "k8s.io/api/batch/v1"
5153
corev1 "k8s.io/api/core/v1"
5254
"k8s.io/apimachinery/pkg/api/equality"
5355
errors2 "k8s.io/apimachinery/pkg/api/errors"
56+
"k8s.io/apimachinery/pkg/api/meta"
5457
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558
"k8s.io/apimachinery/pkg/types"
5659
"k8s.io/apimachinery/pkg/util/intstr"
@@ -6734,3 +6737,195 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
67346737
})
67356738
}
67366739
}
6740+
6741+
func TestReconcileETCDMemberRecovery(t *testing.T) {
6742+
hcpNS := "clusters-test-hc"
6743+
6744+
healthyEtcdPods := func() []crclient.Object {
6745+
var pods []crclient.Object
6746+
for i := 0; i < 3; i++ {
6747+
pods = append(pods, &corev1.Pod{
6748+
ObjectMeta: metav1.ObjectMeta{
6749+
Name: fmt.Sprintf("etcd-%d", i),
6750+
Namespace: hcpNS,
6751+
Labels: map[string]string{"app": "etcd"},
6752+
},
6753+
Status: corev1.PodStatus{
6754+
ContainerStatuses: []corev1.ContainerStatus{
6755+
{
6756+
Name: "etcd",
6757+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6758+
},
6759+
},
6760+
},
6761+
})
6762+
}
6763+
return pods
6764+
}
6765+
6766+
recoveredEtcdPods := func() []crclient.Object {
6767+
var pods []crclient.Object
6768+
for i := 0; i < 3; i++ {
6769+
pods = append(pods, &corev1.Pod{
6770+
ObjectMeta: metav1.ObjectMeta{
6771+
Name: fmt.Sprintf("etcd-%d", i),
6772+
Namespace: hcpNS,
6773+
Labels: map[string]string{"app": "etcd"},
6774+
},
6775+
Status: corev1.PodStatus{
6776+
ContainerStatuses: []corev1.ContainerStatus{
6777+
{
6778+
Name: "etcd",
6779+
RestartCount: 3,
6780+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6781+
},
6782+
},
6783+
},
6784+
})
6785+
}
6786+
return pods
6787+
}
6788+
6789+
healthyStatefulSet := &appsv1.StatefulSet{
6790+
ObjectMeta: metav1.ObjectMeta{
6791+
Name: "etcd",
6792+
Namespace: hcpNS,
6793+
},
6794+
Status: appsv1.StatefulSetStatus{
6795+
ReadyReplicas: 3,
6796+
AvailableReplicas: 3,
6797+
},
6798+
}
6799+
6800+
unhealthyStatefulSet := &appsv1.StatefulSet{
6801+
ObjectMeta: metav1.ObjectMeta{
6802+
Name: "etcd",
6803+
Namespace: hcpNS,
6804+
},
6805+
Status: appsv1.StatefulSetStatus{
6806+
ReadyReplicas: 2,
6807+
AvailableReplicas: 2,
6808+
},
6809+
}
6810+
6811+
staleCondition := metav1.Condition{
6812+
Type: string(hyperv1.EtcdRecoveryActive),
6813+
Status: metav1.ConditionFalse,
6814+
Reason: hyperv1.EtcdRecoveryJobFailedReason,
6815+
Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
6816+
LastTransitionTime: metav1.Now(),
6817+
}
6818+
6819+
failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6820+
failedJob.Status = batchv1.JobStatus{
6821+
Conditions: []batchv1.JobCondition{
6822+
{
6823+
Type: batchv1.JobFailed,
6824+
Status: corev1.ConditionTrue,
6825+
},
6826+
},
6827+
}
6828+
6829+
testCases := []struct {
6830+
name string
6831+
objects []crclient.Object
6832+
conditions []metav1.Condition
6833+
expectedReason string
6834+
conditionExists bool
6835+
expectJobDeleted bool
6836+
}{
6837+
{
6838+
name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
6839+
conditions: []metav1.Condition{staleCondition},
6840+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6841+
expectedReason: hyperv1.AsExpectedReason,
6842+
conditionExists: true,
6843+
},
6844+
{
6845+
name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
6846+
conditions: []metav1.Condition{},
6847+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6848+
conditionExists: false,
6849+
},
6850+
{
6851+
name: "When failed job exists but etcd recovered it should cleanup job and clear condition",
6852+
conditions: []metav1.Condition{staleCondition},
6853+
objects: append(healthyEtcdPods(), healthyStatefulSet, failedJob),
6854+
expectedReason: hyperv1.AsExpectedReason,
6855+
conditionExists: true,
6856+
expectJobDeleted: true,
6857+
},
6858+
{
6859+
name: "When etcd pods have restarted but recovered it should clear the stale condition",
6860+
conditions: []metav1.Condition{staleCondition},
6861+
objects: append(recoveredEtcdPods(), healthyStatefulSet),
6862+
expectedReason: hyperv1.AsExpectedReason,
6863+
conditionExists: true,
6864+
},
6865+
{
6866+
name: "When failed job exists and etcd is still unhealthy it should keep the failure condition",
6867+
conditions: []metav1.Condition{staleCondition},
6868+
objects: append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
6869+
expectedReason: hyperv1.EtcdRecoveryJobFailedReason,
6870+
conditionExists: true,
6871+
},
6872+
}
6873+
6874+
for _, tc := range testCases {
6875+
t.Run(tc.name, func(t *testing.T) {
6876+
g := NewGomegaWithT(t)
6877+
6878+
hcluster := &hyperv1.HostedCluster{
6879+
ObjectMeta: metav1.ObjectMeta{
6880+
Name: "test-hc",
6881+
Namespace: "clusters",
6882+
},
6883+
Spec: hyperv1.HostedClusterSpec{
6884+
Etcd: hyperv1.EtcdSpec{
6885+
ManagementType: hyperv1.Managed,
6886+
},
6887+
ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
6888+
},
6889+
Status: hyperv1.HostedClusterStatus{
6890+
Conditions: tc.conditions,
6891+
},
6892+
}
6893+
6894+
objects := append([]crclient.Object{hcluster}, tc.objects...)
6895+
client := fake.NewClientBuilder().
6896+
WithScheme(api.Scheme).
6897+
WithObjects(objects...).
6898+
WithStatusSubresource(hcluster).
6899+
Build()
6900+
6901+
r := &HostedClusterReconciler{
6902+
Client: client,
6903+
now: metav1.Now,
6904+
EnableEtcdRecovery: true,
6905+
}
6906+
6907+
_, err := r.reconcileETCDMemberRecovery(
6908+
ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
6909+
hcluster,
6910+
upsert.New(false).CreateOrUpdate,
6911+
)
6912+
g.Expect(err).ToNot(HaveOccurred())
6913+
6914+
updatedHC := &hyperv1.HostedCluster{}
6915+
g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
6916+
6917+
condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
6918+
if tc.conditionExists {
6919+
g.Expect(condition).ToNot(BeNil())
6920+
g.Expect(condition.Reason).To(Equal(tc.expectedReason))
6921+
} else {
6922+
g.Expect(condition).To(BeNil())
6923+
}
6924+
if tc.expectJobDeleted {
6925+
job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6926+
err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
6927+
g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
6928+
}
6929+
})
6930+
}
6931+
}

0 commit comments

Comments
 (0)