Skip to content

Commit d34dc0d

Browse files
committed
fix(etcd-recovery): clear stale EtcdRecoveryActive failure condition when etcd is healthy
When the etcd recovery job fails but etcd self-heals, the EtcdRecoveryJobFailed condition was never cleared. This caused the OpenShift Console to display a stale error message even when the cluster was fully healthy. This fix adds two checks: - When a failed recovery job exists but etcd StatefulSet is fully available (3/3), clean up the job and clear the condition. Transient API errors are propagated instead of silently falling through to the failure path. - When no failing etcd pods exist and etcd is healthy, clear any stale EtcdRecoveryJobFailed condition. Signed-off-by: Vimal Solanki <vsolanki@redhat.com>
1 parent 96a2bcb commit d34dc0d

2 files changed

Lines changed: 226 additions & 1 deletion

File tree

hypershift-operator/controllers/hostedcluster/etcd_recovery.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ import (
2929
"github.com/go-logr/logr"
3030
)
3131

32+
func isEtcdStatefulSetHealthy(sts *appsv1.StatefulSet) bool {
33+
return sts.Spec.Replicas != nil &&
34+
sts.Status.ReadyReplicas == *sts.Spec.Replicas &&
35+
sts.Status.AvailableReplicas == *sts.Spec.Replicas
36+
}
37+
3238
type etcdJobStatus struct {
3339
exists bool
3440
finished bool
@@ -62,12 +68,30 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
6268
// It returns (done, err) where done=true means the caller should return immediately
6369
// without falling through to detectAndTriggerEtcdRecovery.
6470
func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) {
71+
hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)
72+
6573
if !jobStatus.finished {
6674
log.Info("waiting for etcd recovery job to complete")
6775
return true, nil
6876
}
6977

7078
if !jobStatus.successful {
79+
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
80+
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
81+
if !apierrors.IsNotFound(err) {
82+
return false, fmt.Errorf("failed to get etcd statefulset: %w", err)
83+
}
84+
} else if isEtcdStatefulSetHealthy(etcdStatefulSet) {
85+
log.Info("etcd recovered despite failed recovery job, cleaning up")
86+
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
87+
return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
88+
}
89+
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD recovered despite failed recovery job."); err != nil {
90+
return false, err
91+
}
92+
return true, nil
93+
}
94+
7195
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil {
7296
return false, err
7397
}
@@ -99,7 +123,7 @@ func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context,
99123
}
100124

101125
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
102-
if oldCondition == nil || oldCondition.Status != condition.Status {
126+
if oldCondition == nil || oldCondition.Status != condition.Status || oldCondition.Reason != condition.Reason {
103127
meta.SetStatusCondition(&hcluster.Status.Conditions, condition)
104128
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
105129
return fmt.Errorf("failed to update etcd recovery job condition: %w", err)
@@ -131,6 +155,15 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
131155
if !fullyAvailable {
132156
return &requeueAfter, nil
133157
}
158+
159+
oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
160+
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
161+
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
162+
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD cluster is healthy."); err != nil {
163+
return nil, err
164+
}
165+
}
166+
134167
return nil, nil
135168
}
136169

hypershift-operator/controllers/hostedcluster/hostedcluster_controller_test.go

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929
hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030
"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+
etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132
kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233
"github.com/openshift/hypershift/support/api"
3334
"github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849
configv1 "github.com/openshift/api/config/v1"
4950

5051
appsv1 "k8s.io/api/apps/v1"
52+
batchv1 "k8s.io/api/batch/v1"
5153
corev1 "k8s.io/api/core/v1"
5254
"k8s.io/apimachinery/pkg/api/equality"
5355
errors2 "k8s.io/apimachinery/pkg/api/errors"
56+
"k8s.io/apimachinery/pkg/api/meta"
5457
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558
"k8s.io/apimachinery/pkg/types"
5659
"k8s.io/apimachinery/pkg/util/intstr"
@@ -6734,3 +6737,192 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
67346737
})
67356738
}
67366739
}
6740+
6741+
func TestReconcileETCDMemberRecovery(t *testing.T) {
6742+
hcpNS := "clusters-test-hc"
6743+
6744+
healthyEtcdPods := func() []crclient.Object {
6745+
var pods []crclient.Object
6746+
for i := 0; i < 3; i++ {
6747+
pods = append(pods, &corev1.Pod{
6748+
ObjectMeta: metav1.ObjectMeta{
6749+
Name: fmt.Sprintf("etcd-%d", i),
6750+
Namespace: hcpNS,
6751+
Labels: map[string]string{"app": "etcd"},
6752+
},
6753+
Status: corev1.PodStatus{
6754+
ContainerStatuses: []corev1.ContainerStatus{
6755+
{
6756+
Name: "etcd",
6757+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6758+
},
6759+
},
6760+
},
6761+
})
6762+
}
6763+
return pods
6764+
}
6765+
6766+
recoveredEtcdPods := func() []crclient.Object {
6767+
var pods []crclient.Object
6768+
for i := 0; i < 3; i++ {
6769+
pods = append(pods, &corev1.Pod{
6770+
ObjectMeta: metav1.ObjectMeta{
6771+
Name: fmt.Sprintf("etcd-%d", i),
6772+
Namespace: hcpNS,
6773+
Labels: map[string]string{"app": "etcd"},
6774+
},
6775+
Status: corev1.PodStatus{
6776+
ContainerStatuses: []corev1.ContainerStatus{
6777+
{
6778+
Name: "etcd",
6779+
RestartCount: 3,
6780+
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
6781+
},
6782+
},
6783+
},
6784+
})
6785+
}
6786+
return pods
6787+
}
6788+
6789+
initEtcdStatefulSet := func(specReplicas, readyReplicas, availableReplicas int32) *appsv1.StatefulSet {
6790+
return &appsv1.StatefulSet{
6791+
ObjectMeta: metav1.ObjectMeta{
6792+
Name: "etcd",
6793+
Namespace: hcpNS,
6794+
},
6795+
Spec: appsv1.StatefulSetSpec{
6796+
Replicas: ptr.To[int32](specReplicas),
6797+
},
6798+
Status: appsv1.StatefulSetStatus{
6799+
ReadyReplicas: readyReplicas,
6800+
AvailableReplicas: availableReplicas,
6801+
},
6802+
}
6803+
}
6804+
6805+
healthyStatefulSet := initEtcdStatefulSet(3, 3, 3)
6806+
unhealthyStatefulSet := initEtcdStatefulSet(3, 2, 2)
6807+
6808+
staleCondition := metav1.Condition{
6809+
Type: string(hyperv1.EtcdRecoveryActive),
6810+
Status: metav1.ConditionFalse,
6811+
Reason: hyperv1.EtcdRecoveryJobFailedReason,
6812+
Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
6813+
LastTransitionTime: metav1.Now(),
6814+
}
6815+
6816+
failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6817+
failedJob.Status = batchv1.JobStatus{
6818+
Conditions: []batchv1.JobCondition{
6819+
{
6820+
Type: batchv1.JobFailed,
6821+
Status: corev1.ConditionTrue,
6822+
},
6823+
},
6824+
}
6825+
6826+
testCases := []struct {
6827+
name string
6828+
objects []crclient.Object
6829+
conditions []metav1.Condition
6830+
expectedReason string
6831+
conditionExists bool
6832+
expectJobDeleted bool
6833+
}{
6834+
{
6835+
name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
6836+
conditions: []metav1.Condition{staleCondition},
6837+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6838+
expectedReason: hyperv1.AsExpectedReason,
6839+
conditionExists: true,
6840+
},
6841+
{
6842+
name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
6843+
conditions: []metav1.Condition{},
6844+
objects: append(healthyEtcdPods(), healthyStatefulSet),
6845+
conditionExists: false,
6846+
},
6847+
{
6848+
name: "When failed job exists but etcd recovered it should cleanup job and clear condition",
6849+
conditions: []metav1.Condition{staleCondition},
6850+
objects: append(healthyEtcdPods(), healthyStatefulSet, failedJob),
6851+
expectedReason: hyperv1.AsExpectedReason,
6852+
conditionExists: true,
6853+
expectJobDeleted: true,
6854+
},
6855+
{
6856+
name: "When etcd pods have restarted but recovered it should clear the stale condition",
6857+
conditions: []metav1.Condition{staleCondition},
6858+
objects: append(recoveredEtcdPods(), healthyStatefulSet),
6859+
expectedReason: hyperv1.AsExpectedReason,
6860+
conditionExists: true,
6861+
},
6862+
{
6863+
name: "When failed job exists and etcd is still unhealthy it should keep the failure condition",
6864+
conditions: []metav1.Condition{staleCondition},
6865+
objects: append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
6866+
expectedReason: hyperv1.EtcdRecoveryJobFailedReason,
6867+
conditionExists: true,
6868+
},
6869+
}
6870+
6871+
for _, tc := range testCases {
6872+
t.Run(tc.name, func(t *testing.T) {
6873+
g := NewGomegaWithT(t)
6874+
6875+
hcluster := &hyperv1.HostedCluster{
6876+
ObjectMeta: metav1.ObjectMeta{
6877+
Name: "test-hc",
6878+
Namespace: "clusters",
6879+
},
6880+
Spec: hyperv1.HostedClusterSpec{
6881+
Etcd: hyperv1.EtcdSpec{
6882+
ManagementType: hyperv1.Managed,
6883+
},
6884+
ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
6885+
},
6886+
Status: hyperv1.HostedClusterStatus{
6887+
Conditions: tc.conditions,
6888+
},
6889+
}
6890+
6891+
objects := append([]crclient.Object{hcluster}, tc.objects...)
6892+
client := fake.NewClientBuilder().
6893+
WithScheme(api.Scheme).
6894+
WithObjects(objects...).
6895+
WithStatusSubresource(hcluster).
6896+
Build()
6897+
6898+
r := &HostedClusterReconciler{
6899+
Client: client,
6900+
now: metav1.Now,
6901+
EnableEtcdRecovery: true,
6902+
}
6903+
6904+
_, err := r.reconcileETCDMemberRecovery(
6905+
ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
6906+
hcluster,
6907+
upsert.New(false).CreateOrUpdate,
6908+
)
6909+
g.Expect(err).ToNot(HaveOccurred())
6910+
6911+
updatedHC := &hyperv1.HostedCluster{}
6912+
g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())
6913+
6914+
condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
6915+
if tc.conditionExists {
6916+
g.Expect(condition).ToNot(BeNil())
6917+
g.Expect(condition.Reason).To(Equal(tc.expectedReason))
6918+
} else {
6919+
g.Expect(condition).To(BeNil())
6920+
}
6921+
if tc.expectJobDeleted {
6922+
job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
6923+
err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
6924+
g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
6925+
}
6926+
})
6927+
}
6928+
}

0 commit comments

Comments
 (0)