Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion hypershift-operator/controllers/hostedcluster/etcd_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ import (
"github.com/go-logr/logr"
)

func isEtcdStatefulSetHealthy(sts *appsv1.StatefulSet) bool {
return sts.Spec.Replicas != nil &&
sts.Status.ReadyReplicas == *sts.Spec.Replicas &&
sts.Status.AvailableReplicas == *sts.Spec.Replicas
}

type etcdJobStatus struct {
exists bool
finished bool
Expand Down Expand Up @@ -62,12 +68,30 @@ func (r *HostedClusterReconciler) reconcileETCDMemberRecovery(ctx context.Contex
// It returns (done, err) where done=true means the caller should return immediately
// without falling through to detectAndTriggerEtcdRecovery.
func (r *HostedClusterReconciler) handleExistingEtcdRecoveryJob(ctx context.Context, log logr.Logger, hcluster *hyperv1.HostedCluster, recoveryJob *batchv1.Job, jobStatus *etcdJobStatus) (bool, error) {
hcpNS := manifests.HostedControlPlaneNamespace(hcluster.Namespace, hcluster.Name)

if !jobStatus.finished {
log.Info("waiting for etcd recovery job to complete")
return true, nil
}

if !jobStatus.successful {
etcdStatefulSet := etcdrecoverymanifests.EtcdStatefulSet(hcpNS)
if err := r.Get(ctx, crclient.ObjectKeyFromObject(etcdStatefulSet), etcdStatefulSet); err != nil {
if !apierrors.IsNotFound(err) {
return false, fmt.Errorf("failed to get etcd statefulset: %w", err)
}
} else if isEtcdStatefulSetHealthy(etcdStatefulSet) {
log.Info("etcd recovered despite failed recovery job, cleaning up")
if err := r.cleanupEtcdRecoveryObjects(ctx, hcluster); err != nil {
return false, fmt.Errorf("failed to cleanup etcd recovery job: %w", err)
}
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD recovered despite failed recovery job."); err != nil {
return false, err
}
return true, nil
}

if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.EtcdRecoveryJobFailedReason, "Error in Etcd Recovery job: the Etcd cluster requires manual intervention."); err != nil {
return false, err
}
Expand Down Expand Up @@ -99,7 +123,7 @@ func (r *HostedClusterReconciler) setEtcdRecoveryCondition(ctx context.Context,
}

oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
if oldCondition == nil || oldCondition.Status != condition.Status {
if oldCondition == nil || oldCondition.Status != condition.Status || oldCondition.Reason != condition.Reason {
meta.SetStatusCondition(&hcluster.Status.Conditions, condition)
if err := r.Client.Status().Update(ctx, hcluster); err != nil {
return fmt.Errorf("failed to update etcd recovery job condition: %w", err)
Expand Down Expand Up @@ -131,6 +155,15 @@ func (r *HostedClusterReconciler) detectAndTriggerEtcdRecovery(ctx context.Conte
if !fullyAvailable {
return &requeueAfter, nil
}

oldCondition := meta.FindStatusCondition(hcluster.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
if oldCondition != nil && oldCondition.Reason == hyperv1.EtcdRecoveryJobFailedReason {
log.Info("etcd is healthy but EtcdRecoveryActive has stale failure condition, clearing it")
if err := r.setEtcdRecoveryCondition(ctx, hcluster, metav1.ConditionFalse, hyperv1.AsExpectedReason, "ETCD cluster is healthy."); err != nil {
return nil, err
}
}

return nil, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
"github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
"github.com/openshift/hypershift/support/api"
"github.com/openshift/hypershift/support/azureutil"
Expand All @@ -48,9 +49,11 @@ import (
configv1 "github.com/openshift/api/config/v1"

appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
errors2 "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -6734,3 +6737,192 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
})
}
}

func TestReconcileETCDMemberRecovery(t *testing.T) {
hcpNS := "clusters-test-hc"

healthyEtcdPods := func() []crclient.Object {
var pods []crclient.Object
for i := 0; i < 3; i++ {
pods = append(pods, &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May you add a case where RestartCount is > 0 and Running state?
To cover case when POD recovers.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I have made all the changes suggested on the etcd_recovery.go & hostedcluster_controller_test.go file.

Name: fmt.Sprintf("etcd-%d", i),
Namespace: hcpNS,
Labels: map[string]string{"app": "etcd"},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "etcd",
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
},
},
},
})
}
return pods
}

recoveredEtcdPods := func() []crclient.Object {
var pods []crclient.Object
for i := 0; i < 3; i++ {
pods = append(pods, &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("etcd-%d", i),
Namespace: hcpNS,
Labels: map[string]string{"app": "etcd"},
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "etcd",
RestartCount: 3,
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
},
},
},
})
}
return pods
}

initEtcdStatefulSet := func(specReplicas, readyReplicas, availableReplicas int32) *appsv1.StatefulSet {
return &appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: "etcd",
Namespace: hcpNS,
},
Spec: appsv1.StatefulSetSpec{
Replicas: ptr.To[int32](specReplicas),
},
Status: appsv1.StatefulSetStatus{
ReadyReplicas: readyReplicas,
AvailableReplicas: availableReplicas,
},
}
}

healthyStatefulSet := initEtcdStatefulSet(3, 3, 3)
unhealthyStatefulSet := initEtcdStatefulSet(3, 2, 2)

staleCondition := metav1.Condition{
Type: string(hyperv1.EtcdRecoveryActive),
Status: metav1.ConditionFalse,
Reason: hyperv1.EtcdRecoveryJobFailedReason,
Message: "Error in Etcd Recovery job: the Etcd cluster requires manual intervention.",
LastTransitionTime: metav1.Now(),
}

failedJob := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
failedJob.Status = batchv1.JobStatus{
Conditions: []batchv1.JobCondition{
{
Type: batchv1.JobFailed,
Status: corev1.ConditionTrue,
},
},
}

testCases := []struct {
name string
objects []crclient.Object
conditions []metav1.Condition
expectedReason string
conditionExists bool
expectJobDeleted bool
}{
{
name: "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition",
conditions: []metav1.Condition{staleCondition},
objects: append(healthyEtcdPods(), healthyStatefulSet),
expectedReason: hyperv1.AsExpectedReason,
conditionExists: true,
},
{
name: "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one",
conditions: []metav1.Condition{},
objects: append(healthyEtcdPods(), healthyStatefulSet),
conditionExists: false,
},
{
name: "When failed job exists but etcd recovered it should cleanup job and clear condition",
conditions: []metav1.Condition{staleCondition},
objects: append(healthyEtcdPods(), healthyStatefulSet, failedJob),
expectedReason: hyperv1.AsExpectedReason,
conditionExists: true,
expectJobDeleted: true,
},
{
name: "When etcd pods have restarted but recovered it should clear the stale condition",
conditions: []metav1.Condition{staleCondition},
objects: append(recoveredEtcdPods(), healthyStatefulSet),
expectedReason: hyperv1.AsExpectedReason,
conditionExists: true,
},
{
name: "When failed job exists and etcd is still unhealthy it should keep the failure condition",
conditions: []metav1.Condition{staleCondition},
objects: append(healthyEtcdPods(), unhealthyStatefulSet, failedJob),
expectedReason: hyperv1.EtcdRecoveryJobFailedReason,
conditionExists: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := NewGomegaWithT(t)

hcluster := &hyperv1.HostedCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "test-hc",
Namespace: "clusters",
},
Spec: hyperv1.HostedClusterSpec{
Etcd: hyperv1.EtcdSpec{
ManagementType: hyperv1.Managed,
},
ControllerAvailabilityPolicy: hyperv1.HighlyAvailable,
},
Status: hyperv1.HostedClusterStatus{
Conditions: tc.conditions,
},
}

objects := append([]crclient.Object{hcluster}, tc.objects...)
client := fake.NewClientBuilder().
WithScheme(api.Scheme).
WithObjects(objects...).
WithStatusSubresource(hcluster).
Build()

r := &HostedClusterReconciler{
Client: client,
now: metav1.Now,
EnableEtcdRecovery: true,
}

_, err := r.reconcileETCDMemberRecovery(
ctrl.LoggerInto(t.Context(), zap.New(zap.UseDevMode(true))),
hcluster,
upsert.New(false).CreateOrUpdate,
)
g.Expect(err).ToNot(HaveOccurred())

updatedHC := &hyperv1.HostedCluster{}
g.Expect(client.Get(t.Context(), crclient.ObjectKeyFromObject(hcluster), updatedHC)).To(Succeed())

condition := meta.FindStatusCondition(updatedHC.Status.Conditions, string(hyperv1.EtcdRecoveryActive))
if tc.conditionExists {
g.Expect(condition).ToNot(BeNil())
g.Expect(condition.Reason).To(Equal(tc.expectedReason))
} else {
g.Expect(condition).To(BeNil())
}
if tc.expectJobDeleted {
job := etcdrecoverymanifests.EtcdRecoveryJob(hcpNS)
err := client.Get(t.Context(), crclient.ObjectKeyFromObject(job), job)
g.Expect(errors2.IsNotFound(err)).To(BeTrue(), "expected failed recovery job to be deleted")
}
})
}
}