@@ -28,6 +28,7 @@ import (
2828 hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929 hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030 "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+ etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132 kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233 "github.com/openshift/hypershift/support/api"
3334 "github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849 configv1 "github.com/openshift/api/config/v1"
4950
5051 appsv1 "k8s.io/api/apps/v1"
52+ batchv1 "k8s.io/api/batch/v1"
5153 corev1 "k8s.io/api/core/v1"
5254 "k8s.io/apimachinery/pkg/api/equality"
5355 errors2 "k8s.io/apimachinery/pkg/api/errors"
56+ "k8s.io/apimachinery/pkg/api/meta"
5457 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558 "k8s.io/apimachinery/pkg/types"
5659 "k8s.io/apimachinery/pkg/util/intstr"
@@ -6734,3 +6737,192 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
67346737 })
67356738 }
67366739}
6740+
6741+ func TestReconcileETCDMemberRecovery (t * testing.T ) {
6742+ hcpNS := "clusters-test-hc"
6743+
6744+ healthyEtcdPods := func () []crclient.Object {
6745+ var pods []crclient.Object
6746+ for i := 0 ; i < 3 ; i ++ {
6747+ pods = append (pods , & corev1.Pod {
6748+ ObjectMeta : metav1.ObjectMeta {
6749+ Name : fmt .Sprintf ("etcd-%d" , i ),
6750+ Namespace : hcpNS ,
6751+ Labels : map [string ]string {"app" : "etcd" },
6752+ },
6753+ Status : corev1.PodStatus {
6754+ ContainerStatuses : []corev1.ContainerStatus {
6755+ {
6756+ Name : "etcd" ,
6757+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6758+ },
6759+ },
6760+ },
6761+ })
6762+ }
6763+ return pods
6764+ }
6765+
6766+ recoveredEtcdPods := func () []crclient.Object {
6767+ var pods []crclient.Object
6768+ for i := 0 ; i < 3 ; i ++ {
6769+ pods = append (pods , & corev1.Pod {
6770+ ObjectMeta : metav1.ObjectMeta {
6771+ Name : fmt .Sprintf ("etcd-%d" , i ),
6772+ Namespace : hcpNS ,
6773+ Labels : map [string ]string {"app" : "etcd" },
6774+ },
6775+ Status : corev1.PodStatus {
6776+ ContainerStatuses : []corev1.ContainerStatus {
6777+ {
6778+ Name : "etcd" ,
6779+ RestartCount : 3 ,
6780+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6781+ },
6782+ },
6783+ },
6784+ })
6785+ }
6786+ return pods
6787+ }
6788+
6789+ initEtcdStatefulSet := func (specReplicas , readyReplicas , availableReplicas int32 ) * appsv1.StatefulSet {
6790+ return & appsv1.StatefulSet {
6791+ ObjectMeta : metav1.ObjectMeta {
6792+ Name : "etcd" ,
6793+ Namespace : hcpNS ,
6794+ },
6795+ Spec : appsv1.StatefulSetSpec {
6796+ Replicas : ptr.To [int32 ](specReplicas ),
6797+ },
6798+ Status : appsv1.StatefulSetStatus {
6799+ ReadyReplicas : readyReplicas ,
6800+ AvailableReplicas : availableReplicas ,
6801+ },
6802+ }
6803+ }
6804+
6805+ healthyStatefulSet := initEtcdStatefulSet (3 , 3 , 3 )
6806+ unhealthyStatefulSet := initEtcdStatefulSet (3 , 2 , 2 )
6807+
6808+ staleCondition := metav1.Condition {
6809+ Type : string (hyperv1 .EtcdRecoveryActive ),
6810+ Status : metav1 .ConditionFalse ,
6811+ Reason : hyperv1 .EtcdRecoveryJobFailedReason ,
6812+ Message : "Error in Etcd Recovery job: the Etcd cluster requires manual intervention." ,
6813+ LastTransitionTime : metav1 .Now (),
6814+ }
6815+
6816+ failedJob := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6817+ failedJob .Status = batchv1.JobStatus {
6818+ Conditions : []batchv1.JobCondition {
6819+ {
6820+ Type : batchv1 .JobFailed ,
6821+ Status : corev1 .ConditionTrue ,
6822+ },
6823+ },
6824+ }
6825+
6826+ testCases := []struct {
6827+ name string
6828+ objects []crclient.Object
6829+ conditions []metav1.Condition
6830+ expectedReason string
6831+ conditionExists bool
6832+ expectJobDeleted bool
6833+ }{
6834+ {
6835+ name : "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition" ,
6836+ conditions : []metav1.Condition {staleCondition },
6837+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6838+ expectedReason : hyperv1 .AsExpectedReason ,
6839+ conditionExists : true ,
6840+ },
6841+ {
6842+ name : "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one" ,
6843+ conditions : []metav1.Condition {},
6844+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6845+ conditionExists : false ,
6846+ },
6847+ {
6848+ name : "When failed job exists but etcd recovered it should cleanup job and clear condition" ,
6849+ conditions : []metav1.Condition {staleCondition },
6850+ objects : append (healthyEtcdPods (), healthyStatefulSet , failedJob ),
6851+ expectedReason : hyperv1 .AsExpectedReason ,
6852+ conditionExists : true ,
6853+ expectJobDeleted : true ,
6854+ },
6855+ {
6856+ name : "When etcd pods have restarted but recovered it should clear the stale condition" ,
6857+ conditions : []metav1.Condition {staleCondition },
6858+ objects : append (recoveredEtcdPods (), healthyStatefulSet ),
6859+ expectedReason : hyperv1 .AsExpectedReason ,
6860+ conditionExists : true ,
6861+ },
6862+ {
6863+ name : "When failed job exists and etcd is still unhealthy it should keep the failure condition" ,
6864+ conditions : []metav1.Condition {staleCondition },
6865+ objects : append (healthyEtcdPods (), unhealthyStatefulSet , failedJob ),
6866+ expectedReason : hyperv1 .EtcdRecoveryJobFailedReason ,
6867+ conditionExists : true ,
6868+ },
6869+ }
6870+
6871+ for _ , tc := range testCases {
6872+ t .Run (tc .name , func (t * testing.T ) {
6873+ g := NewGomegaWithT (t )
6874+
6875+ hcluster := & hyperv1.HostedCluster {
6876+ ObjectMeta : metav1.ObjectMeta {
6877+ Name : "test-hc" ,
6878+ Namespace : "clusters" ,
6879+ },
6880+ Spec : hyperv1.HostedClusterSpec {
6881+ Etcd : hyperv1.EtcdSpec {
6882+ ManagementType : hyperv1 .Managed ,
6883+ },
6884+ ControllerAvailabilityPolicy : hyperv1 .HighlyAvailable ,
6885+ },
6886+ Status : hyperv1.HostedClusterStatus {
6887+ Conditions : tc .conditions ,
6888+ },
6889+ }
6890+
6891+ objects := append ([]crclient.Object {hcluster }, tc .objects ... )
6892+ client := fake .NewClientBuilder ().
6893+ WithScheme (api .Scheme ).
6894+ WithObjects (objects ... ).
6895+ WithStatusSubresource (hcluster ).
6896+ Build ()
6897+
6898+ r := & HostedClusterReconciler {
6899+ Client : client ,
6900+ now : metav1 .Now ,
6901+ EnableEtcdRecovery : true ,
6902+ }
6903+
6904+ _ , err := r .reconcileETCDMemberRecovery (
6905+ ctrl .LoggerInto (t .Context (), zap .New (zap .UseDevMode (true ))),
6906+ hcluster ,
6907+ upsert .New (false ).CreateOrUpdate ,
6908+ )
6909+ g .Expect (err ).ToNot (HaveOccurred ())
6910+
6911+ updatedHC := & hyperv1.HostedCluster {}
6912+ g .Expect (client .Get (t .Context (), crclient .ObjectKeyFromObject (hcluster ), updatedHC )).To (Succeed ())
6913+
6914+ condition := meta .FindStatusCondition (updatedHC .Status .Conditions , string (hyperv1 .EtcdRecoveryActive ))
6915+ if tc .conditionExists {
6916+ g .Expect (condition ).ToNot (BeNil ())
6917+ g .Expect (condition .Reason ).To (Equal (tc .expectedReason ))
6918+ } else {
6919+ g .Expect (condition ).To (BeNil ())
6920+ }
6921+ if tc .expectJobDeleted {
6922+ job := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6923+ err := client .Get (t .Context (), crclient .ObjectKeyFromObject (job ), job )
6924+ g .Expect (errors2 .IsNotFound (err )).To (BeTrue (), "expected failed recovery job to be deleted" )
6925+ }
6926+ })
6927+ }
6928+ }
0 commit comments