@@ -28,6 +28,7 @@ import (
2828 hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929 hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030 "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+ etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132 kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233 "github.com/openshift/hypershift/support/api"
3334 "github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849 configv1 "github.com/openshift/api/config/v1"
4950
5051 appsv1 "k8s.io/api/apps/v1"
52+ batchv1 "k8s.io/api/batch/v1"
5153 corev1 "k8s.io/api/core/v1"
5254 "k8s.io/apimachinery/pkg/api/equality"
5355 errors2 "k8s.io/apimachinery/pkg/api/errors"
56+ "k8s.io/apimachinery/pkg/api/meta"
5457 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558 "k8s.io/apimachinery/pkg/types"
5659 "k8s.io/apimachinery/pkg/util/intstr"
@@ -6734,3 +6737,195 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
67346737 })
67356738 }
67366739}
6740+
6741+ func TestReconcileETCDMemberRecovery (t * testing.T ) {
6742+ hcpNS := "clusters-test-hc"
6743+
6744+ healthyEtcdPods := func () []crclient.Object {
6745+ var pods []crclient.Object
6746+ for i := 0 ; i < 3 ; i ++ {
6747+ pods = append (pods , & corev1.Pod {
6748+ ObjectMeta : metav1.ObjectMeta {
6749+ Name : fmt .Sprintf ("etcd-%d" , i ),
6750+ Namespace : hcpNS ,
6751+ Labels : map [string ]string {"app" : "etcd" },
6752+ },
6753+ Status : corev1.PodStatus {
6754+ ContainerStatuses : []corev1.ContainerStatus {
6755+ {
6756+ Name : "etcd" ,
6757+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6758+ },
6759+ },
6760+ },
6761+ })
6762+ }
6763+ return pods
6764+ }
6765+
6766+ recoveredEtcdPods := func () []crclient.Object {
6767+ var pods []crclient.Object
6768+ for i := 0 ; i < 3 ; i ++ {
6769+ pods = append (pods , & corev1.Pod {
6770+ ObjectMeta : metav1.ObjectMeta {
6771+ Name : fmt .Sprintf ("etcd-%d" , i ),
6772+ Namespace : hcpNS ,
6773+ Labels : map [string ]string {"app" : "etcd" },
6774+ },
6775+ Status : corev1.PodStatus {
6776+ ContainerStatuses : []corev1.ContainerStatus {
6777+ {
6778+ Name : "etcd" ,
6779+ RestartCount : 3 ,
6780+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6781+ },
6782+ },
6783+ },
6784+ })
6785+ }
6786+ return pods
6787+ }
6788+
6789+ healthyStatefulSet := & appsv1.StatefulSet {
6790+ ObjectMeta : metav1.ObjectMeta {
6791+ Name : "etcd" ,
6792+ Namespace : hcpNS ,
6793+ },
6794+ Status : appsv1.StatefulSetStatus {
6795+ ReadyReplicas : 3 ,
6796+ AvailableReplicas : 3 ,
6797+ },
6798+ }
6799+
6800+ unhealthyStatefulSet := & appsv1.StatefulSet {
6801+ ObjectMeta : metav1.ObjectMeta {
6802+ Name : "etcd" ,
6803+ Namespace : hcpNS ,
6804+ },
6805+ Status : appsv1.StatefulSetStatus {
6806+ ReadyReplicas : 2 ,
6807+ AvailableReplicas : 2 ,
6808+ },
6809+ }
6810+
6811+ staleCondition := metav1.Condition {
6812+ Type : string (hyperv1 .EtcdRecoveryActive ),
6813+ Status : metav1 .ConditionFalse ,
6814+ Reason : hyperv1 .EtcdRecoveryJobFailedReason ,
6815+ Message : "Error in Etcd Recovery job: the Etcd cluster requires manual intervention." ,
6816+ LastTransitionTime : metav1 .Now (),
6817+ }
6818+
6819+ failedJob := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6820+ failedJob .Status = batchv1.JobStatus {
6821+ Conditions : []batchv1.JobCondition {
6822+ {
6823+ Type : batchv1 .JobFailed ,
6824+ Status : corev1 .ConditionTrue ,
6825+ },
6826+ },
6827+ }
6828+
6829+ testCases := []struct {
6830+ name string
6831+ objects []crclient.Object
6832+ conditions []metav1.Condition
6833+ expectedReason string
6834+ conditionExists bool
6835+ expectJobDeleted bool
6836+ }{
6837+ {
6838+ name : "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition" ,
6839+ conditions : []metav1.Condition {staleCondition },
6840+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6841+ expectedReason : hyperv1 .AsExpectedReason ,
6842+ conditionExists : true ,
6843+ },
6844+ {
6845+ name : "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one" ,
6846+ conditions : []metav1.Condition {},
6847+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6848+ conditionExists : false ,
6849+ },
6850+ {
6851+ name : "When failed job exists but etcd recovered it should cleanup job and clear condition" ,
6852+ conditions : []metav1.Condition {staleCondition },
6853+ objects : append (healthyEtcdPods (), healthyStatefulSet , failedJob ),
6854+ expectedReason : hyperv1 .AsExpectedReason ,
6855+ conditionExists : true ,
6856+ expectJobDeleted : true ,
6857+ },
6858+ {
6859+ name : "When etcd pods have restarted but recovered it should clear the stale condition" ,
6860+ conditions : []metav1.Condition {staleCondition },
6861+ objects : append (recoveredEtcdPods (), healthyStatefulSet ),
6862+ expectedReason : hyperv1 .AsExpectedReason ,
6863+ conditionExists : true ,
6864+ },
6865+ {
6866+ name : "When failed job exists and etcd is still unhealthy it should keep the failure condition" ,
6867+ conditions : []metav1.Condition {staleCondition },
6868+ objects : append (healthyEtcdPods (), unhealthyStatefulSet , failedJob ),
6869+ expectedReason : hyperv1 .EtcdRecoveryJobFailedReason ,
6870+ conditionExists : true ,
6871+ },
6872+ }
6873+
6874+ for _ , tc := range testCases {
6875+ t .Run (tc .name , func (t * testing.T ) {
6876+ g := NewGomegaWithT (t )
6877+
6878+ hcluster := & hyperv1.HostedCluster {
6879+ ObjectMeta : metav1.ObjectMeta {
6880+ Name : "test-hc" ,
6881+ Namespace : "clusters" ,
6882+ },
6883+ Spec : hyperv1.HostedClusterSpec {
6884+ Etcd : hyperv1.EtcdSpec {
6885+ ManagementType : hyperv1 .Managed ,
6886+ },
6887+ ControllerAvailabilityPolicy : hyperv1 .HighlyAvailable ,
6888+ },
6889+ Status : hyperv1.HostedClusterStatus {
6890+ Conditions : tc .conditions ,
6891+ },
6892+ }
6893+
6894+ objects := append ([]crclient.Object {hcluster }, tc .objects ... )
6895+ client := fake .NewClientBuilder ().
6896+ WithScheme (api .Scheme ).
6897+ WithObjects (objects ... ).
6898+ WithStatusSubresource (hcluster ).
6899+ Build ()
6900+
6901+ r := & HostedClusterReconciler {
6902+ Client : client ,
6903+ now : metav1 .Now ,
6904+ EnableEtcdRecovery : true ,
6905+ }
6906+
6907+ _ , err := r .reconcileETCDMemberRecovery (
6908+ ctrl .LoggerInto (t .Context (), zap .New (zap .UseDevMode (true ))),
6909+ hcluster ,
6910+ upsert .New (false ).CreateOrUpdate ,
6911+ )
6912+ g .Expect (err ).ToNot (HaveOccurred ())
6913+
6914+ updatedHC := & hyperv1.HostedCluster {}
6915+ g .Expect (client .Get (t .Context (), crclient .ObjectKeyFromObject (hcluster ), updatedHC )).To (Succeed ())
6916+
6917+ condition := meta .FindStatusCondition (updatedHC .Status .Conditions , string (hyperv1 .EtcdRecoveryActive ))
6918+ if tc .conditionExists {
6919+ g .Expect (condition ).ToNot (BeNil ())
6920+ g .Expect (condition .Reason ).To (Equal (tc .expectedReason ))
6921+ } else {
6922+ g .Expect (condition ).To (BeNil ())
6923+ }
6924+ if tc .expectJobDeleted {
6925+ job := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6926+ err := client .Get (t .Context (), crclient .ObjectKeyFromObject (job ), job )
6927+ g .Expect (errors2 .IsNotFound (err )).To (BeTrue (), "expected failed recovery job to be deleted" )
6928+ }
6929+ })
6930+ }
6931+ }
0 commit comments