@@ -28,6 +28,7 @@ import (
2828 hcmetrics "github.com/openshift/hypershift/hypershift-operator/controllers/hostedcluster/metrics"
2929 hcpmanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests"
3030 "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator"
31+ etcdrecoverymanifests "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/etcdrecovery"
3132 kvinfra "github.com/openshift/hypershift/kubevirtexternalinfra"
3233 "github.com/openshift/hypershift/support/api"
3334 "github.com/openshift/hypershift/support/azureutil"
@@ -48,9 +49,11 @@ import (
4849 configv1 "github.com/openshift/api/config/v1"
4950
5051 appsv1 "k8s.io/api/apps/v1"
52+ batchv1 "k8s.io/api/batch/v1"
5153 corev1 "k8s.io/api/core/v1"
5254 "k8s.io/apimachinery/pkg/api/equality"
5355 errors2 "k8s.io/apimachinery/pkg/api/errors"
56+ "k8s.io/apimachinery/pkg/api/meta"
5457 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5558 "k8s.io/apimachinery/pkg/types"
5659 "k8s.io/apimachinery/pkg/util/intstr"
@@ -6569,3 +6572,195 @@ func TestComputeEndpointServiceCondition(t *testing.T) {
65696572 })
65706573 }
65716574}
6575+
6576+ func TestReconcileETCDMemberRecovery (t * testing.T ) {
6577+ hcpNS := "clusters-test-hc"
6578+
6579+ healthyEtcdPods := func () []crclient.Object {
6580+ var pods []crclient.Object
6581+ for i := 0 ; i < 3 ; i ++ {
6582+ pods = append (pods , & corev1.Pod {
6583+ ObjectMeta : metav1.ObjectMeta {
6584+ Name : fmt .Sprintf ("etcd-%d" , i ),
6585+ Namespace : hcpNS ,
6586+ Labels : map [string ]string {"app" : "etcd" },
6587+ },
6588+ Status : corev1.PodStatus {
6589+ ContainerStatuses : []corev1.ContainerStatus {
6590+ {
6591+ Name : "etcd" ,
6592+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6593+ },
6594+ },
6595+ },
6596+ })
6597+ }
6598+ return pods
6599+ }
6600+
6601+ recoveredEtcdPods := func () []crclient.Object {
6602+ var pods []crclient.Object
6603+ for i := 0 ; i < 3 ; i ++ {
6604+ pods = append (pods , & corev1.Pod {
6605+ ObjectMeta : metav1.ObjectMeta {
6606+ Name : fmt .Sprintf ("etcd-%d" , i ),
6607+ Namespace : hcpNS ,
6608+ Labels : map [string ]string {"app" : "etcd" },
6609+ },
6610+ Status : corev1.PodStatus {
6611+ ContainerStatuses : []corev1.ContainerStatus {
6612+ {
6613+ Name : "etcd" ,
6614+ RestartCount : 3 ,
6615+ State : corev1.ContainerState {Running : & corev1.ContainerStateRunning {}},
6616+ },
6617+ },
6618+ },
6619+ })
6620+ }
6621+ return pods
6622+ }
6623+
6624+ healthyStatefulSet := & appsv1.StatefulSet {
6625+ ObjectMeta : metav1.ObjectMeta {
6626+ Name : "etcd" ,
6627+ Namespace : hcpNS ,
6628+ },
6629+ Status : appsv1.StatefulSetStatus {
6630+ ReadyReplicas : 3 ,
6631+ AvailableReplicas : 3 ,
6632+ },
6633+ }
6634+
6635+ unhealthyStatefulSet := & appsv1.StatefulSet {
6636+ ObjectMeta : metav1.ObjectMeta {
6637+ Name : "etcd" ,
6638+ Namespace : hcpNS ,
6639+ },
6640+ Status : appsv1.StatefulSetStatus {
6641+ ReadyReplicas : 2 ,
6642+ AvailableReplicas : 2 ,
6643+ },
6644+ }
6645+
6646+ staleCondition := metav1.Condition {
6647+ Type : string (hyperv1 .EtcdRecoveryActive ),
6648+ Status : metav1 .ConditionFalse ,
6649+ Reason : hyperv1 .EtcdRecoveryJobFailedReason ,
6650+ Message : "Error in Etcd Recovery job: the Etcd cluster requires manual intervention." ,
6651+ LastTransitionTime : metav1 .Now (),
6652+ }
6653+
6654+ failedJob := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6655+ failedJob .Status = batchv1.JobStatus {
6656+ Conditions : []batchv1.JobCondition {
6657+ {
6658+ Type : batchv1 .JobFailed ,
6659+ Status : corev1 .ConditionTrue ,
6660+ },
6661+ },
6662+ }
6663+
6664+ testCases := []struct {
6665+ name string
6666+ objects []crclient.Object
6667+ conditions []metav1.Condition
6668+ expectedReason string
6669+ conditionExists bool
6670+ expectJobDeleted bool
6671+ }{
6672+ {
6673+ name : "When etcd is healthy and stale EtcdRecoveryJobFailed condition exists it should clear the condition" ,
6674+ conditions : []metav1.Condition {staleCondition },
6675+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6676+ expectedReason : hyperv1 .AsExpectedReason ,
6677+ conditionExists : true ,
6678+ },
6679+ {
6680+ name : "When etcd is healthy and no EtcdRecoveryActive condition exists it should not add one" ,
6681+ conditions : []metav1.Condition {},
6682+ objects : append (healthyEtcdPods (), healthyStatefulSet ),
6683+ conditionExists : false ,
6684+ },
6685+ {
6686+ name : "When failed job exists but etcd recovered it should cleanup job and clear condition" ,
6687+ conditions : []metav1.Condition {staleCondition },
6688+ objects : append (healthyEtcdPods (), healthyStatefulSet , failedJob ),
6689+ expectedReason : hyperv1 .AsExpectedReason ,
6690+ conditionExists : true ,
6691+ expectJobDeleted : true ,
6692+ },
6693+ {
6694+ name : "When etcd pods have restarted but recovered it should clear the stale condition" ,
6695+ conditions : []metav1.Condition {staleCondition },
6696+ objects : append (recoveredEtcdPods (), healthyStatefulSet ),
6697+ expectedReason : hyperv1 .AsExpectedReason ,
6698+ conditionExists : true ,
6699+ },
6700+ {
6701+ name : "When failed job exists and etcd is still unhealthy it should keep the failure condition" ,
6702+ conditions : []metav1.Condition {staleCondition },
6703+ objects : append (healthyEtcdPods (), unhealthyStatefulSet , failedJob ),
6704+ expectedReason : hyperv1 .EtcdRecoveryJobFailedReason ,
6705+ conditionExists : true ,
6706+ },
6707+ }
6708+
6709+ for _ , tc := range testCases {
6710+ t .Run (tc .name , func (t * testing.T ) {
6711+ g := NewGomegaWithT (t )
6712+
6713+ hcluster := & hyperv1.HostedCluster {
6714+ ObjectMeta : metav1.ObjectMeta {
6715+ Name : "test-hc" ,
6716+ Namespace : "clusters" ,
6717+ },
6718+ Spec : hyperv1.HostedClusterSpec {
6719+ Etcd : hyperv1.EtcdSpec {
6720+ ManagementType : hyperv1 .Managed ,
6721+ },
6722+ ControllerAvailabilityPolicy : hyperv1 .HighlyAvailable ,
6723+ },
6724+ Status : hyperv1.HostedClusterStatus {
6725+ Conditions : tc .conditions ,
6726+ },
6727+ }
6728+
6729+ objects := append ([]crclient.Object {hcluster }, tc .objects ... )
6730+ client := fake .NewClientBuilder ().
6731+ WithScheme (api .Scheme ).
6732+ WithObjects (objects ... ).
6733+ WithStatusSubresource (hcluster ).
6734+ Build ()
6735+
6736+ r := & HostedClusterReconciler {
6737+ Client : client ,
6738+ now : metav1 .Now ,
6739+ EnableEtcdRecovery : true ,
6740+ }
6741+
6742+ _ , err := r .reconcileETCDMemberRecovery (
6743+ ctrl .LoggerInto (t .Context (), zap .New (zap .UseDevMode (true ))),
6744+ hcluster ,
6745+ upsert .New (false ).CreateOrUpdate ,
6746+ )
6747+ g .Expect (err ).ToNot (HaveOccurred ())
6748+
6749+ updatedHC := & hyperv1.HostedCluster {}
6750+ g .Expect (client .Get (t .Context (), crclient .ObjectKeyFromObject (hcluster ), updatedHC )).To (Succeed ())
6751+
6752+ condition := meta .FindStatusCondition (updatedHC .Status .Conditions , string (hyperv1 .EtcdRecoveryActive ))
6753+ if tc .conditionExists {
6754+ g .Expect (condition ).ToNot (BeNil ())
6755+ g .Expect (condition .Reason ).To (Equal (tc .expectedReason ))
6756+ } else {
6757+ g .Expect (condition ).To (BeNil ())
6758+ }
6759+ if tc .expectJobDeleted {
6760+ job := etcdrecoverymanifests .EtcdRecoveryJob (hcpNS )
6761+ err := client .Get (t .Context (), crclient .ObjectKeyFromObject (job ), job )
6762+ g .Expect (errors2 .IsNotFound (err )).To (BeTrue (), "expected failed recovery job to be deleted" )
6763+ }
6764+ })
6765+ }
6766+ }
0 commit comments