Skip to content

Commit f272628

Browse files
committed
improve dcm tests to reduce flakiness in endpoints
Addressing a race when patching endpoints resource. This update adds a polling to ensure the endpoints resource was created. Also, dump all endpoints and endpointSlice in the test namespace in case of a test failure, which helps to correlate the current state and what should be expected in the test scenario. Example of deployments state log: ``` deployment state: replicas=4 pods=route-scale-in-5b7b4f6b8c-9ncjg/Running/10.128.1.27 // route-scale-in-5b7b4f6b8c-ck45n/Running/10.128.1.28 // route-scale-in-5b7b4f6b8c-srffr/Running/10.128.1.29 // route-scale-in-5b7b4f6b8c-v9hm2/Running/10.128.1.26 ``` Example of Endpoints and EndpointSlice resources listed if the test fails: ``` Endpoints: NAME ADDRESSES NOT READY ADDRESSES PORTS route-scale-in 10.128.1.26,10.128.1.27,10.128.1.28,10.128.1.29 9376 route-scale-in-khbh5 10.128.1.26 9376 EndpointSlices: NAME SERVICE ADDRESSES NOT READY ADDRESSES PORTS route-scale-in-khbh5-8ncmh route-scale-in-khbh5 10.128.1.26 9376 route-scale-in-rrhzv route-scale-in 10.128.1.27,10.128.1.28,10.128.1.29,10.128.1.26 9376 ``` https://redhat.atlassian.net/browse/OCPBUGS-85426
1 parent 38c4fba commit f272628

2 files changed

Lines changed: 126 additions & 9 deletions

File tree

test/extended/router/config_manager_ingress.go

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
5353
ctx := context.Background()
5454
oc := exutil.NewCLIWithPodSecurityLevel("router-dcm-ingress", api.LevelPrivileged).AsAdmin()
5555
kubeClient := oc.AdminKubeClient()
56+
routeClient := oc.AdminRouteClient()
5657

5758
// variables updated on every new test
5859
var (
@@ -62,6 +63,20 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
6263
)
6364

6465
g.AfterEach(func() {
66+
if g.CurrentSpecReport().Failed() {
67+
routes, _ := routeClient.RouteV1().Routes(oc.Namespace()).List(ctx, metav1.ListOptions{})
68+
if routes != nil {
69+
outputIngress(routes.Items...)
70+
}
71+
endpoints, _ := kubeClient.CoreV1().Endpoints(oc.Namespace()).List(ctx, metav1.ListOptions{})
72+
if endpoints != nil {
73+
outputEndpoints(endpoints.Items...)
74+
}
75+
epsList, _ := kubeClient.DiscoveryV1().EndpointSlices(oc.Namespace()).List(ctx, metav1.ListOptions{})
76+
if epsList != nil {
77+
outputEndpointSlice(epsList.Items...)
78+
}
79+
}
6580
if controller.Name != "" {
6681
err := oc.AdminOperatorClient().OperatorV1().IngressControllers(controller.Namespace).Delete(ctx, controller.Name, *metav1.NewDeleteOptions(1))
6782
o.Expect(err).NotTo(o.HaveOccurred())
@@ -73,7 +88,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
7388
nsOperator := "openshift-ingress-operator"
7489
controllerName := names.SimpleNameGenerator.GenerateName("e2e-dcm-")
7590

76-
// ... and its router is created on router's namespace
91+
// ... and its router and service are created in router's namespace
7792
nsRouter := "openshift-ingress"
7893
svcName := "router-internal-" + controllerName
7994

@@ -471,6 +486,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
471486

472487
// ... k8s recreates it and we wait it to be fully functional
473488
err = builder.waitDeployment(replicas, dcmIngressTimeout)
489+
builder.printDeploymentState(ctx)
474490
o.Expect(err).NotTo(o.HaveOccurred())
475491
}
476492

@@ -778,7 +794,12 @@ func (r *routeStackBuilder) createDeploymentStack(ctx context.Context, routetype
778794
if err = r.waitDeployment(replicas, timeout); err != nil {
779795
return nil, err
780796
}
781-
return r.exposeDeployment(ctx)
797+
backendServers, err = r.exposeDeployment(ctx)
798+
if err != nil {
799+
return nil, err
800+
}
801+
r.printDeploymentState(ctx)
802+
return backendServers, nil
782803
}
783804

784805
// scaleDeployment scales-in/out the common deployment to the specified replicas. It waits for all the pods to be created and returns their names.
@@ -796,6 +817,7 @@ func (r *routeStackBuilder) scaleDeployment(ctx context.Context, replicas int, t
796817
}
797818
return len(backendServers) == replicas, nil
798819
})
820+
r.printDeploymentState(ctx)
799821
return backendServers, err
800822
}
801823

@@ -824,7 +846,14 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN
824846
}
825847

826848
// we also need the deprecated Endpoints API, since router still uses it depending on the ROUTER_WATCH_ENDPOINTS envvar
827-
epCurrent, err := r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
849+
var epCurrent *corev1.Endpoints
850+
err = wait.PollUntilContextTimeout(ctx, time.Second, 5*time.Second, false, func(ctx context.Context) (done bool, err error) {
851+
epCurrent, err = r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
852+
if err != nil {
853+
framework.Logf("error fetching Endpoints: %s", err.Error())
854+
}
855+
return err == nil, nil
856+
})
828857
if err != nil {
829858
return "", err
830859
}
@@ -910,7 +939,7 @@ func (r *routeStackBuilder) scaleInEndpoints(ctx context.Context, detachedServic
910939
if err != nil {
911940
return err
912941
}
913-
// deleting addresses, from all subnets, whose IP address is not found in the patched `eps`
942+
// deleting addresses, from all subsets, whose IP address is not found in the patched `eps`
914943
for i := range ep.Subsets {
915944
ss := &ep.Subsets[i]
916945
ss.Addresses = slices.DeleteFunc(ss.Addresses, func(addr corev1.EndpointAddress) bool {
@@ -953,6 +982,25 @@ func (r *routeStackBuilder) exposeDeployment(ctx context.Context) (backendServer
953982
return r.fetchServiceReplicas(ctx)
954983
}
955984

985+
// printDeploymentState outputs the pod names, status, and their IP addresses. Best effort, it outputs the error instead in case it happens.
986+
// It requires that `exposeDeployment()` was already called.
987+
func (r *routeStackBuilder) printDeploymentState(ctx context.Context) {
988+
pods, err := r.fetchPods(ctx)
989+
if err != nil {
990+
framework.Logf("deployment state: error reading deployment pods: %v", err)
991+
return
992+
}
993+
var podDescription []string
994+
for _, pod := range pods {
995+
var podIPs []string
996+
for _, ip := range pod.Status.PodIPs {
997+
podIPs = append(podIPs, ip.IP)
998+
}
999+
podDescription = append(podDescription, fmt.Sprintf("%s/%s/%s", pod.Name, pod.Status.Phase, strings.Join(podIPs, ",")))
1000+
}
1001+
framework.Logf("deployment state: replicas=%d pods=%s", len(pods), strings.Join(podDescription, " // "))
1002+
}
1003+
9561004
// fetchEndpointSlice fetches the EndpointSlice of the provided service name. It currently supports only one EndpointSlice instance for simplicity.
9571005
func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName string) (*discoveryv1.EndpointSlice, error) {
9581006
listOpts := metav1.ListOptions{LabelSelector: discoveryv1.LabelServiceName + "=" + serviceName}
@@ -967,8 +1015,8 @@ func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName
9671015
return &epsList.Items[0], nil
9681016
}
9691017

970-
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
971-
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1018+
// fetchPods fetches the pods from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1019+
func (r *routeStackBuilder) fetchPods(ctx context.Context) ([]corev1.Pod, error) {
9721020
svc, err := r.kubeClient.CoreV1().Services(r.namespace).Get(ctx, r.resourceName, metav1.GetOptions{})
9731021
if err != nil {
9741022
return nil, err
@@ -978,9 +1026,18 @@ func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string,
9781026
if err != nil {
9791027
return nil, err
9801028
}
981-
backendServers := make([]string, len(pods.Items))
982-
for i := range pods.Items {
983-
backendServers[i] = pods.Items[i].Name
1029+
return pods.Items, nil
1030+
}
1031+
1032+
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1033+
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1034+
pods, err := r.fetchPods(ctx)
1035+
if err != nil {
1036+
return nil, err
1037+
}
1038+
backendServers := make([]string, len(pods))
1039+
for i := range pods {
1040+
backendServers[i] = pods[i].Name
9841041
}
9851042
return backendServers, nil
9861043
}

test/extended/router/stress.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"context"
66
"fmt"
7+
"strconv"
78
"strings"
89
"text/tabwriter"
910
"time"
@@ -16,6 +17,7 @@ import (
1617

1718
appsv1 "k8s.io/api/apps/v1"
1819
corev1 "k8s.io/api/core/v1"
20+
discoveryv1 "k8s.io/api/discovery/v1"
1921
rbacv1 "k8s.io/api/rbac/v1"
2022
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2123
"k8s.io/apimachinery/pkg/runtime"
@@ -656,6 +658,64 @@ func outputIngress(routes ...routev1.Route) {
656658
e2e.Logf("Routes:\n%s", b.String())
657659
}
658660

661+
func outputEndpoints(endpoints ...corev1.Endpoints) {
662+
b := &bytes.Buffer{}
663+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
664+
fmt.Fprintf(w, "NAME\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
665+
for _, ep := range endpoints {
666+
for _, ss := range ep.Subsets {
667+
resumeAddrs := func(addrs []corev1.EndpointAddress) string {
668+
var addrList []string
669+
for _, addr := range addrs {
670+
val := "-"
671+
if addr.IP != "" {
672+
val = addr.IP
673+
} else if addr.Hostname != "" {
674+
val = addr.Hostname
675+
}
676+
addrList = append(addrList, val)
677+
}
678+
return strings.Join(addrList, ",")
679+
}
680+
var portList []string
681+
for _, port := range ss.Ports {
682+
portList = append(portList, strconv.Itoa(int(port.Port)))
683+
}
684+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", ep.Name, resumeAddrs(ss.Addresses), resumeAddrs(ss.NotReadyAddresses), strings.Join(portList, ","))
685+
}
686+
}
687+
w.Flush()
688+
e2e.Logf("Endpoints:\n%s", b.String())
689+
}
690+
691+
func outputEndpointSlice(epss ...discoveryv1.EndpointSlice) {
692+
b := &bytes.Buffer{}
693+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
694+
fmt.Fprintf(w, "NAME\tSERVICE\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
695+
for _, eps := range epss {
696+
var addrList, notReadyAddrList []string
697+
for _, ep := range eps.Endpoints {
698+
addrs := strings.Join(ep.Addresses, "+")
699+
if ready := ep.Conditions.Ready; ready == nil || *ready == true {
700+
addrList = append(addrList, addrs)
701+
} else {
702+
notReadyAddrList = append(notReadyAddrList, addrs)
703+
}
704+
}
705+
var portList []string
706+
for _, port := range eps.Ports {
707+
val := "-"
708+
if port.Port != nil {
709+
val = strconv.Itoa(int(*port.Port))
710+
}
711+
portList = append(portList, val)
712+
}
713+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", eps.Name, eps.Labels[discoveryv1.LabelServiceName], strings.Join(addrList, ","), strings.Join(notReadyAddrList, ","), strings.Join(portList, ","))
714+
}
715+
w.Flush()
716+
e2e.Logf("EndpointSlices:\n%s", b.String())
717+
}
718+
659719
// findMostRecentConditionTime returns the time of the most recent condition.
660720
func findMostRecentConditionTime(conditions []routev1.RouteIngressCondition) time.Time {
661721
var recent time.Time

0 commit comments

Comments
 (0)