Skip to content

Commit 3dcf0fe

Browse files
committed
improve dcm tests to reduce flakiness
Addressing some flakes in the DCM e2e tests: Race patching endpoints resource: add a polling to ensure the endpoints resource was created. Also, dump all endpoints and endpointSlice in the test namespace in case of a test failure. Timeout calling the router: increase the timeout, reading all the fast ones from a single const. Also, dumping router logs which should help to correlate failures when calling its service. Router container not found: router pod should be reporting container as running and healthy, but it was not running. Attaching the router log, which should help to explain why router container was not found. Example of testing deployment state log, added whenever it is scaled: ``` deployment state: replicas=4 pods=route-scale-in-5b7b4f6b8c-9ncjg/Running/10.128.1.27 // route-scale-in-5b7b4f6b8c-ck45n/Running/10.128.1.28 // route-scale-in-5b7b4f6b8c-srffr/Running/10.128.1.29 // route-scale-in-5b7b4f6b8c-v9hm2/Running/10.128.1.26 ``` Example of Endpoints and EndpointSlice resources listed if the test fails: ``` Endpoints: NAME ADDRESSES NOT READY ADDRESSES PORTS route-scale-in 10.128.1.26,10.128.1.27,10.128.1.28,10.128.1.29 9376 route-scale-in-khbh5 10.128.1.26 9376 EndpointSlices: NAME SERVICE ADDRESSES NOT READY ADDRESSES PORTS route-scale-in-khbh5-8ncmh route-scale-in-khbh5 10.128.1.26 9376 route-scale-in-rrhzv route-scale-in 10.128.1.27,10.128.1.28,10.128.1.29,10.128.1.26 9376 ``` https://redhat.atlassian.net/browse/OCPBUGS-85426
1 parent d41fea2 commit 3dcf0fe

3 files changed

Lines changed: 140 additions & 17 deletions

File tree

test/extended/router/config_manager.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
)
3232

3333
const timeoutSeconds = 3 * 60
34+
const fastTimeoutSeconds = 10
3435

3536
var _ = g.Describe("[sig-network][Feature:Router][apigroup:route.openshift.io]", func() {
3637
defer g.GinkgoRecover()
@@ -862,7 +863,7 @@ func readURL(ns, execPodName, host, abspath, ipaddr string) (string, error) {
862863
return output, nil
863864
}
864865

865-
func waitForRouteToRespond(ns, execPodName, proto, host, abspath, ipaddr string, port int) error {
866+
func waitForRouteToRespond(ns, execPodName, proto, host, abspath, ipaddr string, _ int) error {
866867
execPod := execPodRef{
867868
NamespacedName: types.NamespacedName{
868869
Namespace: ns,

test/extended/router/config_manager_ingress.go

Lines changed: 78 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,32 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
5353
ctx := context.Background()
5454
oc := exutil.NewCLIWithPodSecurityLevel("router-dcm-ingress", api.LevelPrivileged).AsAdmin()
5555
kubeClient := oc.AdminKubeClient()
56+
routeClient := oc.AdminRouteClient()
5657

5758
// variables updated on every new test
5859
var (
5960
execPod execPodRef
6061
controller types.NamespacedName
62+
routerPod types.NamespacedName
6163
routeSelectorSet labels.Set
6264
)
6365

6466
g.AfterEach(func() {
67+
if g.CurrentSpecReport().Failed() {
68+
routes, _ := routeClient.RouteV1().Routes(oc.Namespace()).List(ctx, metav1.ListOptions{})
69+
if routes != nil {
70+
outputIngress(routes.Items...)
71+
}
72+
endpoints, _ := kubeClient.CoreV1().Endpoints(oc.Namespace()).List(ctx, metav1.ListOptions{})
73+
if endpoints != nil {
74+
outputEndpoints(endpoints.Items...)
75+
}
76+
epsList, _ := kubeClient.DiscoveryV1().EndpointSlices(oc.Namespace()).List(ctx, metav1.ListOptions{})
77+
if epsList != nil {
78+
outputEndpointSlice(epsList.Items...)
79+
}
80+
exutil.DumpPodLogsStartingWithInNamespace(routerPod.Name, routerPod.Namespace, oc)
81+
}
6582
if controller.Name != "" {
6683
err := oc.AdminOperatorClient().OperatorV1().IngressControllers(controller.Namespace).Delete(ctx, controller.Name, *metav1.NewDeleteOptions(1))
6784
o.Expect(err).NotTo(o.HaveOccurred())
@@ -73,7 +90,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
7390
nsOperator := "openshift-ingress-operator"
7491
controllerName := names.SimpleNameGenerator.GenerateName("e2e-dcm-")
7592

76-
// ... and its router is created on router's namespace
93+
// ... and its router and service are created in router's namespace
7794
nsRouter := "openshift-ingress"
7895
svcName := "router-internal-" + controllerName
7996

@@ -134,12 +151,15 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
134151
if utilnet.IsIPv6String(pods.Items[0].Status.PodIP) {
135152
loopback = "::1"
136153
}
154+
155+
routerPod = types.NamespacedName{
156+
Namespace: pods.Items[0].Namespace,
157+
Name: pods.Items[0].Name,
158+
}
159+
137160
execPod = execPodRef{
138-
NamespacedName: types.NamespacedName{
139-
Namespace: pods.Items[0].Namespace,
140-
Name: pods.Items[0].Name,
141-
},
142-
ipAddress: loopback,
161+
NamespacedName: routerPod,
162+
ipAddress: loopback,
143163
}
144164
})
145165

@@ -471,6 +491,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
471491

472492
// ... k8s recreates it and we wait it to be fully functional
473493
err = builder.waitDeployment(replicas, dcmIngressTimeout)
494+
builder.printDeploymentState(ctx)
474495
o.Expect(err).NotTo(o.HaveOccurred())
475496
}
476497

@@ -682,7 +703,7 @@ func execPodReadURL(execPod execPodRef, host string, secure bool, abspath string
682703
port = 443
683704
}
684705
uri := fmt.Sprintf("%s://%s:%d%s", proto, host, port, abspath)
685-
cmd := fmt.Sprintf("curl -ksS -m 5 -w '\n%%{http_code}' --resolve %s:%d:%s %q", host, port, execPod.ipAddress, uri)
706+
cmd := fmt.Sprintf("curl -ksS --max-time %d -w '\n%%{http_code}' --resolve %s:%d:%s %q", fastTimeoutSeconds, host, port, execPod.ipAddress, uri)
686707
output, err = e2eoutput.RunHostCmd(execPod.Namespace, execPod.Name, cmd)
687708

688709
// Checking for curl's "(52) empty response from server", this means a FIN or RST from the server side.
@@ -778,7 +799,12 @@ func (r *routeStackBuilder) createDeploymentStack(ctx context.Context, routetype
778799
if err = r.waitDeployment(replicas, timeout); err != nil {
779800
return nil, err
780801
}
781-
return r.exposeDeployment(ctx)
802+
backendServers, err = r.exposeDeployment(ctx)
803+
if err != nil {
804+
return nil, err
805+
}
806+
r.printDeploymentState(ctx)
807+
return backendServers, nil
782808
}
783809

784810
// scaleDeployment scales-in/out the common deployment to the specified replicas. It waits for all the pods to be created and returns their names.
@@ -796,6 +822,7 @@ func (r *routeStackBuilder) scaleDeployment(ctx context.Context, replicas int, t
796822
}
797823
return len(backendServers) == replicas, nil
798824
})
825+
r.printDeploymentState(ctx)
799826
return backendServers, err
800827
}
801828

@@ -824,7 +851,14 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN
824851
}
825852

826853
// we also need the deprecated Endpoints API, since router still uses it depending on the ROUTER_WATCH_ENDPOINTS envvar
827-
epCurrent, err := r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
854+
var epCurrent *corev1.Endpoints
855+
err = wait.PollUntilContextTimeout(ctx, time.Second, fastTimeoutSeconds*time.Second, false, func(ctx context.Context) (done bool, err error) {
856+
epCurrent, err = r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
857+
if err != nil {
858+
framework.Logf("error fetching Endpoints: %s", err.Error())
859+
}
860+
return err == nil, nil
861+
})
828862
if err != nil {
829863
return "", err
830864
}
@@ -841,7 +875,7 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN
841875
}
842876

843877
// EndpointSlice use to be created as soon as the Endpoints resource is created. Lets wait for it, and create ourselves in case it is missing
844-
err = wait.PollUntilContextTimeout(ctx, time.Second, 5*time.Second, false, func(ctx context.Context) (done bool, err error) {
878+
err = wait.PollUntilContextTimeout(ctx, time.Second, fastTimeoutSeconds*time.Second, false, func(ctx context.Context) (done bool, err error) {
845879
_, err = r.fetchEndpointSlice(ctx, serviceName)
846880
if err != nil {
847881
framework.Logf("error fetching EndpointSlice: %s", err.Error())
@@ -910,7 +944,7 @@ func (r *routeStackBuilder) scaleInEndpoints(ctx context.Context, detachedServic
910944
if err != nil {
911945
return err
912946
}
913-
// deleting addresses, from all subnets, whose IP address is not found in the patched `eps`
947+
// deleting addresses, from all subsets, whose IP address is not found in the patched `eps`
914948
for i := range ep.Subsets {
915949
ss := &ep.Subsets[i]
916950
ss.Addresses = slices.DeleteFunc(ss.Addresses, func(addr corev1.EndpointAddress) bool {
@@ -953,6 +987,25 @@ func (r *routeStackBuilder) exposeDeployment(ctx context.Context) (backendServer
953987
return r.fetchServiceReplicas(ctx)
954988
}
955989

990+
// printDeploymentState outputs the pod names, status, and their IP addresses. Best effort, it outputs the error instead in case it happens.
991+
// It requires that `exposeDeployment()` was already called.
992+
func (r *routeStackBuilder) printDeploymentState(ctx context.Context) {
993+
pods, err := r.fetchPods(ctx)
994+
if err != nil {
995+
framework.Logf("deployment state: error reading deployment pods: %v", err)
996+
return
997+
}
998+
var podDescription []string
999+
for _, pod := range pods {
1000+
var podIPs []string
1001+
for _, ip := range pod.Status.PodIPs {
1002+
podIPs = append(podIPs, ip.IP)
1003+
}
1004+
podDescription = append(podDescription, fmt.Sprintf("%s/%s/%s", pod.Name, pod.Status.Phase, strings.Join(podIPs, ",")))
1005+
}
1006+
framework.Logf("deployment state: replicas=%d pods=%s", len(pods), strings.Join(podDescription, " // "))
1007+
}
1008+
9561009
// fetchEndpointSlice fetches the EndpointSlice of the provided service name. It currently supports only one EndpointSlice instance for simplicity.
9571010
func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName string) (*discoveryv1.EndpointSlice, error) {
9581011
listOpts := metav1.ListOptions{LabelSelector: discoveryv1.LabelServiceName + "=" + serviceName}
@@ -967,8 +1020,8 @@ func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName
9671020
return &epsList.Items[0], nil
9681021
}
9691022

970-
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
971-
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1023+
// fetchPods fetches the pods from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1024+
func (r *routeStackBuilder) fetchPods(ctx context.Context) ([]corev1.Pod, error) {
9721025
svc, err := r.kubeClient.CoreV1().Services(r.namespace).Get(ctx, r.resourceName, metav1.GetOptions{})
9731026
if err != nil {
9741027
return nil, err
@@ -978,9 +1031,18 @@ func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string,
9781031
if err != nil {
9791032
return nil, err
9801033
}
981-
backendServers := make([]string, len(pods.Items))
982-
for i := range pods.Items {
983-
backendServers[i] = pods.Items[i].Name
1034+
return pods.Items, nil
1035+
}
1036+
1037+
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1038+
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1039+
pods, err := r.fetchPods(ctx)
1040+
if err != nil {
1041+
return nil, err
1042+
}
1043+
backendServers := make([]string, len(pods))
1044+
for i := range pods {
1045+
backendServers[i] = pods[i].Name
9841046
}
9851047
return backendServers, nil
9861048
}

test/extended/router/stress.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"context"
66
"fmt"
7+
"strconv"
78
"strings"
89
"text/tabwriter"
910
"time"
@@ -16,6 +17,7 @@ import (
1617

1718
appsv1 "k8s.io/api/apps/v1"
1819
corev1 "k8s.io/api/core/v1"
20+
discoveryv1 "k8s.io/api/discovery/v1"
1921
rbacv1 "k8s.io/api/rbac/v1"
2022
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2123
"k8s.io/apimachinery/pkg/runtime"
@@ -656,6 +658,64 @@ func outputIngress(routes ...routev1.Route) {
656658
e2e.Logf("Routes:\n%s", b.String())
657659
}
658660

661+
func outputEndpoints(endpoints ...corev1.Endpoints) {
662+
b := &bytes.Buffer{}
663+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
664+
fmt.Fprintf(w, "NAME\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
665+
for _, ep := range endpoints {
666+
for _, ss := range ep.Subsets {
667+
resumeAddrs := func(addrs []corev1.EndpointAddress) string {
668+
var addrList []string
669+
for _, addr := range addrs {
670+
val := "-"
671+
if addr.IP != "" {
672+
val = addr.IP
673+
} else if addr.Hostname != "" {
674+
val = addr.Hostname
675+
}
676+
addrList = append(addrList, val)
677+
}
678+
return strings.Join(addrList, ",")
679+
}
680+
var portList []string
681+
for _, port := range ss.Ports {
682+
portList = append(portList, strconv.Itoa(int(port.Port)))
683+
}
684+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", ep.Name, resumeAddrs(ss.Addresses), resumeAddrs(ss.NotReadyAddresses), strings.Join(portList, ","))
685+
}
686+
}
687+
w.Flush()
688+
e2e.Logf("Endpoints:\n%s", b.String())
689+
}
690+
691+
func outputEndpointSlice(epss ...discoveryv1.EndpointSlice) {
692+
b := &bytes.Buffer{}
693+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
694+
fmt.Fprintf(w, "NAME\tSERVICE\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
695+
for _, eps := range epss {
696+
var addrList, notReadyAddrList []string
697+
for _, ep := range eps.Endpoints {
698+
addrs := strings.Join(ep.Addresses, "+")
699+
if ready := ep.Conditions.Ready; ready == nil || *ready == true {
700+
addrList = append(addrList, addrs)
701+
} else {
702+
notReadyAddrList = append(notReadyAddrList, addrs)
703+
}
704+
}
705+
var portList []string
706+
for _, port := range eps.Ports {
707+
val := "-"
708+
if port.Port != nil {
709+
val = strconv.Itoa(int(*port.Port))
710+
}
711+
portList = append(portList, val)
712+
}
713+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", eps.Name, eps.Labels[discoveryv1.LabelServiceName], strings.Join(addrList, ","), strings.Join(notReadyAddrList, ","), strings.Join(portList, ","))
714+
}
715+
w.Flush()
716+
e2e.Logf("EndpointSlices:\n%s", b.String())
717+
}
718+
659719
// findMostRecentConditionTime returns the time of the most recent condition.
660720
func findMostRecentConditionTime(conditions []routev1.RouteIngressCondition) time.Time {
661721
var recent time.Time

0 commit comments

Comments
 (0)