Skip to content

Commit 7ae081b

Browse files
committed
improve dcm tests to reduce flakiness in endpoints
Addressing a race when patching endpoints resource. This update adds a polling to ensure the endpoints resource was created. Also, dump all endpoints and endpointSlice in the test namespace in case of a test failure, which helps to correlate the current state and what should be expected in the test scenario. Example of deployments state log: ``` deployment state: replicas=4 pods=route-scale-in-5b7b4f6b8c-9ncjg/Running/10.128.1.27 // route-scale-in-5b7b4f6b8c-ck45n/Running/10.128.1.28 // route-scale-in-5b7b4f6b8c-srffr/Running/10.128.1.29 // route-scale-in-5b7b4f6b8c-v9hm2/Running/10.128.1.26 ``` Example of Endpoints and EndpointSlice resources listed if the test fails: ``` Endpoints: NAME ADDRESSES NOT READY ADDRESSES PORTS route-scale-in 10.128.1.26,10.128.1.27,10.128.1.28,10.128.1.29 9376 route-scale-in-khbh5 10.128.1.26 9376 EndpointSlices: NAME SERVICE ADDRESSES NOT READY ADDRESSES PORTS route-scale-in-khbh5-8ncmh route-scale-in-khbh5 10.128.1.26 9376 route-scale-in-rrhzv route-scale-in 10.128.1.27,10.128.1.28,10.128.1.29,10.128.1.26 9376 ``` https://redhat.atlassian.net/browse/OCPBUGS-85426
1 parent 50759b5 commit 7ae081b

2 files changed

Lines changed: 149 additions & 19 deletions

File tree

test/extended/router/config_manager_ingress.go

Lines changed: 89 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
exutil "github.com/openshift/origin/test/extended/util"
4040
)
4141

42+
// execPodRef defines the attributes of the router pod used to execute local HTTP requests
4243
type execPodRef struct {
4344
types.NamespacedName
4445
ipAddress string
@@ -47,21 +48,46 @@ type execPodRef struct {
4748
var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.io][OCPFeatureGate:IngressControllerDynamicConfigurationManager]", func() {
4849
defer g.GinkgoRecover()
4950

51+
// dcmIngressTimeout defines the maximum amount of time to wait for test operations to complete.
5052
const dcmIngressTimeout = 2 * time.Minute
53+
// maxDynamicServers defines the number of empty dynamic servers to be allocated when a reload happens.
54+
// Some tests use this value as a premise to limit the number of new servers on scale-out operations.
5155
const maxDynamicServers = 4
5256

5357
ctx := context.Background()
5458
oc := exutil.NewCLIWithPodSecurityLevel("router-dcm-ingress", api.LevelPrivileged).AsAdmin()
5559
kubeClient := oc.AdminKubeClient()
60+
routeClient := oc.AdminRouteClient()
5661

57-
// variables updated on every new test
58-
var (
59-
execPod execPodRef
60-
controller types.NamespacedName
61-
routeSelectorSet labels.Set
62-
)
62+
// execPod is the pod used to run requests against the router.
63+
var execPod execPodRef
64+
// controller is the fully qualified name of the ingress controller resource used in the current test.
65+
var controller types.NamespacedName
66+
// routeSelectorSet is the label key/value pair that the ingress controller uses to filter route resources.
67+
// All route resources should add this label, so the router can find and process them.
68+
var routeSelectorSet labels.Set
6369

6470
g.AfterEach(func() {
71+
if g.CurrentSpecReport().Failed() {
72+
routes, err := routeClient.RouteV1().Routes(oc.Namespace()).List(ctx, metav1.ListOptions{})
73+
if err != nil {
74+
framework.Logf("failed to list Routes in namespace %q: %v", oc.Namespace(), err)
75+
} else {
76+
outputIngress(routes.Items...)
77+
}
78+
endpoints, err := kubeClient.CoreV1().Endpoints(oc.Namespace()).List(ctx, metav1.ListOptions{})
79+
if err != nil {
80+
framework.Logf("failed to list Endpoints in namespace %q: %v", oc.Namespace(), err)
81+
} else {
82+
outputEndpoints(endpoints.Items...)
83+
}
84+
epsList, err := kubeClient.DiscoveryV1().EndpointSlices(oc.Namespace()).List(ctx, metav1.ListOptions{})
85+
if err != nil {
86+
framework.Logf("failed to list EndpointSlices in namespace %q: %v", oc.Namespace(), err)
87+
} else {
88+
outputEndpointSlice(epsList.Items...)
89+
}
90+
}
6591
if controller.Name != "" {
6692
err := oc.AdminOperatorClient().OperatorV1().IngressControllers(controller.Namespace).Delete(ctx, controller.Name, *metav1.NewDeleteOptions(1))
6793
o.Expect(err).NotTo(o.HaveOccurred())
@@ -73,7 +99,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
7399
nsOperator := "openshift-ingress-operator"
74100
controllerName := names.SimpleNameGenerator.GenerateName("e2e-dcm-")
75101

76-
// ... and its router is created on router's namespace
102+
// ... and its router and service are created in router's namespace
77103
nsRouter := "openshift-ingress"
78104
svcName := "router-internal-" + controllerName
79105

@@ -189,12 +215,12 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
189215
// scaling-out to 4 replicas, one at a time
190216
for replicas := initReplicas + 1; replicas <= 4; replicas++ {
191217

192-
g.By(fmt.Sprintf("scaling-out to %d replicas", replicas))
218+
framework.Logf("scaling-out to %d replicas", replicas)
193219

194220
currentServers, err := builder.scaleDeployment(ctx, replicas, dcmIngressTimeout)
195221
o.Expect(err).NotTo(o.HaveOccurred())
196222

197-
g.By("waiting router to add all the backend servers to the load balance")
223+
framework.Logf("waiting router to add all the backend servers to the load balance")
198224

199225
// router should eventually reach all the known replicas
200226
eventuallyRouteAllServers(execPod, hostname, false, currentServers, dcmIngressTimeout)
@@ -235,7 +261,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
235261
// instead of HAProxy removing it from the balance due to health-check starting to fail.
236262
for replicas := initReplicas - 1; replicas >= 1; replicas-- {
237263

238-
g.By(fmt.Sprintf("scaling-in to %d replicas", replicas))
264+
framework.Logf("scaling-in to %d replicas", replicas)
239265

240266
currentServers, err := builder.scaleInEndpoints(ctx, serviceName, replicas)
241267
o.Expect(err).NotTo(o.HaveOccurred())
@@ -471,6 +497,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
471497

472498
// ... k8s recreates it and we wait it to be fully functional
473499
err = builder.waitDeployment(replicas, dcmIngressTimeout)
500+
builder.printDeploymentState(ctx)
474501
o.Expect(err).NotTo(o.HaveOccurred())
475502
}
476503

@@ -637,7 +664,7 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift.
637664
// Iterates over a number of scaling operations, always checking if the change is being applied.
638665
for i, replicas := range changingReplicas {
639666

640-
g.By(fmt.Sprintf("iteration %d, scaling from %d to %d replicas", i+1, prevReplicas, replicas))
667+
framework.Logf("iteration %d, scaling from %d to %d replicas", i+1, prevReplicas, replicas)
641668

642669
currentServers, err := builder.scaleDeployment(ctx, replicas, dcmIngressTimeout)
643670
o.Expect(err).NotTo(o.HaveOccurred())
@@ -778,7 +805,12 @@ func (r *routeStackBuilder) createDeploymentStack(ctx context.Context, routetype
778805
if err = r.waitDeployment(replicas, timeout); err != nil {
779806
return nil, err
780807
}
781-
return r.exposeDeployment(ctx)
808+
backendServers, err = r.exposeDeployment(ctx)
809+
if err != nil {
810+
return nil, err
811+
}
812+
r.printDeploymentState(ctx)
813+
return backendServers, nil
782814
}
783815

784816
// scaleDeployment scales-in/out the common deployment to the specified replicas. It waits for all the pods to be created and returns their names.
@@ -796,6 +828,7 @@ func (r *routeStackBuilder) scaleDeployment(ctx context.Context, replicas int, t
796828
}
797829
return len(backendServers) == replicas, nil
798830
})
831+
r.printDeploymentState(ctx)
799832
return backendServers, err
800833
}
801834

@@ -824,7 +857,14 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN
824857
}
825858

826859
// we also need the deprecated Endpoints API, since router still uses it depending on the ROUTER_WATCH_ENDPOINTS envvar
827-
epCurrent, err := r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
860+
var epCurrent *corev1.Endpoints
861+
err = wait.PollUntilContextTimeout(ctx, time.Second, 5*time.Second, false, func(ctx context.Context) (done bool, err error) {
862+
epCurrent, err = r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{})
863+
if err != nil {
864+
framework.Logf("error fetching Endpoints: %s", err.Error())
865+
}
866+
return err == nil, nil
867+
})
828868
if err != nil {
829869
return "", err
830870
}
@@ -910,7 +950,7 @@ func (r *routeStackBuilder) scaleInEndpoints(ctx context.Context, detachedServic
910950
if err != nil {
911951
return err
912952
}
913-
// deleting addresses, from all subnets, whose IP address is not found in the patched `eps`
953+
// deleting addresses, from all subsets, whose IP address is not found in the patched `eps`
914954
for i := range ep.Subsets {
915955
ss := &ep.Subsets[i]
916956
ss.Addresses = slices.DeleteFunc(ss.Addresses, func(addr corev1.EndpointAddress) bool {
@@ -939,6 +979,8 @@ func (r *routeStackBuilder) createServeHostnameDeployment(replicas int) error {
939979

940980
// createDeployment creates the deployment resource. It should be called just once, since it uses the OC namespace and the common resource name.
941981
func (r *routeStackBuilder) createDeployment(image string, replicas, port int, cmd ...string) error {
982+
// This deployment is safe under Single Node OpenShift (SNO): although it can create more replicas,
983+
// those replicas does not configure anti-affinity, so all of them can run in the same node.
942984
runArgs := []string{"deployment", r.resourceName, "--image", image, "--replicas", strconv.Itoa(replicas), "--port", strconv.Itoa(port), "--"}
943985
runArgs = append(runArgs, cmd...)
944986
return r.oc.Run("create").Args(runArgs...).Execute()
@@ -953,6 +995,25 @@ func (r *routeStackBuilder) exposeDeployment(ctx context.Context) (backendServer
953995
return r.fetchServiceReplicas(ctx)
954996
}
955997

998+
// printDeploymentState outputs the pod names, status, and their IP addresses. Best effort, it outputs the error instead in case it happens.
999+
// It requires that `exposeDeployment()` was already called.
1000+
func (r *routeStackBuilder) printDeploymentState(ctx context.Context) {
1001+
pods, err := r.fetchPods(ctx)
1002+
if err != nil {
1003+
framework.Logf("deployment state: error reading deployment pods: %v", err)
1004+
return
1005+
}
1006+
var podDescription []string
1007+
for _, pod := range pods {
1008+
var podIPs []string
1009+
for _, ip := range pod.Status.PodIPs {
1010+
podIPs = append(podIPs, ip.IP)
1011+
}
1012+
podDescription = append(podDescription, fmt.Sprintf("%s/%s/%s", pod.Name, pod.Status.Phase, strings.Join(podIPs, ",")))
1013+
}
1014+
framework.Logf("deployment state: replicas=%d pods=%s", len(pods), strings.Join(podDescription, " // "))
1015+
}
1016+
9561017
// fetchEndpointSlice fetches the EndpointSlice of the provided service name. It currently supports only one EndpointSlice instance for simplicity.
9571018
func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName string) (*discoveryv1.EndpointSlice, error) {
9581019
listOpts := metav1.ListOptions{LabelSelector: discoveryv1.LabelServiceName + "=" + serviceName}
@@ -967,8 +1028,8 @@ func (r *routeStackBuilder) fetchEndpointSlice(ctx context.Context, serviceName
9671028
return &epsList.Items[0], nil
9681029
}
9691030

970-
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
971-
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1031+
// fetchPods fetches the pods from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1032+
func (r *routeStackBuilder) fetchPods(ctx context.Context) ([]corev1.Pod, error) {
9721033
svc, err := r.kubeClient.CoreV1().Services(r.namespace).Get(ctx, r.resourceName, metav1.GetOptions{})
9731034
if err != nil {
9741035
return nil, err
@@ -978,9 +1039,18 @@ func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string,
9781039
if err != nil {
9791040
return nil, err
9801041
}
981-
backendServers := make([]string, len(pods.Items))
982-
for i := range pods.Items {
983-
backendServers[i] = pods.Items[i].Name
1042+
return pods.Items, nil
1043+
}
1044+
1045+
// fetchServiceReplicas fetches the pod names from the exposed common deployment. It requires that `exposeDeployment()` was already called.
1046+
func (r *routeStackBuilder) fetchServiceReplicas(ctx context.Context) ([]string, error) {
1047+
pods, err := r.fetchPods(ctx)
1048+
if err != nil {
1049+
return nil, err
1050+
}
1051+
backendServers := make([]string, len(pods))
1052+
for i := range pods {
1053+
backendServers[i] = pods[i].Name
9841054
}
9851055
return backendServers, nil
9861056
}

test/extended/router/stress.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"context"
66
"fmt"
7+
"strconv"
78
"strings"
89
"text/tabwriter"
910
"time"
@@ -16,6 +17,7 @@ import (
1617

1718
appsv1 "k8s.io/api/apps/v1"
1819
corev1 "k8s.io/api/core/v1"
20+
discoveryv1 "k8s.io/api/discovery/v1"
1921
rbacv1 "k8s.io/api/rbac/v1"
2022
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2123
"k8s.io/apimachinery/pkg/runtime"
@@ -656,6 +658,64 @@ func outputIngress(routes ...routev1.Route) {
656658
e2e.Logf("Routes:\n%s", b.String())
657659
}
658660

661+
func outputEndpoints(endpoints ...corev1.Endpoints) {
662+
b := &bytes.Buffer{}
663+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
664+
fmt.Fprintf(w, "NAME\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
665+
for _, ep := range endpoints {
666+
for _, ss := range ep.Subsets {
667+
resumeAddrs := func(addrs []corev1.EndpointAddress) string {
668+
var addrList []string
669+
for _, addr := range addrs {
670+
val := "-"
671+
if addr.IP != "" {
672+
val = addr.IP
673+
} else if addr.Hostname != "" {
674+
val = addr.Hostname
675+
}
676+
addrList = append(addrList, val)
677+
}
678+
return strings.Join(addrList, ",")
679+
}
680+
var portList []string
681+
for _, port := range ss.Ports {
682+
portList = append(portList, strconv.Itoa(int(port.Port)))
683+
}
684+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", ep.Name, resumeAddrs(ss.Addresses), resumeAddrs(ss.NotReadyAddresses), strings.Join(portList, ","))
685+
}
686+
}
687+
w.Flush()
688+
e2e.Logf("Endpoints:\n%s", b.String())
689+
}
690+
691+
func outputEndpointSlice(epss ...discoveryv1.EndpointSlice) {
692+
b := &bytes.Buffer{}
693+
w := tabwriter.NewWriter(b, 0, 0, 2, ' ', 0)
694+
fmt.Fprintf(w, "NAME\tSERVICE\tADDRESSES\tNOT READY ADDRESSES\tPORTS\n")
695+
for _, eps := range epss {
696+
var addrList, notReadyAddrList []string
697+
for _, ep := range eps.Endpoints {
698+
addrs := strings.Join(ep.Addresses, "+")
699+
if ready := ep.Conditions.Ready; ready == nil || *ready == true {
700+
addrList = append(addrList, addrs)
701+
} else {
702+
notReadyAddrList = append(notReadyAddrList, addrs)
703+
}
704+
}
705+
var portList []string
706+
for _, port := range eps.Ports {
707+
val := "-"
708+
if port.Port != nil {
709+
val = strconv.Itoa(int(*port.Port))
710+
}
711+
portList = append(portList, val)
712+
}
713+
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", eps.Name, eps.Labels[discoveryv1.LabelServiceName], strings.Join(addrList, ","), strings.Join(notReadyAddrList, ","), strings.Join(portList, ","))
714+
}
715+
w.Flush()
716+
e2e.Logf("EndpointSlices:\n%s", b.String())
717+
}
718+
659719
// findMostRecentConditionTime returns the time of the most recent condition.
660720
func findMostRecentConditionTime(conditions []routev1.RouteIngressCondition) time.Time {
661721
var recent time.Time

0 commit comments

Comments
 (0)