Skip to content

Commit 7d0c97a

Browse files
authored
test: address e2e test flakiness (#1222)
- Increase Load Balancer creation timeout to prevent early test abort (previously decreased: bccaddb#diff-8d48bac6a439e62ea3b0740b26d2d6b9c93e1aeebc67aa9cc562abac695b13d5L291) - If we can't cleanly destroy the namespace after three minutes we can info log the error and rely on the Servers being destroyed - Cleanup all remaining Load Balancers when stopping the cluster
1 parent b550fa5 commit 7d0c97a

2 files changed

Lines changed: 64 additions & 20 deletions

File tree

tests/e2e/cloud_test.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,6 @@ func TestServiceLoadBalancersMinimalSetup(t *testing.T) {
8181
}
8282

8383
func TestServiceLoadBalancersHTTPS(t *testing.T) {
84-
t.Parallel()
85-
8684
lbTest := lbTestHelper{
8785
t: t,
8886
podName: "loadbalancer-https",
@@ -112,8 +110,6 @@ func TestServiceLoadBalancersHTTPS(t *testing.T) {
112110
}
113111

114112
func TestServiceLoadBalancersHTTPSWithManagedCertificate(t *testing.T) {
115-
t.Parallel()
116-
117113
if testCluster.certDomain == "" {
118114
t.Skip("Skipping because CERT_DOMAIN is not set")
119115
}

tests/e2e/helper_test.go

Lines changed: 64 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ import (
1111
"net/http"
1212
"os"
1313
"strconv"
14+
"strings"
1415
"testing"
1516
"time"
1617

17-
"github.com/stretchr/testify/require"
1818
hrobot "github.com/syself/hrobot-go"
1919
corev1 "k8s.io/api/core/v1"
2020
k8serrors "k8s.io/apimachinery/pkg/api/errors"
@@ -24,11 +24,37 @@ import (
2424
"k8s.io/client-go/kubernetes"
2525
"k8s.io/client-go/tools/clientcmd"
2626

27+
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/annotation"
28+
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
2729
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport"
2830
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/utils"
2931
"github.com/hetznercloud/hcloud-go/v2/hcloud"
3032
)
3133

34+
// Load Balancer creation timeouts. Services that involve certificate
35+
// provisioning take longer to become ready.
36+
const (
37+
lbCreateTimeoutDefault = 8 * time.Minute
38+
lbCreateTimeoutCert = 16 * time.Minute
39+
)
40+
41+
// lbCreateTimeoutFor returns the timeout to use when waiting for the given
42+
// Load Balancer service to become ready.
43+
func lbCreateTimeoutFor(svc *corev1.Service) time.Duration {
44+
certAnnotations := []annotation.Name{
45+
annotation.LBSvcHTTPCertificates,
46+
annotation.LBSvcHTTPCertificateType,
47+
annotation.LBSvcHTTPManagedCertificateName,
48+
annotation.LBSvcHTTPManagedCertificateDomains,
49+
}
50+
for _, a := range certAnnotations {
51+
if _, ok := svc.Annotations[string(a)]; ok {
52+
return lbCreateTimeoutCert
53+
}
54+
}
55+
return lbCreateTimeoutDefault
56+
}
57+
3258
var rng *rand.Rand
3359

3460
func init() {
@@ -53,7 +79,7 @@ type TestCluster struct {
5379
certDomain string
5480

5581
certificates *utils.SyncSet[int64]
56-
loadBalancers *utils.SyncSet[int64]
82+
loadBalancers *utils.SyncSet[string]
5783
}
5884

5985
func (tc *TestCluster) Start() error {
@@ -104,7 +130,7 @@ func (tc *TestCluster) Start() error {
104130
tc.certDomain = os.Getenv("CERT_DOMAIN")
105131

106132
tc.certificates = utils.NewSyncSet[int64]()
107-
tc.loadBalancers = utils.NewSyncSet[int64]()
133+
tc.loadBalancers = utils.NewSyncSet[string]()
108134

109135
return nil
110136
}
@@ -113,14 +139,26 @@ func (tc *TestCluster) Stop() error {
113139
errs := make([]error, 0, tc.loadBalancers.Size()+tc.certificates.Size())
114140
ctx := context.Background()
115141

116-
for _, item := range tc.loadBalancers.All() {
117-
fmt.Printf("deleting load balancer %d\n", item)
118-
if _, err := tc.hcloud.LoadBalancer.Delete(ctx, &hcloud.LoadBalancer{ID: item}); err != nil {
119-
errs = append(errs, fmt.Errorf("delete load balancer %d failed: %w", item, err))
142+
uids := tc.loadBalancers.All()
143+
selector := fmt.Sprintf("%s in (%s)", hcops.LabelServiceUID, strings.Join(uids, ","))
144+
lbs, err := tc.hcloud.LoadBalancer.AllWithOpts(ctx, hcloud.LoadBalancerListOpts{
145+
ListOpts: hcloud.ListOpts{
146+
LabelSelector: selector,
147+
},
148+
})
149+
if err != nil {
150+
return fmt.Errorf("fetching load balancers via selector %s: %w", selector, err)
151+
}
152+
for _, lb := range lbs {
153+
// Stop is called after `m.Run()`, so we can't use t.Log anymore.
154+
fmt.Printf("force-deleting leaked load balancer %d (%s)\n", lb.ID, lb.Name)
155+
if _, err := tc.hcloud.LoadBalancer.Delete(ctx, lb); err != nil {
156+
errs = append(errs, fmt.Errorf("delete leaked load balancer %d failed: %w", lb.ID, err))
120157
}
121158
}
122159

123160
for _, item := range tc.certificates.All() {
161+
// Stop is called after `m.Run()`, so we can't use t.Log anymore.
124162
fmt.Printf("deleting certificate %d\n", item)
125163
if _, err := tc.hcloud.Certificate.Delete(ctx, &hcloud.Certificate{ID: item}); err != nil {
126164
errs = append(errs, fmt.Errorf("delete certificate %d failed: %w", item, err))
@@ -229,7 +267,7 @@ func (l *lbTestHelper) DeployTestPod() (*corev1.Pod, error) {
229267
return nil, fmt.Errorf("could not create test pod: %w", err)
230268
}
231269

232-
err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 1*time.Minute, false, func(ctx context.Context) (done bool, err error) {
270+
err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 2*time.Minute, false, func(ctx context.Context) (done bool, err error) {
233271
p, err := testCluster.k8sClient.CoreV1().Pods(l.namespace).Get(ctx, podName, metav1.GetOptions{})
234272
if err != nil {
235273
return false, err
@@ -243,7 +281,7 @@ func (l *lbTestHelper) DeployTestPod() (*corev1.Pod, error) {
243281
return false, nil
244282
})
245283
if err != nil {
246-
return nil, fmt.Errorf("pod %s did not come up after 1 minute: %w", podName, err)
284+
return nil, fmt.Errorf("pod %s did not come up after 2 minutes: %w", podName, err)
247285
}
248286

249287
return pod, nil
@@ -280,16 +318,21 @@ func (l *lbTestHelper) ServiceDefinition(pod *corev1.Pod, annotations map[string
280318
}
281319

282320
// CreateService creates a k8s service based on the given service definition
283-
// and waits until it is "ready".
321+
// and waits until it is "ready". The wait timeout is derived from the service
322+
// annotations: services that provision certificates get a longer timeout.
284323
func (l *lbTestHelper) CreateService(lbSvc *corev1.Service) (*corev1.Service, error) {
285324
l.t.Helper()
286325

326+
timeout := lbCreateTimeoutFor(lbSvc)
327+
287328
lbSvc, err := testCluster.k8sClient.CoreV1().Services(l.namespace).Create(l.t.Context(), lbSvc, metav1.CreateOptions{})
288329
if err != nil {
289330
return nil, fmt.Errorf("could not create service: %w", err)
290331
}
291332

292-
ctx, cancel := context.WithTimeout(l.t.Context(), 4*time.Minute)
333+
testCluster.loadBalancers.Add(string(lbSvc.UID))
334+
335+
ctx, cancel := context.WithTimeout(l.t.Context(), timeout)
293336
defer cancel()
294337

295338
retries := 0
@@ -324,19 +367,24 @@ func (l *lbTestHelper) TearDown() {
324367

325368
// Use context.Background() rather than t.Context(): cleanup must run to
326369
// completion even when the test has already been cancelled or failed.
327-
err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
370+
ctx := context.Background()
371+
372+
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
328373
err := testCluster.k8sClient.CoreV1().Namespaces().Delete(ctx, l.namespace, metav1.DeleteOptions{})
329374
if err != nil && !k8serrors.IsNotFound(err) {
330375
return false, err
331376
}
332377
return k8serrors.IsNotFound(err), nil
333378
})
334-
require.NoError(l.t, err)
379+
if err != nil {
380+
// The cluster is deleted afterward, so we can info log this error
381+
l.t.Logf("error tearing down test namespace: %v", err)
382+
}
335383
}
336384

337385
// WaitForHTTPAvailable tries to connect to the given IP via HTTP or HTTPS
338386
// (controlled by useHTTPS). It uses exponential backoff starting at 1s and
339-
// capping at 30s, waiting up to 6 minutes for a successful HTTP 200 response.
387+
// capping at 30s, waiting up to 8 minutes for a successful HTTP 200 response.
340388
// Each individual request has a 5s timeout.
341389
func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) error {
342390
l.t.Helper()
@@ -354,7 +402,7 @@ func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) err
354402
proto = "https"
355403
}
356404

357-
ctx, cancel := context.WithTimeout(l.t.Context(), 6*time.Minute)
405+
ctx, cancel := context.WithTimeout(l.t.Context(), 8*time.Minute)
358406
defer cancel()
359407

360408
retries := 0
@@ -376,7 +424,7 @@ func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) err
376424

377425
select {
378426
case <-ctx.Done():
379-
return fmt.Errorf("timed out after 6m waiting for %s to be available", ingressIP)
427+
return fmt.Errorf("timed out after 8m waiting for %s to be available", ingressIP)
380428
case <-time.After(pollBackoff(retries)):
381429
retries++
382430
}

0 commit comments

Comments
 (0)