Skip to content

Commit f79d89c

Browse files
committed
test: ensure cleanup of LBs
1 parent ed1782f commit f79d89c

1 file changed

Lines changed: 83 additions & 10 deletions

File tree

tests/e2e/helper_test.go

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"k8s.io/client-go/kubernetes"
2424
"k8s.io/client-go/tools/clientcmd"
2525

26+
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
2627
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport"
2728
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/utils"
2829
"github.com/hetznercloud/hcloud-go/v2/hcloud"
@@ -112,10 +113,21 @@ func (tc *TestCluster) Stop() error {
112113
errs := make([]error, 0, tc.loadBalancers.Size()+tc.certificates.Size())
113114
ctx := context.Background()
114115

115-
for _, item := range tc.loadBalancers.All() {
116-
fmt.Printf("deleting load balancer %d\n", item)
117-
if _, err := tc.hcloud.LoadBalancer.Delete(ctx, &hcloud.LoadBalancer{ID: item}); err != nil {
118-
errs = append(errs, fmt.Errorf("delete load balancer %d failed: %w", item, err))
116+
// Leak sweep: any registered Load Balancer still present here means the
117+
// hccm finalizer did not release it during namespace teardown. This can
118+
// cause issues when deleting the Private Network afterward.
119+
for _, id := range tc.loadBalancers.All() {
120+
lb, _, err := tc.hcloud.LoadBalancer.GetByID(ctx, id)
121+
if err != nil {
122+
errs = append(errs, fmt.Errorf("checking load balancer %d for leak: %w", id, err))
123+
continue
124+
}
125+
if lb == nil {
126+
continue // released by hccm finalizer, nothing to do
127+
}
128+
fmt.Printf("force-deleting leaked load balancer %d (%s)\n", id, lb.Name)
129+
if _, err := tc.hcloud.LoadBalancer.Delete(ctx, lb); err != nil {
130+
errs = append(errs, fmt.Errorf("delete leaked load balancer %d failed: %w", id, err))
119131
}
120132
}
121133

@@ -228,7 +240,7 @@ func (l *lbTestHelper) DeployTestPod() (*corev1.Pod, error) {
228240
return nil, fmt.Errorf("could not create test pod: %w", err)
229241
}
230242

231-
err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 1*time.Minute, false, func(ctx context.Context) (done bool, err error) {
243+
err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 2*time.Minute, false, func(ctx context.Context) (done bool, err error) {
232244
p, err := testCluster.k8sClient.CoreV1().Pods(l.namespace).Get(ctx, podName, metav1.GetOptions{})
233245
if err != nil {
234246
return false, err
@@ -242,7 +254,7 @@ func (l *lbTestHelper) DeployTestPod() (*corev1.Pod, error) {
242254
return false, nil
243255
})
244256
if err != nil {
245-
return nil, fmt.Errorf("pod %s did not come up after 1 minute: %w", podName, err)
257+
return nil, fmt.Errorf("pod %s did not come up after 2 minutes: %w", podName, err)
246258
}
247259

248260
return pod, nil
@@ -299,6 +311,9 @@ func (l *lbTestHelper) CreateService(lbSvc *corev1.Service) (*corev1.Service, er
299311
}
300312

301313
if len(svc.Status.LoadBalancer.Ingress) > 0 {
314+
if err := testCluster.registerServiceLoadBalancers(ctx, svc); err != nil {
315+
return nil, err
316+
}
302317
return svc, nil
303318
}
304319

@@ -311,6 +326,24 @@ func (l *lbTestHelper) CreateService(lbSvc *corev1.Service) (*corev1.Service, er
311326
}
312327
}
313328

329+
// registerServiceLoadBalancers looks up the hcloud Load Balancers that hccm
330+
// created for svc (identified by the service-uid label) and tracks their IDs
331+
// so TestCluster.Stop can detect finalizer leaks after the suite runs.
332+
func (tc *TestCluster) registerServiceLoadBalancers(ctx context.Context, svc *corev1.Service) error {
333+
lbs, err := tc.hcloud.LoadBalancer.AllWithOpts(ctx, hcloud.LoadBalancerListOpts{
334+
ListOpts: hcloud.ListOpts{
335+
LabelSelector: fmt.Sprintf("%s=%s", hcops.LabelServiceUID, svc.UID),
336+
},
337+
})
338+
if err != nil {
339+
return fmt.Errorf("listing hcloud load balancers for service %s/%s: %w", svc.Namespace, svc.Name, err)
340+
}
341+
for _, lb := range lbs {
342+
tc.loadBalancers.Add(lb.ID)
343+
}
344+
return nil
345+
}
346+
314347
// TearDown deletes the created pod and service.
315348
func (l *lbTestHelper) TearDown() {
316349
l.t.Helper()
@@ -323,7 +356,15 @@ func (l *lbTestHelper) TearDown() {
323356

324357
// Use context.Background() rather than t.Context(): cleanup must run to
325358
// completion even when the test has already been cancelled or failed.
326-
err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
359+
ctx := context.Background()
360+
361+
// Delete LoadBalancer Services explicitly before the namespace. If the
362+
// hccm finalizer is stuck releasing Hetzner resources, the error surfaces
363+
// here as an attributable Service-delete timeout instead of a generic
364+
// namespace-delete timeout.
365+
l.deleteLoadBalancerServices(ctx)
366+
367+
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) {
327368
err := testCluster.k8sClient.CoreV1().Namespaces().Delete(ctx, l.namespace, metav1.DeleteOptions{})
328369
if err != nil && !k8serrors.IsNotFound(err) {
329370
return false, err
@@ -336,9 +377,41 @@ func (l *lbTestHelper) TearDown() {
336377
}
337378
}
338379

380+
func (l *lbTestHelper) deleteLoadBalancerServices(ctx context.Context) {
381+
svcClient := testCluster.k8sClient.CoreV1().Services(l.namespace)
382+
383+
svcList, err := svcClient.List(ctx, metav1.ListOptions{})
384+
if err != nil {
385+
if !k8serrors.IsNotFound(err) {
386+
l.t.Logf("error listing services in namespace %s: %v", l.namespace, err)
387+
}
388+
return
389+
}
390+
391+
for _, svc := range svcList.Items {
392+
if svc.Spec.Type != corev1.ServiceTypeLoadBalancer {
393+
continue
394+
}
395+
if err := svcClient.Delete(ctx, svc.Name, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
396+
l.t.Logf("error deleting service %s/%s: %v", l.namespace, svc.Name, err)
397+
continue
398+
}
399+
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
400+
_, err := svcClient.Get(ctx, svc.Name, metav1.GetOptions{})
401+
if k8serrors.IsNotFound(err) {
402+
return true, nil
403+
}
404+
return false, err
405+
})
406+
if err != nil {
407+
l.t.Logf("service %s/%s did not delete within 2m (hccm finalizer may be stuck): %v", l.namespace, svc.Name, err)
408+
}
409+
}
410+
}
411+
339412
// WaitForHTTPAvailable tries to connect to the given IP via HTTP or HTTPS
340413
// (controlled by useHTTPS). It uses exponential backoff starting at 1s and
341-
// capping at 30s, waiting up to 6 minutes for a successful HTTP 200 response.
414+
// capping at 30s, waiting up to 8 minutes for a successful HTTP 200 response.
342415
// Each individual request has a 5s timeout.
343416
func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) error {
344417
l.t.Helper()
@@ -356,7 +429,7 @@ func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) err
356429
proto = "https"
357430
}
358431

359-
ctx, cancel := context.WithTimeout(l.t.Context(), 6*time.Minute)
432+
ctx, cancel := context.WithTimeout(l.t.Context(), 8*time.Minute)
360433
defer cancel()
361434

362435
retries := 0
@@ -378,7 +451,7 @@ func (l *lbTestHelper) WaitForHTTPAvailable(ingressIP string, useHTTPS bool) err
378451

379452
select {
380453
case <-ctx.Done():
381-
return fmt.Errorf("timed out after 6m waiting for %s to be available", ingressIP)
454+
return fmt.Errorf("timed out after 8m waiting for %s to be available", ingressIP)
382455
case <-time.After(pollBackoff(retries)):
383456
retries++
384457
}

0 commit comments

Comments
 (0)