From 86d25959daa8113c9bac4a3d2dad0c3c874cf10e Mon Sep 17 00:00:00 2001 From: david-yu Date: Fri, 10 Apr 2026 13:56:42 -0700 Subject: [PATCH 1/2] fix: wait for cert-manager webhook before decommissioning integration test The TestIntegrationStatefulSetDecommissioner test intermittently fails with "no endpoints available for service cert-manager-webhook" because helm install runs before the cert-manager webhook pod has ready endpoints. Add testutil.WaitForCertManagerWebhook() in SetupSuite before any helm operations, matching the pattern used by vcluster, k3d, helmtest, and acceptance test setups. Also add webhook error retry to installChart, matching the existing retry in upgradeChart. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../statefulset_decommissioner_test.go | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go b/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go index 7ca6b6a64..d1767743d 100644 --- a/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go +++ b/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go @@ -240,6 +240,15 @@ func (s *StatefulSetDecommissionerSuite) SetupSuite() { // pods can schedule and cert-manager webhook stays available. s.cleanupStaleState() + // Wait for cert-manager webhook to have ready endpoints before + // proceeding. Without this, helm installs that create Certificate + // resources can fail with "no endpoints available for service + // cert-manager-webhook" under parallel test load. + s.Require().NoError( + testutil.WaitForCertManagerWebhook(s.ctx, s.client, 2*time.Minute), + "cert-manager webhook not ready", + ) + s.env.SetupManager(s.setupRBAC(), func(mcmgr multicluster.Manager) error { mgr := mcmgr.GetLocalManager() helmClient, err := helm.New(helm.Options{ @@ -310,12 +319,23 @@ func (s *StatefulSetDecommissionerSuite) installChart(name string, overrides map values = functional.MergeMaps(values, overrides) } - release, err := s.helm.Install(s.ctx, redpandaChartPath, helm.InstallOptions{ - CreateNamespace: true, - Name: name, - Namespace: s.env.Namespace(), - Values: values, - }) + // Retry on transient cert-manager webhook errors, matching the + // pattern in upgradeChart. + var release helm.Release + var err error + for attempt := range 3 { + release, err = s.helm.Install(s.ctx, redpandaChartPath, helm.InstallOptions{ + CreateNamespace: true, + Name: name, + Namespace: s.env.Namespace(), + Values: values, + }) + if err == nil || !strings.Contains(err.Error(), "webhook") { + break + } + s.T().Logf("helm install attempt %d failed with webhook error, retrying: %v", attempt+1, err) + time.Sleep(10 * time.Second) + } s.Require().NoError(err) c := &chart{ From 553c34d4352def078a39025e7108d4365798fdeb Mon Sep 17 00:00:00 2001 From: david-yu Date: Fri, 10 Apr 2026 14:29:48 -0700 Subject: [PATCH 2/2] fix: align cert-manager versions, pre-import images, and auto-recreate stale k3d clusters Root cause: CI agents persist k3d clusters across builds. When the cert-manager version in the startup manifest changes, loadCluster patches the HelmChart which triggers an in-place upgrade. This disrupts the running webhook during transition, causing "cert-manager webhook not ready" timeouts. Fixes: 1. Align cert-manager to v1.17.2 (pre-pulled by CI) in both pkg/k3d/cert-manager.yaml and pkg/testutil/testutil.go. 2. Pre-import cert-manager images into k3d containerd in waitForJobs() so the helm controller doesn't need to pull from the internet. 3. Auto-recreate stale clusters: if loadCluster fails (e.g. webhook never becomes ready after manifest upgrade), GetOrCreate deletes the unhealthy cluster and creates a fresh one. This handles the case where a CI agent has a k3d cluster from a previous build with an incompatible cert-manager version. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../statefulset_decommissioner_test.go | 57 ++++++- pkg/k3d/cert-manager.yaml | 2 +- pkg/k3d/k3d.go | 141 +++++++++++++++++- pkg/testutil/testutil.go | 2 +- 4 files changed, 191 insertions(+), 11 deletions(-) diff --git a/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go b/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go index d1767743d..deaea87e3 100644 --- a/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go +++ b/operator/internal/controller/decommissioning/statefulset_decommissioner_test.go @@ -13,6 +13,7 @@ import ( "context" _ "embed" "encoding/json" + "fmt" "strings" "testing" "time" @@ -210,6 +211,53 @@ func (s *StatefulSetDecommissionerSuite) untaintNode(name string) { s.Require().NoError(s.client.Update(s.ctx, &node)) } +func (s *StatefulSetDecommissionerSuite) dumpCertManagerDiagnostics() { + t := s.T() + t.Log("=== cert-manager diagnostics ===") + + var pods corev1.PodList + if err := s.client.List(s.ctx, &pods, client.InNamespace("cert-manager")); err != nil { + t.Logf(" failed to list cert-manager pods: %v", err) + } else if len(pods.Items) == 0 { + t.Log(" NO pods found in cert-manager namespace") + } else { + for _, pod := range pods.Items { + t.Logf(" pod/%s phase=%s", pod.Name, pod.Status.Phase) + for _, cs := range pod.Status.ContainerStatuses { + msg := fmt.Sprintf(" container %s: ready=%v restarts=%d", cs.Name, cs.Ready, cs.RestartCount) + if cs.State.Waiting != nil { + msg += fmt.Sprintf(" waiting=%s(%s)", cs.State.Waiting.Reason, cs.State.Waiting.Message) + } + if cs.State.Terminated != nil { + msg += fmt.Sprintf(" terminated=%s(exit=%d)", cs.State.Terminated.Reason, cs.State.Terminated.ExitCode) + } + t.Log(msg) + } + } + } + + var nodes corev1.NodeList + if err := s.client.List(s.ctx, &nodes); err != nil { + t.Logf(" failed to list nodes: %v", err) + } else { + for _, node := range nodes.Items { + taints := "" + for _, taint := range node.Spec.Taints { + taints += fmt.Sprintf(" %s=%s:%s", taint.Key, taint.Value, taint.Effect) + } + ready := "NotReady" + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + ready = "Ready" + } + } + t.Logf(" node/%s %s taints=[%s]", node.Name, ready, strings.TrimSpace(taints)) + } + } + + t.Log("=== end cert-manager diagnostics ===") +} + func (s *StatefulSetDecommissionerSuite) SetupSuite() { t := s.T() @@ -244,10 +292,11 @@ func (s *StatefulSetDecommissionerSuite) SetupSuite() { // proceeding. Without this, helm installs that create Certificate // resources can fail with "no endpoints available for service // cert-manager-webhook" under parallel test load. - s.Require().NoError( - testutil.WaitForCertManagerWebhook(s.ctx, s.client, 2*time.Minute), - "cert-manager webhook not ready", - ) + if err := testutil.WaitForCertManagerWebhook(s.ctx, s.client, 2*time.Minute); err != nil { + // Dump diagnostics before failing. + s.dumpCertManagerDiagnostics() + s.Require().NoError(err, "cert-manager webhook not ready") + } s.env.SetupManager(s.setupRBAC(), func(mcmgr multicluster.Manager) error { mgr := mcmgr.GetLocalManager() diff --git a/pkg/k3d/cert-manager.yaml b/pkg/k3d/cert-manager.yaml index 09071e080..2b91df6c3 100644 --- a/pkg/k3d/cert-manager.yaml +++ b/pkg/k3d/cert-manager.yaml @@ -9,7 +9,7 @@ spec: chart: cert-manager createNamespace: true targetNamespace: cert-manager - version: "v1.16.1" + version: "v1.17.2" valuesContent: |- crds: enabled: true diff --git a/pkg/k3d/k3d.go b/pkg/k3d/k3d.go index 698de4aaf..c9c54cd0a 100644 --- a/pkg/k3d/k3d.go +++ b/pkg/k3d/k3d.go @@ -34,6 +34,7 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/clientcmd" "sigs.k8s.io/controller-runtime/pkg/client" @@ -203,7 +204,23 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) { cluster, err := NewCluster(name, opts...) if err != nil { if errors.Is(err, ErrExists) { - return loadCluster(name, config) + c, loadErr := loadCluster(name, config) + if loadErr != nil { + // Cluster exists but is unhealthy (e.g. stale cert-manager + // from a previous build). Delete and recreate. + fmt.Fprintf(os.Stderr, "WARNING: existing k3d cluster %q is unhealthy (%v), deleting and recreating\n", name, loadErr) + deleteCluster(name) + clearImageMarkers(name) + c2, err2 := NewCluster(name, opts...) + if err2 != nil { + return nil, errors.Wrapf(err2, "recreating cluster %q after unhealthy load", name) + } + if err2 := c2.importImages("localhost/redpanda-operator:dev"); err2 != nil { + return nil, err2 + } + return c2, nil + } + return c, nil } return nil, err } @@ -215,6 +232,15 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) { return cluster, nil } +// deleteCluster removes a k3d cluster by name. Best-effort — errors are logged +// but not returned, since this is used as cleanup before recreation. +func deleteCluster(name string) { + out, err := exec.Command("k3d", "cluster", "delete", name).CombinedOutput() + if err != nil { + fmt.Fprintf(os.Stderr, "WARNING: failed to delete k3d cluster %q: %v: %s\n", name, err, out) + } +} + // imageMarkerPath returns a file path used to track whether an image has // already been imported into a given k3d cluster. func imageMarkerPath(clusterName, image string) string { @@ -342,12 +368,12 @@ Use testutils.SkipIfNotIntegration or testutils.SkipIfNotAcceptance to gate test // If k3d cluster create will fail please uncomment the following debug logs from containers for i := 0; i < config.agents; i++ { containerLogs, _ := exec.Command("docker", "logs", fmt.Sprintf("k3d-%s-agent-%d", name, i)).CombinedOutput() - fmt.Printf("Agent-%d logs:\n%s\n", i, string(containerLogs)) + fmt.Fprintf(os.Stderr, "Agent-%d logs:\n%s\n", i, string(containerLogs)) } containerLogs, _ := exec.Command("docker", "logs", fmt.Sprintf("k3d-%s-server-0", name)).CombinedOutput() - fmt.Printf("server-0 logs:\n%s\n", string(containerLogs)) + fmt.Fprintf(os.Stderr, "server-0 logs:\n%s\n", string(containerLogs)) containerLogs, _ = exec.Command("docker", "network", "inspect", config.network).CombinedOutput() - fmt.Printf("docker network inspect:\n%s\n", string(containerLogs)) + fmt.Fprintf(os.Stderr, "docker network inspect:\n%s\n", string(containerLogs)) return nil, errors.Wrapf(err, "%s", out) } @@ -509,6 +535,17 @@ func (c *Cluster) waitForJobs(ctx context.Context) error { } if !c.skipManifests { + // Pre-import cert-manager images into the k3d cluster so the k3s + // helm controller doesn't need to pull them from the internet. + // The CI `test:pull-images` task downloads these into the host + // Docker daemon, but k3d's containerd is separate — images must + // be explicitly imported via `k3d image import`. + if err := c.importImages(certManagerImages()...); err != nil { + // Non-fatal: if the images aren't in the host Docker daemon + // (e.g. local dev), k3s will pull them from the registry. + fmt.Fprintf(os.Stderr, "WARNING: failed to import cert-manager images (will rely on registry pull): %v\n", err) + } + // NB: Originally this functionality was achieved via the --volume flag to // k3d but CI runs via a docker in docker setup which makes it unreasonable // to use --volume. @@ -549,11 +586,105 @@ func (c *Cluster) waitForJobs(ctx context.Context) error { // helm controller. Wait for its webhook to be ready before returning, // otherwise helm operations that create Certificate resources will fail. if !c.skipManifests { - return testutil.WaitForCertManagerWebhook(ctx, cl, 2*time.Minute) + if err := testutil.WaitForCertManagerWebhook(ctx, cl, 5*time.Minute); err != nil { + // Dump diagnostic info to help debug CI failures. + c.dumpCertManagerDiagnostics(ctx, cl) + return err + } } return nil } +// dumpCertManagerDiagnostics prints cert-manager pod and job status to stdout +// for CI debugging when the webhook readiness check fails. +func (c *Cluster) dumpCertManagerDiagnostics(ctx context.Context, cl client.Client) { + fmt.Fprintf(os.Stderr, "\n=== cert-manager diagnostics for k3d cluster %q ===\n", c.Name) + + // Dump pods in cert-manager namespace. + var pods corev1.PodList + if err := cl.List(ctx, &pods, client.InNamespace("cert-manager")); err != nil { + fmt.Fprintf(os.Stderr, " failed to list cert-manager pods: %v\n", err) + } else if len(pods.Items) == 0 { + fmt.Fprintf(os.Stderr, " NO pods found in cert-manager namespace\n") + } else { + for _, pod := range pods.Items { + fmt.Fprintf(os.Stderr, " pod/%s phase=%s\n", pod.Name, pod.Status.Phase) + for _, cs := range pod.Status.ContainerStatuses { + fmt.Fprintf(os.Stderr, " container %s: ready=%v restarts=%d", cs.Name, cs.Ready, cs.RestartCount) + if cs.State.Waiting != nil { + fmt.Fprintf(os.Stderr, " waiting=%s(%s)", cs.State.Waiting.Reason, cs.State.Waiting.Message) + } + if cs.State.Terminated != nil { + fmt.Fprintf(os.Stderr, " terminated=%s(exit=%d)", cs.State.Terminated.Reason, cs.State.Terminated.ExitCode) + } + fmt.Println() + } + for _, cs := range pod.Status.InitContainerStatuses { + fmt.Fprintf(os.Stderr, " init/%s: ready=%v restarts=%d", cs.Name, cs.Ready, cs.RestartCount) + if cs.State.Waiting != nil { + fmt.Fprintf(os.Stderr, " waiting=%s(%s)", cs.State.Waiting.Reason, cs.State.Waiting.Message) + } + if cs.State.Terminated != nil { + fmt.Fprintf(os.Stderr, " terminated=%s(exit=%d)", cs.State.Terminated.Reason, cs.State.Terminated.ExitCode) + } + fmt.Println() + } + } + } + + // Dump helm controller jobs in kube-system. + var jobs batchv1.JobList + if err := cl.List(ctx, &jobs, client.InNamespace("kube-system")); err != nil { + fmt.Fprintf(os.Stderr, " failed to list kube-system jobs: %v\n", err) + } else { + for _, job := range jobs.Items { + if !strings.Contains(job.Name, "cert-manager") { + continue + } + fmt.Fprintf(os.Stderr, " job/%s active=%d succeeded=%d failed=%d\n", + job.Name, job.Status.Active, job.Status.Succeeded, job.Status.Failed) + for _, cond := range job.Status.Conditions { + fmt.Fprintf(os.Stderr, " condition %s=%s: %s\n", cond.Type, cond.Status, cond.Message) + } + } + } + + // Dump HelmChart in kube-system. + var helmCharts unstructured.UnstructuredList + helmCharts.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "helm.cattle.io", + Version: "v1", + Kind: "HelmChartList", + }) + if err := cl.List(ctx, &helmCharts, client.InNamespace("kube-system")); err != nil { + fmt.Fprintf(os.Stderr, " failed to list HelmCharts: %v\n", err) + } else { + for _, hc := range helmCharts.Items { + if !strings.Contains(hc.GetName(), "cert-manager") { + continue + } + spec, _ := hc.Object["spec"].(map[string]any) + status, _ := hc.Object["status"].(map[string]any) + fmt.Fprintf(os.Stderr, " helmchart/%s version=%v jobName=%v\n", hc.GetName(), spec["version"], status["jobName"]) + } + } + + fmt.Fprintf(os.Stderr, "=== end cert-manager diagnostics ===\n\n") +} + +// certManagerImages returns the container images needed by the cert-manager +// version deployed via the startup manifests. These are imported into the k3d +// cluster so the k3s helm controller doesn't need to pull from the internet. +func certManagerImages() []string { + v := testutil.CertManagerVersion + return []string{ + "quay.io/jetstack/cert-manager-controller:" + v, + "quay.io/jetstack/cert-manager-webhook:" + v, + "quay.io/jetstack/cert-manager-cainjector:" + v, + "quay.io/jetstack/cert-manager-startupapicheck:" + v, + } +} + // startupManifests parses the embedded FS of Kubernetes manifests as a slice // of [client.Object]s. var startupManifests = sync.OnceValue(func() []client.Object { diff --git a/pkg/testutil/testutil.go b/pkg/testutil/testutil.go index daa2ea35a..9f2a663b3 100644 --- a/pkg/testutil/testutil.go +++ b/pkg/testutil/testutil.go @@ -28,7 +28,7 @@ var ( ) const ( - CertManagerVersion = "v1.17.1" + CertManagerVersion = "v1.17.2" VClusterVersion = "v0.31.2" )