Skip to content

Commit d618cbc

Browse files
david-yuclaude
andcommitted
fix: align cert-manager versions, pre-import images, and auto-recreate stale k3d clusters
Root cause: CI agents persist k3d clusters across builds. When the cert-manager version in the startup manifest changes, loadCluster patches the HelmChart which triggers an in-place upgrade. This disrupts the running webhook during transition, causing "cert-manager webhook not ready" timeouts. Fixes: 1. Align cert-manager to v1.17.2 (pre-pulled by CI) in both pkg/k3d/cert-manager.yaml and pkg/testutil/testutil.go. 2. Pre-import cert-manager images into k3d containerd in waitForJobs() so the helm controller doesn't need to pull from the internet. 3. Auto-recreate stale clusters: if loadCluster fails (e.g. webhook never becomes ready after manifest upgrade), GetOrCreate deletes the unhealthy cluster and creates a fresh one. This handles the case where a CI agent has a k3d cluster from a previous build with an incompatible cert-manager version. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 86d2595 commit d618cbc

File tree

3 files changed

+138
-7
lines changed

3 files changed

+138
-7
lines changed

pkg/k3d/cert-manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
chart: cert-manager
1010
createNamespace: true
1111
targetNamespace: cert-manager
12-
version: "v1.16.1"
12+
version: "v1.17.2"
1313
valuesContent: |-
1414
crds:
1515
enabled: true

pkg/k3d/k3d.go

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
batchv1 "k8s.io/api/batch/v1"
3535
corev1 "k8s.io/api/core/v1"
3636
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
37+
"k8s.io/apimachinery/pkg/runtime/schema"
3738
"k8s.io/apimachinery/pkg/util/wait"
3839
"k8s.io/client-go/tools/clientcmd"
3940
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -203,7 +204,23 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
203204
cluster, err := NewCluster(name, opts...)
204205
if err != nil {
205206
if errors.Is(err, ErrExists) {
206-
return loadCluster(name, config)
207+
c, loadErr := loadCluster(name, config)
208+
if loadErr != nil {
209+
// Cluster exists but is unhealthy (e.g. stale cert-manager
210+
// from a previous build). Delete and recreate.
211+
fmt.Fprintf(os.Stderr, "WARNING: existing k3d cluster %q is unhealthy (%v), deleting and recreating\n", name, loadErr)
212+
deleteCluster(name)
213+
clearImageMarkers(name)
214+
c2, err2 := NewCluster(name, opts...)
215+
if err2 != nil {
216+
return nil, errors.Wrapf(err2, "recreating cluster %q after unhealthy load", name)
217+
}
218+
if err2 := c2.importImages("localhost/redpanda-operator:dev"); err2 != nil {
219+
return nil, err2
220+
}
221+
return c2, nil
222+
}
223+
return c, nil
207224
}
208225
return nil, err
209226
}
@@ -215,6 +232,15 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
215232
return cluster, nil
216233
}
217234

235+
// deleteCluster removes a k3d cluster by name. Best-effort — errors are logged
236+
// but not returned, since this is used as cleanup before recreation.
237+
func deleteCluster(name string) {
238+
out, err := exec.Command("k3d", "cluster", "delete", name).CombinedOutput()
239+
if err != nil {
240+
fmt.Fprintf(os.Stderr, "WARNING: failed to delete k3d cluster %q: %v: %s\n", name, err, out)
241+
}
242+
}
243+
218244
// imageMarkerPath returns a file path used to track whether an image has
219245
// already been imported into a given k3d cluster.
220246
func imageMarkerPath(clusterName, image string) string {
@@ -342,12 +368,12 @@ Use testutils.SkipIfNotIntegration or testutils.SkipIfNotAcceptance to gate test
342368
// If k3d cluster create will fail please uncomment the following debug logs from containers
343369
for i := 0; i < config.agents; i++ {
344370
containerLogs, _ := exec.Command("docker", "logs", fmt.Sprintf("k3d-%s-agent-%d", name, i)).CombinedOutput()
345-
fmt.Printf("Agent-%d logs:\n%s\n", i, string(containerLogs))
371+
fmt.Fprintf(os.Stderr, "Agent-%d logs:\n%s\n", i, string(containerLogs))
346372
}
347373
containerLogs, _ := exec.Command("docker", "logs", fmt.Sprintf("k3d-%s-server-0", name)).CombinedOutput()
348-
fmt.Printf("server-0 logs:\n%s\n", string(containerLogs))
374+
fmt.Fprintf(os.Stderr, "server-0 logs:\n%s\n", string(containerLogs))
349375
containerLogs, _ = exec.Command("docker", "network", "inspect", config.network).CombinedOutput()
350-
fmt.Printf("docker network inspect:\n%s\n", string(containerLogs))
376+
fmt.Fprintf(os.Stderr, "docker network inspect:\n%s\n", string(containerLogs))
351377

352378
return nil, errors.Wrapf(err, "%s", out)
353379
}
@@ -509,6 +535,17 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
509535
}
510536

511537
if !c.skipManifests {
538+
// Pre-import cert-manager images into the k3d cluster so the k3s
539+
// helm controller doesn't need to pull them from the internet.
540+
// The CI `test:pull-images` task downloads these into the host
541+
// Docker daemon, but k3d's containerd is separate — images must
542+
// be explicitly imported via `k3d image import`.
543+
if err := c.importImages(certManagerImages()...); err != nil {
544+
// Non-fatal: if the images aren't in the host Docker daemon
545+
// (e.g. local dev), k3s will pull them from the registry.
546+
fmt.Fprintf(os.Stderr, "WARNING: failed to import cert-manager images (will rely on registry pull): %v\n", err)
547+
}
548+
512549
// NB: Originally this functionality was achieved via the --volume flag to
513550
// k3d but CI runs via a docker in docker setup which makes it unreasonable
514551
// to use --volume.
@@ -549,11 +586,105 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
549586
// helm controller. Wait for its webhook to be ready before returning,
550587
// otherwise helm operations that create Certificate resources will fail.
551588
if !c.skipManifests {
552-
return testutil.WaitForCertManagerWebhook(ctx, cl, 2*time.Minute)
589+
if err := testutil.WaitForCertManagerWebhook(ctx, cl, 5*time.Minute); err != nil {
590+
// Dump diagnostic info to help debug CI failures.
591+
c.dumpCertManagerDiagnostics(ctx, cl)
592+
return err
593+
}
553594
}
554595
return nil
555596
}
556597

598+
// dumpCertManagerDiagnostics prints cert-manager pod and job status to stdout
599+
// for CI debugging when the webhook readiness check fails.
600+
func (c *Cluster) dumpCertManagerDiagnostics(ctx context.Context, cl client.Client) {
601+
fmt.Fprintf(os.Stderr, "\n=== cert-manager diagnostics for k3d cluster %q ===\n", c.Name)
602+
603+
// Dump pods in cert-manager namespace.
604+
var pods corev1.PodList
605+
if err := cl.List(ctx, &pods, client.InNamespace("cert-manager")); err != nil {
606+
fmt.Fprintf(os.Stderr, " failed to list cert-manager pods: %v\n", err)
607+
} else if len(pods.Items) == 0 {
608+
fmt.Fprintf(os.Stderr, " NO pods found in cert-manager namespace\n")
609+
} else {
610+
for _, pod := range pods.Items {
611+
fmt.Fprintf(os.Stderr, " pod/%s phase=%s\n", pod.Name, pod.Status.Phase)
612+
for _, cs := range pod.Status.ContainerStatuses {
613+
fmt.Fprintf(os.Stderr, " container %s: ready=%v restarts=%d", cs.Name, cs.Ready, cs.RestartCount)
614+
if cs.State.Waiting != nil {
615+
fmt.Fprintf(os.Stderr, " waiting=%s(%s)", cs.State.Waiting.Reason, cs.State.Waiting.Message)
616+
}
617+
if cs.State.Terminated != nil {
618+
fmt.Fprintf(os.Stderr, " terminated=%s(exit=%d)", cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
619+
}
620+
fmt.Println()
621+
}
622+
for _, cs := range pod.Status.InitContainerStatuses {
623+
fmt.Fprintf(os.Stderr, " init/%s: ready=%v restarts=%d", cs.Name, cs.Ready, cs.RestartCount)
624+
if cs.State.Waiting != nil {
625+
fmt.Fprintf(os.Stderr, " waiting=%s(%s)", cs.State.Waiting.Reason, cs.State.Waiting.Message)
626+
}
627+
if cs.State.Terminated != nil {
628+
fmt.Fprintf(os.Stderr, " terminated=%s(exit=%d)", cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
629+
}
630+
fmt.Println()
631+
}
632+
}
633+
}
634+
635+
// Dump helm controller jobs in kube-system.
636+
var jobs batchv1.JobList
637+
if err := cl.List(ctx, &jobs, client.InNamespace("kube-system")); err != nil {
638+
fmt.Fprintf(os.Stderr, " failed to list kube-system jobs: %v\n", err)
639+
} else {
640+
for _, job := range jobs.Items {
641+
if !strings.Contains(job.Name, "cert-manager") {
642+
continue
643+
}
644+
fmt.Fprintf(os.Stderr, " job/%s active=%d succeeded=%d failed=%d\n",
645+
job.Name, job.Status.Active, job.Status.Succeeded, job.Status.Failed)
646+
for _, cond := range job.Status.Conditions {
647+
fmt.Fprintf(os.Stderr, " condition %s=%s: %s\n", cond.Type, cond.Status, cond.Message)
648+
}
649+
}
650+
}
651+
652+
// Dump HelmChart in kube-system.
653+
var helmCharts unstructured.UnstructuredList
654+
helmCharts.SetGroupVersionKind(schema.GroupVersionKind{
655+
Group: "helm.cattle.io",
656+
Version: "v1",
657+
Kind: "HelmChartList",
658+
})
659+
if err := cl.List(ctx, &helmCharts, client.InNamespace("kube-system")); err != nil {
660+
fmt.Fprintf(os.Stderr, " failed to list HelmCharts: %v\n", err)
661+
} else {
662+
for _, hc := range helmCharts.Items {
663+
if !strings.Contains(hc.GetName(), "cert-manager") {
664+
continue
665+
}
666+
spec, _ := hc.Object["spec"].(map[string]any)
667+
status, _ := hc.Object["status"].(map[string]any)
668+
fmt.Fprintf(os.Stderr, " helmchart/%s version=%v jobName=%v\n", hc.GetName(), spec["version"], status["jobName"])
669+
}
670+
}
671+
672+
fmt.Fprintf(os.Stderr, "=== end cert-manager diagnostics ===\n\n")
673+
}
674+
675+
// certManagerImages returns the container images needed by the cert-manager
676+
// version deployed via the startup manifests. These are imported into the k3d
677+
// cluster so the k3s helm controller doesn't need to pull from the internet.
678+
func certManagerImages() []string {
679+
v := testutil.CertManagerVersion
680+
return []string{
681+
"quay.io/jetstack/cert-manager-controller:" + v,
682+
"quay.io/jetstack/cert-manager-webhook:" + v,
683+
"quay.io/jetstack/cert-manager-cainjector:" + v,
684+
"quay.io/jetstack/cert-manager-startupapicheck:" + v,
685+
}
686+
}
687+
557688
// startupManifests parses the embedded FS of Kubernetes manifests as a slice
558689
// of [client.Object]s.
559690
var startupManifests = sync.OnceValue(func() []client.Object {

pkg/testutil/testutil.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ var (
2828
)
2929

3030
const (
31-
CertManagerVersion = "v1.17.1"
31+
CertManagerVersion = "v1.17.2"
3232
VClusterVersion = "v0.31.2"
3333
)
3434

0 commit comments

Comments
 (0)