Skip to content

Commit 18e11fe

Browse files
david-yuclaude
andcommitted
fix: align cert-manager versions, pre-import images, and auto-recreate stale k3d clusters
Root cause: CI agents persist k3d clusters across builds. When the cert-manager version in the startup manifest changes, loadCluster patches the HelmChart which triggers an in-place upgrade. This disrupts the running webhook during transition, causing "cert-manager webhook not ready" timeouts. Fixes: 1. Align cert-manager to v1.17.2 (pre-pulled by CI) in both pkg/k3d/cert-manager.yaml and pkg/testutil/testutil.go. 2. Pre-import cert-manager images into k3d containerd in waitForJobs() so the helm controller doesn't need to pull from the internet. 3. Auto-recreate stale clusters: if loadCluster fails (e.g. webhook never becomes ready after manifest upgrade), GetOrCreate deletes the unhealthy cluster and creates a fresh one. This handles the case where a CI agent has a k3d cluster from a previous build with an incompatible cert-manager version. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 86d2595 commit 18e11fe

3 files changed

Lines changed: 52 additions & 3 deletions

File tree

pkg/k3d/cert-manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
chart: cert-manager
1010
createNamespace: true
1111
targetNamespace: cert-manager
12-
version: "v1.16.1"
12+
version: "v1.17.2"
1313
valuesContent: |-
1414
crds:
1515
enabled: true

pkg/k3d/k3d.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,23 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
203203
cluster, err := NewCluster(name, opts...)
204204
if err != nil {
205205
if errors.Is(err, ErrExists) {
206-
return loadCluster(name, config)
206+
c, loadErr := loadCluster(name, config)
207+
if loadErr != nil {
208+
// Cluster exists but is unhealthy (e.g. stale cert-manager
209+
// from a previous build). Delete and recreate.
210+
fmt.Printf("WARNING: existing k3d cluster %q is unhealthy (%v), deleting and recreating\n", name, loadErr)
211+
deleteCluster(name)
212+
clearImageMarkers(name)
213+
c2, err2 := NewCluster(name, opts...)
214+
if err2 != nil {
215+
return nil, errors.Wrapf(err2, "recreating cluster %q after unhealthy load", name)
216+
}
217+
if err2 := c2.importImages("localhost/redpanda-operator:dev"); err2 != nil {
218+
return nil, err2
219+
}
220+
return c2, nil
221+
}
222+
return c, nil
207223
}
208224
return nil, err
209225
}
@@ -215,6 +231,15 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
215231
return cluster, nil
216232
}
217233

234+
// deleteCluster removes a k3d cluster by name. Best-effort — errors are logged
235+
// but not returned, since this is used as cleanup before recreation.
236+
func deleteCluster(name string) {
237+
out, err := exec.Command("k3d", "cluster", "delete", name).CombinedOutput()
238+
if err != nil {
239+
fmt.Printf("WARNING: failed to delete k3d cluster %q: %v: %s\n", name, err, out)
240+
}
241+
}
242+
218243
// imageMarkerPath returns a file path used to track whether an image has
219244
// already been imported into a given k3d cluster.
220245
func imageMarkerPath(clusterName, image string) string {
@@ -509,6 +534,17 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
509534
}
510535

511536
if !c.skipManifests {
537+
// Pre-import cert-manager images into the k3d cluster so the k3s
538+
// helm controller doesn't need to pull them from the internet.
539+
// The CI `test:pull-images` task downloads these into the host
540+
// Docker daemon, but k3d's containerd is separate — images must
541+
// be explicitly imported via `k3d image import`.
542+
if err := c.importImages(certManagerImages()...); err != nil {
543+
// Non-fatal: if the images aren't in the host Docker daemon
544+
// (e.g. local dev), k3s will pull them from the registry.
545+
fmt.Printf("WARNING: failed to import cert-manager images (will rely on registry pull): %v\n", err)
546+
}
547+
512548
// NB: Originally this functionality was achieved via the --volume flag to
513549
// k3d but CI runs via a docker in docker setup which makes it unreasonable
514550
// to use --volume.
@@ -554,6 +590,19 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
554590
return nil
555591
}
556592

593+
// certManagerImages returns the container images needed by the cert-manager
594+
// version deployed via the startup manifests. These are imported into the k3d
595+
// cluster so the k3s helm controller doesn't need to pull from the internet.
596+
func certManagerImages() []string {
597+
v := testutil.CertManagerVersion
598+
return []string{
599+
"quay.io/jetstack/cert-manager-controller:" + v,
600+
"quay.io/jetstack/cert-manager-webhook:" + v,
601+
"quay.io/jetstack/cert-manager-cainjector:" + v,
602+
"quay.io/jetstack/cert-manager-startupapicheck:" + v,
603+
}
604+
}
605+
557606
// startupManifests parses the embedded FS of Kubernetes manifests as a slice
558607
// of [client.Object]s.
559608
var startupManifests = sync.OnceValue(func() []client.Object {

pkg/testutil/testutil.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ var (
2828
)
2929

3030
const (
31-
CertManagerVersion = "v1.17.1"
31+
CertManagerVersion = "v1.17.2"
3232
VClusterVersion = "v0.31.2"
3333
)
3434

0 commit comments

Comments
 (0)