@@ -34,6 +34,7 @@ import (
3434 batchv1 "k8s.io/api/batch/v1"
3535 corev1 "k8s.io/api/core/v1"
3636 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
37+ "k8s.io/apimachinery/pkg/runtime/schema"
3738 "k8s.io/apimachinery/pkg/util/wait"
3839 "k8s.io/client-go/tools/clientcmd"
3940 "sigs.k8s.io/controller-runtime/pkg/client"
@@ -203,7 +204,23 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
203204 cluster , err := NewCluster (name , opts ... )
204205 if err != nil {
205206 if errors .Is (err , ErrExists ) {
206- return loadCluster (name , config )
207+ c , loadErr := loadCluster (name , config )
208+ if loadErr != nil {
209+ // Cluster exists but is unhealthy (e.g. stale cert-manager
210+ // from a previous build). Delete and recreate.
211+ fmt .Fprintf (os .Stderr , "WARNING: existing k3d cluster %q is unhealthy (%v), deleting and recreating\n " , name , loadErr )
212+ deleteCluster (name )
213+ clearImageMarkers (name )
214+ c2 , err2 := NewCluster (name , opts ... )
215+ if err2 != nil {
216+ return nil , errors .Wrapf (err2 , "recreating cluster %q after unhealthy load" , name )
217+ }
218+ if err2 := c2 .importImages ("localhost/redpanda-operator:dev" ); err2 != nil {
219+ return nil , err2
220+ }
221+ return c2 , nil
222+ }
223+ return c , nil
207224 }
208225 return nil , err
209226 }
@@ -215,6 +232,15 @@ func GetOrCreate(name string, opts ...ClusterOpt) (*Cluster, error) {
215232 return cluster , nil
216233}
217234
235+ // deleteCluster removes a k3d cluster by name. Best-effort — errors are logged
236+ // but not returned, since this is used as cleanup before recreation.
237+ func deleteCluster (name string ) {
238+ out , err := exec .Command ("k3d" , "cluster" , "delete" , name ).CombinedOutput ()
239+ if err != nil {
240+ fmt .Fprintf (os .Stderr , "WARNING: failed to delete k3d cluster %q: %v: %s\n " , name , err , out )
241+ }
242+ }
243+
218244// imageMarkerPath returns a file path used to track whether an image has
219245// already been imported into a given k3d cluster.
220246func imageMarkerPath (clusterName , image string ) string {
@@ -342,12 +368,12 @@ Use testutils.SkipIfNotIntegration or testutils.SkipIfNotAcceptance to gate test
342368 // If k3d cluster create will fail please uncomment the following debug logs from containers
343369 for i := 0 ; i < config .agents ; i ++ {
344370 containerLogs , _ := exec .Command ("docker" , "logs" , fmt .Sprintf ("k3d-%s-agent-%d" , name , i )).CombinedOutput ()
345- fmt .Printf ( "Agent-%d logs:\n %s\n " , i , string (containerLogs ))
371+ fmt .Fprintf ( os . Stderr , "Agent-%d logs:\n %s\n " , i , string (containerLogs ))
346372 }
347373 containerLogs , _ := exec .Command ("docker" , "logs" , fmt .Sprintf ("k3d-%s-server-0" , name )).CombinedOutput ()
348- fmt .Printf ( "server-0 logs:\n %s\n " , string (containerLogs ))
374+ fmt .Fprintf ( os . Stderr , "server-0 logs:\n %s\n " , string (containerLogs ))
349375 containerLogs , _ = exec .Command ("docker" , "network" , "inspect" , config .network ).CombinedOutput ()
350- fmt .Printf ( "docker network inspect:\n %s\n " , string (containerLogs ))
376+ fmt .Fprintf ( os . Stderr , "docker network inspect:\n %s\n " , string (containerLogs ))
351377
352378 return nil , errors .Wrapf (err , "%s" , out )
353379 }
@@ -509,6 +535,17 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
509535 }
510536
511537 if ! c .skipManifests {
538+ // Pre-import cert-manager images into the k3d cluster so the k3s
539+ // helm controller doesn't need to pull them from the internet.
540+ // The CI `test:pull-images` task downloads these into the host
541+ // Docker daemon, but k3d's containerd is separate — images must
542+ // be explicitly imported via `k3d image import`.
543+ if err := c .importImages (certManagerImages ()... ); err != nil {
544+ // Non-fatal: if the images aren't in the host Docker daemon
545+ // (e.g. local dev), k3s will pull them from the registry.
546+ fmt .Fprintf (os .Stderr , "WARNING: failed to import cert-manager images (will rely on registry pull): %v\n " , err )
547+ }
548+
512549 // NB: Originally this functionality was achieved via the --volume flag to
513550 // k3d but CI runs via a docker in docker setup which makes it unreasonable
514551 // to use --volume.
@@ -549,11 +586,105 @@ func (c *Cluster) waitForJobs(ctx context.Context) error {
549586 // helm controller. Wait for its webhook to be ready before returning,
550587 // otherwise helm operations that create Certificate resources will fail.
551588 if ! c .skipManifests {
552- return testutil .WaitForCertManagerWebhook (ctx , cl , 2 * time .Minute )
589+ if err := testutil .WaitForCertManagerWebhook (ctx , cl , 5 * time .Minute ); err != nil {
590+ // Dump diagnostic info to help debug CI failures.
591+ c .dumpCertManagerDiagnostics (ctx , cl )
592+ return err
593+ }
553594 }
554595 return nil
555596}
556597
598+ // dumpCertManagerDiagnostics prints cert-manager pod and job status to stdout
599+ // for CI debugging when the webhook readiness check fails.
600+ func (c * Cluster ) dumpCertManagerDiagnostics (ctx context.Context , cl client.Client ) {
601+ fmt .Fprintf (os .Stderr , "\n === cert-manager diagnostics for k3d cluster %q ===\n " , c .Name )
602+
603+ // Dump pods in cert-manager namespace.
604+ var pods corev1.PodList
605+ if err := cl .List (ctx , & pods , client .InNamespace ("cert-manager" )); err != nil {
606+ fmt .Fprintf (os .Stderr , " failed to list cert-manager pods: %v\n " , err )
607+ } else if len (pods .Items ) == 0 {
608+ fmt .Fprintf (os .Stderr , " NO pods found in cert-manager namespace\n " )
609+ } else {
610+ for _ , pod := range pods .Items {
611+ fmt .Fprintf (os .Stderr , " pod/%s phase=%s\n " , pod .Name , pod .Status .Phase )
612+ for _ , cs := range pod .Status .ContainerStatuses {
613+ fmt .Fprintf (os .Stderr , " container %s: ready=%v restarts=%d" , cs .Name , cs .Ready , cs .RestartCount )
614+ if cs .State .Waiting != nil {
615+ fmt .Fprintf (os .Stderr , " waiting=%s(%s)" , cs .State .Waiting .Reason , cs .State .Waiting .Message )
616+ }
617+ if cs .State .Terminated != nil {
618+ fmt .Fprintf (os .Stderr , " terminated=%s(exit=%d)" , cs .State .Terminated .Reason , cs .State .Terminated .ExitCode )
619+ }
620+ fmt .Println ()
621+ }
622+ for _ , cs := range pod .Status .InitContainerStatuses {
623+ fmt .Fprintf (os .Stderr , " init/%s: ready=%v restarts=%d" , cs .Name , cs .Ready , cs .RestartCount )
624+ if cs .State .Waiting != nil {
625+ fmt .Fprintf (os .Stderr , " waiting=%s(%s)" , cs .State .Waiting .Reason , cs .State .Waiting .Message )
626+ }
627+ if cs .State .Terminated != nil {
628+ fmt .Fprintf (os .Stderr , " terminated=%s(exit=%d)" , cs .State .Terminated .Reason , cs .State .Terminated .ExitCode )
629+ }
630+ fmt .Println ()
631+ }
632+ }
633+ }
634+
635+ // Dump helm controller jobs in kube-system.
636+ var jobs batchv1.JobList
637+ if err := cl .List (ctx , & jobs , client .InNamespace ("kube-system" )); err != nil {
638+ fmt .Fprintf (os .Stderr , " failed to list kube-system jobs: %v\n " , err )
639+ } else {
640+ for _ , job := range jobs .Items {
641+ if ! strings .Contains (job .Name , "cert-manager" ) {
642+ continue
643+ }
644+ fmt .Fprintf (os .Stderr , " job/%s active=%d succeeded=%d failed=%d\n " ,
645+ job .Name , job .Status .Active , job .Status .Succeeded , job .Status .Failed )
646+ for _ , cond := range job .Status .Conditions {
647+ fmt .Fprintf (os .Stderr , " condition %s=%s: %s\n " , cond .Type , cond .Status , cond .Message )
648+ }
649+ }
650+ }
651+
652+ // Dump HelmChart in kube-system.
653+ var helmCharts unstructured.UnstructuredList
654+ helmCharts .SetGroupVersionKind (schema.GroupVersionKind {
655+ Group : "helm.cattle.io" ,
656+ Version : "v1" ,
657+ Kind : "HelmChartList" ,
658+ })
659+ if err := cl .List (ctx , & helmCharts , client .InNamespace ("kube-system" )); err != nil {
660+ fmt .Fprintf (os .Stderr , " failed to list HelmCharts: %v\n " , err )
661+ } else {
662+ for _ , hc := range helmCharts .Items {
663+ if ! strings .Contains (hc .GetName (), "cert-manager" ) {
664+ continue
665+ }
666+ spec , _ := hc .Object ["spec" ].(map [string ]any )
667+ status , _ := hc .Object ["status" ].(map [string ]any )
668+ fmt .Fprintf (os .Stderr , " helmchart/%s version=%v jobName=%v\n " , hc .GetName (), spec ["version" ], status ["jobName" ])
669+ }
670+ }
671+
672+ fmt .Fprintf (os .Stderr , "=== end cert-manager diagnostics ===\n \n " )
673+ }
674+
675+ // certManagerImages returns the container images needed by the cert-manager
676+ // version deployed via the startup manifests. These are imported into the k3d
677+ // cluster so the k3s helm controller doesn't need to pull from the internet.
678+ func certManagerImages () []string {
679+ v := testutil .CertManagerVersion
680+ return []string {
681+ "quay.io/jetstack/cert-manager-controller:" + v ,
682+ "quay.io/jetstack/cert-manager-webhook:" + v ,
683+ "quay.io/jetstack/cert-manager-cainjector:" + v ,
684+ "quay.io/jetstack/cert-manager-startupapicheck:" + v ,
685+ }
686+ }
687+
557688// startupManifests parses the embedded FS of Kubernetes manifests as a slice
558689// of [client.Object]s.
559690var startupManifests = sync .OnceValue (func () []client.Object {
0 commit comments