feat: add diagnostic log collection on test failure

subhashkhileri · subhashkhileri · commit 6c7ee7859f47 · 2026-04-27T14:11:16.000+05:30
Adds collectDiagnosticLogs() to KubernetesClientHelper that captures
cluster state (events, pods, deployments, statefulsets, routes,
configmaps, per-container pod logs) to files on test failure.

TeardownReporter now tracks failed projects and collects diagnostics
before namespace deletion. Log collection runs on both CI and local;
namespace deletion remains CI-only.

Bumps version to 1.1.34.
diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts
@@ -33,7 +33,7 @@ export default defineConfig({
       { text: "Examples", link: "/examples/" },
       { text: "Overlay Testing", link: "/overlay/" },
       {
-        text: "v1.1.33",
+        text: "v1.1.34",
         items: [{ text: "Changelog", link: "/changelog" }],
       },
     ],
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,12 +2,27 @@
 
 All notable changes to this project will be documented in this file.
 
-## [1.1.33] - Current
+## [1.1.34] - Current
+
+### Added
+
+- **Diagnostic log collection on failure**: `collectDiagnosticLogs(namespace, outputDir?)` on `KubernetesClientHelper` captures comprehensive cluster state (events, pod status, deployments, statefulsets, routes, and per-container pod logs including init containers and previous restarts) to files under `node_modules/.cache/e2e-test-results/logs/<namespace>/`. Uses `kubectl` for cross-platform compatibility. Empty files (e.g. no previous logs) are not created.
+- **TeardownReporter collects diagnostics on test failure**: When any test in a project fails, the teardown reporter automatically calls `collectDiagnosticLogs` before namespace deletion. Diagnostic collection runs on both CI and local; namespace deletion remains CI-only.
+- **Per-container pod log collection**: Logs are collected per-container (init + app containers) instead of `--all-containers`, which fails entirely if any container hasn't started. Files saved to `pods/<pod-name>/<container-name>.log` and `pods/<pod-name>/<container-name>.previous.log`.
+
+### Changed
+
+- **TeardownReporter tracks test failures**: Added `_projectsWithFailures` set to track which projects had test failures, so diagnostic logs are only collected when needed.
+- **TeardownReporter active on non-CI**: The reporter now processes `onTestEnd`/`onEnd` regardless of `CI` env var. Log collection runs always; namespace deletion is still gated on `CI=true`.
+
+## [1.1.33]
 
 ### Added
 
 - **Automatic Vault secret loading for local development**: Set `VAULT=1` or `VAULT=true` to automatically fetch secrets from HashiCorp Vault during global setup. Handles OIDC login, fetches global and per-workspace secrets, and injects them into `process.env`. Only secret key names are logged, never values. Configurable via `VAULT_ADDR` and `VAULT_BASE_PATH` env vars. Logs a Slack channel (`#rhdh-e2e-tests`) when permission is denied.
 
+## [1.1.32]
+
 ### Fixed
 
 - **Normalize `-dynamic` suffix in `extractPluginName`**: Plugins whose metadata `dynamicArtifact` is a local path (ending in `-dynamic`) were not matched during PR OCI resolution or config injection, because the metadata map key included the `-dynamic` suffix while OCI URL lookups did not. `extractPluginName` now strips the `-dynamic` suffix so local paths and OCI refs for the same logical plugin produce the same key. ([RHDHBUGS-2987](https://issues.redhat.com/browse/RHDHBUGS-2987))
diff --git a/docs/guide/core-concepts/error-handling.md b/docs/guide/core-concepts/error-handling.md
@@ -280,6 +280,14 @@ await page.click('button[data-testid="save"]');
 await expect(page.getByText("Saved")).toBeVisible();
 ```
 
+## Cluster Diagnostic Logs
+
+When tests fail, the framework automatically collects cluster diagnostics (pod logs, events, deployments) to `node_modules/.cache/e2e-test-results/logs/<namespace>/`. This includes per-container logs for all pods (init and app containers), with previous restart logs when available.
+
+Check these files first when debugging deployment or pod failures — they're often more useful than Playwright's HTML report for infrastructure issues.
+
+See [Kubernetes Client — Diagnostic Log Collection](/guide/utilities/kubernetes-client#diagnostic-log-collection) for the full list of collected resources and API details.
+
 ## Error Handling Checklist
 
 - [ ] Use specific error messages that include context
diff --git a/docs/guide/utilities/kubernetes-client.md b/docs/guide/utilities/kubernetes-client.md
@@ -121,6 +121,44 @@ When a failure is detected, the method:
 2. Fetches container logs via `oc logs`
 3. Throws an error with the failure details
 
+## Diagnostic Log Collection
+
+### `collectDiagnosticLogs(namespace, outputDir?)`
+
+Collects comprehensive cluster diagnostics and saves them to files. Uses `kubectl` for cross-platform compatibility (OpenShift, EKS, GKE, etc.). OpenShift-specific resources (routes) are collected on a best-effort basis.
+
+```typescript
+await k8sClient.collectDiagnosticLogs("my-namespace");
+// Saves to: node_modules/.cache/e2e-test-results/logs/my-namespace/
+
+// Or with a custom output directory:
+await k8sClient.collectDiagnosticLogs("my-namespace", "/tmp/debug-logs");
+```
+
+**Collected resources:**
+
+| File | Content |
+|------|---------|
+| `events.txt` | Namespace events sorted by timestamp |
+| `pods.txt` | Pod status (`kubectl get pods -o wide`) |
+| `describe-pods.txt` | Full pod descriptions |
+| `deployments.txt` | Deployment status |
+| `describe-deployments.txt` | Full deployment descriptions |
+| `statefulsets.txt` | StatefulSet status |
+| `routes.txt` | OpenShift routes (skipped on non-OpenShift clusters) |
+| `pods/<pod>/<container>.log` | Current logs per container (init + app) |
+| `pods/<pod>/<container>.previous.log` | Previous restart logs (only if pod restarted) |
+
+**Key behaviors:**
+- Logs are collected per-container rather than `--all-containers`, so a failed init container doesn't block collection of other container logs
+- Empty files are not created (e.g., when there are no previous logs)
+- Resource types that don't exist on the cluster (e.g., routes on non-OpenShift) are silently skipped
+- All resource collection runs in parallel via `Promise.allSettled`
+
+**Automatic collection on test failure:**
+
+In the overlay testing flow, you don't need to call this manually. The built-in `TeardownReporter` automatically calls `collectDiagnosticLogs` for any project that had test failures. This works on both CI and local runs.
+
 ## Deployment Operations
 
 ### `scaleDeployment(namespace, name, replicas)`
diff --git a/docs/overlay/reference/troubleshooting.md b/docs/overlay/reference/troubleshooting.md
@@ -271,6 +271,40 @@ oc login --token=<token> --server=<server>
 - Check route/service configuration
 - Verify network policies
 
+## Diagnostic Logs
+
+When tests fail, the `TeardownReporter` automatically collects cluster diagnostics and saves them to:
+
+```
+node_modules/.cache/e2e-test-results/logs/<project-name>/
+├── events.txt              # Namespace events (sorted by time)
+├── pods.txt                # Pod status
+├── describe-pods.txt       # Full pod descriptions
+├── deployments.txt         # Deployment status
+├── describe-deployments.txt
+├── statefulsets.txt
+├── routes.txt              # OpenShift routes
+└── pods/
+    └── <pod-name>/
+        ├── <container>.log          # Current logs
+        └── <container>.previous.log # Previous restart logs
+```
+
+This runs automatically on **both CI and local** — no configuration needed. Namespace deletion remains CI-only.
+
+**When using `run-e2e.sh`**, logs are written relative to the repo root. When running from a workspace (`cd workspaces/my-plugin/e2e-tests && yarn test`), they're relative to the `e2e-tests/` directory.
+
+**Logs are only collected for projects with failures.** If all tests pass, no diagnostic logs are written.
+
+To collect diagnostics manually (e.g., from a custom script):
+
+```typescript
+import { KubernetesClientHelper } from "@red-hat-developer-hub/e2e-test-utils/utils";
+
+const k8sClient = new KubernetesClientHelper();
+await k8sClient.collectDiagnosticLogs("my-namespace", "./my-logs");
+```
+
 ## Debugging Tips
 
 ### Use Headed Mode
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@red-hat-developer-hub/e2e-test-utils",
-  "version": "1.1.33",
+  "version": "1.1.34",
   "description": "Test utilities for RHDH E2E tests",
   "license": "Apache-2.0",
   "repository": {
diff --git a/src/playwright/teardown-reporter.ts b/src/playwright/teardown-reporter.ts
@@ -4,28 +4,37 @@ import type {
   TestCase,
   TestResult,
 } from "@playwright/test/reporter";
+import path from "path";
 import { KubernetesClientHelper } from "../utils/kubernetes-client.js";
 import { getTeardownNamespaces } from "./teardown-namespaces.js";
 
 /**
- * Playwright reporter that deletes namespaces per-project as soon as all tests
- * in that project finish. This frees cluster resources early instead of waiting
- * for the entire suite to complete.
+ * Playwright reporter that collects diagnostic logs on failure and deletes
+ * namespaces per-project as soon as all tests in that project finish.
+ *
+ * Why a reporter (not afterAll / worker fixture teardown):
+ * - afterAll runs when a worker dies. On test failure Playwright kills the
+ *   worker for retries, so afterAll deletes the namespace before the retry.
+ * - Worker fixture teardown has the same problem — it fires on worker exit.
+ * - A reporter runs in the main Playwright process, survives worker restarts,
+ *   and can track per-project completion including retries.
  *
  * Handles retries: a test is only counted as done when it passes/is skipped,
  * or exhausts all retry attempts.
  *
  * Falls back in onEnd() to clean up any projects that didn't complete naturally
  * (e.g., interrupted runs, maxFailures).
  *
- * Only active when process.env.CI === "true".
+ * Diagnostic log collection runs always (CI and local).
+ * Namespace deletion only runs when process.env.CI === "true".
  *
  * By default, deletes the namespace matching the project name.
  * For custom namespaces, consumers can register them via registerTeardownNamespace().
  */
 export default class TeardownReporter implements Reporter {
   private _projectTestCounts = new Map<string, number>();
   private _projectCompleted = new Map<string, number>();
+  private _projectsWithFailures = new Set<string>();
   private _pendingDeletions = new Map<string, Promise<void>>();
 
   onBegin(_config: unknown, suite: Suite): void {
@@ -42,8 +51,6 @@ export default class TeardownReporter implements Reporter {
   }
 
   onTestEnd(test: TestCase, result: TestResult): void {
-    if (process.env.CI !== "true") return;
-
     const project = test.parent.project();
     if (!project) return;
 
@@ -55,10 +62,15 @@ export default class TeardownReporter implements Reporter {
     if (!isDone) return;
 
     const name = project.name;
+
+    if (result.status !== "passed" && result.status !== "skipped") {
+      this._projectsWithFailures.add(name);
+    }
+
     const completed = (this._projectCompleted.get(name) ?? 0) + 1;
     this._projectCompleted.set(name, completed);
 
-    // Start deletion immediately (fire-and-forget here, awaited in onEnd)
+    // Start cleanup immediately (fire-and-forget here, awaited in onEnd)
     if (
       completed === this._projectTestCounts.get(name) &&
       !this._pendingDeletions.has(name)
@@ -68,15 +80,14 @@ export default class TeardownReporter implements Reporter {
   }
 
   async onEnd(): Promise<void> {
-    if (process.env.CI !== "true") return;
-
-    // Await all in-flight deletions started from onTestEnd
+    // Await all in-flight cleanups started from onTestEnd
     await Promise.all(this._pendingDeletions.values());
 
     // Fallback: clean up projects that didn't complete naturally
-    // (e.g., interrupted run, maxFailures hit)
+    // (e.g., interrupted run, maxFailures hit) — always collect diagnostics
     for (const [project] of this._projectTestCounts) {
       if (!this._pendingDeletions.has(project)) {
+        this._projectsWithFailures.add(project);
         await this._deleteProjectNamespaces(project);
       }
     }
@@ -88,7 +99,7 @@ export default class TeardownReporter implements Reporter {
       k8sClient = new KubernetesClientHelper();
     } catch (error) {
       console.error(
-        `[TeardownReporter] Cannot connect to cluster, skipping teardown:`,
+        `[TeardownReporter] Cannot connect to cluster, skipping cleanup:`,
         error,
       );
       return;
@@ -98,11 +109,28 @@ export default class TeardownReporter implements Reporter {
     const namespaces =
       customNamespaces.length > 0 ? customNamespaces : [projectName];
 
-    for (const ns of namespaces) {
-      console.log(
-        `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`,
-      );
-      await k8sClient.deleteNamespace(ns);
+    // Collect diagnostic logs on failure (always, regardless of CI)
+    if (this._projectsWithFailures.has(projectName)) {
+      for (const ns of namespaces) {
+        const outputDir = path.join(
+          "node_modules",
+          ".cache",
+          "e2e-test-results",
+          "logs",
+          projectName,
+        );
+        await k8sClient.collectDiagnosticLogs(ns, outputDir);
+      }
+    }
+
+    // Delete namespaces only in CI
+    if (process.env.CI === "true") {
+      for (const ns of namespaces) {
+        console.log(
+          `[TeardownReporter] Deleting namespace "${ns}" (project: ${projectName})`,
+        );
+        await k8sClient.deleteNamespace(ns);
+      }
     }
   }
 }
diff --git a/src/utils/kubernetes-client.ts b/src/utils/kubernetes-client.ts

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ export default defineConfig({`
`33`	`33`	`{ text: "Examples", link: "/examples/" },`
`34`	`34`	`{ text: "Overlay Testing", link: "/overlay/" },`
`35`	`35`	`{`
`36`		`- text: "v1.1.33",`
	`36`	`+ text: "v1.1.34",`
`37`	`37`	`items: [{ text: "Changelog", link: "/changelog" }],`
`38`	`38`	`},`
`39`	`39`	`],`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@red-hat-developer-hub/e2e-test-utils",`
`3`		`- "version": "1.1.33",`
	`3`	`+ "version": "1.1.34",`
`4`	`4`	`"description": "Test utilities for RHDH E2E tests",`
`5`	`5`	`"license": "Apache-2.0",`
`6`	`6`	`"repository": {`