From cea1a9ff532b44971f14d575e4e1bae5ad63375c Mon Sep 17 00:00:00 2001
From: gmanal <gmanal@nvidia.com>
Date: Fri, 5 Jun 2026 14:05:52 +0530
Subject: [PATCH 1/4] test again

---
 .github/workflows/test-nvcr-pull.yml | 157 +++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 .github/workflows/test-nvcr-pull.yml

diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml
new file mode 100644
index 00000000000..4ac535a7625
--- /dev/null
+++ b/.github/workflows/test-nvcr-pull.yml
@@ -0,0 +1,157 @@
+# nvcr.io public-image pull smoke test
+#
+#   Goal: confirm the NVIDIA/cccl self-hosted runner pool can pull a
+#   canonical public NVIDIA image from nvcr.io. Validates the runner →
+#   nvcr.io network path and matches the dominant pattern used across
+#   NVIDIA OSS GHA workflows (TensorRT-LLM, apex, NeMo, NVFlare, etc.):
+#   anonymous pull from the public nvcr.io/nvidia/* namespace.
+#
+#   No registry credentials are required. No secrets are referenced.
+#   This test is deliberately scoped to the public nvcr.io/nvidia/*
+#   namespace — private-namespace authentication is out of scope and
+#   tracked separately.
+#
+# LOG DISCIPLINE
+#   PASS/FAIL gates only. No hostnames, no kernel strings, no docker
+#   info dumps, no resolved IPs — Actions logs on a public repo are
+#   world-readable.
+#
+# TRIGGER
+#   - Open a PR against NVIDIA/cccl from a fork. copy-pr-bot (already
+#     configured for cccl) mirrors the PR head to a `pull-request/N`
+#     branch on the upstream repo, and the workflow runs there on
+#     `linux-amd64-cpu4` (self-hosted) with upstream context. Same
+#     trust model as `secret-scan.yml`.
+#   - "Run workflow" button on the Actions tab (workflow_dispatch).
+
+name: Test nvcr.io Image Pull (public)
+
+run-name: nvcr public-pull test — ${{ github.ref_name }}
+
+on:
+  push:
+    branches:
+      # copy-pr-bot mirror branches — same trigger model as
+      # secret-scan.yml and ci-workflow-pull-request.yml.
+      - "pull-request/[0-9]+"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  pull-nvcr-image:
+    name: docker pull nvcr.io public image
+    # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback
+    # so the workflow does not error out on contributor forks where
+    # nv-gha-runners labels do not resolve. Mirrors `secret-scan.yml`.
+    runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }}
+    env:
+      # Canonical public NVIDIA round-trip test image. Public namespace
+      # (nvcr.io/nvidia/*); no auth required. Small (~80 MB) for a fast
+      # smoke test. Same image NGC docs recommend for verifying nvcr.io
+      # connectivity end-to-end.
+      NVCR_IMAGE: nvcr.io/nvidia/cuda:12.4.0-base-ubuntu22.04
+
+    steps:
+      - name: Preflight — docker available
+        run: |
+          set -euo pipefail
+          if ! command -v docker >/dev/null 2>&1; then
+            echo "::error::docker CLI not found on this runner."
+            exit 1
+          fi
+          if ! docker version --format '{{.Server.Version}}' >/dev/null 2>&1; then
+            echo "::error::docker daemon not reachable from this runner."
+            exit 1
+          fi
+          echo "docker: OK"
+
+      - name: Network reachability — nvcr.io (PASS/FAIL only)
+        run: |
+          set -euo pipefail
+          if curl --silent --show-error --fail --max-time 15 \
+              --output /dev/null --head https://nvcr.io/v2/; then
+            echo "nvcr.io reachability (HTTPS HEAD /v2/): PASS"
+          else
+            rc=$?
+            echo "::error::nvcr.io unreachable (curl exit ${rc})."
+            exit "${rc}"
+          fi
+
+      - name: Anonymous manifest probe (HTTP 200 expected)
+        # Hits the OCI/Docker manifest endpoint with NO auth. For an
+        # image in the public nvcr.io/nvidia/* namespace this MUST
+        # return 200; anything else is a real signal and fails the
+        # test. Cheaper than a full layer pull as a pre-check.
+        run: |
+          set -euo pipefail
+          img="${NVCR_IMAGE#nvcr.io/}"
+          if [[ "${img}" == *@* ]]; then
+            img_path="${img%@*}"; img_ref="${img##*@}"
+          elif [[ "${img}" == *:* ]]; then
+            img_path="${img%:*}"; img_ref="${img##*:}"
+          else
+            img_path="${img}"; img_ref="latest"
+          fi
+          code="$(curl --silent --show-error --max-time 15 \
+              --output /dev/null --write-out '%{http_code}' \
+              --header 'Accept: application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json' \
+              "https://nvcr.io/v2/${img_path}/manifests/${img_ref}" 2>/dev/null || echo '000')"
+          case "${code}" in
+            200) echo "anonymous manifest fetch: PASS (HTTP 200)" ;;
+            *)   echo "::error::anonymous manifest fetch: HTTP ${code} (expected 200 for public namespace)"
+                 exit 1 ;;
+          esac
+
+      - name: docker pull (anonymous)
+        run: |
+          set -euo pipefail
+          if docker pull "${NVCR_IMAGE}" >/dev/null 2>&1; then
+            echo "docker pull: PASS"
+          else
+            rc=$?
+            echo "::error::docker pull failed (exit ${rc})."
+            exit "${rc}"
+          fi
+
+      - name: Sanity — image present locally
+        run: |
+          set -euo pipefail
+          if docker image inspect "${NVCR_IMAGE}" \
+              --format '{{.Architecture}}/{{.Os}} size={{.Size}}' \
+              > /tmp/img_meta 2>/dev/null; then
+            echo "image present: PASS ($(cat /tmp/img_meta))"
+          else
+            echo "::error::image not present locally after pull."
+            exit 1
+          fi
+
+      - name: Cleanup
+        if: always()
+        run: |
+          if [[ -n "${NVCR_IMAGE:-}" ]]; then
+            docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true
+          fi
+
+      - name: Summary (no runner identity)
+        if: always()
+        run: |
+          {
+            echo "### nvcr.io public-image pull smoke test"
+            echo
+            echo "| Field | Value |"
+            echo "|---|---|"
+            echo "| Repository | \`${GITHUB_REPOSITORY}\` |"
+            echo "| Branch | \`${GITHUB_REF_NAME}\` |"
+            echo "| Runner class | \`${RUNNER_OS}/${RUNNER_ARCH}\` |"
+            echo "| Image | \`${NVCR_IMAGE}\` |"
+            echo "| Job outcome | \`${{ job.status }}\` |"
+            echo
+            echo "_Step logs contain per-gate PASS/FAIL. Runner hostname,_"
+            echo "_kernel, daemon config, and resolved IPs are deliberately not echoed._"
+          } >> "${GITHUB_STEP_SUMMARY}"

From f67036a159380ff47c9a72a2d6a7f34fd4c1cb62 Mon Sep 17 00:00:00 2001
From: gmanal <gmanal@nvidia.com>
Date: Fri, 5 Jun 2026 15:14:01 +0530
Subject: [PATCH 2/4] test again - 01

---
 .github/workflows/test-nvcr-pull.yml | 64 ++++++++++++++--------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml
index 4ac535a7625..6c65d44483f 100644
--- a/.github/workflows/test-nvcr-pull.yml
+++ b/.github/workflows/test-nvcr-pull.yml
@@ -71,43 +71,45 @@ jobs:
           fi
           echo "docker: OK"
 
-      - name: Network reachability — nvcr.io (PASS/FAIL only)
+      - name: Network reachability — nvcr.io
+        # Accepts ANY HTTP response from nvcr.io as PASS — the goal is
+        # purely "the runner can reach the registry over the network".
+        # HTTP 401 on `/v2/` is *expected* per OCI Distribution Spec:
+        # the endpoint returns a Bearer-token auth challenge, and the
+        # docker client follows that challenge automatically (anonymous
+        # token for public images). A plain `curl --fail` would treat
+        # 401 as failure, which is misleading — we only fail here on
+        # actual network errors (no HTTP response at all).
         run: |
           set -euo pipefail
-          if curl --silent --show-error --fail --max-time 15 \
-              --output /dev/null --head https://nvcr.io/v2/; then
-            echo "nvcr.io reachability (HTTPS HEAD /v2/): PASS"
-          else
-            rc=$?
-            echo "::error::nvcr.io unreachable (curl exit ${rc})."
-            exit "${rc}"
-          fi
-
-      - name: Anonymous manifest probe (HTTP 200 expected)
-        # Hits the OCI/Docker manifest endpoint with NO auth. For an
-        # image in the public nvcr.io/nvidia/* namespace this MUST
-        # return 200; anything else is a real signal and fails the
-        # test. Cheaper than a full layer pull as a pre-check.
-        run: |
-          set -euo pipefail
-          img="${NVCR_IMAGE#nvcr.io/}"
-          if [[ "${img}" == *@* ]]; then
-            img_path="${img%@*}"; img_ref="${img##*@}"
-          elif [[ "${img}" == *:* ]]; then
-            img_path="${img%:*}"; img_ref="${img##*:}"
-          else
-            img_path="${img}"; img_ref="latest"
-          fi
-          code="$(curl --silent --show-error --max-time 15 \
+          code="$(curl --silent --max-time 15 \
               --output /dev/null --write-out '%{http_code}' \
-              --header 'Accept: application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json' \
-              "https://nvcr.io/v2/${img_path}/manifests/${img_ref}" 2>/dev/null || echo '000')"
+              --head https://nvcr.io/v2/ || echo '000')"
           case "${code}" in
-            200) echo "anonymous manifest fetch: PASS (HTTP 200)" ;;
-            *)   echo "::error::anonymous manifest fetch: HTTP ${code} (expected 200 for public namespace)"
-                 exit 1 ;;
+            000)
+              echo "::error::nvcr.io unreachable (no HTTP response)."
+              exit 1
+              ;;
+            2*|401|403)
+              echo "nvcr.io reachable: PASS (HTTP ${code}; 401 is the expected OCI auth challenge)"
+              ;;
+            5*)
+              echo "::error::nvcr.io reachable but server error (HTTP ${code})."
+              exit 1
+              ;;
+            *)
+              echo "::warning::nvcr.io reachable but unexpected HTTP ${code}"
+              ;;
           esac
 
+      # NOTE: no standalone manifest probe. nvcr.io returns 401 on the
+      # manifest endpoint to unauthenticated clients (OCI auth challenge,
+      # same reason as `/v2/` above) — a plain HTTP probe is therefore
+      # not a reliable public-vs-private signal on this registry. The
+      # functional test is `docker pull` below, which handles the full
+      # token-dance correctly and is the ground truth for "can this
+      # runner pull this image".
+
       - name: docker pull (anonymous)
         run: |
           set -euo pipefail

From f10932dca4276e8442ba0ca99d4bbee7c873df6a Mon Sep 17 00:00:00 2001
From: gmanal <gmanal@nvidia.com>
Date: Fri, 5 Jun 2026 15:22:26 +0530
Subject: [PATCH 3/4] test again - 02

---
 .github/workflows/test-nvcr-pull.yml | 153 +++++----------------------
 1 file changed, 24 insertions(+), 129 deletions(-)

diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml
index 6c65d44483f..f598f3026cf 100644
--- a/.github/workflows/test-nvcr-pull.yml
+++ b/.github/workflows/test-nvcr-pull.yml
@@ -1,38 +1,22 @@
-# nvcr.io public-image pull smoke test
+# nvcr.io image pull + size — single-job smoke test.
 #
-#   Goal: confirm the NVIDIA/cccl self-hosted runner pool can pull a
-#   canonical public NVIDIA image from nvcr.io. Validates the runner →
-#   nvcr.io network path and matches the dominant pattern used across
-#   NVIDIA OSS GHA workflows (TensorRT-LLM, apex, NeMo, NVFlare, etc.):
-#   anonymous pull from the public nvcr.io/nvidia/* namespace.
+# Pulls a specific nvcr.io image and reports its on-disk size. Does
+# not include `docker login` — relies on whatever credentials the
+# runner is configured with (the Packer-baked NGC pull-secret on
+# nv-gha-runners, if present). A failure with an auth error is
+# itself useful signal that the runner is not pre-configured for
+# the target namespace.
 #
-#   No registry credentials are required. No secrets are referenced.
-#   This test is deliberately scoped to the public nvcr.io/nvidia/*
-#   namespace — private-namespace authentication is out of scope and
-#   tracked separately.
-#
-# LOG DISCIPLINE
-#   PASS/FAIL gates only. No hostnames, no kernel strings, no docker
-#   info dumps, no resolved IPs — Actions logs on a public repo are
-#   world-readable.
-#
-# TRIGGER
-#   - Open a PR against NVIDIA/cccl from a fork. copy-pr-bot (already
-#     configured for cccl) mirrors the PR head to a `pull-request/N`
-#     branch on the upstream repo, and the workflow runs there on
-#     `linux-amd64-cpu4` (self-hosted) with upstream context. Same
-#     trust model as `secret-scan.yml`.
-#   - "Run workflow" button on the Actions tab (workflow_dispatch).
+# Triggers: same as `secret-scan.yml` — copy-pr-bot mirror branches
+# (`pull-request/[0-9]+`) on NVIDIA/cccl + workflow_dispatch.
 
-name: Test nvcr.io Image Pull (public)
+name: nvcr image pull + size
 
-run-name: nvcr public-pull test — ${{ github.ref_name }}
+run-name: nvcr pull — ${{ github.ref_name }}
 
 on:
   push:
     branches:
-      # copy-pr-bot mirror branches — same trigger model as
-      # secret-scan.yml and ci-workflow-pull-request.yml.
       - "pull-request/[0-9]+"
   workflow_dispatch:
 
@@ -44,116 +28,27 @@ permissions:
   contents: read
 
 jobs:
-  pull-nvcr-image:
-    name: docker pull nvcr.io public image
+  pull-and-size:
+    name: docker pull + size
     # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback
-    # so the workflow does not error out on contributor forks where
-    # nv-gha-runners labels do not resolve. Mirrors `secret-scan.yml`.
+    # on contributor forks where nv-gha-runners labels do not resolve.
     runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }}
     env:
-      # Canonical public NVIDIA round-trip test image. Public namespace
-      # (nvcr.io/nvidia/*); no auth required. Small (~80 MB) for a fast
-      # smoke test. Same image NGC docs recommend for verifying nvcr.io
-      # connectivity end-to-end.
-      NVCR_IMAGE: nvcr.io/nvidia/cuda:12.4.0-base-ubuntu22.04
+      NVCR_IMAGE: nvcr.io/nvidian/prodsec/pulse-trufflehog:1.33
 
     steps:
-      - name: Preflight — docker available
-        run: |
-          set -euo pipefail
-          if ! command -v docker >/dev/null 2>&1; then
-            echo "::error::docker CLI not found on this runner."
-            exit 1
-          fi
-          if ! docker version --format '{{.Server.Version}}' >/dev/null 2>&1; then
-            echo "::error::docker daemon not reachable from this runner."
-            exit 1
-          fi
-          echo "docker: OK"
-
-      - name: Network reachability — nvcr.io
-        # Accepts ANY HTTP response from nvcr.io as PASS — the goal is
-        # purely "the runner can reach the registry over the network".
-        # HTTP 401 on `/v2/` is *expected* per OCI Distribution Spec:
-        # the endpoint returns a Bearer-token auth challenge, and the
-        # docker client follows that challenge automatically (anonymous
-        # token for public images). A plain `curl --fail` would treat
-        # 401 as failure, which is misleading — we only fail here on
-        # actual network errors (no HTTP response at all).
-        run: |
-          set -euo pipefail
-          code="$(curl --silent --max-time 15 \
-              --output /dev/null --write-out '%{http_code}' \
-              --head https://nvcr.io/v2/ || echo '000')"
-          case "${code}" in
-            000)
-              echo "::error::nvcr.io unreachable (no HTTP response)."
-              exit 1
-              ;;
-            2*|401|403)
-              echo "nvcr.io reachable: PASS (HTTP ${code}; 401 is the expected OCI auth challenge)"
-              ;;
-            5*)
-              echo "::error::nvcr.io reachable but server error (HTTP ${code})."
-              exit 1
-              ;;
-            *)
-              echo "::warning::nvcr.io reachable but unexpected HTTP ${code}"
-              ;;
-          esac
-
-      # NOTE: no standalone manifest probe. nvcr.io returns 401 on the
-      # manifest endpoint to unauthenticated clients (OCI auth challenge,
-      # same reason as `/v2/` above) — a plain HTTP probe is therefore
-      # not a reliable public-vs-private signal on this registry. The
-      # functional test is `docker pull` below, which handles the full
-      # token-dance correctly and is the ground truth for "can this
-      # runner pull this image".
+      - name: docker pull
+        run: docker pull "${NVCR_IMAGE}"
 
-      - name: docker pull (anonymous)
+      - name: Report size
         run: |
           set -euo pipefail
-          if docker pull "${NVCR_IMAGE}" >/dev/null 2>&1; then
-            echo "docker pull: PASS"
-          else
-            rc=$?
-            echo "::error::docker pull failed (exit ${rc})."
-            exit "${rc}"
-          fi
-
-      - name: Sanity — image present locally
-        run: |
-          set -euo pipefail
-          if docker image inspect "${NVCR_IMAGE}" \
-              --format '{{.Architecture}}/{{.Os}} size={{.Size}}' \
-              > /tmp/img_meta 2>/dev/null; then
-            echo "image present: PASS ($(cat /tmp/img_meta))"
-          else
-            echo "::error::image not present locally after pull."
-            exit 1
-          fi
+          bytes=$(docker image inspect "${NVCR_IMAGE}" --format '{{.Size}}')
+          mib=$(awk -v b="${bytes}" 'BEGIN { printf "%.1f", b/1024/1024 }')
+          gib=$(awk -v b="${bytes}" 'BEGIN { printf "%.2f", b/1024/1024/1024 }')
+          echo "Image: ${NVCR_IMAGE}"
+          echo "Size:  ${bytes} bytes  (${mib} MiB / ${gib} GiB)"
 
       - name: Cleanup
         if: always()
-        run: |
-          if [[ -n "${NVCR_IMAGE:-}" ]]; then
-            docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true
-          fi
-
-      - name: Summary (no runner identity)
-        if: always()
-        run: |
-          {
-            echo "### nvcr.io public-image pull smoke test"
-            echo
-            echo "| Field | Value |"
-            echo "|---|---|"
-            echo "| Repository | \`${GITHUB_REPOSITORY}\` |"
-            echo "| Branch | \`${GITHUB_REF_NAME}\` |"
-            echo "| Runner class | \`${RUNNER_OS}/${RUNNER_ARCH}\` |"
-            echo "| Image | \`${NVCR_IMAGE}\` |"
-            echo "| Job outcome | \`${{ job.status }}\` |"
-            echo
-            echo "_Step logs contain per-gate PASS/FAIL. Runner hostname,_"
-            echo "_kernel, daemon config, and resolved IPs are deliberately not echoed._"
-          } >> "${GITHUB_STEP_SUMMARY}"
+        run: docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true

From 2212b4c1b7cf000e9b3654355a88529417035709 Mon Sep 17 00:00:00 2001
From: gmanal <gmanal@nvidia.com>
Date: Fri, 5 Jun 2026 15:55:25 +0530
Subject: [PATCH 4/4] test again - 03

---
 .github/workflows/test-nvcr-pull.yml | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml
index f598f3026cf..04ffc83d4fb 100644
--- a/.github/workflows/test-nvcr-pull.yml
+++ b/.github/workflows/test-nvcr-pull.yml
@@ -33,6 +33,11 @@ jobs:
     # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback
     # on contributor forks where nv-gha-runners labels do not resolve.
     runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }}
+    # Cap stuck pulls / registry hangs so a wedged job doesn't sit on a
+    # self-hosted runner indefinitely. 15 min covers a multi-GB image
+    # pull on a busy runner with headroom; well under GitHub's default
+    # 6-hour job timeout.
+    timeout-minutes: 15
     env:
       NVCR_IMAGE: nvcr.io/nvidian/prodsec/pulse-trufflehog:1.33
 
@@ -51,4 +56,21 @@ jobs:
 
       - name: Cleanup
         if: always()
-        run: docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true
+        # Only attempt removal if the image is actually present locally.
+        # If the pull failed (e.g. auth error on a private namespace),
+        # `docker rmi` would fail with "no such image" — that's expected,
+        # not a real disk-growth signal, so we skip silently. If the
+        # image IS present and `rmi` fails, that's a genuine problem on
+        # a self-hosted runner (leaks layers across runs) — surface as
+        # a warning so it shows up in the run log.
+        run: |
+          set -euo pipefail
+          if ! docker image inspect "${NVCR_IMAGE}" >/dev/null 2>&1; then
+            echo "cleanup: image not present locally (likely pull failed); nothing to remove"
+            exit 0
+          fi
+          if ! docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1; then
+            echo "::warning::cleanup: failed to remove ${NVCR_IMAGE} from runner cache; manual cleanup may be needed to avoid disk growth on this runner"
+          else
+            echo "cleanup: removed ${NVCR_IMAGE}"
+          fi