From cea1a9ff532b44971f14d575e4e1bae5ad63375c Mon Sep 17 00:00:00 2001 From: gmanal Date: Fri, 5 Jun 2026 14:05:52 +0530 Subject: [PATCH 1/4] test again --- .github/workflows/test-nvcr-pull.yml | 157 +++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 .github/workflows/test-nvcr-pull.yml diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml new file mode 100644 index 00000000000..4ac535a7625 --- /dev/null +++ b/.github/workflows/test-nvcr-pull.yml @@ -0,0 +1,157 @@ +# nvcr.io public-image pull smoke test +# +# Goal: confirm the NVIDIA/cccl self-hosted runner pool can pull a +# canonical public NVIDIA image from nvcr.io. Validates the runner → +# nvcr.io network path and matches the dominant pattern used across +# NVIDIA OSS GHA workflows (TensorRT-LLM, apex, NeMo, NVFlare, etc.): +# anonymous pull from the public nvcr.io/nvidia/* namespace. +# +# No registry credentials are required. No secrets are referenced. +# This test is deliberately scoped to the public nvcr.io/nvidia/* +# namespace — private-namespace authentication is out of scope and +# tracked separately. +# +# LOG DISCIPLINE +# PASS/FAIL gates only. No hostnames, no kernel strings, no docker +# info dumps, no resolved IPs — Actions logs on a public repo are +# world-readable. +# +# TRIGGER +# - Open a PR against NVIDIA/cccl from a fork. copy-pr-bot (already +# configured for cccl) mirrors the PR head to a `pull-request/N` +# branch on the upstream repo, and the workflow runs there on +# `linux-amd64-cpu4` (self-hosted) with upstream context. Same +# trust model as `secret-scan.yml`. +# - "Run workflow" button on the Actions tab (workflow_dispatch). + +name: Test nvcr.io Image Pull (public) + +run-name: nvcr public-pull test — ${{ github.ref_name }} + +on: + push: + branches: + # copy-pr-bot mirror branches — same trigger model as + # secret-scan.yml and ci-workflow-pull-request.yml. + - "pull-request/[0-9]+" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + pull-nvcr-image: + name: docker pull nvcr.io public image + # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback + # so the workflow does not error out on contributor forks where + # nv-gha-runners labels do not resolve. Mirrors `secret-scan.yml`. + runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }} + env: + # Canonical public NVIDIA round-trip test image. Public namespace + # (nvcr.io/nvidia/*); no auth required. Small (~80 MB) for a fast + # smoke test. Same image NGC docs recommend for verifying nvcr.io + # connectivity end-to-end. + NVCR_IMAGE: nvcr.io/nvidia/cuda:12.4.0-base-ubuntu22.04 + + steps: + - name: Preflight — docker available + run: | + set -euo pipefail + if ! command -v docker >/dev/null 2>&1; then + echo "::error::docker CLI not found on this runner." + exit 1 + fi + if ! docker version --format '{{.Server.Version}}' >/dev/null 2>&1; then + echo "::error::docker daemon not reachable from this runner." + exit 1 + fi + echo "docker: OK" + + - name: Network reachability — nvcr.io (PASS/FAIL only) + run: | + set -euo pipefail + if curl --silent --show-error --fail --max-time 15 \ + --output /dev/null --head https://nvcr.io/v2/; then + echo "nvcr.io reachability (HTTPS HEAD /v2/): PASS" + else + rc=$? + echo "::error::nvcr.io unreachable (curl exit ${rc})." + exit "${rc}" + fi + + - name: Anonymous manifest probe (HTTP 200 expected) + # Hits the OCI/Docker manifest endpoint with NO auth. For an + # image in the public nvcr.io/nvidia/* namespace this MUST + # return 200; anything else is a real signal and fails the + # test. Cheaper than a full layer pull as a pre-check. + run: | + set -euo pipefail + img="${NVCR_IMAGE#nvcr.io/}" + if [[ "${img}" == *@* ]]; then + img_path="${img%@*}"; img_ref="${img##*@}" + elif [[ "${img}" == *:* ]]; then + img_path="${img%:*}"; img_ref="${img##*:}" + else + img_path="${img}"; img_ref="latest" + fi + code="$(curl --silent --show-error --max-time 15 \ + --output /dev/null --write-out '%{http_code}' \ + --header 'Accept: application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json' \ + "https://nvcr.io/v2/${img_path}/manifests/${img_ref}" 2>/dev/null || echo '000')" + case "${code}" in + 200) echo "anonymous manifest fetch: PASS (HTTP 200)" ;; + *) echo "::error::anonymous manifest fetch: HTTP ${code} (expected 200 for public namespace)" + exit 1 ;; + esac + + - name: docker pull (anonymous) + run: | + set -euo pipefail + if docker pull "${NVCR_IMAGE}" >/dev/null 2>&1; then + echo "docker pull: PASS" + else + rc=$? + echo "::error::docker pull failed (exit ${rc})." + exit "${rc}" + fi + + - name: Sanity — image present locally + run: | + set -euo pipefail + if docker image inspect "${NVCR_IMAGE}" \ + --format '{{.Architecture}}/{{.Os}} size={{.Size}}' \ + > /tmp/img_meta 2>/dev/null; then + echo "image present: PASS ($(cat /tmp/img_meta))" + else + echo "::error::image not present locally after pull." + exit 1 + fi + + - name: Cleanup + if: always() + run: | + if [[ -n "${NVCR_IMAGE:-}" ]]; then + docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true + fi + + - name: Summary (no runner identity) + if: always() + run: | + { + echo "### nvcr.io public-image pull smoke test" + echo + echo "| Field | Value |" + echo "|---|---|" + echo "| Repository | \`${GITHUB_REPOSITORY}\` |" + echo "| Branch | \`${GITHUB_REF_NAME}\` |" + echo "| Runner class | \`${RUNNER_OS}/${RUNNER_ARCH}\` |" + echo "| Image | \`${NVCR_IMAGE}\` |" + echo "| Job outcome | \`${{ job.status }}\` |" + echo + echo "_Step logs contain per-gate PASS/FAIL. Runner hostname,_" + echo "_kernel, daemon config, and resolved IPs are deliberately not echoed._" + } >> "${GITHUB_STEP_SUMMARY}" From f67036a159380ff47c9a72a2d6a7f34fd4c1cb62 Mon Sep 17 00:00:00 2001 From: gmanal Date: Fri, 5 Jun 2026 15:14:01 +0530 Subject: [PATCH 2/4] test again - 01 --- .github/workflows/test-nvcr-pull.yml | 64 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml index 4ac535a7625..6c65d44483f 100644 --- a/.github/workflows/test-nvcr-pull.yml +++ b/.github/workflows/test-nvcr-pull.yml @@ -71,43 +71,45 @@ jobs: fi echo "docker: OK" - - name: Network reachability — nvcr.io (PASS/FAIL only) + - name: Network reachability — nvcr.io + # Accepts ANY HTTP response from nvcr.io as PASS — the goal is + # purely "the runner can reach the registry over the network". + # HTTP 401 on `/v2/` is *expected* per OCI Distribution Spec: + # the endpoint returns a Bearer-token auth challenge, and the + # docker client follows that challenge automatically (anonymous + # token for public images). A plain `curl --fail` would treat + # 401 as failure, which is misleading — we only fail here on + # actual network errors (no HTTP response at all). run: | set -euo pipefail - if curl --silent --show-error --fail --max-time 15 \ - --output /dev/null --head https://nvcr.io/v2/; then - echo "nvcr.io reachability (HTTPS HEAD /v2/): PASS" - else - rc=$? - echo "::error::nvcr.io unreachable (curl exit ${rc})." - exit "${rc}" - fi - - - name: Anonymous manifest probe (HTTP 200 expected) - # Hits the OCI/Docker manifest endpoint with NO auth. For an - # image in the public nvcr.io/nvidia/* namespace this MUST - # return 200; anything else is a real signal and fails the - # test. Cheaper than a full layer pull as a pre-check. - run: | - set -euo pipefail - img="${NVCR_IMAGE#nvcr.io/}" - if [[ "${img}" == *@* ]]; then - img_path="${img%@*}"; img_ref="${img##*@}" - elif [[ "${img}" == *:* ]]; then - img_path="${img%:*}"; img_ref="${img##*:}" - else - img_path="${img}"; img_ref="latest" - fi - code="$(curl --silent --show-error --max-time 15 \ + code="$(curl --silent --max-time 15 \ --output /dev/null --write-out '%{http_code}' \ - --header 'Accept: application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json' \ - "https://nvcr.io/v2/${img_path}/manifests/${img_ref}" 2>/dev/null || echo '000')" + --head https://nvcr.io/v2/ || echo '000')" case "${code}" in - 200) echo "anonymous manifest fetch: PASS (HTTP 200)" ;; - *) echo "::error::anonymous manifest fetch: HTTP ${code} (expected 200 for public namespace)" - exit 1 ;; + 000) + echo "::error::nvcr.io unreachable (no HTTP response)." + exit 1 + ;; + 2*|401|403) + echo "nvcr.io reachable: PASS (HTTP ${code}; 401 is the expected OCI auth challenge)" + ;; + 5*) + echo "::error::nvcr.io reachable but server error (HTTP ${code})." + exit 1 + ;; + *) + echo "::warning::nvcr.io reachable but unexpected HTTP ${code}" + ;; esac + # NOTE: no standalone manifest probe. nvcr.io returns 401 on the + # manifest endpoint to unauthenticated clients (OCI auth challenge, + # same reason as `/v2/` above) — a plain HTTP probe is therefore + # not a reliable public-vs-private signal on this registry. The + # functional test is `docker pull` below, which handles the full + # token-dance correctly and is the ground truth for "can this + # runner pull this image". + - name: docker pull (anonymous) run: | set -euo pipefail From f10932dca4276e8442ba0ca99d4bbee7c873df6a Mon Sep 17 00:00:00 2001 From: gmanal Date: Fri, 5 Jun 2026 15:22:26 +0530 Subject: [PATCH 3/4] test again - 02 --- .github/workflows/test-nvcr-pull.yml | 153 +++++---------------------- 1 file changed, 24 insertions(+), 129 deletions(-) diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml index 6c65d44483f..f598f3026cf 100644 --- a/.github/workflows/test-nvcr-pull.yml +++ b/.github/workflows/test-nvcr-pull.yml @@ -1,38 +1,22 @@ -# nvcr.io public-image pull smoke test +# nvcr.io image pull + size — single-job smoke test. # -# Goal: confirm the NVIDIA/cccl self-hosted runner pool can pull a -# canonical public NVIDIA image from nvcr.io. Validates the runner → -# nvcr.io network path and matches the dominant pattern used across -# NVIDIA OSS GHA workflows (TensorRT-LLM, apex, NeMo, NVFlare, etc.): -# anonymous pull from the public nvcr.io/nvidia/* namespace. +# Pulls a specific nvcr.io image and reports its on-disk size. Does +# not include `docker login` — relies on whatever credentials the +# runner is configured with (the Packer-baked NGC pull-secret on +# nv-gha-runners, if present). A failure with an auth error is +# itself useful signal that the runner is not pre-configured for +# the target namespace. # -# No registry credentials are required. No secrets are referenced. -# This test is deliberately scoped to the public nvcr.io/nvidia/* -# namespace — private-namespace authentication is out of scope and -# tracked separately. -# -# LOG DISCIPLINE -# PASS/FAIL gates only. No hostnames, no kernel strings, no docker -# info dumps, no resolved IPs — Actions logs on a public repo are -# world-readable. -# -# TRIGGER -# - Open a PR against NVIDIA/cccl from a fork. copy-pr-bot (already -# configured for cccl) mirrors the PR head to a `pull-request/N` -# branch on the upstream repo, and the workflow runs there on -# `linux-amd64-cpu4` (self-hosted) with upstream context. Same -# trust model as `secret-scan.yml`. -# - "Run workflow" button on the Actions tab (workflow_dispatch). +# Triggers: same as `secret-scan.yml` — copy-pr-bot mirror branches +# (`pull-request/[0-9]+`) on NVIDIA/cccl + workflow_dispatch. -name: Test nvcr.io Image Pull (public) +name: nvcr image pull + size -run-name: nvcr public-pull test — ${{ github.ref_name }} +run-name: nvcr pull — ${{ github.ref_name }} on: push: branches: - # copy-pr-bot mirror branches — same trigger model as - # secret-scan.yml and ci-workflow-pull-request.yml. - "pull-request/[0-9]+" workflow_dispatch: @@ -44,116 +28,27 @@ permissions: contents: read jobs: - pull-nvcr-image: - name: docker pull nvcr.io public image + pull-and-size: + name: docker pull + size # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback - # so the workflow does not error out on contributor forks where - # nv-gha-runners labels do not resolve. Mirrors `secret-scan.yml`. + # on contributor forks where nv-gha-runners labels do not resolve. runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }} env: - # Canonical public NVIDIA round-trip test image. Public namespace - # (nvcr.io/nvidia/*); no auth required. Small (~80 MB) for a fast - # smoke test. Same image NGC docs recommend for verifying nvcr.io - # connectivity end-to-end. - NVCR_IMAGE: nvcr.io/nvidia/cuda:12.4.0-base-ubuntu22.04 + NVCR_IMAGE: nvcr.io/nvidian/prodsec/pulse-trufflehog:1.33 steps: - - name: Preflight — docker available - run: | - set -euo pipefail - if ! command -v docker >/dev/null 2>&1; then - echo "::error::docker CLI not found on this runner." - exit 1 - fi - if ! docker version --format '{{.Server.Version}}' >/dev/null 2>&1; then - echo "::error::docker daemon not reachable from this runner." - exit 1 - fi - echo "docker: OK" - - - name: Network reachability — nvcr.io - # Accepts ANY HTTP response from nvcr.io as PASS — the goal is - # purely "the runner can reach the registry over the network". - # HTTP 401 on `/v2/` is *expected* per OCI Distribution Spec: - # the endpoint returns a Bearer-token auth challenge, and the - # docker client follows that challenge automatically (anonymous - # token for public images). A plain `curl --fail` would treat - # 401 as failure, which is misleading — we only fail here on - # actual network errors (no HTTP response at all). - run: | - set -euo pipefail - code="$(curl --silent --max-time 15 \ - --output /dev/null --write-out '%{http_code}' \ - --head https://nvcr.io/v2/ || echo '000')" - case "${code}" in - 000) - echo "::error::nvcr.io unreachable (no HTTP response)." - exit 1 - ;; - 2*|401|403) - echo "nvcr.io reachable: PASS (HTTP ${code}; 401 is the expected OCI auth challenge)" - ;; - 5*) - echo "::error::nvcr.io reachable but server error (HTTP ${code})." - exit 1 - ;; - *) - echo "::warning::nvcr.io reachable but unexpected HTTP ${code}" - ;; - esac - - # NOTE: no standalone manifest probe. nvcr.io returns 401 on the - # manifest endpoint to unauthenticated clients (OCI auth challenge, - # same reason as `/v2/` above) — a plain HTTP probe is therefore - # not a reliable public-vs-private signal on this registry. The - # functional test is `docker pull` below, which handles the full - # token-dance correctly and is the ground truth for "can this - # runner pull this image". + - name: docker pull + run: docker pull "${NVCR_IMAGE}" - - name: docker pull (anonymous) + - name: Report size run: | set -euo pipefail - if docker pull "${NVCR_IMAGE}" >/dev/null 2>&1; then - echo "docker pull: PASS" - else - rc=$? - echo "::error::docker pull failed (exit ${rc})." - exit "${rc}" - fi - - - name: Sanity — image present locally - run: | - set -euo pipefail - if docker image inspect "${NVCR_IMAGE}" \ - --format '{{.Architecture}}/{{.Os}} size={{.Size}}' \ - > /tmp/img_meta 2>/dev/null; then - echo "image present: PASS ($(cat /tmp/img_meta))" - else - echo "::error::image not present locally after pull." - exit 1 - fi + bytes=$(docker image inspect "${NVCR_IMAGE}" --format '{{.Size}}') + mib=$(awk -v b="${bytes}" 'BEGIN { printf "%.1f", b/1024/1024 }') + gib=$(awk -v b="${bytes}" 'BEGIN { printf "%.2f", b/1024/1024/1024 }') + echo "Image: ${NVCR_IMAGE}" + echo "Size: ${bytes} bytes (${mib} MiB / ${gib} GiB)" - name: Cleanup if: always() - run: | - if [[ -n "${NVCR_IMAGE:-}" ]]; then - docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true - fi - - - name: Summary (no runner identity) - if: always() - run: | - { - echo "### nvcr.io public-image pull smoke test" - echo - echo "| Field | Value |" - echo "|---|---|" - echo "| Repository | \`${GITHUB_REPOSITORY}\` |" - echo "| Branch | \`${GITHUB_REF_NAME}\` |" - echo "| Runner class | \`${RUNNER_OS}/${RUNNER_ARCH}\` |" - echo "| Image | \`${NVCR_IMAGE}\` |" - echo "| Job outcome | \`${{ job.status }}\` |" - echo - echo "_Step logs contain per-gate PASS/FAIL. Runner hostname,_" - echo "_kernel, daemon config, and resolved IPs are deliberately not echoed._" - } >> "${GITHUB_STEP_SUMMARY}" + run: docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true From 2212b4c1b7cf000e9b3654355a88529417035709 Mon Sep 17 00:00:00 2001 From: gmanal Date: Fri, 5 Jun 2026 15:55:25 +0530 Subject: [PATCH 4/4] test again - 03 --- .github/workflows/test-nvcr-pull.yml | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-nvcr-pull.yml b/.github/workflows/test-nvcr-pull.yml index f598f3026cf..04ffc83d4fb 100644 --- a/.github/workflows/test-nvcr-pull.yml +++ b/.github/workflows/test-nvcr-pull.yml @@ -33,6 +33,11 @@ jobs: # NV self-hosted CPU runner on NVIDIA/cccl; GitHub-hosted fallback # on contributor forks where nv-gha-runners labels do not resolve. runs-on: ${{ github.repository == 'NVIDIA/cccl' && 'linux-amd64-cpu4' || 'ubuntu-latest' }} + # Cap stuck pulls / registry hangs so a wedged job doesn't sit on a + # self-hosted runner indefinitely. 15 min covers a multi-GB image + # pull on a busy runner with headroom; well under GitHub's default + # 6-hour job timeout. + timeout-minutes: 15 env: NVCR_IMAGE: nvcr.io/nvidian/prodsec/pulse-trufflehog:1.33 @@ -51,4 +56,21 @@ jobs: - name: Cleanup if: always() - run: docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1 || true + # Only attempt removal if the image is actually present locally. + # If the pull failed (e.g. auth error on a private namespace), + # `docker rmi` would fail with "no such image" — that's expected, + # not a real disk-growth signal, so we skip silently. If the + # image IS present and `rmi` fails, that's a genuine problem on + # a self-hosted runner (leaks layers across runs) — surface as + # a warning so it shows up in the run log. + run: | + set -euo pipefail + if ! docker image inspect "${NVCR_IMAGE}" >/dev/null 2>&1; then + echo "cleanup: image not present locally (likely pull failed); nothing to remove" + exit 0 + fi + if ! docker rmi "${NVCR_IMAGE}" >/dev/null 2>&1; then + echo "::warning::cleanup: failed to remove ${NVCR_IMAGE} from runner cache; manual cleanup may be needed to avoid disk growth on this runner" + else + echo "cleanup: removed ${NVCR_IMAGE}" + fi