simplify gpu_tests.yml

kevalmorabia97 · kevalmorabia97 · commit caeef809709f · 2026-05-29T11:51:19.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -40,56 +40,61 @@ jobs:
         include:
           - example: gpu
             timeout: 75
-            container_image: pytorch:26.04-py3
+            container_image: nvcr.io/nvidia/pytorch:26.04-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nemo:26.04
+            container_image: nvcr.io/nvidia/nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc16
+            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
+          - example: gpu_vllm
+            timeout: 30
+            container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
-      image: nvcr.io/nvidia/${{ matrix.container_image }}
+      image: ${{ matrix.container_image }}
+      # nvcr.io images require NGC auth; public docker.io images (e.g. vllm) are pulled
+      # anonymously (the runner skips docker login when username/password are empty).
       credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_API_KEY }}
-      env:
-        GIT_DEPTH: 1000 # For correct version
-        PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/gpu-test-run
-        with:
-          example: ${{ matrix.example }}
-          codecov_token: ${{ secrets.CODECOV_TOKEN }}
-
-  # Docker Hub image: anonymous pull (no ``credentials:``) and no coverage
-  gpu-tests-vllm:
-    needs: [pr-gate]
-    if: needs.pr-gate.outputs.any_changed == 'true'
-    runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
-    timeout-minutes: 30
-    container:
-      image: docker.io/vllm/vllm-openai:v0.20.0
+        username: ${{ startsWith(matrix.container_image, 'nvcr.io') && '$oauthtoken' || '' }}
+        password: ${{ startsWith(matrix.container_image, 'nvcr.io') && secrets.NGC_API_KEY || '' }}
       env:
         GIT_DEPTH: 1000 # For correct version
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
+      - name: Install git
+        # The vllm container ships without git; needed for a real checkout (correct
+        # setuptools-scm version) and for the Codecov upload below.
+        if: matrix.example == 'gpu_vllm'
+        run: apt-get update && apt-get install -y git
       - uses: actions/checkout@v6
-      - uses: ./.github/actions/gpu-test-run
+      - uses: nv-gha-runners/setup-proxy-cache@main
+      - name: Setup environment variables
+        run: |
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+      - name: Run gpu tests
+        env:
+          COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
+          COVERAGE_FILE: ${{ github.workspace }}/.coverage
+        run: |
+          python -m pip install nox && nox -s ${{ matrix.example }}
+      - name: Upload GPU coverage to Codecov
+        uses: codecov/codecov-action@v5
         with:
-          example: gpu_vllm
-          with_coverage: "false"
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: coverage.xml
+          flags: gpu
+          fail_ci_if_error: false # test may be skipped if relevant file changes are not detected
+          verbose: true
 
   gpu-pr-required-check:
-    # Run even if any of the gpu jobs is skipped
+    # Run even if gpu-tests is skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [pr-gate, gpu-tests, gpu-tests-vllm]
+    needs: [pr-gate, gpu-tests]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
-        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && (needs.gpu-tests.result != 'success' || needs.gpu-tests-vllm.result != 'success')) }}
+        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }}
         run: exit 1