NVIDIA
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 10 additions & 27 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 10 additions & 27 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 19 additions & 3 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/gpu_megatron/_extensions‎
Lines changed: 1 addition & 0 deletions b/‎tests/gpu_megatron/_extensions‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/gpu_megatron/torch/conftest.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/gpu_megatron/torch/conftest.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎…distill/plugins/test_distill_megatron.py‎ ‎…distill/plugins/test_distill_megatron.py‎tests/gpu/torch/distill/plugins/test_distill_megatron.py renamed to tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py b/‎…distill/plugins/test_distill_megatron.py‎ ‎…distill/plugins/test_distill_megatron.py‎tests/gpu/torch/distill/plugins/test_distill_megatron.py renamed to tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
diff --git a/‎…h/export/test_unified_export_megatron.py‎ ‎…h/export/test_unified_export_megatron.py‎tests/gpu/torch/export/test_unified_export_megatron.py renamed to tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/‎…h/export/test_unified_export_megatron.py‎ ‎…h/export/test_unified_export_megatron.py‎tests/gpu/torch/export/test_unified_export_megatron.py renamed to tests/gpu_megatron/torch/export/test_unified_export_megatron.py
diff --git a/‎…t/test_vllm_fakequant_megatron_export.py‎ ‎…t/test_vllm_fakequant_megatron_export.py‎tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py renamed to tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py b/‎…t/test_vllm_fakequant_megatron_export.py‎ ‎…t/test_vllm_fakequant_megatron_export.py‎tests/gpu/torch/export/test_vllm_fakequant_megatron_export.py renamed to tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
diff --git a/‎…ins/test_megatron_gpt_dynamic_modules.py‎ ‎…ins/test_megatron_gpt_dynamic_modules.py‎tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py renamed to tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/‎…ins/test_megatron_gpt_dynamic_modules.py‎ ‎…ins/test_megatron_gpt_dynamic_modules.py‎tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py renamed to tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -56,18 +56,21 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests #####
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
   torch-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
     strategy:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
 
-  ##### Speculative Decoding Example Tests (requires 26.01 image) #####
-  speculative-decoding-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  speculative-decoding-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-2
-
   ##### TensorRT-LLM Example Tests #####
   trtllm-pr:
     needs: [check-file-changes, wait-checks]
@@ -172,15 +156,14 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
             needs.torch-pr.result != 'success' ||
-            needs.speculative-decoding-pr.result != 'success' ||
             needs.trtllm-pr.result != 'success' ||
             needs.onnx-pr.result != 'success'
           ))
 
@@ -59,8 +59,16 @@ jobs:
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 120
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -74,11 +82,19 @@ jobs:
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e ${{ matrix.example }} --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 150
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
 
@@ -132,8 +132,8 @@ disable_error_code = ["attr-defined"]
 [tool.pytest.ini_options]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
-# print execution time for 20 slowest tests and generate coverage reports
-addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+# print execution time for 50 slowest tests and generate coverage reports
+addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=50 --strict-markers"
 pythonpath = ["tests/"]
 markers = [
     "manual: Only run when --run-manual is given",
 
@@ -77,6 +77,7 @@
         "pytest-cov",
         "pytest-instafail",
         "pytest-timeout",
+        "sentencepiece",  # For test_unified_export_megatron.py, test_vllm_fakequant_megatron_export.py
         "timm",
         "torchprofile>=0.0.4",  # For computing flops of CV models
         "torchvision",
 
@@ -0,0 +1 @@
+../gpu/_extensions/
@@ -0,0 +1 @@
+../../gpu/torch/conftest.py