Add back sentencepiece dependency for tests + cleanup workflow

kevalmorabia97 · kevalmorabia97 · commit 7f8ccafb19c8 · 2026-02-13T11:46:09.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -56,18 +56,21 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests #####
+  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
   torch-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
     strategy:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -78,36 +81,17 @@ jobs:
       fail-fast: false
       matrix:
         example: [llm_distill, llm_qat, llm_sparsity]
+        include:
+          - example: speculative_decoding
+            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/pytorch:25.06-py3"
+      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
 
-  ##### Speculative Decoding Example Tests (requires 26.01 image) #####
-  speculative-decoding-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  speculative-decoding-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
-      example: speculative_decoding
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-2
-
   ##### TensorRT-LLM Example Tests #####
   trtllm-pr:
     needs: [check-file-changes, wait-checks]
@@ -172,15 +156,14 @@ jobs:
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, speculative-decoding-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
             needs.torch-pr.result != 'success' ||
-            needs.speculative-decoding-pr.result != 'success' ||
             needs.trtllm-pr.result != 'success' ||
             needs.onnx-pr.result != 'success'
           ))
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -62,9 +62,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 90
+    timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -84,9 +88,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        example: [py312-cuda12-gpu, py312-cuda12-gpu-megatron]
+        include:
+          - example: py312-cuda12-gpu
+            timeout: 90
+          - example: py312-cuda12-gpu-megatron
+            timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 90
+    timeout-minutes: ${{ matrix.timeout }}
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:
diff --git a/pyproject.toml b/pyproject.toml
@@ -132,8 +132,8 @@ disable_error_code = ["attr-defined"]
 [tool.pytest.ini_options]
 # Default additional options
 # Show a short test summary info for all except passed tests with -ra flag
-# print execution time for 20 slowest tests and generate coverage reports
-addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=20 --strict-markers"
+# print execution time for 50 slowest tests and generate coverage reports
+addopts = "-v -ra --instafail --cov-report=term-missing --cov-report=html --cov-report=xml:coverage.xml --cov-config=pyproject.toml --durations=50 --strict-markers"
 pythonpath = ["tests/"]
 markers = [
     "manual: Only run when --run-manual is given",
diff --git a/setup.py b/setup.py
@@ -77,6 +77,7 @@
         "pytest-cov",
         "pytest-instafail",
         "pytest-timeout",
+        "sentencepiece",  # For test_unified_export_megatron.py, test_vllm_fakequant_megatron_export.py
         "timm",
         "torchprofile>=0.0.4",  # For computing flops of CV models
         "torchvision",
diff --git a/tox.ini b/tox.ini
@@ -62,8 +62,6 @@ commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
-    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
-    #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
@@ -79,8 +77,6 @@ commands_pre =
     pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
-    # NOTE: User is expected to have correct torch-cuda version pre-installed if using --current-env
-    #   to avoid possible CUDA version mismatch
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"