Upgrade Dev containers for CICD to latest (#891)

kevalmorabia97 · danielkorzekwa · commit 64eda9b8542c · 2026-03-04T03:27:10.000-08:00
## What does this PR do?

- Upgrade CICD test containers to latest
- Enable torch 2.10 testing in CICD

## Testing
&lt;!-- Mention how have you tested your change if applicable. --&gt;

CI/CD in this PR should pass

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
  * Added support for mixed-precision gradient handling with FSDP2.

* **Documentation**
* Updated Linux installation guide with CUDA 13.x support and cupy
dependency guidance.

* **Chores**
* Updated CI/CD workflows and test infrastructure to support PyTorch
2.10 and CUDA 13.
* Updated container image versions and test environment configurations.
  * Updated TensorRT-LLM version requirements.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
Signed-off-by: Daniel Korzekwa &lt;dkorzekwa@nvidia.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -66,11 +66,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -83,11 +83,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -103,7 +103,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-1
@@ -117,7 +117,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -133,7 +133,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -147,7 +147,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -63,14 +63,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
-      image: nvcr.io/nvidia/pytorch:25.06-py3
+      image: nvcr.io/nvidia/pytorch:26.01-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -91,9 +91,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: ${{ matrix.timeout }}
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -37,7 +37,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -55,6 +55,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
+        # Some issues with torch 2.10 on Windows, so using 2.9 for now
         run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
@@ -70,15 +71,15 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27, 28]
+        torch: [26, 27, 28, 29]
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
@@ -96,7 +97,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -14,11 +14,11 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | Python                  |  >=3.10,<3.13               |
 +-------------------------+-----------------------------+
-| CUDA                    |  >=12.0                     |
+| CUDA                    |  12.x, 13.x                 |
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.2.0rc4                   |
+| TensorRT-LLM (Optional) |  >=1.0                      |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
@@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package
     *   - Huggingface (``transformers``, ``diffusers``, etc.)
         - ``[hf]``
 
+**CUDA specific dependencies**
+
+* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``.
+
 **Accelerated Quantization with Triton Kernels**
 
 ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -15,6 +15,7 @@
 
 """ModelOpt plugin for transformers Trainer."""
 
+import contextlib
 import gc
 import json
 import os
@@ -100,6 +101,52 @@ class QuantizationArgumentsWithConfig(QuantizationArguments):
     )
 
 
+def _patch_fsdp2_post_backward():
+    """Patch FSDP2 ``post_backward`` to handle mixed-precision gradient dtypes.
+
+    FSDP2 with bf16 mixed precision upcasts bf16 parameters to fp32 for optimizer
+    precision, while gradients are reduced in bf16. In PyTorch >= 2.6, assigning a
+    bf16 gradient to a fp32 parameter raises a ``RuntimeError`` due to the
+    ``grad_dtype`` check, and the fused Adam optimizer also rejects mixed dtypes.
+
+    This patch wraps ``FSDPParamGroup.post_backward`` to:
+    1. Set ``grad_dtype=None`` on sharded params before reduction (allowing bf16 assignment).
+    2. Cast gradients to match parameter dtype after reduction (so the optimizer sees matching dtypes).
+
+    .. note::
+        This is a workaround. The proper fix should come from PyTorch's FSDP2
+        ``foreach_reduce`` (which should cast gradients to match the parameter dtype)
+        or from accelerate (which should set ``grad_dtype`` when it upcasts params).
+        Remove this once the upstream fix is available.
+    """
+    try:
+        from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+    except ImportError:
+        return
+
+    if hasattr(FSDPParamGroup, "_modelopt_original_post_backward"):
+        return  # Already patched
+
+    FSDPParamGroup._modelopt_original_post_backward = FSDPParamGroup.post_backward
+
+    @torch.no_grad()
+    def _patched_post_backward(self):
+        # Allow bf16 gradients to be assigned to fp32 parameters
+        for fsdp_param in self.fsdp_params:
+            with contextlib.suppress(AttributeError):
+                fsdp_param.sharded_param.grad_dtype = None
+
+        self._modelopt_original_post_backward()
+
+        # Cast gradients to parameter dtype so the optimizer sees matching dtypes
+        for fsdp_param in self.fsdp_params:
+            sp = fsdp_param.sharded_param
+            if sp.grad is not None and sp.grad.dtype != sp.dtype:
+                sp.grad = sp.grad.to(sp.dtype)
+
+    FSDPParamGroup.post_backward = _patched_post_backward
+
+
 def check_awq_smoothquant(quant_cfg):
     # TODO: Remove this once deepspeed for AWQ and SmoothQuant is added
     """Get the quantization type from the configuration."""
@@ -337,6 +384,7 @@ def _patch_accelerate_for_fsdp2_fix(self):
         is causing issues with quantized models since quantization modules adds buffers which are not sharded.
         This patch hides the buffers added by quantization modules from the original accelerate prepare.
         """
+        _patch_fsdp2_post_backward()
 
         def _modelopt_prepare(self, *args, **kwargs):
             if not self.is_fsdp2:
diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import ctypes
 import importlib.metadata
+import os
 import shutil
 
 import pytest
@@ -28,6 +29,23 @@ def skip_if_no_tensorrt():
     except (AssertionError, ImportError) as e:
         pytest.skip(f"{e}", allow_module_level=True)
 
+    # Also verify that ORT's TensorRT EP can actually load its native library.
+    # The tensorrt Python package may be installed, but ORT's provider shared library
+    # (libonnxruntime_providers_tensorrt.so) could fail to load due to CUDA version
+    # mismatches (e.g., ORT built for CUDA 12 running on a CUDA 13 system).
+    try:
+        import onnxruntime
+
+        ort_capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
+        trt_provider_lib = os.path.join(ort_capi_dir, "libonnxruntime_providers_tensorrt.so")
+        if os.path.isfile(trt_provider_lib):
+            ctypes.CDLL(trt_provider_lib)
+    except OSError as e:
+        pytest.skip(
+            f"ORT TensorRT EP native library cannot be loaded: {e}",
+            allow_module_level=True,
+        )
+
 
 def skip_if_no_trtexec():
     if not shutil.which("trtexec"):
@@ -43,19 +61,12 @@ def skip_if_no_libcudnn():
         pytest.skip(f"{e}!", allow_module_level=True)
 
 
-def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool = False):
+def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False):
     try:
         import megatron  # noqa: F401
     except ImportError:
         pytest.skip("megatron not available", allow_module_level=True)
 
-    try:
-        import apex  # noqa: F401
-
-        has_apex = True
-    except ImportError:
-        has_apex = False
-
     try:
         import transformer_engine  # noqa: F401
 
@@ -70,8 +81,8 @@ def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool
     except ImportError:
         has_mamba = False
 
-    if apex_or_te_required and not has_apex and not has_te:
-        pytest.skip("Apex or TE required for Megatron test", allow_module_level=True)
+    if te_required and not has_te:
+        pytest.skip("TE required for Megatron test", allow_module_level=True)
 
     if mamba_required and not has_mamba:
         pytest.skip("Mamba required for Megatron test", allow_module_level=True)
@@ -88,5 +99,5 @@ def skip_if_onnx_version_above_1_18():
 
     if version.parse(installed_version) > version.parse(required_version):
         pytest.skip(
-            f"{package_name} version {installed_version} is less than required {required_version}"
+            f"{package_name} version {installed_version} is greater than required {required_version}"
         )
diff --git a/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py b/tests/gpu/torch/quantization/test_nvfp4_onnx_export.py
@@ -98,6 +98,7 @@ def forward_loop(model, run_backward=False):
         output_names=["output"],
         export_params=True,
         opset_version=17,
+        dynamo=False,
     )
 
     onnx_model = NVFP4QuantExporter.process_model(onnx.load(onnx_path))
diff --git a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
@@ -16,10 +16,6 @@
 from functools import partial
 
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference_with_dummy_input
diff --git a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
@@ -21,14 +21,11 @@
 import pytest
 import torch
 import transformers
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import get_forward
 from _test_utils.torch.transformers_models import create_tiny_llama_dir
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 import modelopt.torch.quantization as mtq
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.export import KV_CACHE_FP8, export_mcore_gpt_to_hf, import_mcore_gpt_from_hf
diff --git a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
@@ -18,15 +18,12 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export import export_mcore_gpt_to_hf_vllm_fq
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 
 def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
     """Test megatron-core model export for vLLM with fake quantization."""
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -17,10 +17,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
@@ -17,7 +17,7 @@
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
 
-skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
+skip_if_no_megatron(mamba_required=True)
 
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
diff --git a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
diff --git a/tox.ini b/tox.ini

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,7 @@ def forward_loop(model, run_backward=False):`
`98`	`98`	`output_names=["output"],`
`99`	`99`	`export_params=True,`
`100`	`100`	`opset_version=17,`
	`101`	`+ dynamo=False,`
`101`	`102`	`)`
`102`	`103`
`103`	`104`	`onnx_model = NVFP4QuantExporter.process_model(onnx.load(onnx_path))`