Fix failures

kevalmorabia97 · kevalmorabia97 · commit dea2eeb63b93 · 2026-02-13T17:31:47.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -66,11 +66,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -83,11 +83,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:26.01-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -63,9 +63,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda13-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda13-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: ${{ matrix.timeout }}
@@ -89,9 +89,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: ${{ matrix.timeout }}
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -55,7 +55,8 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
-        run: pip install tox && tox -e py312-torch210-tf_latest-unit
+        # Some issues with torch 2.10 on Windows, so using 2.9 for now
+        run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
     needs: [linux]
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -15,6 +15,7 @@
 
 """ModelOpt plugin for transformers Trainer."""
 
+import contextlib
 import gc
 import json
 import os
@@ -100,6 +101,52 @@ class QuantizationArgumentsWithConfig(QuantizationArguments):
     )
 
 
+def _patch_fsdp2_post_backward():
+    """Patch FSDP2 ``post_backward`` to handle mixed-precision gradient dtypes.
+
+    FSDP2 with bf16 mixed precision upcasts bf16 parameters to fp32 for optimizer
+    precision, while gradients are reduced in bf16. In PyTorch >= 2.6, assigning a
+    bf16 gradient to a fp32 parameter raises a ``RuntimeError`` due to the
+    ``grad_dtype`` check, and the fused Adam optimizer also rejects mixed dtypes.
+
+    This patch wraps ``FSDPParamGroup.post_backward`` to:
+    1. Set ``grad_dtype=None`` on sharded params before reduction (allowing bf16 assignment).
+    2. Cast gradients to match parameter dtype after reduction (so the optimizer sees matching dtypes).
+
+    .. note::
+        This is a workaround. The proper fix should come from PyTorch's FSDP2
+        ``foreach_reduce`` (which should cast gradients to match the parameter dtype)
+        or from accelerate (which should set ``grad_dtype`` when it upcasts params).
+        Remove this once the upstream fix is available.
+    """
+    try:
+        from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+    except ImportError:
+        return
+
+    if hasattr(FSDPParamGroup, "_modelopt_original_post_backward"):
+        return  # Already patched
+
+    FSDPParamGroup._modelopt_original_post_backward = FSDPParamGroup.post_backward
+
+    @torch.no_grad()
+    def _patched_post_backward(self):
+        # Allow bf16 gradients to be assigned to fp32 parameters
+        for fsdp_param in self.fsdp_params:
+            with contextlib.suppress(AttributeError):
+                fsdp_param.sharded_param.grad_dtype = None
+
+        self._modelopt_original_post_backward()
+
+        # Cast gradients to parameter dtype so the optimizer sees matching dtypes
+        for fsdp_param in self.fsdp_params:
+            sp = fsdp_param.sharded_param
+            if sp.grad is not None and sp.grad.dtype != sp.dtype:
+                sp.grad = sp.grad.to(sp.dtype)
+
+    FSDPParamGroup.post_backward = _patched_post_backward
+
+
 def check_awq_smoothquant(quant_cfg):
     # TODO: Remove this once deepspeed for AWQ and SmoothQuant is added
     """Get the quantization type from the configuration."""
@@ -337,6 +384,7 @@ def _patch_accelerate_for_fsdp2_fix(self):
         is causing issues with quantized models since quantization modules adds buffers which are not sharded.
         This patch hides the buffers added by quantization modules from the original accelerate prepare.
         """
+        _patch_fsdp2_post_backward()
 
         def _modelopt_prepare(self, *args, **kwargs):
             if not self.is_fsdp2:
diff --git a/tests/_test_utils/import_helper.py b/tests/_test_utils/import_helper.py
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import ctypes
 import importlib.metadata
+import os
 import shutil
 
 import pytest
@@ -28,6 +29,23 @@ def skip_if_no_tensorrt():
     except (AssertionError, ImportError) as e:
         pytest.skip(f"{e}", allow_module_level=True)
 
+    # Also verify that ORT's TensorRT EP can actually load its native library.
+    # The tensorrt Python package may be installed, but ORT's provider shared library
+    # (libonnxruntime_providers_tensorrt.so) could fail to load due to CUDA version
+    # mismatches (e.g., ORT built for CUDA 12 running on a CUDA 13 system).
+    try:
+        import onnxruntime
+
+        ort_capi_dir = os.path.join(os.path.dirname(onnxruntime.__file__), "capi")
+        trt_provider_lib = os.path.join(ort_capi_dir, "libonnxruntime_providers_tensorrt.so")
+        if os.path.isfile(trt_provider_lib):
+            ctypes.CDLL(trt_provider_lib)
+    except OSError as e:
+        pytest.skip(
+            f"ORT TensorRT EP native library cannot be loaded: {e}",
+            allow_module_level=True,
+        )
+
 
 def skip_if_no_trtexec():
     if not shutil.which("trtexec"):
@@ -43,19 +61,12 @@ def skip_if_no_libcudnn():
         pytest.skip(f"{e}!", allow_module_level=True)
 
 
-def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool = False):
+def skip_if_no_megatron(*, te_required: bool = True, mamba_required: bool = False):
     try:
         import megatron  # noqa: F401
     except ImportError:
         pytest.skip("megatron not available", allow_module_level=True)
 
-    try:
-        import apex  # noqa: F401
-
-        has_apex = True
-    except ImportError:
-        has_apex = False
-
     try:
         import transformer_engine  # noqa: F401
 
@@ -70,8 +81,8 @@ def skip_if_no_megatron(apex_or_te_required: bool = False, mamba_required: bool
     except ImportError:
         has_mamba = False
 
-    if apex_or_te_required and not has_apex and not has_te:
-        pytest.skip("Apex or TE required for Megatron test", allow_module_level=True)
+    if te_required and not has_te:
+        pytest.skip("TE required for Megatron test", allow_module_level=True)
 
     if mamba_required and not has_mamba:
         pytest.skip("Mamba required for Megatron test", allow_module_level=True)
diff --git a/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py b/tests/gpu_megatron/torch/distill/plugins/test_distill_megatron.py
@@ -16,10 +16,6 @@
 from functools import partial
 
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference_with_dummy_input
diff --git a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
@@ -21,14 +21,11 @@
 import pytest
 import torch
 import transformers
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import get_forward
 from _test_utils.torch.transformers_models import create_tiny_llama_dir
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 import modelopt.torch.quantization as mtq
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.export import KV_CACHE_FP8, export_mcore_gpt_to_hf, import_mcore_gpt_from_hf
diff --git a/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py b/tests/gpu_megatron/torch/export/test_vllm_fakequant_megatron_export.py
@@ -18,15 +18,12 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export import export_mcore_gpt_to_hf_vllm_fq
 
-skip_if_no_megatron(apex_or_te_required=True)
-
 
 def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
     """Test megatron-core model export for vLLM with fake quantization."""
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -17,10 +17,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import run_mcore_inference
diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
@@ -17,7 +17,7 @@
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
 
-skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
+skip_if_no_megatron(mamba_required=True)
 
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
diff --git a/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py b/tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py
@@ -19,22 +19,17 @@
 import pytest
 import torch
 import torch.nn.init as init
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import initialize_for_megatron
 from megatron.core import dist_checkpointing
 
+import modelopt.torch.peft as mtpeft
+import modelopt.torch.quantization as mtq
 from modelopt.torch.opt.plugins.mcore_dist_checkpointing import (
     restore_sharded_modelopt_state,
     save_sharded_modelopt_state,
 )
-
-skip_if_no_megatron()
-
-
-import modelopt.torch.peft as mtpeft
-import modelopt.torch.quantization as mtq
 from modelopt.torch.peft.lora.layer import LoRAModule
 from modelopt.torch.utils.plugins import megatron_prefill
 
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -17,10 +17,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import (
diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
@@ -22,7 +22,7 @@
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
 
-skip_if_no_megatron(apex_or_te_required=True, mamba_required=True)
+skip_if_no_megatron(mamba_required=True)
 
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_mamba_hybrid_model
diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
@@ -18,7 +18,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import (
     MegatronModel,
@@ -41,9 +40,6 @@
     data_tensor_context_parallel_test_helper,
     verify_kv_cache_amax_sync,
 )
-
-skip_if_no_megatron()
-
 from megatron.core.parallel_state import (
     destroy_model_parallel,
     get_data_parallel_group,
diff --git a/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu_megatron/torch/speculative/plugins/test_speculative_megatron_modules.py
@@ -16,10 +16,6 @@
 
 import pytest
 import torch
-from _test_utils.import_helper import skip_if_no_megatron
-
-skip_if_no_megatron(apex_or_te_required=True)
-
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 
diff --git a/tox.ini b/tox.ini
@@ -2,8 +2,8 @@
 envlist=
     pre-commit-all
     py312-torch210-tf_latest-unit
-    py312-cuda13-gpu
-    py312-cuda13-gpu-megatron
+    cuda13-gpu
+    cuda13-gpu-megatron
 skipsdist = True
 toxworkdir = /tmp/{env:USER}-modelopt-tox
 
@@ -59,26 +59,24 @@ commands =
 ###########################################################
 # GPU test environments (Should be used with --current-env)
 ###########################################################
-[testenv:{py310,py311,py312}-cuda13-gpu]
+[testenv:cuda13-gpu]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
-    pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
-
+    pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git
     pip install -e .[all,dev-test]
+
+    # Install cupy-cuda13x for INT4 ONNX quantization (default is cupy-cuda12x)
+    pip uninstall -y cupy-cuda12x
+    pip install cupy-cuda13x
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"
     python -m pytest tests/gpu
 
-[testenv:{py310,py311,py312}-cuda13-gpu-megatron]
+[testenv:cuda13-gpu-megatron]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
     pip install -U megatron-core
-
-    # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
-    pip install pip-mark-installed
-    pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
-
     pip install -e .[all,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"