NVIDIA
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 17 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 7 additions & 3 deletions b/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎examples/deepseek/ptq.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/deepseek/ptq.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/deepseek/quantize_to_nvfp4.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/deepseek/quantize_to_nvfp4.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/diffusers/quantization/calibration.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/diffusers/quantization/calibration.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/diffusers/quantization/models_utils.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/diffusers/quantization/models_utils.py‎
Lines changed: 2 additions & 3 deletions
@@ -66,11 +66,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -83,11 +83,11 @@ jobs:
         example: [llm_distill, llm_qat, llm_sparsity]
         include:
           - example: speculative_decoding
-            docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
+            docker_image: "26.01"
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
+      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -103,7 +103,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-1
@@ -117,7 +117,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
+      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
       example: ${{ matrix.example }}
       pip_install_extras: "[hf,dev-test]"
       runner: linux-amd64-gpu-h100-latest-2
@@ -133,7 +133,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
@@ -147,7 +147,7 @@ jobs:
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
+      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
       example: ${{ matrix.example }}
       pip_install_extras: "[all,dev-test]"
       runner: linux-amd64-gpu-l4-latest-1
 
@@ -63,14 +63,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
-      image: nvcr.io/nvidia/pytorch:25.06-py3
+      image: nvcr.io/nvidia/pytorch:26.01-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -89,9 +89,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: py312-cuda12-gpu
+          - example: cuda13-gpu
             timeout: 90
-          - example: py312-cuda12-gpu-megatron
+          - example: cuda13-gpu-megatron
             timeout: 120
     runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: ${{ matrix.timeout }}
 
@@ -37,7 +37,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -55,6 +55,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
+        # Some issues with torch 2.10 on Windows, so using 2.9 for now
         run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
@@ -70,15 +71,15 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27, 28]
+        torch: [26, 27, 28, 29]
     steps:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
@@ -96,7 +97,7 @@ jobs:
       - uses: actions/checkout@v6
       - uses: ./.github/actions/ubuntu-setup
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]
 
@@ -24,7 +24,9 @@ repos:
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
+        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
       - id: ruff-format
+        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
@@ -93,6 +95,7 @@ repos:
               examples/llm_eval/modeling.py|
               examples/llm_qat/main.py|
               examples/llm_sparsity/weight_sparsity/finetune.py|
+              examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
 
@@ -1,6 +1,22 @@
 NVIDIA Model Optimizer Changelog (Linux)
 ========================================
 
+0.43 (2026-03-xx)
+^^^^^^^^^^^^^^^^^
+
+**Bug Fixes**
+
+- ONNX Runtime dependency upgraded to 1.24 to solve missing graph outputs when using the TensorRT Execution Provider.
+
+**New Features**
+
+- User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
+- ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
+- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to all the experts.
+- Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
+- Add support for rotating the input before quantization for RHT.
+- Add support for advanced weight scale search for NVFP4 quantization and its export path.
+
 0.42 (2026-02-xx)
 ^^^^^^^^^^^^^^^^^
 
@@ -21,6 +37,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add LTX-2 and Wan2.2 (T2V) support in the diffusers quantization workflow.
 - Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
 - Add support for image-text data calibration in PTQ for Nemotron VL models.
+- Add support for advanced weight scale search for NVFP4 quantization and its export path.
 - Add PTQ support for Nemotron Parse.
 - Add distillation support for LTX-2. See `examples/diffusers/distillation/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/distillation>`_ for more details.
 - Add Megatron Core export/import mapping for Qwen3-VL (``Qwen3VLForConditionalGeneration``) vision-language models. The mapping handles the ``model.language_model.`` weight prefix used by Qwen3-VL.
 
@@ -14,13 +14,13 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | Python                  |  >=3.10,<3.13               |
 +-------------------------+-----------------------------+
-| CUDA                    |  >=12.0                     |
+| CUDA                    |  12.x, 13.x                 |
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.2.0rc4                   |
+| TensorRT-LLM (Optional) |  >=1.0                      |
 +-------------------------+-----------------------------+
-| ONNX Runtime (Optional) |  1.22                       |
+| ONNX Runtime (Optional) |  1.24                       |
 +-------------------------+-----------------------------+
 | TensorRT (Optional)     |  >=10.0                     |
 +-------------------------+-----------------------------+
@@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package
     *   - Huggingface (``transformers``, ``diffusers``, etc.)
         - ``[hf]``
 
+**CUDA specific dependencies**
+
+* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``.
+
 **Accelerated Quantization with Triton Kernels**
 
 ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization
 
@@ -56,6 +56,7 @@
 from modelopt.torch.export.model_config import KV_CACHE_FP8
 from modelopt.torch.export.quant_utils import get_quant_config
 from modelopt.torch.quantization.nn import TensorQuantizer
+from modelopt.torch.quantization.triton import weight_dequant
 from modelopt.torch.quantization.utils import (
     is_quantized_column_parallel_linear,
     is_quantized_parallel_linear,
@@ -77,7 +78,6 @@
     )
 
 import model as deekseep_model  # noqa: E402
-from ds_kernel import weight_dequant  # noqa: E402
 from kernel import act_quant, fp8_gemm  # noqa: E402
 
 
@@ -99,7 +99,7 @@ def linear(
                 weight = weight_quantizer(weight)
             return F.linear(x, weight, bias)
         elif gemm_impl == "bf16":
-            weight = weight_dequant(weight, weight.scale)
+            weight = weight_dequant(weight, weight.scale, dtype=torch.bfloat16)
             if act_quantizer is not None:
                 x = act_quantizer(x)
             if weight_quantizer is not None:
 
@@ -44,11 +44,11 @@
 from typing import Any
 
 import torch
-from ds_kernel import weight_dequant
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
 
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
+from modelopt.torch.quantization.triton import weight_dequant
 
 
 def _remap_key(key_dict: dict[str, Any]):
 
@@ -121,6 +121,10 @@ def _run_wan_video_calibration(
 
     def _run_ltx2_calibration(self, prompt_batch: list[str], extra_args: dict[str, Any]) -> None:
         from ltx_core.model.video_vae import TilingConfig
+        from ltx_pipelines.utils.constants import (
+            DEFAULT_AUDIO_GUIDER_PARAMS,
+            DEFAULT_VIDEO_GUIDER_PARAMS,
+        )
 
         prompt = prompt_batch[0]
         extra_params = self.pipeline_manager.config.extra_params
@@ -134,9 +138,8 @@ def _run_ltx2_calibration(self, prompt_batch: list[str], extra_args: dict[str, A
             "num_frames": extra_params.get("num_frames", extra_args.get("num_frames", 121)),
             "frame_rate": extra_params.get("frame_rate", extra_args.get("frame_rate", 24.0)),
             "num_inference_steps": self.config.n_steps,
-            "cfg_guidance_scale": extra_params.get(
-                "cfg_guidance_scale", extra_args.get("cfg_guidance_scale", 4.0)
-            ),
+            "video_guider_params": DEFAULT_VIDEO_GUIDER_PARAMS,
+            "audio_guider_params": DEFAULT_AUDIO_GUIDER_PARAMS,
             "images": extra_params.get("images", []),
             "tiling_config": extra_params.get("tiling_config", TilingConfig.default()),
         }
 
@@ -163,11 +163,10 @@ def get_model_filter_func(model_type: ModelType) -> Callable[[str], bool]:
         "backbone": "transformer",
         "dataset": _SD_PROMPTS_DATASET,
         "inference_extra_args": {
-            "height": 1024,
-            "width": 1536,
+            "height": 768,
+            "width": 1280,
             "num_frames": 121,
             "frame_rate": 24.0,
-            "cfg_guidance_scale": 4.0,
             "negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
         },
     },