Skip to content

Commit 6de191e

Browse files
authored
Merge branch 'main' into main
2 parents 84765bd + a6cbcba commit 6de191e

156 files changed

Lines changed: 11965 additions & 1904 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/example_tests.yml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,11 @@ jobs:
6666
example: [llm_distill, llm_qat, llm_sparsity]
6767
include:
6868
- example: speculative_decoding
69-
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
69+
docker_image: "26.01"
7070
uses: ./.github/workflows/_example_tests_runner.yml
7171
secrets: inherit
7272
with:
73-
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
73+
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
7474
example: ${{ matrix.example }}
7575
pip_install_extras: "[hf,dev-test]"
7676
runner: linux-amd64-gpu-l4-latest-1
@@ -83,11 +83,11 @@ jobs:
8383
example: [llm_distill, llm_qat, llm_sparsity]
8484
include:
8585
- example: speculative_decoding
86-
docker_image: "nvcr.io/nvidia/pytorch:26.01-py3"
86+
docker_image: "26.01"
8787
uses: ./.github/workflows/_example_tests_runner.yml
8888
secrets: inherit
8989
with:
90-
docker_image: ${{ matrix.docker_image || 'nvcr.io/nvidia/pytorch:25.06-py3' }}
90+
docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
9191
example: ${{ matrix.example }}
9292
pip_install_extras: "[hf,dev-test]"
9393
runner: linux-amd64-gpu-h100-latest-2
@@ -103,7 +103,7 @@ jobs:
103103
uses: ./.github/workflows/_example_tests_runner.yml
104104
secrets: inherit
105105
with:
106-
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
106+
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
107107
example: ${{ matrix.example }}
108108
pip_install_extras: "[hf,dev-test]"
109109
runner: linux-amd64-gpu-h100-latest-1
@@ -117,7 +117,7 @@ jobs:
117117
uses: ./.github/workflows/_example_tests_runner.yml
118118
secrets: inherit
119119
with:
120-
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc4"
120+
docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6.post3"
121121
example: ${{ matrix.example }}
122122
pip_install_extras: "[hf,dev-test]"
123123
runner: linux-amd64-gpu-h100-latest-2
@@ -133,7 +133,7 @@ jobs:
133133
uses: ./.github/workflows/_example_tests_runner.yml
134134
secrets: inherit
135135
with:
136-
docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
136+
docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
137137
example: ${{ matrix.example }}
138138
pip_install_extras: "[all,dev-test]"
139139
runner: linux-amd64-gpu-l4-latest-1
@@ -147,7 +147,7 @@ jobs:
147147
uses: ./.github/workflows/_example_tests_runner.yml
148148
secrets: inherit
149149
with:
150-
docker_image: "nvcr.io/nvidia/tensorrt:25.08-py3"
150+
docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
151151
example: ${{ matrix.example }}
152152
pip_install_extras: "[all,dev-test]"
153153
runner: linux-amd64-gpu-l4-latest-1

.github/workflows/gpu_tests.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ jobs:
6363
fail-fast: false
6464
matrix:
6565
include:
66-
- example: py312-cuda12-gpu
66+
- example: cuda13-gpu
6767
timeout: 90
68-
- example: py312-cuda12-gpu-megatron
68+
- example: cuda13-gpu-megatron
6969
timeout: 120
7070
runs-on: linux-amd64-gpu-l4-latest-1
7171
timeout-minutes: ${{ matrix.timeout }}
7272
container: &gpu_container
73-
image: nvcr.io/nvidia/pytorch:25.06-py3
73+
image: nvcr.io/nvidia/pytorch:26.01-py3
7474
env:
7575
GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
7676
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -89,9 +89,9 @@ jobs:
8989
fail-fast: false
9090
matrix:
9191
include:
92-
- example: py312-cuda12-gpu
92+
- example: cuda13-gpu
9393
timeout: 90
94-
- example: py312-cuda12-gpu-megatron
94+
- example: cuda13-gpu-megatron
9595
timeout: 120
9696
runs-on: linux-amd64-gpu-h100-latest-2
9797
timeout-minutes: ${{ matrix.timeout }}

.github/workflows/unit_tests.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
- uses: actions/checkout@v6
3838
- uses: ./.github/actions/ubuntu-setup
3939
- name: Run unit tests
40-
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
40+
run: pip install tox && COV_ARGS="--cov" tox -e py312-torch210-tf_latest-unit
4141
- name: Upload coverage reports to Codecov
4242
uses: codecov/codecov-action@v5
4343
with:
@@ -55,6 +55,7 @@ jobs:
5555
with:
5656
python-version: "3.12"
5757
- name: Run unit tests (without coverage)
58+
# Some issues with torch 2.10 on Windows, so using 2.9 for now
5859
run: pip install tox && tox -e py312-torch29-tf_latest-unit
5960
multi-py:
6061
if: github.event_name == 'pull_request'
@@ -70,15 +71,15 @@ jobs:
7071
with:
7172
python-version: "3.${{ matrix.py }}"
7273
- name: Run unit tests
73-
run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
74+
run: pip install tox && tox -e py3${{ matrix.py }}-torch210-tf_latest-unit
7475
multi-torch:
7576
if: github.event_name == 'pull_request'
7677
needs: [linux]
7778
runs-on: ubuntu-latest
7879
timeout-minutes: 30
7980
strategy:
8081
matrix:
81-
torch: [26, 27, 28]
82+
torch: [26, 27, 28, 29]
8283
steps:
8384
- uses: actions/checkout@v6
8485
- uses: ./.github/actions/ubuntu-setup
@@ -96,7 +97,7 @@ jobs:
9697
- uses: actions/checkout@v6
9798
- uses: ./.github/actions/ubuntu-setup
9899
- name: Run unit tests
99-
run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
100+
run: pip install tox && tox -e py312-torch210-tf_${{ matrix.tf }}-unit
100101
partial-install:
101102
if: github.event_name == 'pull_request'
102103
needs: [linux]

.pre-commit-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ repos:
2424
hooks:
2525
- id: ruff-check
2626
args: [--fix, --exit-non-zero-on-fix]
27+
exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
2728
- id: ruff-format
29+
exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
2830

2931
- repo: https://github.com/pre-commit/mirrors-mypy
3032
rev: v1.17.1
@@ -93,6 +95,7 @@ repos:
9395
examples/llm_eval/modeling.py|
9496
examples/llm_qat/main.py|
9597
examples/llm_sparsity/weight_sparsity/finetune.py|
98+
examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
9699
examples/speculative_decoding/main.py|
97100
examples/speculative_decoding/medusa_utils.py|
98101
examples/speculative_decoding/server_generate.py|

CHANGELOG.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,22 @@
11
NVIDIA Model Optimizer Changelog (Linux)
22
========================================
33

4+
0.43 (2026-03-xx)
5+
^^^^^^^^^^^^^^^^^
6+
7+
**Bug Fixes**
8+
9+
- ONNX Runtime dependency upgraded to 1.24 to solve missing graph outputs when using the TensorRT Execution Provider.
10+
11+
**New Features**
12+
13+
- User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
14+
- ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
15+
- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to all the experts.
16+
- Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
17+
- Add support for rotating the input before quantization for RHT.
18+
- Add support for advanced weight scale search for NVFP4 quantization and its export path.
19+
420
0.42 (2026-02-xx)
521
^^^^^^^^^^^^^^^^^
622

@@ -21,6 +37,7 @@ NVIDIA Model Optimizer Changelog (Linux)
2137
- Add LTX-2 and Wan2.2 (T2V) support in the diffusers quantization workflow.
2238
- Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
2339
- Add support for image-text data calibration in PTQ for Nemotron VL models.
40+
- Add support for advanced weight scale search for NVFP4 quantization and its export path.
2441
- Add PTQ support for Nemotron Parse.
2542
- Add distillation support for LTX-2. See `examples/diffusers/distillation/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/distillation>`_ for more details.
2643
- Add Megatron Core export/import mapping for Qwen3-VL (``Qwen3VLForConditionalGeneration``) vision-language models. The mapping handles the ``model.language_model.`` weight prefix used by Qwen3-VL.

docs/source/getting_started/_installation_for_Linux.rst

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
1414
+-------------------------+-----------------------------+
1515
| Python | >=3.10,<3.13 |
1616
+-------------------------+-----------------------------+
17-
| CUDA | >=12.0 |
17+
| CUDA | 12.x, 13.x |
1818
+-------------------------+-----------------------------+
1919
| PyTorch | >=2.6 |
2020
+-------------------------+-----------------------------+
21-
| TensorRT-LLM (Optional) | 1.2.0rc4 |
21+
| TensorRT-LLM (Optional) | >=1.0 |
2222
+-------------------------+-----------------------------+
23-
| ONNX Runtime (Optional) | 1.22 |
23+
| ONNX Runtime (Optional) | 1.24 |
2424
+-------------------------+-----------------------------+
2525
| TensorRT (Optional) | >=10.0 |
2626
+-------------------------+-----------------------------+
@@ -126,6 +126,10 @@ Additionally, we support installing dependencies for following 3rd-party package
126126
* - Huggingface (``transformers``, ``diffusers``, etc.)
127127
- ``[hf]``
128128

129+
**CUDA specific dependencies**
130+
131+
* By default, ``cupy-cuda12x`` is installed for INT4 ONNX quantization. If you have CUDA 13, you need to run ``pip uninstall -y cupy-cuda12x`` and ``pip install cupy-cuda13x`` after installing ``nvidia-modelopt[onnx]``.
132+
129133
**Accelerated Quantization with Triton Kernels**
130134

131135
ModelOpt includes optimized quantization kernels implemented with Triton language that accelerate quantization

examples/deepseek/ptq.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from modelopt.torch.export.model_config import KV_CACHE_FP8
5757
from modelopt.torch.export.quant_utils import get_quant_config
5858
from modelopt.torch.quantization.nn import TensorQuantizer
59+
from modelopt.torch.quantization.triton import weight_dequant
5960
from modelopt.torch.quantization.utils import (
6061
is_quantized_column_parallel_linear,
6162
is_quantized_parallel_linear,
@@ -77,7 +78,6 @@
7778
)
7879

7980
import model as deekseep_model # noqa: E402
80-
from ds_kernel import weight_dequant # noqa: E402
8181
from kernel import act_quant, fp8_gemm # noqa: E402
8282

8383

@@ -99,7 +99,7 @@ def linear(
9999
weight = weight_quantizer(weight)
100100
return F.linear(x, weight, bias)
101101
elif gemm_impl == "bf16":
102-
weight = weight_dequant(weight, weight.scale)
102+
weight = weight_dequant(weight, weight.scale, dtype=torch.bfloat16)
103103
if act_quantizer is not None:
104104
x = act_quantizer(x)
105105
if weight_quantizer is not None:

examples/deepseek/quantize_to_nvfp4.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@
4444
from typing import Any
4545

4646
import torch
47-
from ds_kernel import weight_dequant
4847
from safetensors.torch import load_file, save_file
4948
from tqdm import tqdm
5049

5150
from modelopt.torch.quantization.qtensor import NVFP4QTensor
51+
from modelopt.torch.quantization.triton import weight_dequant
5252

5353

5454
def _remap_key(key_dict: dict[str, Any]):

examples/diffusers/quantization/calibration.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ def _run_wan_video_calibration(
121121

122122
def _run_ltx2_calibration(self, prompt_batch: list[str], extra_args: dict[str, Any]) -> None:
123123
from ltx_core.model.video_vae import TilingConfig
124+
from ltx_pipelines.utils.constants import (
125+
DEFAULT_AUDIO_GUIDER_PARAMS,
126+
DEFAULT_VIDEO_GUIDER_PARAMS,
127+
)
124128

125129
prompt = prompt_batch[0]
126130
extra_params = self.pipeline_manager.config.extra_params
@@ -134,9 +138,8 @@ def _run_ltx2_calibration(self, prompt_batch: list[str], extra_args: dict[str, A
134138
"num_frames": extra_params.get("num_frames", extra_args.get("num_frames", 121)),
135139
"frame_rate": extra_params.get("frame_rate", extra_args.get("frame_rate", 24.0)),
136140
"num_inference_steps": self.config.n_steps,
137-
"cfg_guidance_scale": extra_params.get(
138-
"cfg_guidance_scale", extra_args.get("cfg_guidance_scale", 4.0)
139-
),
141+
"video_guider_params": DEFAULT_VIDEO_GUIDER_PARAMS,
142+
"audio_guider_params": DEFAULT_AUDIO_GUIDER_PARAMS,
140143
"images": extra_params.get("images", []),
141144
"tiling_config": extra_params.get("tiling_config", TilingConfig.default()),
142145
}

examples/diffusers/quantization/models_utils.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,11 +163,10 @@ def get_model_filter_func(model_type: ModelType) -> Callable[[str], bool]:
163163
"backbone": "transformer",
164164
"dataset": _SD_PROMPTS_DATASET,
165165
"inference_extra_args": {
166-
"height": 1024,
167-
"width": 1536,
166+
"height": 768,
167+
"width": 1280,
168168
"num_frames": 121,
169169
"frame_rate": 24.0,
170-
"cfg_guidance_scale": 4.0,
171170
"negative_prompt": "worst quality, inconsistent motion, blurry, jittery, distorted",
172171
},
173172
},

0 commit comments

Comments
 (0)