Skip to content

Commit 41e1788

Browse files
committed
Move model-specific PTQ quant_cfg overrides from llm_ptq to YAML
Replace the hardcoded model-type branches in examples/llm_ptq (gemma/mpt AWQ alpha tuning, gemma SmoothQuant alpha, phi4mm exclusions, Nemotron VL exclusions) with opt-in declarative recipes under modelopt_recipes/huggingface/<model_type>/ptq/. Users select them with --recipe huggingface/<model_type>/ptq/<recipe>. - Per-model recipes ship with FP8 KV-cache cast (kv_fp8_cast) and the algorithm/numerics each model needs. - phi4mm and nemotron_vl each include a merged disabled_quantizers.yaml unit so recipes import a single disabled-quantizer slot instead of layering default + model-specific exclusions. - Each ptq/ folder has a README describing what is model-specific. - Drop now-unused qformat/model_type parameters from build_quant_cfg and the Nemotron VL append block in mono_quantize. Signed-off-by: Shengliang Xu <shengliangx@nvidia.com>
1 parent 5ac8368 commit 41e1788

14 files changed

Lines changed: 385 additions & 37 deletions

File tree

examples/llm_ptq/example_utils.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,8 @@ def calibrate_loop(_model):
202202

203203

204204
def build_quant_cfg(
205-
qformat,
206205
quant_cfg,
207206
awq_block_size,
208-
model_type,
209207
moe_calib_experts_ratio: float | None = None,
210208
) -> dict[str, Any]:
211209
quant_cfg = copy.deepcopy(quant_cfg)
@@ -222,10 +220,6 @@ def build_quant_cfg(
222220
if awq_block_size:
223221
weight_quantizer["block_sizes"][-1] = awq_block_size
224222

225-
# Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models
226-
if qformat == "w4a8_awq" and model_type in ["gemma", "mpt"]:
227-
quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1}
228-
229223
if moe_calib_experts_ratio:
230224
assert 0 < moe_calib_experts_ratio <= 1, "moe_calib_experts_ratio must be between 0 and 1"
231225
if isinstance(quant_cfg["algorithm"], str):
@@ -240,17 +234,6 @@ def build_quant_cfg(
240234
f"Quantization algorithm: {quant_cfg['algorithm']} does not support setting moe_calib_experts_ratio"
241235
)
242236

243-
# Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
244-
if model_type == "gemma" and "int8_sq" in qformat:
245-
quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
246-
247-
if model_type == "phi4mm":
248-
# Only quantize the language model
249-
quant_cfg["quant_cfg"].append({"quantizer_name": "*speech*", "enable": False})
250-
quant_cfg["quant_cfg"].append({"quantizer_name": "*audio*", "enable": False})
251-
quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
252-
quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
253-
254237
return quant_cfg
255238

256239

examples/llm_ptq/hf_ptq.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -626,22 +626,6 @@ def mono_quantize(
626626
"Consider reducing calib_size to reduce calibration time.\n####\n"
627627
)
628628

629-
# For Nemotron VL models, disable quantization of vision components
630-
if is_nemotron_vl_model:
631-
print("Disabling quantization for vision components in Nemotron VL model")
632-
quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
633-
quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
634-
# Also disable radio model components specifically (for Nemotron-Parse)
635-
quant_cfg["quant_cfg"].append({"quantizer_name": "*radio*", "enable": False})
636-
quant_cfg["quant_cfg"].append({"quantizer_name": "*visual*", "enable": False})
637-
quant_cfg["quant_cfg"].append(
638-
{"quantizer_name": "*encoder*", "enable": False}
639-
) # Disable encoder
640-
quant_cfg["quant_cfg"].append(
641-
{"quantizer_name": "*model_encoder*", "enable": False}
642-
) # Nemotron-Parse specific
643-
print("Quantization will only be applied to the decoder (text generation) component")
644-
645629
if not model_is_already_quantized or calibration_only:
646630
# quantize the model
647631

@@ -1110,10 +1094,8 @@ def _is_layerwise(obj):
11101094
quant_cfg = QUANT_CFG_CHOICES[args.qformat]
11111095

11121096
quant_cfg = build_quant_cfg(
1113-
args.qformat,
11141097
quant_cfg,
11151098
args.awq_block_size,
1116-
model_type,
11171099
args.moe_calib_experts_ratio,
11181100
)
11191101

examples/llm_ptq/multinode_ptq.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,10 +330,8 @@ def main(args):
330330
quant_cfg = QUANT_CFG_CHOICES[args.qformat]
331331

332332
quant_cfg = build_quant_cfg(
333-
args.qformat,
334333
quant_cfg,
335334
args.awq_block_size,
336-
model_type,
337335
)
338336

339337
enable_quant_kv_cache = args.kv_cache_qformat != "none"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Gemma PTQ recipes
2+
3+
Recipes here override the algorithm defaults that ship in the general PTQ
4+
presets because Gemma needs different settings to converge / stay accurate.
5+
6+
| Recipe | What's model-specific |
7+
|--------|-----------------------|
8+
| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search. The default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
9+
| `int8_sq-kv_fp8_cast.yaml` | Sets SmoothQuant `alpha: 0.5` instead of the default `1.0`. Gemma 7B regresses with `alpha=1`; `0.5` recovers it. Numerics: INT8 per-channel weights + INT8 inputs + FP8 KV-cache cast. |
10+
11+
The base numerics units and the standard disabled-quantizer list are inherited
12+
from the shared `configs/`; only the algorithm fields are model-specific.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Gemma-specific INT8 SmoothQuant PTQ recipe with FP8 KV-cache cast. Overrides
17+
# the SmoothQuant alpha from the default 1.0 to 0.5 to recover accuracy on
18+
# Gemma 7B (default alpha causes a regression).
19+
20+
imports:
21+
base_disable_all: configs/ptq/units/base_disable_all
22+
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
23+
int8: configs/numerics/int8
24+
int8_per_channel: configs/numerics/int8_per_channel
25+
kv_fp8_cast: configs/ptq/units/kv_fp8_cast
26+
27+
metadata:
28+
recipe_type: ptq
29+
description: >-
30+
Gemma INT8 SmoothQuant recipe with FP8 KV-cache cast: alpha=0.5 (instead
31+
of the default 1.0) to avoid accuracy regression on Gemma 7B, plus FP8
32+
KV-cache using constant amax (no KV calibration).
33+
quantize:
34+
algorithm:
35+
method: smoothquant
36+
alpha: 0.5
37+
quant_cfg:
38+
- $import: base_disable_all
39+
- quantizer_name: '*weight_quantizer'
40+
cfg:
41+
$import: int8_per_channel
42+
- quantizer_name: '*input_quantizer'
43+
cfg:
44+
$import: int8
45+
- $import: kv_fp8_cast
46+
- $import: default_disabled_quantizers
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Gemma-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
17+
# optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
18+
# in TRT-LLM kernels when using the default AWQ search on Gemma.
19+
20+
imports:
21+
base_disable_all: configs/ptq/units/base_disable_all
22+
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
23+
fp8: configs/numerics/fp8
24+
int4_per_block: configs/numerics/int4_per_block
25+
kv_fp8_cast: configs/ptq/units/kv_fp8_cast
26+
27+
metadata:
28+
recipe_type: ptq
29+
description: >-
30+
Gemma W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block weights + FP8
31+
inputs, awq_lite with alpha_step=1 (coarser search) to avoid TRT-LLM
32+
overflow, plus FP8 KV-cache using constant amax (no KV calibration).
33+
quantize:
34+
algorithm:
35+
method: awq_lite
36+
alpha_step: 1
37+
quant_cfg:
38+
- $import: base_disable_all
39+
- quantizer_name: '*weight_quantizer'
40+
cfg:
41+
- $import: int4_per_block
42+
- $import: fp8
43+
- quantizer_name: '*input_quantizer'
44+
cfg:
45+
$import: fp8
46+
- $import: kv_fp8_cast
47+
- $import: default_disabled_quantizers
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# MPT PTQ recipes
2+
3+
| Recipe | What's model-specific |
4+
|--------|-----------------------|
5+
| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search. The default search overflows in TRT-LLM kernels on MPT; the coarser sweep avoids it. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). Same algorithm override applied to Gemma — see `huggingface/gemma/ptq/`. |
6+
7+
The base numerics units and the standard disabled-quantizer list are inherited
8+
from the shared `configs/`; only the AWQ algorithm fields are model-specific.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# MPT-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
17+
# optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
18+
# in TRT-LLM kernels when using the default AWQ search on MPT.
19+
20+
imports:
21+
base_disable_all: configs/ptq/units/base_disable_all
22+
default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
23+
fp8: configs/numerics/fp8
24+
int4_per_block: configs/numerics/int4_per_block
25+
kv_fp8_cast: configs/ptq/units/kv_fp8_cast
26+
27+
metadata:
28+
recipe_type: ptq
29+
description: >-
30+
MPT W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block weights + FP8
31+
inputs, awq_lite with alpha_step=1 (coarser search) to avoid TRT-LLM
32+
overflow, plus FP8 KV-cache using constant amax (no KV calibration).
33+
quantize:
34+
algorithm:
35+
method: awq_lite
36+
alpha_step: 1
37+
quant_cfg:
38+
- $import: base_disable_all
39+
- quantizer_name: '*weight_quantizer'
40+
cfg:
41+
- $import: int4_per_block
42+
- $import: fp8
43+
- quantizer_name: '*input_quantizer'
44+
cfg:
45+
$import: fp8
46+
- $import: kv_fp8_cast
47+
- $import: default_disabled_quantizers
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Nemotron VL PTQ recipes
2+
3+
Nemotron VL is a vision-language model family (including Nemotron-Parse).
4+
Quantization should be applied only to the decoder (text-generation
5+
component); the vision encoder, image encoder, and radio/model_encoder
6+
branches stay in full precision.
7+
8+
| File | What's model-specific |
9+
|------|-----------------------|
10+
| `disabled_quantizers.yaml` | Reusable unit (`QuantizerCfgListConfig`). Merges the standard `default_disabled_quantizers` exclusions with Nemotron-VL ones (`*vision*`, `*image*`, `*radio*`, `*visual*`, `*encoder*`, `*model_encoder*`). The last two patterns are required for Nemotron-Parse. Imported by recipes below as the single `disabled_quantizers` slot so they don't pull in two disabled-quantizer sets. |
11+
| `nvfp4-kv_fp8_cast.yaml` | NVFP4 W4A4 model quantization + FP8 KV-cache cast (constant amax, no KV calibration). Identical numerics to the general `nvfp4` preset / `kv_fp8_cast` unit; what makes it model-specific is that it imports `disabled_quantizers.yaml` from this folder to skip the vision/encoder branches. |
12+
13+
Additional `<qformat>-kv_fp8_cast.yaml` recipes can be generated for other formats
14+
if needed; only `nvfp4-kv_fp8_cast.yaml` is shipped by default.
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# QuantizerCfgList snippet of disabled quantizers for Nemotron VL. Merges the
17+
# standard `default_disabled_quantizers` exclusions with Nemotron-VL-specific
18+
# ones (only the decoder is quantized; vision/encoder branches, including the
19+
# Nemotron-Parse radio/model_encoder modules, are skipped). Recipes that
20+
# import this should NOT also import `default_disabled_quantizers`.
21+
22+
# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig
23+
- quantizer_name: '*block_sparse_moe.gate*'
24+
enable: false
25+
- quantizer_name: '*linear_attn.conv1d*'
26+
enable: false
27+
- quantizer_name: '*lm_head*'
28+
enable: false
29+
- quantizer_name: '*mixer.conv1d*'
30+
enable: false
31+
- quantizer_name: '*mlp.gate.*'
32+
enable: false
33+
- quantizer_name: '*mlp.shared_expert_gate.*'
34+
enable: false
35+
- quantizer_name: '*output_layer*'
36+
enable: false
37+
- quantizer_name: '*proj_out.*'
38+
enable: false
39+
- quantizer_name: '*router*'
40+
enable: false
41+
- quantizer_name: 'output.*'
42+
enable: false
43+
- parent_class: 'nn.BatchNorm1d'
44+
quantizer_name: '*'
45+
enable: false
46+
- parent_class: 'nn.BatchNorm2d'
47+
quantizer_name: '*'
48+
enable: false
49+
- parent_class: 'nn.BatchNorm3d'
50+
quantizer_name: '*'
51+
enable: false
52+
- parent_class: 'nn.LeakyReLU'
53+
quantizer_name: '*'
54+
enable: false
55+
- quantizer_name: '*vision*'
56+
enable: false
57+
- quantizer_name: '*image*'
58+
enable: false
59+
- quantizer_name: '*radio*'
60+
enable: false
61+
- quantizer_name: '*visual*'
62+
enable: false
63+
- quantizer_name: '*encoder*'
64+
enable: false
65+
- quantizer_name: '*model_encoder*'
66+
enable: false

0 commit comments

Comments
 (0)