Move model-specific PTQ quant_cfg overrides from llm_ptq to YAML

shengliangxu · shengliangxu · commit 41e178862cf3 · 2026-05-19T16:28:41.000-07:00
Replace the hardcoded model-type branches in examples/llm_ptq (gemma/mpt
AWQ alpha tuning, gemma SmoothQuant alpha, phi4mm exclusions, Nemotron VL
exclusions) with opt-in declarative recipes under
modelopt_recipes/huggingface/&lt;model_type&gt;/ptq/. Users select them with
--recipe huggingface/&lt;model_type&gt;/ptq/&lt;recipe&gt;.

- Per-model recipes ship with FP8 KV-cache cast (kv_fp8_cast) and the
  algorithm/numerics each model needs.
- phi4mm and nemotron_vl each include a merged disabled_quantizers.yaml
  unit so recipes import a single disabled-quantizer slot instead of
  layering default + model-specific exclusions.
- Each ptq/ folder has a README describing what is model-specific.
- Drop now-unused qformat/model_type parameters from build_quant_cfg and
  the Nemotron VL append block in mono_quantize.

Signed-off-by: Shengliang Xu &lt;shengliangx@nvidia.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -202,10 +202,8 @@ def calibrate_loop(_model):
 
 
 def build_quant_cfg(
-    qformat,
     quant_cfg,
     awq_block_size,
-    model_type,
     moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
     quant_cfg = copy.deepcopy(quant_cfg)
@@ -222,10 +220,6 @@ def build_quant_cfg(
         if awq_block_size:
             weight_quantizer["block_sizes"][-1] = awq_block_size
 
-        # Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models
-        if qformat == "w4a8_awq" and model_type in ["gemma", "mpt"]:
-            quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1}
-
     if moe_calib_experts_ratio:
         assert 0 < moe_calib_experts_ratio <= 1, "moe_calib_experts_ratio must be between 0 and 1"
         if isinstance(quant_cfg["algorithm"], str):
@@ -240,17 +234,6 @@ def build_quant_cfg(
                 f"Quantization algorithm: {quant_cfg['algorithm']} does not support setting moe_calib_experts_ratio"
             )
 
-    # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
-    if model_type == "gemma" and "int8_sq" in qformat:
-        quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
-
-    if model_type == "phi4mm":
-        # Only quantize the language model
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*speech*", "enable": False})
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*audio*", "enable": False})
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
-
     return quant_cfg
 
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -626,22 +626,6 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
-    # For Nemotron VL models, disable quantization of vision components
-    if is_nemotron_vl_model:
-        print("Disabling quantization for vision components in Nemotron VL model")
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*vision*", "enable": False})
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*image*", "enable": False})
-        # Also disable radio model components specifically (for Nemotron-Parse)
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*radio*", "enable": False})
-        quant_cfg["quant_cfg"].append({"quantizer_name": "*visual*", "enable": False})
-        quant_cfg["quant_cfg"].append(
-            {"quantizer_name": "*encoder*", "enable": False}
-        )  # Disable encoder
-        quant_cfg["quant_cfg"].append(
-            {"quantizer_name": "*model_encoder*", "enable": False}
-        )  # Nemotron-Parse specific
-        print("Quantization will only be applied to the decoder (text generation) component")
-
     if not model_is_already_quantized or calibration_only:
         # quantize the model
 
@@ -1110,10 +1094,8 @@ def _is_layerwise(obj):
             quant_cfg = QUANT_CFG_CHOICES[args.qformat]
 
             quant_cfg = build_quant_cfg(
-                args.qformat,
                 quant_cfg,
                 args.awq_block_size,
-                model_type,
                 args.moe_calib_experts_ratio,
             )
 
diff --git a/examples/llm_ptq/multinode_ptq.py b/examples/llm_ptq/multinode_ptq.py
@@ -330,10 +330,8 @@ def main(args):
     quant_cfg = QUANT_CFG_CHOICES[args.qformat]
 
     quant_cfg = build_quant_cfg(
-        args.qformat,
         quant_cfg,
         args.awq_block_size,
-        model_type,
     )
 
     enable_quant_kv_cache = args.kv_cache_qformat != "none"
diff --git a/modelopt_recipes/huggingface/gemma/ptq/README.md b/modelopt_recipes/huggingface/gemma/ptq/README.md
@@ -0,0 +1,12 @@
+# Gemma PTQ recipes
+
+Recipes here override the algorithm defaults that ship in the general PTQ
+presets because Gemma needs different settings to converge / stay accurate.
+
+| Recipe | What's model-specific |
+|--------|-----------------------|
+| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search. The default search overflows in TRT-LLM kernels on Gemma; the coarser sweep avoids it without measurably hurting accuracy. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). |
+| `int8_sq-kv_fp8_cast.yaml` | Sets SmoothQuant `alpha: 0.5` instead of the default `1.0`. Gemma 7B regresses with `alpha=1`; `0.5` recovers it. Numerics: INT8 per-channel weights + INT8 inputs + FP8 KV-cache cast. |
+
+The base numerics units and the standard disabled-quantizer list are inherited
+from the shared `configs/`; only the algorithm fields are model-specific.
diff --git a/modelopt_recipes/huggingface/gemma/ptq/int8_sq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma/ptq/int8_sq-kv_fp8_cast.yaml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gemma-specific INT8 SmoothQuant PTQ recipe with FP8 KV-cache cast. Overrides
+# the SmoothQuant alpha from the default 1.0 to 0.5 to recover accuracy on
+# Gemma 7B (default alpha causes a regression).
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  int8: configs/numerics/int8
+  int8_per_channel: configs/numerics/int8_per_channel
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    Gemma INT8 SmoothQuant recipe with FP8 KV-cache cast: alpha=0.5 (instead
+    of the default 1.0) to avoid accuracy regression on Gemma 7B, plus FP8
+    KV-cache using constant amax (no KV calibration).
+quantize:
+  algorithm:
+    method: smoothquant
+    alpha: 0.5
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        $import: int8_per_channel
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        $import: int8
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
diff --git a/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/gemma/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gemma-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
+# optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
+# in TRT-LLM kernels when using the default AWQ search on Gemma.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  int4_per_block: configs/numerics/int4_per_block
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    Gemma W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block weights + FP8
+    inputs, awq_lite with alpha_step=1 (coarser search) to avoid TRT-LLM
+    overflow, plus FP8 KV-cache using constant amax (no KV calibration).
+quantize:
+  algorithm:
+    method: awq_lite
+    alpha_step: 1
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        - $import: int4_per_block
+        - $import: fp8
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        $import: fp8
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
diff --git a/modelopt_recipes/huggingface/mpt/ptq/README.md b/modelopt_recipes/huggingface/mpt/ptq/README.md
@@ -0,0 +1,8 @@
+# MPT PTQ recipes
+
+| Recipe | What's model-specific |
+|--------|-----------------------|
+| `w4a8_awq-kv_fp8_cast.yaml` | Uses `awq_lite` with `alpha_step: 1` instead of the default AWQ search. The default search overflows in TRT-LLM kernels on MPT; the coarser sweep avoids it. Numerics: INT4 block weights + FP8 inputs + FP8 KV-cache cast (constant amax, no KV calibration). Same algorithm override applied to Gemma — see `huggingface/gemma/ptq/`. |
+
+The base numerics units and the standard disabled-quantizer list are inherited
+from the shared `configs/`; only the AWQ algorithm fields are model-specific.
diff --git a/modelopt_recipes/huggingface/mpt/ptq/w4a8_awq-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/mpt/ptq/w4a8_awq-kv_fp8_cast.yaml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MPT-specific W4A8 AWQ PTQ recipe with FP8 KV-cache cast. Uses a coarser
+# optimal-scale search (awq_lite with alpha_step=1) to avoid overflow observed
+# in TRT-LLM kernels when using the default AWQ search on MPT.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  int4_per_block: configs/numerics/int4_per_block
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    MPT W4A8 AWQ recipe with FP8 KV-cache cast: INT4 block weights + FP8
+    inputs, awq_lite with alpha_step=1 (coarser search) to avoid TRT-LLM
+    overflow, plus FP8 KV-cache using constant amax (no KV calibration).
+quantize:
+  algorithm:
+    method: awq_lite
+    alpha_step: 1
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        - $import: int4_per_block
+        - $import: fp8
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        $import: fp8
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
diff --git a/modelopt_recipes/huggingface/nemotron_vl/ptq/README.md b/modelopt_recipes/huggingface/nemotron_vl/ptq/README.md
@@ -0,0 +1,14 @@
+# Nemotron VL PTQ recipes
+
+Nemotron VL is a vision-language model family (including Nemotron-Parse).
+Quantization should be applied only to the decoder (text-generation
+component); the vision encoder, image encoder, and radio/model_encoder
+branches stay in full precision.
+
+| File | What's model-specific |
+|------|-----------------------|
+| `disabled_quantizers.yaml` | Reusable unit (`QuantizerCfgListConfig`). Merges the standard `default_disabled_quantizers` exclusions with Nemotron-VL ones (`*vision*`, `*image*`, `*radio*`, `*visual*`, `*encoder*`, `*model_encoder*`). The last two patterns are required for Nemotron-Parse. Imported by recipes below as the single `disabled_quantizers` slot so they don't pull in two disabled-quantizer sets. |
+| `nvfp4-kv_fp8_cast.yaml` | NVFP4 W4A4 model quantization + FP8 KV-cache cast (constant amax, no KV calibration). Identical numerics to the general `nvfp4` preset / `kv_fp8_cast` unit; what makes it model-specific is that it imports `disabled_quantizers.yaml` from this folder to skip the vision/encoder branches. |
+
+Additional `<qformat>-kv_fp8_cast.yaml` recipes can be generated for other formats
+if needed; only `nvfp4-kv_fp8_cast.yaml` is shipped by default.
diff --git a/modelopt_recipes/huggingface/nemotron_vl/ptq/disabled_quantizers.yaml b/modelopt_recipes/huggingface/nemotron_vl/ptq/disabled_quantizers.yaml
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# QuantizerCfgList snippet of disabled quantizers for Nemotron VL. Merges the
+# standard `default_disabled_quantizers` exclusions with Nemotron-VL-specific
+# ones (only the decoder is quantized; vision/encoder branches, including the
+# Nemotron-Parse radio/model_encoder modules, are skipped). Recipes that
+# import this should NOT also import `default_disabled_quantizers`.
+
+# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig
+  - quantizer_name: '*block_sparse_moe.gate*'
+    enable: false
+  - quantizer_name: '*linear_attn.conv1d*'
+    enable: false
+  - quantizer_name: '*lm_head*'
+    enable: false
+  - quantizer_name: '*mixer.conv1d*'
+    enable: false
+  - quantizer_name: '*mlp.gate.*'
+    enable: false
+  - quantizer_name: '*mlp.shared_expert_gate.*'
+    enable: false
+  - quantizer_name: '*output_layer*'
+    enable: false
+  - quantizer_name: '*proj_out.*'
+    enable: false
+  - quantizer_name: '*router*'
+    enable: false
+  - quantizer_name: 'output.*'
+    enable: false
+  - parent_class: 'nn.BatchNorm1d'
+    quantizer_name: '*'
+    enable: false
+  - parent_class: 'nn.BatchNorm2d'
+    quantizer_name: '*'
+    enable: false
+  - parent_class: 'nn.BatchNorm3d'
+    quantizer_name: '*'
+    enable: false
+  - parent_class: 'nn.LeakyReLU'
+    quantizer_name: '*'
+    enable: false
+  - quantizer_name: '*vision*'
+    enable: false
+  - quantizer_name: '*image*'
+    enable: false
+  - quantizer_name: '*radio*'
+    enable: false
+  - quantizer_name: '*visual*'
+    enable: false
+  - quantizer_name: '*encoder*'
+    enable: false
+  - quantizer_name: '*model_encoder*'
+    enable: false
diff --git a/modelopt_recipes/huggingface/nemotron_vl/ptq/nvfp4-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/nemotron_vl/ptq/nvfp4-kv_fp8_cast.yaml
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Nemotron VL-specific PTQ recipe for the `nvfp4` quantization format.
+# Equivalent to the general `nvfp4` preset with quantization disabled
+# on non-language branches.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  w4a4_nvfp4_nvfp4: configs/ptq/units/w4a4_nvfp4_nvfp4
+  disabled_quantizers: huggingface/nemotron_vl/ptq/disabled_quantizers
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+metadata:
+  recipe_type: ptq
+  description: 'Nemotron VL PTQ recipe (nvfp4): same numerics as the general nvfp4 preset, applied to the decoder (text generation) component only (vision/encoder
+    branches are skipped).'
+quantize:
+  algorithm: max
+  quant_cfg:
+    - '$import': base_disable_all
+    - '$import': w4a4_nvfp4_nvfp4
+    - '$import': kv_fp8_cast
+    - '$import': disabled_quantizers
diff --git a/modelopt_recipes/huggingface/phi4mm/ptq/README.md b/modelopt_recipes/huggingface/phi4mm/ptq/README.md
@@ -0,0 +1,13 @@
+# Phi-4-Multimodal PTQ recipes
+
+Phi-4-Multimodal is a multimodal model. Quantization should be applied only to
+the language model; the speech, audio, image, and vision branches are kept in
+full precision to avoid accuracy regressions on those modalities.
+
+| File | What's model-specific |
+|------|-----------------------|
+| `disabled_quantizers.yaml` | Reusable unit (`QuantizerCfgListConfig`). Merges the standard `default_disabled_quantizers` exclusions with Phi-4-MM ones (`*speech*`, `*audio*`, `*image*`, `*vision*`). Imported by recipes below as the single `disabled_quantizers` slot so they don't pull in two disabled-quantizer sets. |
+| `nvfp4-kv_fp8_cast.yaml` | NVFP4 W4A4 model quantization + FP8 KV-cache cast (constant amax, no KV calibration). Identical numerics to the general `nvfp4` preset / `kv_fp8_cast` unit; what makes it model-specific is that it imports `disabled_quantizers.yaml` from this folder to skip the non-language branches. |
+
+Additional `<qformat>-kv_fp8_cast.yaml` recipes can be generated for other formats
+if needed; only `nvfp4-kv_fp8_cast.yaml` is shipped by default.
diff --git a/modelopt_recipes/huggingface/phi4mm/ptq/disabled_quantizers.yaml b/modelopt_recipes/huggingface/phi4mm/ptq/disabled_quantizers.yaml
diff --git a/modelopt_recipes/huggingface/phi4mm/ptq/nvfp4-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/phi4mm/ptq/nvfp4-kv_fp8_cast.yaml

Original file line number	Diff line number	Diff line change
`@@ -330,10 +330,8 @@ def main(args):`
`330`	`330`	`quant_cfg = QUANT_CFG_CHOICES[args.qformat]`
`331`	`331`
`332`	`332`	`quant_cfg = build_quant_cfg(`
`333`		`- args.qformat,`
`334`	`333`	`quant_cfg,`
`335`	`334`	`args.awq_block_size,`
`336`		`- model_type,`
`337`	`335`	`)`
`338`	`336`
`339`	`337`	`enable_quant_kv_cache = args.kv_cache_qformat != "none"`