Move Qwen3.5/Qwen3.6 W4A16 recipe to huggingface/<model_type>/ptq/ convention

shengliangxu · shengliangxu · commit 7e8d2216f416 · 2026-05-20T20:14:36.000-07:00
Relocate the Qwen3.5/3.6 W4A16 recipe from modelopt_recipes/models/ to the
per-model_type layout under modelopt_recipes/huggingface/qwen3_5/ptq/ and
modelopt_recipes/huggingface/qwen3_5_moe/ptq/, matching the convention used
by the other model-specific recipes.

The two HuggingFace model_types share the same hybrid linear-attention +
softmax-attention architecture (transformers qwen3_5 and qwen3_5_moe), so
the recipe's quant_cfg list applies identically to both. Extract that list
into a single QuantizerCfgListConfig snippet (nvfp4_mlp-fp8_attn-kv_fp8_cast
.quant_cfg.yaml) and have both per-model_type recipe wrappers $import it,
so there is one source of truth.

Inline the shared base_disable_all and default_disabled_quantizers units
via $import, replace the inline NVFP4/FP8 cfg literals with the existing
configs/numerics/{nvfp4,fp8} snippets, and replace the explicit FP8 KV
constant-amax block with the existing configs/ptq/units/kv_fp8_cast unit.

Signed-off-by: Shengliang Xu &lt;shengliangx@nvidia.com&gt;
diff --git a/modelopt_recipes/huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg.yaml b/modelopt_recipes/huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg.yaml
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Shared `quant_cfg` snippet for the Qwen3.5 family's
+# `nvfp4_mlp-fp8_attn-kv_fp8_cast` recipe. Imported by both
+# `huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml` (dense
+# `qwen3_5`) and `huggingface/qwen3_5_moe/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml`
+# (MoE `qwen3_5_moe`); the two families share the hybrid linear-attention +
+# softmax-attention architecture, so the wildcard rules apply identically.
+# MoE-only patterns inside `default_disabled_quantizers`
+# (`*block_sparse_moe.gate*`, `*mlp.shared_expert_gate.*`, `*router*`) are
+# no-ops on dense.
+
+# modelopt-schema: modelopt.torch.quantization.config.QuantizerCfgListConfig
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  fp8: configs/numerics/fp8
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+  nvfp4: configs/numerics/nvfp4
+---
+  - $import: base_disable_all
+
+  # W4A16 NVFP4 on MLP projection targets. The gate/up/down projection patterns
+  # cover dense MLPs, shared experts, and fused MoE expert quantizers
+  # (e.g. gate_up_proj_weight_quantizers.N).
+  - quantizer_name: '*mlp*gate_proj*weight_quantizer*'
+    cfg: {$import: nvfp4}
+  - quantizer_name: '*mlp*up_proj*weight_quantizer*'
+    cfg: {$import: nvfp4}
+  - quantizer_name: '*mlp*down_proj*weight_quantizer*'
+    cfg: {$import: nvfp4}
+
+  # FP8 self-attention projections.
+  - quantizer_name: '*self_attn*weight_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*self_attn*input_quantizer'
+    cfg: {$import: fp8}
+
+  # FP8 large linear-attention projections. in_proj_a / in_proj_b / conv1d
+  # remain disabled to match the reference checkpoint policy.
+  - quantizer_name: '*linear_attn.in_proj_qkv*weight_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*linear_attn.in_proj_qkv*input_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*linear_attn.in_proj_z*weight_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*linear_attn.in_proj_z*input_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*linear_attn.out_proj*weight_quantizer'
+    cfg: {$import: fp8}
+  - quantizer_name: '*linear_attn.out_proj*input_quantizer'
+    cfg: {$import: fp8}
+
+  # FP8 KV cache with constant amax.
+  - $import: kv_fp8_cast
+
+  # Standard exclusions (BatchNorm, LeakyReLU, gates, routers, conv1d, output
+  # heads, etc.). Includes `*lm_head*` disable, which is re-enabled below.
+  - $import: default_disabled_quantizers
+
+  # Qwen-specific exclusions: linear-attention sub-modules that are not in the
+  # reference recipe, and any visual / MTP siblings on multimodal releases.
+  - quantizer_name: '*linear_attn.in_proj_a*'
+    enable: false
+  - quantizer_name: '*linear_attn.in_proj_b*'
+    enable: false
+  - quantizer_name: '*visual*'
+    enable: false
+  - quantizer_name: '*vision_tower*'
+    enable: false
+  - quantizer_name: '*mtp*'
+    enable: false
+
+  # Re-enable NVFP4 on lm_head weights. Must come after
+  # default_disabled_quantizers, which disables `*lm_head*`.
+  - quantizer_name: '*lm_head*weight_quantizer'
+    cfg: {$import: nvfp4}
diff --git a/modelopt_recipes/huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NVFP4 MLP / FP8 attention / FP8 KV-cast PTQ recipe for HuggingFace `qwen3_5`
+# (dense) models. Covers Qwen3.5 and Qwen3.6 dense releases, which share the
+# `qwen3_5` model_type and hybrid linear-attention + softmax-attention
+# architecture. Shares its `quant_cfg` with the MoE counterpart at
+# `huggingface/qwen3_5_moe/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml`; the
+# snippet lives under
+# `huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg.yaml`.
+
+imports:
+  quant_cfg: huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    NVFP4 MLP / FP8 attention / FP8 KV-cast PTQ recipe for HuggingFace
+    `qwen3_5` (dense) models: NVFP4 for MLP projection weights and lm_head;
+    FP8 for self-attention and the large linear-attention projections; FP8 KV
+    cache with constant amax.
+quantize:
+  algorithm:
+    method: max
+    layerwise: false
+  quant_cfg:
+    - $import: quant_cfg
diff --git a/modelopt_recipes/huggingface/qwen3_5_moe/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml b/modelopt_recipes/huggingface/qwen3_5_moe/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NVFP4 MLP / FP8 attention / FP8 KV-cast PTQ recipe for HuggingFace
+# `qwen3_5_moe` models. Covers Qwen3.5-MoE and Qwen3.6-MoE releases, which
+# share the `qwen3_5_moe` model_type and hybrid linear-attention +
+# softmax-attention MoE architecture. Shares its `quant_cfg` with the dense
+# counterpart at
+# `huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.yaml`; the snippet
+# lives under
+# `huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg.yaml`.
+
+imports:
+  quant_cfg: huggingface/qwen3_5/ptq/nvfp4_mlp-fp8_attn-kv_fp8_cast.quant_cfg
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    NVFP4 MLP / FP8 attention / FP8 KV-cast PTQ recipe for HuggingFace
+    `qwen3_5_moe` models (Qwen3.5-MoE and Qwen3.6-MoE releases): NVFP4 for MoE
+    / shared-expert MLP projection weights and lm_head; FP8 for self-attention
+    and the large linear-attention projections; FP8 KV cache with constant
+    amax.
+quantize:
+  algorithm:
+    method: max
+    layerwise: false
+  quant_cfg:
+    - $import: quant_cfg
diff --git a/modelopt_recipes/models/Qwen3.5-Qwen3.6/w4a16.yaml b/modelopt_recipes/models/Qwen3.5-Qwen3.6/w4a16.yaml