Add Qwen3.6 W4A16 PTQ recipe (#1503)

meenchen · web-flow · commit 910dc49a2c04 · 2026-05-20T13:10:07.000-07:00
### What does this PR do? Type of change: ?   ### Usage ```python # Add a code snippet demonstrating how to use this ``` ### Testing  ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / ❌ / N/A  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A  - Did you write any new necessary tests?: ✅ / ❌ / N/A  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A  - Did you get Claude approval on this PR?: ✅ / ❌ / N/A  ### Additional Information   ## Summary by CodeRabbit * **New Features** * Added a W4A16 PTQ recipe for Qwen3.5/Qwen3.6 with mixed-precision rules (NVFP4 for MLP projections, FP8 for attention layers and KV cache). * **Updates** * PTQ workflow now respects supplied recipes and applies quantization rules to the full model (recipe-driven targeting enabled).  [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/Model-Optimizer/pull/1503?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)   --------- Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com>
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -542,12 +542,17 @@ def load_model(args: argparse.Namespace):
             ]
 
             # We only quantize the language model for VLMs other than the type supported above.
-            extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
-                full_model
-            )
-            if extracted_lm is not None:
-                language_model = extracted_lm
-                model_type = extracted_model_type
+            # Recipe mode is the exception: in Qwen3.5/3.6-MoE VLMs, lm_head sits
+            # on the outer CausalLM, not the inner language backbone. A recipe that targets
+            # lm_head must therefore quantize against the full model and explicitly keep visual
+            # and MTP siblings disabled.
+            if args.recipe is None:
+                extracted_lm, extracted_model_type = extract_and_prepare_language_model_from_vl(
+                    full_model
+                )
+                if extracted_lm is not None:
+                    language_model = extracted_lm
+                    model_type = extracted_model_type
 
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
diff --git a/modelopt_recipes/models/Qwen3.5-Qwen3.6/w4a16.yaml b/modelopt_recipes/models/Qwen3.5-Qwen3.6/w4a16.yaml
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+metadata:
+  recipe_type: ptq
+  description: >
+    W4A16 PTQ recipe for Qwen3.5/Qwen3.6 models: W4A16 NVFP4 for dense MLP,
+    routed MoE, shared-expert MLP projections, and lm_head; FP8 for
+    self-attention and the large linear-attention projections; FP8 KV cache
+    with constant amax (fp8_cast behavior).
+
+quantize:
+  algorithm:
+    method: max
+    layerwise: false
+
+  quant_cfg:
+    - quantizer_name: '*'
+      enable: false
+
+    # W4A16 NVFP4 MLP projection targets. Matching the gate/up/down projection
+    # names covers dense MLPs, shared experts, and fused MoE expert quantizers
+    # such as gate_up_proj_weight_quantizers.N/down_proj_weight_quantizers.N.
+    - quantizer_name: '*mlp*gate_proj*weight_quantizer*'
+      enable: true
+      cfg: &nvfp4_cfg
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*mlp*up_proj*weight_quantizer*'
+      enable: true
+      cfg: *nvfp4_cfg
+    - quantizer_name: '*mlp*down_proj*weight_quantizer*'
+      enable: true
+      cfg: *nvfp4_cfg
+    - quantizer_name: '*lm_head*weight_quantizer'
+      enable: true
+      cfg: *nvfp4_cfg
+
+    # FP8 self-attention projections.
+    - quantizer_name: '*self_attn*weight_quantizer'
+      enable: true
+      cfg: &fp8_cfg
+        num_bits: e4m3
+        axis:
+    - quantizer_name: '*self_attn*input_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+
+    # FP8 large linear-attention projections. Keep in_proj_a, in_proj_b, and
+    # conv1d disabled to match the reference checkpoint policy.
+    - quantizer_name: '*linear_attn.in_proj_qkv*weight_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+    - quantizer_name: '*linear_attn.in_proj_qkv*input_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+    - quantizer_name: '*linear_attn.in_proj_z*weight_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+    - quantizer_name: '*linear_attn.in_proj_z*input_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+    - quantizer_name: '*linear_attn.out_proj*weight_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+    - quantizer_name: '*linear_attn.out_proj*input_quantizer'
+      enable: true
+      cfg: *fp8_cfg
+
+    # FP8 KV cache with constant amax. This matches fp8_cast behavior and
+    # avoids exporting per-layer KV scale tensors.
+    - quantizer_name: '*[kv]_bmm_quantizer'
+      enable: true
+      cfg:
+        num_bits: e4m3
+        axis:
+        use_constant_amax: true
+
+    # Explicitly keep non-reference targets unquantized.
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*linear_attn.in_proj_a*'
+      enable: false
+    - quantizer_name: '*linear_attn.in_proj_b*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*mixer.conv1d*'
+      enable: false
+    - quantizer_name: '*output_layer*'
+      enable: false
+    - quantizer_name: '*proj_out.*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - quantizer_name: '*visual*'
+      enable: false
+    - quantizer_name: '*vision_tower*'
+      enable: false
+    - quantizer_name: '*mtp*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false