Merge branch 'main' into nvfp4-block-size-validation

makroumi · web-flow · commit 4fcb798e760e · 2026-05-07T19:31:47.000-04:00
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -49,18 +49,7 @@ dense | sparsegpt) ;;
     ;;
 esac
 
-#Iterate over list of qformats provided and check if they are valid
-IFS=","
-for qformat in $QFORMAT; do
-    case $qformat in
-    fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;;
-    *)
-        echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2
-        exit 1
-        ;;
-    esac
-done
-IFS=" "
+# Quant format / recipe validation is delegated to hf_ptq.py.
 
 script_dir="$(dirname "$(readlink -f "$0")")"
 
@@ -72,7 +61,14 @@ fi
 
 QFORMAT_MODIFIED="${QFORMAT//,/_}"
 
-MODEL_NAME=$(basename $MODEL_PATH | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
+# When using --recipe, build the model name from the recipe basename (without
+# directory or .yaml suffix) so each recipe gets its own SAVE_PATH.
+if [ -n "$RECIPE" ]; then
+    RECIPE_TAG=$(basename "$RECIPE" .yaml | sed 's/[^0-9a-zA-Z\-]/_/g')
+    MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_recipe_${RECIPE_TAG}
+else
+    MODEL_NAME=$(basename "$MODEL_PATH" | sed 's/[^0-9a-zA-Z\-]/_/g')_${QFORMAT_MODIFIED}${KV_CACHE_QUANT:+_kv_${KV_CACHE_QUANT}}
+fi
 
 SAVE_PATH=${ROOT_SAVE_PATH}/saved_models_${MODEL_NAME}
 
@@ -164,24 +160,18 @@ fi
 
 if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH) ]]; then
 
-    if [ "$qformat" == "bf16" ] || [ "$qformat" == "fp16" ]; then
-        if [ -d "$MODEL_PATH" ]; then
-            MODEL_CONFIG_EXIST=true
-            MODEL_CONFIG=$MODEL_PATH/config.json
-            for file in $MODEL_PATH/*; do ln -sf "$file" $SAVE_PATH/; done
-        else
-            echo "Please use the model directory where the config.json file is present."
-            exit 1
-        fi
-    fi
-
     if [[ "$MODEL_CONFIG_EXIST" == false ]]; then
         echo "Quantizing original model..."
+        if [ -n "$RECIPE" ]; then
+            QUANT_SPEC_ARGS="--recipe=$RECIPE"
+        else
+            QUANT_SPEC_ARGS="--qformat=${QFORMAT// /,}"
+        fi
         python hf_ptq.py \
             --pyt_ckpt_path=$MODEL_PATH \
             --export_path=$SAVE_PATH \
             --sparsity_fmt=$SPARSITY_FMT \
-            --qformat="${QFORMAT// /,}" \
+            $QUANT_SPEC_ARGS \
             --calib_size=$CALIB_SIZE \
             --batch_size=$CALIB_BATCH_SIZE \
             --inference_tensor_parallel=$TP \
@@ -203,7 +193,7 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         exit 0
     fi
 
-    if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then
+    if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]] || [[ "$RECIPE" == *"nvfp4"* ]]; then
         cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1)
 
         if [ "$cuda_major" -lt 10 ]; then
@@ -212,6 +202,11 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH
         fi
     fi
 
+    if [ -n "$RECIPE" ]; then
+        echo "Recipe $RECIPE used. Please deploy with TensorRT-LLM directly. Checkpoint export_path: $SAVE_PATH"
+        exit 0
+    fi
+
     if [[ ! " fp8 nvfp4 bf16 fp16 " =~ " ${QFORMAT} " ]]; then
         echo "Quant $QFORMAT specified. Please read TensorRT-LLM quantization support matrix https://nvidia.github.io/TensorRT-LLM/features/quantization.html#quantization-in-tensorrt-llm and use TensorRT-LLM for deployment. Checkpoint export_path: $SAVE_PATH"
         exit 0
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
@@ -20,6 +20,7 @@ parse_options() {
   # Default values
     MODEL_PATH=""
     QFORMAT=""
+    RECIPE=""
     KV_CACHE_QUANT=""
     TP=1
     PP=1
@@ -37,13 +38,14 @@ parse_options() {
     CAST_MXFP4_TO_NVFP4=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,recipe:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:,auto_quantize_method:,auto_quantize_score_size:,auto_quantize_checkpoint:,moe_calib_experts_ratio:,cast_mxfp4_to_nvfp4" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
     case "$1" in
       --model ) MODEL_PATH="$2"; shift 2;;
       --quant ) QFORMAT="$2"; shift 2;;
+      --recipe ) RECIPE="$2"; shift 2;;
       --kv_cache_quant ) KV_CACHE_QUANT="$2"; shift 2;;
       --tp ) TP="$2"; shift 2;;
       --pp ) PP="$2"; shift 2;;
@@ -99,12 +101,19 @@ parse_options() {
   fi
 
   # Verify required options are provided
-  if [ -z "$MODEL_PATH" ] || [ -z "$QFORMAT" ] || [ -z "$TASKS" ]; then
-    echo "Usage: $0 --model=<MODEL_PATH> --quant=<QFORMAT> --tasks=<TASK,...>"
+  if [ -z "$MODEL_PATH" ] || [ -z "$TASKS" ] || ([ -z "$QFORMAT" ] && [ -z "$RECIPE" ]); then
+    echo "Usage: $0 --model=<MODEL_PATH> (--quant=<QFORMAT> | --recipe=<RECIPE>) --tasks=<TASK,...>"
     echo "Optional args: --sparsity=<SPARSITY_FMT> --awq_block_size=<AWQ_BLOCK_SIZE> --calib=<CALIB_SIZE>"
     exit 1
   fi
 
+  # --quant and --recipe are mutually exclusive: --recipe is a full PTQ spec, while
+  # --quant selects a built-in qformat preset. Pick exactly one.
+  if [ -n "$QFORMAT" ] && [ -n "$RECIPE" ]; then
+    echo "Cannot specify both --quant and --recipe; pick one." >&2
+    exit 1
+  fi
+
   VALID_TASKS=("quant" "mmlu" "lm_eval" "livecodebench" "simple_eval")
 
   for task in $(echo "$TASKS" | tr ',' ' '); do
@@ -135,6 +144,7 @@ parse_options() {
   echo "================="
   echo "model: $MODEL_PATH"
   echo "quant: $QFORMAT"
+  echo "recipe: $RECIPE"
   echo "tp (TensorRT-LLM Checkpoint only): $TP"
   echo "pp (TensorRT-LLM Checkpoint only): $PP"
   echo "sparsity: $SPARSITY_FMT"
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -62,6 +62,29 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     for idx in range(n):
         expert = nn.Module()
 
+        # If the gate_up source quantizer was never calibrated (rare expert
+        # that received no calibration tokens), derive its amax once from the
+        # FUSED tensor so gate and up share the same weight_scale_2 below.
+        # Why: vLLM fuses W1 (gate) and W3 (up) at load time and asserts a
+        # single per-tensor scale across the fusion. The per-projection
+        # fallback further down would otherwise compute amax independently from
+        # each half — gate's max and up's max generally differ — producing
+        # mismatched weight_scale_2 and garbled MoE output at inference.
+        gate_up_q = module.gate_up_proj_weight_quantizers[idx]
+        if getattr(gate_up_q, "is_enabled", False) and (
+            not hasattr(gate_up_q, "_amax")
+            or gate_up_q._amax is None
+            or torch.all(gate_up_q._amax == 0)
+        ):
+            gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
+            warnings.warn(
+                f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
+                f"(amax missing or zero). Using fused-tensor amax as fallback "
+                f"(shared by gate and up so weight_scale_2 stays consistent). "
+                f"Consider increasing calibration size to activate all experts.",
+                stacklevel=2,
+            )
+
         projections = [
             ("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True),
             ("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True),
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse-kv_fp8_cast.yaml
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  nvfp4: configs/numerics/nvfp4
+  nvfp4_static: configs/numerics/nvfp4_static
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), FP8 KV cache with constant amax.
+quantize:
+  algorithm:
+    method: mse
+    fp8_scale_sweep: true
+    # layerwise=false required for VLMs where the decoder layers are nested under
+    # `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
+    layerwise: false
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*mlp.experts*weight_quantizer'
+      cfg:
+        $import: nvfp4_static
+    - quantizer_name: '*mlp.experts*input_quantizer'
+      cfg:
+        $import: nvfp4
+    - quantizer_name: '*block_sparse_moe*weight_quantizer'
+      cfg:
+        $import: nvfp4_static
+    - quantizer_name: '*block_sparse_moe*input_quantizer'
+      cfg:
+        $import: nvfp4
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
diff --git a/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml b/modelopt_recipes/general/ptq/nvfp4_mlp_only_mse-kv_fp8_cast.yaml
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  nvfp4: configs/numerics/nvfp4
+  nvfp4_static: configs/numerics/nvfp4_static
+  kv_fp8_cast: configs/ptq/units/kv_fp8_cast
+
+metadata:
+  recipe_type: ptq
+  description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for MLP/MoE linear layers (W4A4), FP8 KV cache with constant amax.
+quantize:
+  algorithm:
+    method: mse
+    fp8_scale_sweep: true
+    # layerwise=false required for VLMs where the decoder layers are nested under
+    # `model.language_model.layers` (layerwise_calibrate can't find them otherwise).
+    layerwise: false
+  quant_cfg:
+    - $import: base_disable_all
+    - quantizer_name: '*mlp*weight_quantizer'
+      cfg:
+        $import: nvfp4_static
+    - quantizer_name: '*mlp*input_quantizer'
+      cfg:
+        $import: nvfp4
+    - quantizer_name: '*block_sparse_moe*weight_quantizer'
+      cfg:
+        $import: nvfp4_static
+    - quantizer_name: '*block_sparse_moe*input_quantizer'
+      cfg:
+        $import: nvfp4
+    - quantizer_name: '*.experts.*weight_quantizer'
+      cfg:
+        $import: nvfp4_static
+    - quantizer_name: '*.experts.*input_quantizer'
+      cfg:
+        $import: nvfp4
+    - $import: kv_fp8_cast
+    - $import: default_disabled_quantizers
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -300,6 +300,94 @@ def test_export_creates_per_expert_submodules(self):
         if QuantModuleRegistry.get(expert_type) is not None:
             QuantModuleRegistry.unregister(expert_type)
 
+    def test_uncalibrated_expert_gate_up_share_amax(self, monkeypatch):
+        """gate_proj and up_proj must share weight_scale_2 even when an expert
+        was never routed during calibration.
+
+        Regression for the bug where ``_export_fused_experts``'s per-projection
+        fallback computed amax independently from the gate and up halves of the
+        fused tensor — producing mismatched ``weight_scale_2`` values for any
+        uncalibrated expert. vLLM fuses W1 (gate) and W3 (up) at load time and
+        asserts a single shared scale; mismatched scales corrupted MoE output.
+        The fix derives the fallback amax once from the fused ``gate_up[idx]``
+        tensor before the deepcopies, so gate's clone and up's clone start with
+        the same amax.
+        """
+        from modelopt.torch.export.moe_utils import _export_fused_experts
+
+        # Build experts where gate and up have very different magnitudes —
+        # any per-half fallback would clearly produce different amaxes.
+        experts = _SyntheticFusedExperts()
+        gate = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.02
+        up = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.20
+        with torch.no_grad():
+            experts.gate_up_proj.copy_(torch.cat([gate, up], dim=1))
+
+        expert_type = type(experts)
+        if QuantModuleRegistry.get(expert_type) is None:
+            QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})(
+                _QuantFusedExperts
+            )
+        try:
+            converted = QuantModuleRegistry.convert(experts)
+
+            # Leave every expert weight quantizer uncalibrated (no _amax).
+            # Mark them enabled to exercise the export-time fallback path.
+            for q in converted.gate_up_proj_weight_quantizers:
+                q._disabled = False
+            for q in converted.down_proj_weight_quantizers:
+                q._disabled = False
+
+            # Capture the amax each per-projection wrapper carries into the
+            # FP4 quantization step. Patching here avoids needing CUDA / FP4.
+            seen = {}  # (expert_idx, proj_name) -> amax tensor
+
+            def _spy_export(wrapper, dtype):
+                # Identify which expert/projection this wrapper belongs to by
+                # matching the weight tensor against the fused parameters.
+                w = wrapper.weight.data
+                # gate_up_proj is (N, 2*INTER, HIDDEN); split halves are
+                # contiguous .data views or .contiguous() copies — we can match
+                # by shape and value identity for this synthetic case.
+                amax = wrapper.weight_quantizer._amax.detach().clone()
+                # Identify by matching against gate vs. up slices of each expert.
+                for idx in range(NUM_EXPERTS):
+                    g_slice = converted.gate_up_proj.data[idx, :INTERMEDIATE_DIM, :]
+                    u_slice = converted.gate_up_proj.data[idx, INTERMEDIATE_DIM:, :]
+                    d_slice = converted.down_proj.data[idx]
+                    if w.shape == g_slice.shape and torch.equal(w, g_slice):
+                        seen[(idx, "gate_proj")] = amax
+                        return
+                    if w.shape == u_slice.shape and torch.equal(w, u_slice):
+                        seen[(idx, "up_proj")] = amax
+                        return
+                    if w.shape == d_slice.shape and torch.equal(w, d_slice):
+                        seen[(idx, "down_proj")] = amax
+                        return
+
+            monkeypatch.setattr(
+                "modelopt.torch.export.unified_export_hf._export_quantized_weight",
+                _spy_export,
+            )
+
+            _export_fused_experts(converted, torch.float16)
+
+            # Assert: for every expert, gate's amax matches up's amax.
+            for idx in range(NUM_EXPERTS):
+                g_amax = seen.get((idx, "gate_proj"))
+                u_amax = seen.get((idx, "up_proj"))
+                assert g_amax is not None and u_amax is not None, (
+                    f"Expert {idx}: missing recorded amax (gate={g_amax}, up={u_amax})"
+                )
+                assert torch.allclose(g_amax, u_amax), (
+                    f"Expert {idx}: gate amax {g_amax.item()} != up amax {u_amax.item()}. "
+                    f"Uncalibrated fused experts must share gate/up amax so that "
+                    f"weight_scale_2 stays consistent across the fusion."
+                )
+        finally:
+            if QuantModuleRegistry.get(expert_type) is not None:
+                QuantModuleRegistry.unregister(expert_type)
+
 
 # ---------------------------------------------------------------------------
 # Tests for force_eager_experts_impl_on_the_fly