Unify weight_scale_2 between gate_proj/up_proj (and w1/w3) in the HF export path for MOE models (#1033)

Edwardf0t1 · web-flow · commit 7b34de6436de · 2026-03-16T15:14:45.000-07:00
### What does this PR do? Unify `weight_scale_2` between `gate_proj/up_proj` (and `w1/w3`) in the HF export path for MOE models. Serving engines fuse these projections into a single `gate_up_proj` and require a shared scale; this takes the element-wise max of the two independent scales as a conservative choice that avoids overflow. Type of change: ? Bug fix ### Usage ```python # Add a code snippet demonstrating how to use this ``` ### Testing  ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ / N/A  - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: ✅ / ❌ / N/A  - Did you write any new necessary tests?: ✅ / ❌ / N/A  - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A  ### Additional Information   ## Summary by CodeRabbit * **New Features** * Automatic synchronization of quantization scaling between Mixture-of-Experts gate and up projections during model export for non‑fused MoE setups (e.g., Qwen MoE, DeepSeek). * **Bug Fixes / Improvements** * Export now emits a brief notification when gate/up scaling values are adjusted to ensure consistent quantization.  --------- Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -1171,6 +1171,55 @@ def set_expert_quantizer_amax(
     return uncalibrated_modules
 
 
+# Gate/up naming pairs for standard (unfused) MoE architectures.
+# Fused variants (gate_up_proj, linear_fc1) already share a single quantizer and need no sync.
+_GATE_UP_PAIRS = [("gate_proj", "up_proj"), ("w1", "w3")]
+
+
+def sync_moe_gate_up_amax(model: nn.Module) -> int:
+    """Take element-wise max of gate and up weight quantizer amaxes per expert.
+
+    Serving engines fuse gate_proj and up_proj into a single gate_up_proj and
+    require a single weight_scale_2. Since weight_scale_2 = amax / (6 * 448),
+    syncing amaxes before quantization ensures the per-block weight_scale values
+    are computed against a consistent global scale.
+
+    Only affects standard MoE models with separate gate/up linear layers
+    (e.g. Qwen MoE, DeepSeek). Models with already-fused gate_up_proj
+    (e.g. Llama4, GptOss) are unaffected.
+
+    Returns:
+        Number of expert gate/up pairs whose amaxes were synced.
+    """
+    synced = 0
+    for _, sub_module in model.named_modules():
+        if not (is_moe(sub_module) and hasattr(sub_module, "experts")):
+            continue
+        if not hasattr(sub_module.experts, "__iter__"):
+            continue
+        for expert in sub_module.experts:
+            for gate_name, up_name in _GATE_UP_PAIRS:
+                gate_linear = getattr(expert, gate_name, None)
+                up_linear = getattr(expert, up_name, None)
+                if gate_linear is None or up_linear is None:
+                    continue
+                gate_wq = getattr(gate_linear, "weight_quantizer", None)
+                up_wq = getattr(up_linear, "weight_quantizer", None)
+                if gate_wq is None or up_wq is None:
+                    break
+                gate_amax = getattr(gate_wq, "amax", None)
+                up_amax = getattr(up_wq, "amax", None)
+                if gate_amax is None or up_amax is None:
+                    break
+                if not torch.equal(gate_amax, up_amax):
+                    shared_amax = torch.max(gate_amax, up_amax)
+                    gate_wq.amax = shared_amax
+                    up_wq.amax = shared_amax.clone()
+                    synced += 1
+                break
+    return synced
+
+
 def build_stacked_experts(
     experts: nn.Module,
     linear_names: list[str],
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -73,6 +73,7 @@
     is_moe,
     is_quantlinear,
     set_expert_quantizer_amax,
+    sync_moe_gate_up_amax,
 )
 from .model_config import (
     QUANTIZATION_FP8,
@@ -775,6 +776,18 @@ def _export_transformers_checkpoint(
                 exclude_modules.append(pattern)
                 print(f"Adding MTP layer to quantization_config ignore: {pattern}")
 
+    # Safety net: sync any gate/up weight quantizer amaxes that
+    # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not
+    # activated during the dummy forward, or non-standard expert naming).
+    synced = sync_moe_gate_up_amax(model)
+    if synced:
+        warnings.warn(
+            f"Found {synced} MoE expert gate/up projection pair(s) with mismatched "
+            f"weight_scale_2 after requantize_resmooth_fused_llm_layers. "
+            f"This typically means the dummy forward did not activate these experts. "
+            f"Taking element-wise max of amaxes for serving-engine fusion."
+        )
+
     # Process all quantized modules and export weights
     _process_quantized_modules(model, dtype, is_modelopt_qlora)