address reviews

Edwardf0t1 · Edwardf0t1 · commit ea44272cf05f · 2026-04-08T11:52:32.000-07:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
@@ -979,10 +979,14 @@ def module_match_name_list(module, name_list):
             "Qwen3NextSparseMoeBlock",
             "Qwen3_5MoeSparseMoeBlock",
             "DeepseekMoE",
-            "MixtralSparseMoeBlock",
         ],
     ):
         return ["gate_proj", "down_proj", "up_proj"]
+    elif module_match_name_list(module, ["MixtralSparseMoeBlock"]):
+        # Old-style Mixtral (iterable experts) uses w1/w2/w3.
+        # Fused Mixtral (transformers 5.0+) is already handled by the
+        # structural gate_up_proj_weight_quantizers check above.
+        return ["w1", "w2", "w3"]
     elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]):
         # Older transformers naming for Mixtral
         return ["linear_fc1", "linear_fc2"]
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -16,6 +16,7 @@
 """Utilities for Mixture-of-Experts (MoE) model export."""
 
 import copy
+import warnings
 from pathlib import Path
 
 import torch
@@ -49,17 +50,9 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     n = module.num_experts
     expert_dim = _get_fused_expert_intermediate_dim(module)
 
-    # 1. Input amax fallback — borrow from calibrated peers.
-    for quantizer_list in [
-        module.gate_up_proj_input_quantizers,
-        module.down_proj_input_quantizers,
-    ]:
-        wrappers = []
-        for q in quantizer_list:
-            w = nn.Module()
-            w.input_quantizer = q
-            wrappers.append(w)
-        set_expert_quantizer_amax(modules=wrappers, quantizer_attrs=["input_quantizer"])
+    # 1. Shared input quantizers — one per projection type, shared across all experts.
+    gate_up_input_q = module.gate_up_proj_input_quantizer
+    down_input_q = module.down_proj_input_quantizer
 
     gate_up = module.gate_up_proj.data
     down = module.down_proj.data
@@ -82,11 +75,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 if is_gate_up
                 else module.down_proj_weight_quantizers[idx]
             )
-            i_quantizer = (
-                module.gate_up_proj_input_quantizers[idx]
-                if is_gate_up
-                else module.down_proj_input_quantizers[idx]
-            )
+            i_quantizer = gate_up_input_q if is_gate_up else down_input_q
 
             # gate/up share a weight quantizer — clone so each gets independent amax.
             w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src
@@ -116,6 +105,12 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 )
             ):
                 w_quantizer.amax = weight_slice.abs().amax().to(torch.float32)
+                warnings.warn(
+                    f"Expert {idx} {proj_name} weight quantizer was not calibrated "
+                    f"(amax missing or zero). Using weight-derived amax as fallback. "
+                    f"Consider using more calibration data to activate all experts.",
+                    stacklevel=2,
+                )
 
             wrapper = nn.Module()
             wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False)
@@ -139,9 +134,9 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
         "gate_up_proj",
         "down_proj",
         "gate_up_proj_weight_quantizers",
-        "gate_up_proj_input_quantizers",
+        "gate_up_proj_input_quantizer",
         "down_proj_weight_quantizers",
-        "down_proj_input_quantizers",
+        "down_proj_input_quantizer",
     ):
         if hasattr(module, attr):
             delattr(module, attr)
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -892,8 +892,9 @@ class _QuantFusedExperts(_QuantFunctionalMixin):
 
     Per-expert quantization is achieved by intercepting ``F.linear`` and recovering
     the expert index from the weight tensor's storage offset into the 3-D parameter.
-    Each expert gets its own weight and input quantizers (``nn.ModuleList``), so
-    calibration granularity matches the per-expert decomposition approach.
+    Each expert gets its own weight quantizers (``nn.ModuleList``), while input
+    quantizers are shared across all experts (single ``TensorQuantizer``) to match
+    the shared input quantization scale used by downstream inference frameworks.
 
     Verified compatible models: Mixtral, Qwen2-MoE, Qwen3-MoE, Qwen3.5-MoE,
     DeepSeek-V3, Jamba, OLMoE.
@@ -926,9 +927,9 @@ def _get_expert_idx_from_gate_up(self, weight: torch.Tensor) -> int:
 
     def _setup(self):
         n = self.num_experts
-        self.gate_up_proj_input_quantizers = nn.ModuleList([TensorQuantizer() for _ in range(n)])
+        self.gate_up_proj_input_quantizer = TensorQuantizer()
         self.gate_up_proj_weight_quantizers = nn.ModuleList([TensorQuantizer() for _ in range(n)])
-        self.down_proj_input_quantizers = nn.ModuleList([TensorQuantizer() for _ in range(n)])
+        self.down_proj_input_quantizer = TensorQuantizer()
         self.down_proj_weight_quantizers = nn.ModuleList([TensorQuantizer() for _ in range(n)])
 
         self._register_temp_attribute("_down_proj_linear", False)
@@ -944,12 +945,12 @@ def functionals_to_replace(self):
         def _quantized_linear(input, weight, bias=None):
             if self._down_proj_linear:
                 idx = self._current_expert_idx
-                input = self.down_proj_input_quantizers[idx](input)
+                input = self.down_proj_input_quantizer(input)
                 weight = self.down_proj_weight_quantizers[idx](weight)
             else:
                 idx = self._get_expert_idx_from_gate_up(weight)
                 self._current_expert_idx = idx
-                input = self.gate_up_proj_input_quantizers[idx](input)
+                input = self.gate_up_proj_input_quantizer(input)
                 weight = self.gate_up_proj_weight_quantizers[idx](weight)
             self._down_proj_linear = not self._down_proj_linear
             return _orig_linear(input, weight, bias)
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -189,19 +189,21 @@ def test_two_level_registration(self):
         self._cleanup_registry(block_type)
 
     def test_convert_creates_quantizers(self):
-        """After conversion, fused experts should have per-expert quantizer ModuleLists."""
+        """After conversion, fused experts should have shared input and per-expert weight quantizers."""
         model = _TinyMoEModel()
         expert_type = type(model.moe.experts)
         self._cleanup_registry(expert_type)
 
         register_fused_experts_on_the_fly(model)
         converted = QuantModuleRegistry.convert(model.moe.experts)
 
-        assert hasattr(converted, "gate_up_proj_input_quantizers")
+        # Shared input quantizers (single TensorQuantizer, not ModuleList)
+        assert hasattr(converted, "gate_up_proj_input_quantizer")
+        assert hasattr(converted, "down_proj_input_quantizer")
+        # Per-expert weight quantizers (ModuleList)
         assert hasattr(converted, "gate_up_proj_weight_quantizers")
-        assert hasattr(converted, "down_proj_input_quantizers")
         assert hasattr(converted, "down_proj_weight_quantizers")
-        assert len(converted.gate_up_proj_input_quantizers) == NUM_EXPERTS
+        assert len(converted.gate_up_proj_weight_quantizers) == NUM_EXPERTS
         assert len(converted.down_proj_weight_quantizers) == NUM_EXPERTS
         self._cleanup_registry(expert_type)