add weight-only supports in _QuantFusedExperts

hychiang-git · hychiang-git · commit 08cd44fe0cda · 2026-05-07T18:58:35.000Z
Signed-off-by: Hung-Yueh Chiang &lt;hungyuehc@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -900,6 +900,26 @@ def forward(self, *args, **kwargs):
         self._down_proj_linear = False
         return super().forward(*args, **kwargs)
 
+    def iter_weights_for_calibration(self):
+        """Yield ``(weight_slice, quantizer)`` pairs for each expert and weight type.
+
+        The base implementation resolves singular ``*_weight_quantizer`` names via
+        ``quantizer_attr_names``, but fused experts store per-expert quantizers as
+        ``nn.ModuleList`` attributes (``gate_up_proj_weight_quantizers``,
+        ``down_proj_weight_quantizers``).  Override to yield the per-expert slice
+        and its corresponding quantizer directly.
+        """
+        for weight_name, quantizers_name in (
+            ("gate_up_proj", "gate_up_proj_weight_quantizers"),
+            ("down_proj", "down_proj_weight_quantizers"),
+        ):
+            weight = getattr(self, weight_name, None)
+            quantizers = getattr(self, quantizers_name, None)
+            if weight is None or quantizers is None:
+                continue
+            for idx, q in enumerate(quantizers):
+                yield weight[idx], q
+
     def fold_weight(self, keep_attrs: bool = False):
         """Fold per-expert weight quantizers into the fused 3-D weights.