[NVBUG: 5804406] Auto detect MOE layers (#900)

cjluo-nv · web-flow · commit ac7c985d963b · 2026-02-19T19:46:58.000Z
## What does this PR do? **Type of change:** New feature, new tests **Overview:** Replace hardcoded per-model MoE class registrations (Mixtral, Qwen2Moe, Qwen3Moe, Qwen3Next, Llama4TextMoe, Qwen3VLMoe, MiniMaxM2, etc.) with a single generic auto-detection mechanism (`register_sparse_moe_on_the_fly`) that walks the model tree and identifies MoE blocks by their structural attributes (`gate` + `experts` with `top_k`/`num_experts`). This makes MoE quantization forward-compatible with new HuggingFace MoE architectures without requiring explicit registration for each model family. Additionally, this PR: - Tracks per-expert token routing counts during calibration via a gate forward hook, enabling visibility into expert utilization. - Saves an HTML report of expert token counts during export (`save_expert_token_count_table`), highlighting under-utilized experts. - Fixes the `topk` -> `top_k` attribute name for transformers >= 5.0 compatibility. - Also move the ptq summary prints to a file in hf_ptq.py to reduce the prints ## Usage Auto-detection is transparent -- no user-facing API changes are needed. Any HuggingFace MoE model with the standard `gate`/`experts` pattern is automatically detected and quantized: import modelopt.torch.quantization as mtq # Any HuggingFace MoE model (Mixtral, Qwen3Moe, DeepSeek, etc.) model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-30B-A3B") mtq.quantize(model, mtq.INT8_DEFAULT_CFG, forward_loop) # During export, an .moe.html report with per-expert token counts is saved automatically ## Testing unittest, also test exporting qwen MOE ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information   ## Summary by CodeRabbit * **New Features** * Added expert token count visualization for Mixture of Experts models, exported as HTML reports during model export. * Enhanced sparse MoE quantization with improved calibration-aware routing and automatic model block detection. * **Tests** * Added comprehensive test suite for sparse MoE quantization validation.  --------- Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,8 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 **New Features**
 
+- User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
+- ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 
 0.42 (2026-02-xx)
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -53,6 +53,7 @@
     export_hf_checkpoint,
     export_tensorrt_llm_checkpoint,
     get_model_type,
+    save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
@@ -726,7 +727,12 @@ def post_quantize(
     """
 
     if args.verbose:
-        mtq.print_quant_summary(full_model)
+        try:
+            mtq.print_quant_summary(full_model, args.export_path)
+            save_expert_token_count_table(full_model, args.export_path)
+        except Exception as e:
+            print(f"Error saving quant summary: {e}")
+            print("Continuing with generation...")
 
     # Run some samples
     torch.cuda.empty_cache()
diff --git a/modelopt/torch/export/__init__.py b/modelopt/torch/export/__init__.py
@@ -19,6 +19,7 @@
 from .model_config import *
 from .model_config_export import *
 from .model_utils import *
+from .moe_utils import *
 from .plugins import *
 from .transformer_engine import *
 from .unified_export_hf import *
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for Mixture-of-Experts (MoE) model export."""
+
+from pathlib import Path
+
+import torch.nn as nn
+
+
+def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | None = None):
+    """Collect expert_token_count from all quantized MoE layers and save as an HTML table.
+
+    The table has rows for each MoE layer and columns for each expert, with cell values
+    showing the number of tokens routed to that expert during calibration.
+
+    Args:
+        model: The model containing quantized MoE layers with ``expert_token_count`` attributes.
+        output_dir: Directory to save the HTML file. Defaults to current directory.
+    """
+    rows = []
+    for name, module in model.named_modules():
+        if hasattr(module, "expert_token_count") and module.expert_token_count.numel() > 0:
+            rows.append((name, module.expert_token_count))
+
+    if not rows:
+        return
+
+    num_experts = rows[0][1].shape[0]
+    assert all(r[1].shape[0] == num_experts for r in rows), (
+        "All MoE layers must have the same number of experts"
+    )
+    html_parts = [
+        "<html><head><style>",
+        "table { border-collapse: collapse; font-family: monospace; }",
+        "th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }",
+        "th { background: #f0f0f0; }",
+        "</style></head><body>",
+        "<h2>Expert Token Counts (per MoE layer)</h2>",
+        "<table><tr><th>Layer/Expert</th>",
+    ]
+    html_parts.extend(f"<th>{i}</th>" for i in range(num_experts))
+    html_parts.append("</tr>")
+
+    for name, counts in rows:
+        avg = counts.float().mean().item()
+        html_parts.append(f"<tr><td>{name}</td>")
+        for c in counts.tolist():
+            if avg > 0 and c < avg * 0.05:
+                style = ' style="background: #ff6666;"'
+            elif avg > 0 and c < avg * 0.1:
+                style = ' style="background: #ffcccc;"'
+            else:
+                style = ""
+            html_parts.append(f"<td{style}>{c}</td>")
+        html_parts.append("</tr>")
+
+    html_parts.append("</table></body></html>")
+    html_content = "\n".join(html_parts)
+
+    if output_dir is None:
+        output_dir = Path(".")
+    output_path = Path(output_dir) / ".moe.html"
+    output_path.write_text(html_content, encoding="utf-8")
+    print(f"\033[1mExpert token count table saved to {output_path}\033[0m")
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
@@ -508,14 +508,26 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable):
 
 
 @atomic_print
-def print_quant_summary(model: nn.Module):
+def print_quant_summary(model: nn.Module, output_dir: str | None = None):
     """Print summary of all quantizer modules in the model."""
-    count = 0
-    for name, mod in model.named_modules():
-        if isinstance(mod, TensorQuantizer):
-            print(f"{name:80} {mod}")
-            count += 1
-    print(f"{count} TensorQuantizers found in model")
+    lines = [
+        f"{name:80} {mod}"
+        for name, mod in model.named_modules()
+        if isinstance(mod, TensorQuantizer)
+    ]
+    lines.append(f"{len(lines)} TensorQuantizers found in model")
+
+    if output_dir:
+        path = (
+            output_dir.joinpath(".quant_summary.txt")
+            if hasattr(output_dir, "joinpath")
+            else f"{output_dir}/.quant_summary.txt"
+        )
+        with open(path, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines) + "\n")
+        print(f"\033[1mQuant summary saved to {path}\033[0m")
+    else:
+        print("\n".join(lines))
 
 
 def fold_weight(model: nn.Module):
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -450,20 +450,56 @@ class _QuantSparseMoe(QuantModule):
     """
 
     def _setup(self):
-        pass
+        num_experts = 0
+        if hasattr(self, "gate") and hasattr(self.gate, "num_experts"):
+            num_experts = self.gate.num_experts
+        elif hasattr(self, "num_experts"):
+            num_experts = self.num_experts
+        elif hasattr(self, "experts") and hasattr(self.experts, "num_experts"):
+            num_experts = self.experts.num_experts
+
+        self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cpu")
+        self._count_expert_tokens = False
+
+        if num_experts == 0:
+            warnings.warn(
+                f"{self.__class__.__name__}: could not resolve num_experts; "
+                "expert routing will not be tracked for this layer."
+            )
+            return
+
+        if hasattr(self, "gate"):
+            self.gate.register_forward_hook(self._gate_forward_hook)
+
+    def _gate_forward_hook(self, module, input, output):
+        if not self._count_expert_tokens:
+            return
+        with torch.no_grad():
+            if isinstance(output, tuple) and len(output) >= 3:
+                # v5.x TopKRouter: returns (logits, scores, indices)
+                indices = output[2]
+            else:
+                # v4.x nn.Linear gate: returns logits tensor
+                logits = output if not isinstance(output, tuple) else output[0]
+                top_k = self.gate.top_k if hasattr(self.gate, "top_k") else self.top_k
+                _, indices = torch.topk(logits.float(), top_k, dim=-1)
+            counts = torch.bincount(
+                indices.reshape(-1).cpu(), minlength=len(self.expert_token_count)
+            )
+            self.expert_token_count += counts
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if any(getattr(m, "_if_calib", False) for m in self.experts.modules()):
+        is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules())
+        if is_calib:
             # If any of the experts are in calibration mode, we will forward all tokens to all experts
             # This is used only for calibration, we need to re-calculate the actual outputs again using
             # the original top_k
             if TRANSFORMERS_VERSION_GE_5_0:
-                assert hasattr(self, "gate")
-                # Path for transformers >= 5.0
-                original_top_k = self.gate.topk
-                self.gate.topk = self.gate.num_experts
+                assert hasattr(self, "gate") and hasattr(self.gate, "top_k")
+                original_top_k = self.gate.top_k
+                self.gate.top_k = self.gate.num_experts
                 super().forward(hidden_states)
-                self.gate.topk = original_top_k
+                self.gate.top_k = original_top_k
             else:
                 # Path for transformers < 5.0
                 original_top_k = self.top_k
@@ -475,7 +511,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                     raise ValueError(f"Could not find num_experts in module {self}")
                 super().forward(hidden_states)
                 self.top_k = original_top_k
-        return super().forward(hidden_states)
+        # Enable counting only for the real-routing forward during calibration
+        self._count_expert_tokens = is_calib
+        output = super().forward(hidden_states)
+        self._count_expert_tokens = False
+        return output
 
 
 class _QuantLlama4TextExperts(QuantModule):
@@ -765,10 +805,7 @@ def unpack_weight(self):
 
 
 try:
-    from transformers.models.llama4.modeling_llama4 import Llama4TextExperts, Llama4TextMoe
-
-    if Llama4TextMoe not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Llama4TextMoe: "hf.Llama4TextMoe"})(_QuantSparseMoe)
+    from transformers.models.llama4.modeling_llama4 import Llama4TextExperts
 
     if Llama4TextExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Llama4TextExperts: "hf.Llama4TextExperts"})(
@@ -791,16 +828,6 @@ def unpack_weight(self):
 except ImportError:
     pass
 
-try:
-    from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
-
-    if MixtralSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register({MixtralSparseMoeBlock: "hf.MixtralSparseMoeBlock"})(
-            _QuantSparseMoe
-        )
-except ImportError:
-    pass
-
 try:
     from transformers.models.falcon.modeling_falcon import FalconLinear
 
@@ -809,36 +836,6 @@ def unpack_weight(self):
 except ImportError:
     pass
 
-try:
-    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
-
-    if Qwen3MoeSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Qwen3MoeSparseMoeBlock: "hf.Qwen3MoeSparseMoeBlock"})(
-            _QuantSparseMoe
-        )
-except ImportError:
-    pass
-
-try:
-    from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
-
-    if Qwen2MoeSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Qwen2MoeSparseMoeBlock: "hf.Qwen2MoeSparseMoeBlock"})(
-            _QuantSparseMoe
-        )
-except ImportError:
-    pass
-
-try:
-    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextSparseMoeBlock
-
-    if Qwen3NextSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Qwen3NextSparseMoeBlock: "hf.Qwen3NextSparseMoeBlock"})(
-            _QuantSparseMoe
-        )
-except ImportError:
-    pass
-
 try:
     from compressed_tensors.linear.compressed_linear import CompressedLinear
 
@@ -850,15 +847,7 @@ def unpack_weight(self):
     pass
 
 try:
-    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
-        Qwen3VLMoeTextExperts,
-        Qwen3VLMoeTextSparseMoeBlock,
-    )
-
-    if Qwen3VLMoeTextSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register(
-            {Qwen3VLMoeTextSparseMoeBlock: "hf.Qwen3VLMoeTextSparseMoeBlock"}
-        )(_QuantSparseMoe)
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
 
     if Qwen3VLMoeTextExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3VLMoeTextExperts: "hf.Qwen3VLMoeTextExperts"})(
@@ -989,15 +978,56 @@ def register_falcon_linears_on_the_fly(model):
             QuantModuleRegistry.register({linear_type: linear_type.__name__})(_QuantLinear)
 
 
-def register_minimax_m2_moe_on_the_fly(model):
-    """Register MiniMax M2 MoE modules as a QUANT_MODULE.
+def _is_sparse_moe_block(module):
+    """Check if a module is structurally a sparse MoE block compatible with _QuantSparseMoe.
+
+    All HuggingFace MoE blocks (Mixtral, Qwen3Moe, Qwen2Moe, Qwen3Next, Llama4, MiniMax, etc.)
+    share a common structural pattern: a ``gate`` (TopKRouter) sub-module with routing attributes
+    (``top_k`` and ``num_experts``), and an ``experts`` sub-module.
 
-    MiniMax M2 MoE modules are defined in the model card, so we need to register them on the fly.
+    This function detects that pattern instead of relying on class names, making it forward-compatible
+    with new MoE architectures. Some MoE models (e.g. Glm4MoeMoE) have ``gate`` and ``experts`` but
+    use a different routing interface (``n_routed_experts`` instead of ``num_experts``, custom
+    ``route_tokens_to_experts``), so we require ``num_experts`` to be present to avoid false positives.
     """
-    if type(model).__name__ in ["MiniMaxM2ForCausalLM"]:
-        moe_type = type(model.model.layers[0].block_sparse_moe)
-        if QuantModuleRegistry.get(moe_type) is None:
-            QuantModuleRegistry.register({moe_type: moe_type.__name__})(_QuantSparseMoe)
+    if not hasattr(module, "experts"):
+        return False
+
+    # Primary: gate sub-module has topk/top_k + num_experts (standard TopKRouter pattern)
+    if hasattr(module, "gate"):
+        gate = module.gate
+        has_topk = hasattr(gate, "top_k")
+        has_num_experts = hasattr(gate, "num_experts")
+        if has_topk and has_num_experts:
+            return True
+
+    # Fallback: top_k + num_experts on the block itself (older transformers, e.g. v4.x Qwen3Next)
+    return hasattr(module, "top_k") and hasattr(module, "num_experts")
+
+
+def register_sparse_moe_on_the_fly(model):
+    """Auto-detect and register MOE modules as _QuantSparseMoe.
+
+    Walks the model tree, identifies MoE blocks by their structural attributes
+    (``gate`` + ``experts``), and registers unregistered ones with ``_QuantSparseMoe``.
+    """
+    visited_types = set()
+    for name, module in model.named_modules():
+        mod_type = type(module)
+
+        # Avoid duplicate registration: skip if we already processed this type
+        # in this walk, or if it was previously registered in the QuantModuleRegistry.
+        if mod_type in visited_types or QuantModuleRegistry.get(mod_type) is not None:
+            continue
+
+        visited_types.add(mod_type)
+
+        if _is_sparse_moe_block(module):
+            print(
+                f"\033[1mDetected MOE module '{name}' of type {mod_type.__name__}, "
+                f"registering with _QuantSparseMoe.\033[0m"
+            )
+            QuantModuleRegistry.register({mod_type: f"hf.{mod_type.__name__}"})(_QuantSparseMoe)
 
 
 def _is_supported_hf_model(model):
@@ -1065,7 +1095,7 @@ def _is_param_grad_enabled_for_auto_quantize(pname, model):
     [
         register_falcon_linears_on_the_fly,
         register_dbrx_moe_on_the_fly,
-        register_minimax_m2_moe_on_the_fly,
+        register_sparse_moe_on_the_fly,
         register_hf_attentions_on_the_fly,
         convert_hf_parallel_linears_on_the_fly,
     ]
diff --git a/tests/unit/torch/quantization/plugins/test_sparse_moe.py b/tests/unit/torch/quantization/plugins/test_sparse_moe.py