Support force tokens to % of total experts during calibration (#910)

cjluo-nv · coderabbitai[bot] · realAsma · web-flow · commit 03a1899dda02 · 2026-02-24T12:35:19.000-08:00
## What does this PR do?

**Type of change:** New feature

**Overview:** Adds a configurable `moe_calib_experts_ratio` parameter
that controls the percentage of experts to calibrate during the forward
pass in MoE (Mixture of Experts) models. Previously, the calibration
forward always routed tokens to **all** experts, which is expensive.
This PR allows the user to specify a ratio (default: still all experts
so no behavior change) to improve expert calibration coverage without
the cost of a full-expert forward. The token counting for the expert
coverage table now tracks the calibration routing and runs on CUDA for
efficiency.

**Changes include:**
- New `moe_calib_experts_ratio` field in `QuantizeAlgorithmConfig`
(`config.py`)
- Propagation of the ratio from the algorithm config to MoE modules
during calibration (`mode.py`)
- Updated `_QuantSparseMoe.forward` to use the configurable ratio
instead of hard-coding all experts (`huggingface.py`)
- New `--moe_calib_experts_ratio` CLI flag in `hf_ptq.py` (default
`0.25`)
- Moved `expert_token_count` tensor to CUDA and updated the HTML table
title in `moe_utils.py`

## Usage

Via hf_ptq.py CLI — calibrate 50% of experts during MoE calibration
python hf_ptq.py --model &lt;model&gt; --qformat int4_awq
--moe_calib_experts_ratio 0.5

Via Python API — pass the ratio through the algorithm config
import modelopt.torch.quantization as mtq

quant_cfg = {
    "quant_cfg": { ... },
    "algorithm": {
        "method": "awq_lite",
        "moe_calib_experts_ratio": 0.25,  # calibrate 1/4 of experts
    },
}
mtq.quantize(model, quant_cfg, forward_loop=calib_loop)

## Testing
Test with Qwen3 30B A3B calibration and check the tokens per expert.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;

## Summary by CodeRabbit

## Release Notes

* **New Features**
* Added support for configurable expert calibration during Mixture of
Experts (MOE) model quantization. Users can now specify the percentage
of experts to include during calibration, enabling better expert
coverage and improved quantization accuracy for MOE models. Default: 25%
of all experts.

&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
Signed-off-by: Chenjie Luo &lt;108829653+cjluo-nv@users.noreply.github.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
Co-authored-by: realAsma &lt;86726418+realAsma@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 
 - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
 - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
+- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to all the experts.
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 - Add support for rotating the input before quantization for RHT.
 
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -201,6 +201,7 @@ def build_quant_cfg(
     model_type,
     quant_cfg_choices,
     kv_quant_cfg_choices,
+    moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
     quant_cfg = {}
     assert qformat in quant_cfg_choices, (
@@ -232,6 +233,20 @@ def build_quant_cfg(
             getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"],
         )
 
+    if moe_calib_experts_ratio:
+        assert 0 < moe_calib_experts_ratio <= 1, "moe_calib_experts_ratio must be between 0 and 1"
+        if isinstance(quant_cfg["algorithm"], str):
+            quant_cfg["algorithm"] = {
+                "method": quant_cfg["algorithm"],
+                "moe_calib_experts_ratio": moe_calib_experts_ratio,
+            }
+        elif isinstance(quant_cfg["algorithm"], dict):
+            quant_cfg["algorithm"]["moe_calib_experts_ratio"] = moe_calib_experts_ratio
+        else:
+            warnings.warn(
+                f"Quantization algorithm: {quant_cfg['algorithm']} does not support setting moe_calib_experts_ratio"
+            )
+
     # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead.
     if model_type == "gemma" and "int8_sq" in qformat:
         quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5}
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -906,6 +906,7 @@ def quantize_main(
             model_type,
             QUANT_CFG_CHOICES,
             KV_QUANT_CFG_CHOICES,
+            args.moe_calib_experts_ratio,
         )
 
         # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92)
@@ -1126,8 +1127,21 @@ def parse_args() -> argparse.Namespace:
             "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
         ),
     )
+    parser.add_argument(
+        "--moe_calib_experts_ratio",
+        type=float,
+        default=1.0,
+        help=(
+            "Fraction of experts to calibrate during forward pass (ratio in (0.0, 1.0]). "
+            "Only used for MOE models; used to reduce the number of experts calibrated during the forward pass."
+            "Does not impact non-MOE models."
+        ),
+    )
 
-    return parser.parse_args()
+    args = parser.parse_args()
+    if not (0.0 < args.moe_calib_experts_ratio <= 1.0):
+        parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
+    return args
 
 
 def main(args: argparse.Namespace):
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -48,7 +48,7 @@ def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | Non
         "th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }",
         "th { background: #f0f0f0; }",
         "</style></head><body>",
-        "<h2>Expert Token Counts (per MoE layer)</h2>",
+        "<h2>Expert Calib Token Counts (per MoE layer)</h2>",
         "<table><tr><th>Layer/Expert</th>",
     ]
     html_parts.extend(f"<th>{i}</th>" for i in range(num_experts))
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1097,6 +1097,16 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.",
     )
 
+    moe_calib_experts_ratio: float | None = ModeloptField(
+        default=None,
+        title="% of experts to calibrate during forward pass.",
+        description=(
+            "If specified, we force forward tokens to % of experts during the calibration"
+            " pass. This forward is for calibration purpose only and will not affect the"
+            " actual inference."
+        ),
+    )
+
 
 class MaxCalibConfig(QuantizeAlgorithmConfig):
     """The config for max calibration algorithm.
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -225,6 +225,15 @@ def wrapped_calib_func(
         # For backward compatibility
         kwargs["algorithm"] = method
 
+    moe_calib_experts_ratio = kwargs.pop("moe_calib_experts_ratio", None)
+    if moe_calib_experts_ratio is not None:
+        assert (
+            isinstance(moe_calib_experts_ratio, (int, float)) and 0 < moe_calib_experts_ratio <= 1
+        ), f"Invalid moe_calib_experts_ratio {moe_calib_experts_ratio!r}"
+        for module in model.modules():
+            if hasattr(module, "_moe_calib_experts_ratio"):
+                module._moe_calib_experts_ratio = moe_calib_experts_ratio
+
     if func is not None:
         # Call the function with forward_loop as a separate argument
         func(model, forward_loop=forward_loop, **kwargs)
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -458,8 +458,13 @@ def _setup(self):
         elif hasattr(self, "experts") and hasattr(self.experts, "num_experts"):
             num_experts = self.experts.num_experts
 
-        self.expert_token_count = torch.zeros(num_experts, dtype=torch.long, device="cpu")
+        self.register_buffer(
+            "expert_token_count",
+            torch.zeros(num_experts, dtype=torch.long, device=next(self.parameters()).device),
+            persistent=False,
+        )
         self._count_expert_tokens = False
+        self._moe_calib_experts_ratio = None
 
         if num_experts == 0:
             warnings.warn(
@@ -483,36 +488,48 @@ def _gate_forward_hook(self, module, input, output):
                 logits = output if not isinstance(output, tuple) else output[0]
                 top_k = self.gate.top_k if hasattr(self.gate, "top_k") else self.top_k
                 _, indices = torch.topk(logits.float(), top_k, dim=-1)
-            counts = torch.bincount(
-                indices.reshape(-1).cpu(), minlength=len(self.expert_token_count)
-            )
-            self.expert_token_count += counts
+            counts = torch.bincount(indices.reshape(-1), minlength=self.expert_token_count.shape[0])
+            self.expert_token_count += counts.to(self.expert_token_count.device)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules())
-        if is_calib:
-            # If any of the experts are in calibration mode, we will forward all tokens to all experts
+        self._count_expert_tokens = is_calib
+        if is_calib and self._moe_calib_experts_ratio:
+            self._count_expert_tokens = True
+            assert 0 < self._moe_calib_experts_ratio <= 1, (
+                "moe_calib_experts_ratio must be between 0 and 1"
+            )
+            # If any of the experts are in calibration mode, we will forward all tokens to
+            # self._moe_calib_experts_ratio % of the experts to improve the calibration coverage.
             # This is used only for calibration, we need to re-calculate the actual outputs again using
             # the original top_k
             if TRANSFORMERS_VERSION_GE_5_0:
                 assert hasattr(self, "gate") and hasattr(self.gate, "top_k")
                 original_top_k = self.gate.top_k
-                self.gate.top_k = self.gate.num_experts
+                self.gate.top_k = max(
+                    original_top_k, round(self.gate.num_experts * self._moe_calib_experts_ratio)
+                )
                 super().forward(hidden_states)
                 self.gate.top_k = original_top_k
             else:
                 # Path for transformers < 5.0
                 original_top_k = self.top_k
                 if hasattr(self, "num_experts"):
-                    self.top_k = self.num_experts
+                    self.top_k = max(
+                        original_top_k, round(self.num_experts * self._moe_calib_experts_ratio)
+                    )
                 elif hasattr(self, "experts"):
-                    self.top_k = self.experts.num_experts
+                    self.top_k = max(
+                        original_top_k,
+                        round(self.experts.num_experts * self._moe_calib_experts_ratio),
+                    )
                 else:
                     raise ValueError(f"Could not find num_experts in module {self}")
                 super().forward(hidden_states)
                 self.top_k = original_top_k
-        # Enable counting only for the real-routing forward during calibration
-        self._count_expert_tokens = is_calib
+            self._count_expert_tokens = False
+        else:
+            self._count_expert_tokens = True
         output = super().forward(hidden_states)
         self._count_expert_tokens = False
         return output

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def save_expert_token_count_table(model: nn.Module, output_dir: str \| Path \| Non`
`48`	`48`	`"th, td { border: 1px solid #ccc; padding: 4px 8px; text-align: right; }",`
`49`	`49`	`"th { background: #f0f0f0; }",`
`50`	`50`	`"</style></head><body>",`
`51`		`- "<h2>Expert Token Counts (per MoE layer)</h2>",`
	`51`	`+ "<h2>Expert Calib Token Counts (per MoE layer)</h2>",`
`52`	`52`	`"<table><tr><th>Layer/Expert</th>",`
`53`	`53`	`]`
`54`	`54`	`html_parts.extend(f"<th>{i}</th>" for i in range(num_experts))`