Remove _moe_count_expert_calib_tokens flag; tie token counting to moe_calib_experts_ratio (#1062)

cjluo-nv · web-flow · commit 1dc890d97105 · 2026-03-18T10:09:24.000-07:00
Cherry-pick for 0.43.0 ## Summary - **Remove `moe_count_expert_calib_tokens`** config field and the `_moe_count_expert_calib_tokens` internal flag. Token counting is now implicitly enabled when `moe_calib_experts_ratio` is set, removing a redundant knob. - **Change `--moe_calib_experts_ratio` default to `None`** in `hf_ptq.py` (was `1.0`). Previously all experts were force-calibrated by default; now the feature is opt-in and non-MoE models are unaffected without any flag. - **Disable `layer_sync_moe_local_experts_amax`** when `moe_calib_experts_ratio` is set, since each expert is calibrated independently with sufficient token coverage in that mode. - **Simplify `_QuantSparseMoe.forward`**: remove redundant truthy checks on `_moe_calib_experts_ratio` inside the branch that already assumes it is set. ## Changed files | File | Change | |------|--------| | `modelopt/torch/quantization/config.py` | Remove `moe_count_expert_calib_tokens` field; update `moe_calib_experts_ratio` description to document amax sync behavior | | `modelopt/torch/quantization/mode.py` | Remove `moe_count_expert_calib_tokens` propagation in `wrapped_calib_func` | | `modelopt/torch/quantization/plugins/huggingface.py` | Remove `_moe_count_expert_calib_tokens` from `_QuantSparseMoe`; simplify `forward`; skip `layer_sync_moe_local_experts_amax` when ratio is set | | `examples/llm_ptq/hf_ptq.py` | Default `--moe_calib_experts_ratio` to `None`; guard validation | | `tests/unit/.../test_sparse_moe.py` | Update tests to use `_moe_calib_experts_ratio` instead of removed flag | ## Test plan - [x] Verify `hf_ptq.py` works without `--moe_calib_experts_ratio` (non-MoE model, default `None`) 🤖 Generated with [Claude Code](https://claude.com/claude-code)  ## Summary by CodeRabbit * **Configuration Changes** * moe_calib_experts_ratio now defaults to None (disabled) instead of 1.0; validation only occurs when a value is provided. * **Refactor** * Simplified MoE calibration flow and token-counting behavior; removed a deprecated expert-calibration configuration field. * **Documentation** * Changelog and docstrings updated to reflect the new default and calibration behavior.  --------- Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -18,7 +18,7 @@ NVIDIA Model Optimizer Changelog
 - Add ``fp8_cast`` and ``nvfp4_cast`` modes for ``--kv_cache_qformat`` in ``hf_ptq.py``. These use a constant amax (FP8 E4M3 max, 448.0) without data-driven calibration, since the downstream engine uses FP8 attention math for both FP8 and NVFP4 quantization. A new ``use_constant_amax`` field in :class:`QuantizerAttributeConfig <modelopt.torch.quantization.config.QuantizerAttributeConfig>` controls this behavior.
 - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow.
 - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory.
-- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to all the experts.
+- Add ``--moe_calib_experts_ratio`` flag in ``hf_ptq.py`` to specify the ratio of experts to calibrate during forward pass to improve expert coverage during calibration. Default to None (not enabled).
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 - Add support for rotating the input before quantization for RHT.
 - Add support for advanced weight scale search for NVFP4 quantization and its export path.
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -1207,16 +1207,16 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--moe_calib_experts_ratio",
         type=float,
-        default=1.0,
+        default=None,
         help=(
             "Fraction of experts to calibrate during forward pass (ratio in (0.0, 1.0]). "
-            "Only used for MOE models; used to reduce the number of experts calibrated during the forward pass."
+            "Only used for MOE models; used to reduce the number of experts calibrated during the forward pass. "
             "Does not impact non-MOE models."
         ),
     )
 
     args = parser.parse_args()
-    if not (0.0 < args.moe_calib_experts_ratio <= 1.0):
+    if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
         parser.error("--moe_calib_experts_ratio must be in the range (0.0, 1.0].")
 
     return args
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -1066,21 +1066,15 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
 
     moe_calib_experts_ratio: float | None = ModeloptField(
         default=None,
+        gt=0.0,
+        le=1.0,
         title="% of experts to calibrate during forward pass.",
         description=(
             "If specified, we force forward tokens to % of experts during the calibration"
             " pass. This forward is for calibration purpose only and will not affect the"
-            " actual inference. Not supported for all MoE architectures; currently works"
-            " with a few HuggingFace models such as Mixtral, Qwen3Moe, MiniMax."
-        ),
-    )
-
-    moe_count_expert_calib_tokens: bool = ModeloptField(
-        default=False,
-        title="Enable expert token counting during MoE calibration.",
-        description=(
-            "If True, counts how many tokens are routed to each expert during calibration."
-            " Not supported for all MoE architectures; currently works with a few HuggingFace"
+            " actual inference. NOTE: when set, ``layer_sync_moe_local_experts_amax`` is"
+            " disabled so each expert maintains its own calibration statistics. Not"
+            " supported for all MoE architectures; currently works with a few HuggingFace"
             " models such as Mixtral, Qwen3Moe, MiniMax."
         ),
     )
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
@@ -236,12 +236,6 @@ def wrapped_calib_func(
             if hasattr(module, "_moe_calib_experts_ratio"):
                 module._moe_calib_experts_ratio = moe_calib_experts_ratio
 
-    moe_count_expert_calib_tokens = kwargs.pop("moe_count_expert_calib_tokens", False)
-    if moe_count_expert_calib_tokens:
-        for module in model.modules():
-            if hasattr(module, "_moe_count_expert_calib_tokens"):
-                module._moe_count_expert_calib_tokens = True
-
     if func is not None:
         if sequential:
             if forward_loop is None:
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -446,16 +446,15 @@ class _QuantSparseMoe(QuantModule):
 
     Supports ``layer_sync_moe_local_experts_amax`` to sync input quantizer amax across experts.
 
-    Optionally supports two config-driven features (disabled by default):
+    Optionally supports config-driven features (disabled by default):
     - ``_moe_calib_experts_ratio``: force-forward tokens to more experts during calibration.
-    - ``_moe_count_expert_calib_tokens``: count tokens routed to each expert during calibration.
+      When set to a value > 0, also enables token counting per expert.
 
-    When both are disabled, forward is a direct pass-through with zero overhead.
+    When disabled, forward is a direct pass-through with zero overhead.
     """
 
     def _setup(self):
         self._moe_calib_experts_ratio = None
-        self._moe_count_expert_calib_tokens = False
         self._token_counting_initialized = False
 
     def _init_token_counting(self):
@@ -503,24 +502,18 @@ def _gate_forward_hook(self, module, input, output):
             self.expert_token_count += counts.to(self.expert_token_count.device)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if not self._moe_calib_experts_ratio and not self._moe_count_expert_calib_tokens:
+        if self._moe_calib_experts_ratio is None:
             return super().forward(hidden_states)
 
-        if self._moe_count_expert_calib_tokens and not self._token_counting_initialized:
-            self._init_token_counting()
-
         is_calib = any(getattr(m, "_if_calib", False) for m in self.experts.modules())
-        self._count_expert_tokens = is_calib and self._moe_count_expert_calib_tokens
-
-        # If any of the experts are in calibration mode, we will forward all tokens to
-        # self._moe_calib_experts_ratio % of the experts to improve the calibration coverage.
-        # This is used only for calibration, we need to re-calculate the actual outputs again using
-        # the original top_k
-        if is_calib and self._moe_calib_experts_ratio:
-            self._count_expert_tokens = True
-            assert 0 < self._moe_calib_experts_ratio <= 1, (
-                "moe_calib_experts_ratio must be between 0 and 1"
-            )
+
+        # During calibration, forward all tokens to a larger fraction of experts to improve
+        # calibration coverage, then re-run with the original top_k for actual outputs.
+        if is_calib:
+            # Skip counting when all experts are calibrated (ratio == 1.0).
+            self._count_expert_tokens = self._moe_calib_experts_ratio < 1.0
+            if self._count_expert_tokens and not self._token_counting_initialized:
+                self._init_token_counting()
             if TRANSFORMERS_VERSION_GE_5_0:
                 assert hasattr(self, "gate") and hasattr(self.gate, "top_k")
                 original_top_k = self.gate.top_k
@@ -561,7 +554,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return output
 
     def layer_sync_moe_local_experts_amax(self):
-        """Sync input_quantizer amax across experts so all share the same amax per quantizer."""
+        """Sync input_quantizer amax across experts so all share the same amax per quantizer.
+
+        Skipped when _moe_calib_experts_ratio is set, as each expert is calibrated independently.
+        """
+        if self._moe_calib_experts_ratio is not None:
+            return
         sync_moe_expert_amax(self.experts)
 
 
diff --git a/tests/unit/torch/quantization/plugins/test_sparse_moe.py b/tests/unit/torch/quantization/plugins/test_sparse_moe.py
@@ -202,7 +202,6 @@ def test_setup_config_knobs_default(self):
 
         converted = QuantModuleRegistry.convert(moe_block)
         assert converted._moe_calib_experts_ratio is None
-        assert converted._moe_count_expert_calib_tokens is False
         assert not hasattr(converted, "expert_token_count")
 
     def test_forward_default_config_passthrough(self):
@@ -259,17 +258,22 @@ def test_forward_calib_restores_top_k(self):
             assert converted.top_k == original_top_k
 
     def test_token_counting_lazy_init(self):
-        """When moe_count_expert_calib_tokens is enabled, token counting infra is lazy-inited."""
+        """When moe_calib_experts_ratio > 0, token counting infra is lazy-inited."""
         model = get_tiny_qwen3_moe()
         moe_block = self._get_moe_block(model)
         if QuantModuleRegistry.get(type(moe_block)) is None:
             register_sparse_moe_on_the_fly(model)
 
         converted = QuantModuleRegistry.convert(moe_block)
-        converted._moe_count_expert_calib_tokens = True
+        converted._moe_calib_experts_ratio = 0.5
 
         assert not hasattr(converted, "expert_token_count")
 
+        # Simulate calibration mode so lazy-init triggers during forward
+        # Set _if_calib on an expert sub-module (not set by default since only the MoE
+        # block was converted, not the full model).
+        next(converted.experts.modules())._if_calib = True
+
         x = torch.randn(1, 4, 32)
         with torch.no_grad():
             converted(x)
@@ -305,8 +309,7 @@ def test_qwen3_moe_quantize_with_token_forcing_and_counting():
     quant_cfg = copy.deepcopy(mtq.INT8_DEFAULT_CFG)
     quant_cfg["algorithm"] = {
         "method": "max",
-        "moe_calib_experts_ratio": 1.0,
-        "moe_count_expert_calib_tokens": True,
+        "moe_calib_experts_ratio": 0.5,
     }
 
     def calib_fn(model):