adopt *experts.{id}.* naming pattern

Edwardf0t1 · Edwardf0t1 · commit af5153917472 · 2026-02-16T17:19:04.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -959,6 +959,36 @@ def _export_diffusers_checkpoint(
     print(f"Export complete. Saved to: {export_dir}")
 
 
+def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
+    """No-op replacement for transformers' revert_weight_conversion."""
+    return state_dict
+
+
+def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
+    """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
+    import importlib
+
+    patches: list[tuple[Any, Any]] = []
+    for mod_path in [
+        "transformers.core_model_loading",
+        "transformers.modeling_utils",
+    ]:
+        try:
+            mod = importlib.import_module(mod_path)
+            if hasattr(mod, "revert_weight_conversion"):
+                patches.append((mod, getattr(mod, "revert_weight_conversion")))
+                setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop)
+        except (ImportError, AttributeError):
+            pass
+    return patches
+
+
+def _unpatch_revert_weight_conversion(patches: list[tuple[Any, Any]]) -> None:
+    """Restore the original revert_weight_conversion functions."""
+    for mod, original in patches:
+        mod.revert_weight_conversion = original
+
+
 def export_hf_checkpoint(
     model: Any,
     dtype: torch.dtype | None = None,
@@ -1022,21 +1052,7 @@ def export_hf_checkpoint(
         # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
         # We must patch both the source module and the importing module since
         # modeling_utils does `from core_model_loading import revert_weight_conversion`.
-        _patches = []
-        _noop = lambda model, state_dict: state_dict
-        for _mod_path in [
-            "transformers.core_model_loading",
-            "transformers.modeling_utils",
-        ]:
-            try:
-                import importlib
-
-                _mod = importlib.import_module(_mod_path)
-                if hasattr(_mod, "revert_weight_conversion"):
-                    _patches.append((_mod, getattr(_mod, "revert_weight_conversion")))
-                    setattr(_mod, "revert_weight_conversion", _noop)
-            except (ImportError, AttributeError):
-                pass
+        _patches = _patch_revert_weight_conversion()
 
         try:
             model.save_pretrained(
@@ -1045,8 +1061,7 @@ def export_hf_checkpoint(
                 save_modelopt_state=save_modelopt_state,
             )
         finally:
-            for _mod, _original in _patches:
-                _mod.revert_weight_conversion = _original
+            _unpatch_revert_weight_conversion(_patches)
 
         original_config = f"{export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -653,7 +653,7 @@ def forward(
         return next_states
 
 
-class _Qwen3_5MoeExpertModule(nn.Module):
+class _Qwen35MoeExpertModule(nn.Module):
     """Container for a single Qwen3.5 MoE expert's linear layers.
 
     Produces the naming pattern: experts.{id}.gate_proj.weight
@@ -667,7 +667,7 @@ def __init__(self, hidden_dim: int, expert_dim: int):
         self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False)
 
 
-class _QuantQwen3_5MoeExperts(QuantModule):
+class _QuantQwen35MoeExperts(QuantModule):
     def _setup(self):
         """Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers.
 
@@ -688,7 +688,7 @@ def _copy_weight(module, weight):
         with init_empty_weights():
             expert_modules = nn.ModuleList(
                 [
-                    _Qwen3_5MoeExpertModule(self.hidden_dim, expert_dim)
+                    _Qwen35MoeExpertModule(self.hidden_dim, expert_dim)
                     for _ in range(self.num_experts)
                 ]
             )
@@ -898,7 +898,7 @@ def unpack_weight(self):
     pass
 
 
-class _QuantQwen3_5MoeSparseMoeBlock(_QuantSparseMoe):
+class _QuantQwen35MoeSparseMoeBlock(_QuantSparseMoe):
     """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes.
 
     We override forward instead of just bridging attributes because the router (self.gate)
@@ -927,12 +927,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})(
-            _QuantQwen3_5MoeSparseMoeBlock
+            _QuantQwen35MoeSparseMoeBlock
         )
 
     if Qwen3_5MoeExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})(
-            _QuantQwen3_5MoeExperts
+            _QuantQwen35MoeExperts
         )
 except ImportError:
     pass