From 0b6a7c9b19e848f63eff47601c2bd3372f6e994d Mon Sep 17 00:00:00 2001
From: Jennifer Chen <jennifchen@nvidia.com>
Date: Mon, 18 May 2026 08:06:40 -0700
Subject: [PATCH] support mcore autoquant

Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
---
 modelopt/torch/quantization/algorithms.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
index 992717983db..391f8cb8648 100644
--- a/modelopt/torch/quantization/algorithms.py
+++ b/modelopt/torch/quantization/algorithms.py
@@ -291,13 +291,10 @@ def get_score(self, recipe: QuantRecipe) -> float:
                 total_score += importance.cpu().item()
                 continue
 
-            if parallel_state.expert_model_parallel_group.is_initialized():
-                # TODO: Support expert model parallelism for score estimation
-                warnings.warn("AutoQuantize does not support expert model parallelism yet.")
             importance = importance.cpu()
             importance = DistributedProcessGroup.get_dist_syncd_obj(
                 importance,
-                [parallel_state.tensor_parallel_group, parallel_state.data_parallel_group],
+                [parallel_state.tensor_parallel_group, parallel_state.data_parallel_group, parallel_state.expert_model_parallel_group],
                 sum,
             )
             total_score += importance.item()
@@ -318,13 +315,9 @@ def get_cost(self, recipe: QuantRecipe) -> float:
                 cost += weight_size * recipe.compression
                 continue
 
-            if parallel_state.expert_model_parallel_group.is_initialized():
-                # TODO: Support expert model parallelism
-                warnings.warn("AutoQuantize does not support expert model parallelism yet.")
-
             weight_size = DistributedProcessGroup.get_dist_syncd_obj(
                 weight_size,
-                [parallel_state.tensor_parallel_group],
+                [parallel_state.tensor_parallel_group, parallel_state.expert_model_parallel_group],
                 sum,
             )
 
@@ -362,6 +355,8 @@ class _AutoQuantizeBaseSearcher(BaseSearcher, ABC):
         # gate_proj, up_proj, down_proj for Qwen3 like MoE models
         r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj|up_proj|down_proj)$",
         r"^(.*?\.mixer\.experts)\.\d+\.(up_proj|down_proj)$",  # NemotronH MoE experts
+        # NemotronH MoE experts in MCore naming (linear_fc1=gate+up fused, linear_fc2=down)
+        r"^(.*?\.mlp\.experts\.local_experts)\.\d+\.(linear_fc1|linear_fc2)$",
         r"^(.*?)\.(gate_proj|up_proj)$",  # gate_proj, up_proj for llama like models
         r"^(.*?)\.(\d+\.(w1|w2|w3))$",  # mixtral experts
         r"^(.*?)\.((w1_linear|w2_linear|w3_linear)\.\d+)$",  # dbrx experts