From 0b6a7c9b19e848f63eff47601c2bd3372f6e994d Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Mon, 18 May 2026 08:06:40 -0700 Subject: [PATCH] support mcore autoquant Signed-off-by: Jennifer Chen --- modelopt/torch/quantization/algorithms.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py index 992717983db..391f8cb8648 100644 --- a/modelopt/torch/quantization/algorithms.py +++ b/modelopt/torch/quantization/algorithms.py @@ -291,13 +291,10 @@ def get_score(self, recipe: QuantRecipe) -> float: total_score += importance.cpu().item() continue - if parallel_state.expert_model_parallel_group.is_initialized(): - # TODO: Support expert model parallelism for score estimation - warnings.warn("AutoQuantize does not support expert model parallelism yet.") importance = importance.cpu() importance = DistributedProcessGroup.get_dist_syncd_obj( importance, - [parallel_state.tensor_parallel_group, parallel_state.data_parallel_group], + [parallel_state.tensor_parallel_group, parallel_state.data_parallel_group, parallel_state.expert_model_parallel_group], sum, ) total_score += importance.item() @@ -318,13 +315,9 @@ def get_cost(self, recipe: QuantRecipe) -> float: cost += weight_size * recipe.compression continue - if parallel_state.expert_model_parallel_group.is_initialized(): - # TODO: Support expert model parallelism - warnings.warn("AutoQuantize does not support expert model parallelism yet.") - weight_size = DistributedProcessGroup.get_dist_syncd_obj( weight_size, - [parallel_state.tensor_parallel_group], + [parallel_state.tensor_parallel_group, parallel_state.expert_model_parallel_group], sum, ) @@ -362,6 +355,8 @@ class _AutoQuantizeBaseSearcher(BaseSearcher, ABC): # gate_proj, up_proj, down_proj for Qwen3 like MoE models r"^(.*?\.mlp\.experts)\.\d+\.(gate_proj|up_proj|down_proj)$", r"^(.*?\.mixer\.experts)\.\d+\.(up_proj|down_proj)$", # NemotronH MoE experts + # NemotronH MoE experts in MCore naming (linear_fc1=gate+up fused, linear_fc2=down) + r"^(.*?\.mlp\.experts\.local_experts)\.\d+\.(linear_fc1|linear_fc2)$", r"^(.*?)\.(gate_proj|up_proj)$", # gate_proj, up_proj for llama like models r"^(.*?)\.(\d+\.(w1|w2|w3))$", # mixtral experts r"^(.*?)\.((w1_linear|w2_linear|w3_linear)\.\d+)$", # dbrx experts