sync moe input quantizer only

jenchen13 · jenchen13 · commit cfff76029059 · 2026-02-18T17:30:42.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -575,26 +575,30 @@ def _setup(self):
             expert.linear_fc2.parallel_state = self.parallel_state
 
     def layer_sync_moe_local_experts_amax(self):
-        """Sync amax across local experts in a SequentialMLP.
+        """Sync input quantizer amax across local experts in a SequentialMLP.
 
-        Synchronize the amax values across local experts in a lyaer such that all local experts will
-        share the same amax. This function operates on a single rank and does not require distributed sync.
+        Ensures all experts have the same input quantizer amax.This function operates
+        on a single rank and does not require distributed sync.
 
         Distributed amax sync across EP and ETP (for RowParallel) happens in model_calib.max_calibrate().
         This function should be called before the distributed sync to ensure the amax values
         are synchronized across the layer first.
 
         Note:
             Because there are logic which calls collective communication based on whether amax is not None,
-            We need to garuantee that all experts must have amax. Otherwise, there will be deadlock
-            when synchroizing over EP since some ranks may have amax None and not calling the collective
+            We need to guarantee that all experts must have amax. Otherwise, there will be deadlock
+            when synchronizing over EP since some ranks may have amax None and not calling the collective
             communication.
         """
         # Collect amax from all local experts
         amax_dict = {}
         for expert in self.local_experts:
             for name, module in expert.named_modules():
-                if isinstance(module, TensorQuantizer) and module.amax is not None:
+                if (
+                    isinstance(module, TensorQuantizer)
+                    and module.amax is not None
+                    and name == "input_quantizer"
+                ):
                     stored_amax = amax_dict.get(name)
                     amax_tensor = module.amax.detach().clone()
                     amax_dict[name] = (
@@ -606,7 +610,11 @@ def layer_sync_moe_local_experts_amax(self):
         # Apply synchronized amax values back to all local experts
         for expert in self.local_experts:
             for name, module in expert.named_modules():
-                if isinstance(module, TensorQuantizer) and name in amax_dict:
+                if (
+                    isinstance(module, TensorQuantizer)
+                    and name in amax_dict
+                    and name == "input_quantizer"
+                ):
                     module.amax = amax_dict[name].detach().clone()
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
@@ -735,6 +735,39 @@ def test_te_grouped_vs_sequential_quantize(need_4_gpus):
     )
 
 
+def test_layer_sync_moe_local_experts_amax(moe_grouped_gemm):
+    initialize_for_megatron(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        expert_model_parallel_size=2,
+        expert_tensor_parallel_size=1,
+        seed=SEED,
+    )
+    model = _gpt_model_provider(
+        tp_size=1,
+        ep_size=2,
+        etp_size=1,
+        hidden_size=256,
+        moe_grouped_gemm=moe_grouped_gemm,
+        use_te=moe_grouped_gemm,
+        num_moe_experts=8,
+        transformer_impl="modelopt",
+    )
+    # model = mtq.quantize(model, mtq.FP8_DEFAULT_CFG, get_forward(model))
+    forward = get_forward(model)
+    forward()
+    print(model)
+
+    model.layer_sync_moe_local_experts_amax()
+    prev_amax = None
+    for expert in model.local_experts:
+        assert expert.input_quantizer.amax is not None
+        if prev_amax is None:
+            prev_amax = expert.input_quantizer.amax
+        else:
+            assert torch.allclose(prev_amax, expert.input_quantizer.amax)
+
+
 def _test_expert_model_parallel_amax_sync(
     tp_size, ep_size, etp_size, moe_grouped_gemm, config, rank, size
 ):