From e3e98fe583897def1f6a7a1c06eba7d6b232c171 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Wed, 25 Feb 2026 18:40:43 +0000 Subject: [PATCH] initialize moe experts differently Signed-off-by: Jennifer Chen --- .../torch/quantization/plugins/test_megatron.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py index 5704e369e5..3142ef750e 100644 --- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py +++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py @@ -765,6 +765,12 @@ def _test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, rank, siz num_moe_experts=8, transformer_impl="modelopt", ) + # Make weight initialization different across experts, otherwise experts will have similar amax values + for layer in model.decoder.layers: + for i, expert in enumerate(layer.mlp.experts.local_experts): + expert.linear_fc1.weight.data.fill_(0.1 + i * 0.05) + expert.linear_fc2.weight.data.fill_(0.2 + i * 0.05) + quant_cfg = mtq.FP8_DEFAULT_CFG model = mtq.quantize(model, quant_cfg, get_forward(model))