From e3e98fe583897def1f6a7a1c06eba7d6b232c171 Mon Sep 17 00:00:00 2001
From: Jennifer Chen <jennifchen@nvidia.com>
Date: Wed, 25 Feb 2026 18:40:43 +0000
Subject: [PATCH] initialize moe experts differently

Signed-off-by: Jennifer Chen <jennifchen@nvidia.com>
---
 .../torch/quantization/plugins/test_megatron.py             | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
index 5704e369e5..3142ef750e 100644
--- a/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
+++ b/tests/gpu_megatron/torch/quantization/plugins/test_megatron.py
@@ -765,6 +765,12 @@ def _test_layer_sync_moe_local_experts_amax(ep_size, moe_grouped_gemm, rank, siz
         num_moe_experts=8,
         transformer_impl="modelopt",
     )
+    # Make weight initialization different across experts, otherwise experts will have similar amax values
+    for layer in model.decoder.layers:
+        for i, expert in enumerate(layer.mlp.experts.local_experts):
+            expert.linear_fc1.weight.data.fill_(0.1 + i * 0.05)
+            expert.linear_fc2.weight.data.fill_(0.2 + i * 0.05)
+
     quant_cfg = mtq.FP8_DEFAULT_CFG
     model = mtq.quantize(model, quant_cfg, get_forward(model))