Update minimax_m2.py (#820)

xytpai · sijyang · commit b2d552db7238 · 2026-05-25T01:22:43.000+08:00
diff --git a/atom/models/minimax_m2.py b/atom/models/minimax_m2.py
@@ -232,7 +232,7 @@ def forward(
             # TP-aware RMSNorm: all-reduce variance across TP ranks so
             # normalization uses the global variance (over 6144/1024 dims)
             # rather than per-rank variance (768/128 dims).
-            if qkv.shape[0] <= 64 and self.tp_size > 1:
+            if qkv.shape[0] <= 256 and self.tp_size > 1:
                 q, k, v = tensor_model_parallel_fused_qknorm_allreduce(
                     qkv, self.q_norm.weight, self.k_norm.weight, self.rms_norm_eps
                 )

Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,7 @@ def forward(`
`232`	`232`	`# TP-aware RMSNorm: all-reduce variance across TP ranks so`
`233`	`233`	`# normalization uses the global variance (over 6144/1024 dims)`
`234`	`234`	`# rather than per-rank variance (768/128 dims).`
`235`		`- if qkv.shape[0] <= 64 and self.tp_size > 1:`
	`235`	`+ if qkv.shape[0] <= 256 and self.tp_size > 1:`
`236`	`236`	`q, k, v = tensor_model_parallel_fused_qknorm_allreduce(`
`237`	`237`	`qkv, self.q_norm.weight, self.k_norm.weight, self.rms_norm_eps`
`238`	`238`	`)`