Pad MoE expert input to multiple of 32 for MXFP8 compatibility

trvachov · claude · trvachov · commit 2b8763ee8ad9 · 2026-04-20T12:18:42.000-04:00
After all-to-all dispatch in the MoE block, the per-rank token count is
data-dependent (routing decisions produce different expert loads per step).
MXFP8 requires tensor dims divisible by 32, and FP8 requires product of
non-last dims divisible by 8 - these assertions fire on the post-dispatch
expert_input when the batch token count happens to land on an unaligned
value, causing training to hang or crash.

Pad the token dimension to the next multiple of 32 before GroupedLinear,
attribute the padding to the last expert so m_splits still sums correctly,
then slice the padding off the output. Branch is a no-op for non-MXFP8
runs and when the count is already aligned.

Upstream attention layers get alignment via the collator's
pad_sequences_to_be_divisible_by config; this patch only addresses the
MoE block where alltoall creates a second source of misalignment.

Verified on 8x B300 SXM6 with Mixtral-8x7B EP=8 at SEQ=8192:
FP8 1.196 s/step, MXFP8 1.248 s/step (previously hung/crashed).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Timur Rvachov &lt;trvachov@nvidia.com&gt;
diff --git a/bionemo-recipes/models/mixtral/modeling_mixtral_te.py b/bionemo-recipes/models/mixtral/modeling_mixtral_te.py
@@ -409,7 +409,28 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         self._sync_expert_views()
 
         dispatch_output = self.dispatcher.dispatch(hidden_states, selected_experts, routing_weights)
-        expert_output = self._expert_ffn(dispatch_output.expert_input, dispatch_output.tokens_per_expert)
+
+        expert_input = dispatch_output.expert_input
+        tokens_per_expert = dispatch_output.tokens_per_expert
+
+        # MXFP8 requires both tensor dims divisible by 32.  Upstream attention layers
+        # get this from the collator (pad_sequences_to_be_divisible_by=32), but after
+        # all-to-all dispatch the per-rank token count is data-dependent (routing
+        # decisions pick different expert loads). Pad here so GroupedLinear's MXFP8
+        # kernels don't assert, then slice the padding off afterwards.
+        n_tokens = expert_input.shape[0]
+        mxfp8_pad = (32 - n_tokens % 32) % 32
+        if mxfp8_pad:
+            expert_input = torch.nn.functional.pad(expert_input, (0, 0, 0, mxfp8_pad))
+            # Attribute the padding tokens to the last expert so m_splits still sums correctly.
+            tokens_per_expert = list(tokens_per_expert)
+            tokens_per_expert[-1] += mxfp8_pad
+
+        expert_output = self._expert_ffn(expert_input, tokens_per_expert)
+
+        if mxfp8_pad:
+            expert_output = expert_output[:n_tokens]
+
         output = self.dispatcher.combine(expert_output, dispatch_output.handle)
 
         return output.reshape(original_shape)
diff --git a/bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py b/bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py
@@ -415,7 +415,28 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         self._sync_expert_views()
 
         dispatch_output = self.dispatcher.dispatch(hidden_states, selected_experts, routing_weights)
-        expert_output = self._expert_ffn(dispatch_output.expert_input, dispatch_output.tokens_per_expert)
+
+        expert_input = dispatch_output.expert_input
+        tokens_per_expert = dispatch_output.tokens_per_expert
+
+        # MXFP8 requires both tensor dims divisible by 32.  Upstream attention layers
+        # get this from the collator (pad_sequences_to_be_divisible_by=32), but after
+        # all-to-all dispatch the per-rank token count is data-dependent (routing
+        # decisions pick different expert loads). Pad here so GroupedLinear's MXFP8
+        # kernels don't assert, then slice the padding off afterwards.
+        n_tokens = expert_input.shape[0]
+        mxfp8_pad = (32 - n_tokens % 32) % 32
+        if mxfp8_pad:
+            expert_input = torch.nn.functional.pad(expert_input, (0, 0, 0, mxfp8_pad))
+            # Attribute the padding tokens to the last expert so m_splits still sums correctly.
+            tokens_per_expert = list(tokens_per_expert)
+            tokens_per_expert[-1] += mxfp8_pad
+
+        expert_output = self._expert_ffn(expert_input, tokens_per_expert)
+
+        if mxfp8_pad:
+            expert_output = expert_output[:n_tokens]
+
         output = self.dispatcher.combine(expert_output, dispatch_output.handle)
 
         return output.reshape(original_shape)
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py
@@ -409,7 +409,28 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         self._sync_expert_views()
 
         dispatch_output = self.dispatcher.dispatch(hidden_states, selected_experts, routing_weights)
-        expert_output = self._expert_ffn(dispatch_output.expert_input, dispatch_output.tokens_per_expert)
+
+        expert_input = dispatch_output.expert_input
+        tokens_per_expert = dispatch_output.tokens_per_expert
+
+        # MXFP8 requires both tensor dims divisible by 32.  Upstream attention layers
+        # get this from the collator (pad_sequences_to_be_divisible_by=32), but after
+        # all-to-all dispatch the per-rank token count is data-dependent (routing
+        # decisions pick different expert loads). Pad here so GroupedLinear's MXFP8
+        # kernels don't assert, then slice the padding off afterwards.
+        n_tokens = expert_input.shape[0]
+        mxfp8_pad = (32 - n_tokens % 32) % 32
+        if mxfp8_pad:
+            expert_input = torch.nn.functional.pad(expert_input, (0, 0, 0, mxfp8_pad))
+            # Attribute the padding tokens to the last expert so m_splits still sums correctly.
+            tokens_per_expert = list(tokens_per_expert)
+            tokens_per_expert[-1] += mxfp8_pad
+
+        expert_output = self._expert_ffn(expert_input, tokens_per_expert)
+
+        if mxfp8_pad:
+            expert_output = expert_output[:n_tokens]
+
         output = self.dispatcher.combine(expert_output, dispatch_output.handle)
 
         return output.reshape(original_shape)