Fix minimax tp8

wzhao18 · wzhao18 · commit 8a1f82b82f1a · 2026-04-05T16:41:39.000-07:00
Signed-off-by: wzhao18 &lt;wzhao18.sz@gmail.com&gt;
diff --git a/tests/kernels/moe/test_moe_weight_loading_padded.py b/tests/kernels/moe/test_moe_weight_loading_padded.py
@@ -9,6 +9,9 @@
 correctly handles this mismatch.
 """
 
+import math
+from unittest.mock import MagicMock
+
 import pytest
 import torch
 
@@ -290,3 +293,182 @@ def test_bnb_shape_mismatch_raises(self):
                 shard_id="w2",
                 expert_id=0,
             )
+
+
+def _make_fused_moe_mock(*, is_act_and_mul: bool = True):
+    """Build a FusedMoE mock for weight loading tests."""
+    moe_module = MagicMock(spec=FusedMoE)
+    moe_module.moe_config = MagicMock()
+    moe_module.moe_config.is_act_and_mul = is_act_and_mul
+
+    moe_module._get_hidden_dim = FusedMoE._get_hidden_dim
+    moe_module._narrow_expert_data_for_padding = (
+        FusedMoE._narrow_expert_data_for_padding
+    )
+    return moe_module
+
+
+class TestBlockQuantPaddedHiddenAndIntermediateSize:
+    """Tests weight loading with padded hidden_size and intermediate_size
+    across TP ranks.
+
+    hidden_size: 192 -> 256 (DeepEP-style round-up)
+    intermediate_size_per_partition: 448 -> 512 (block_n=128 alignment)
+    """
+
+    BLOCK_N = 128
+    HIDDEN_UNPADDED = 192
+    HIDDEN_PADDED = math.ceil(HIDDEN_UNPADDED / BLOCK_N) * BLOCK_N
+    INTERMEDIATE_UNPADDED = 448
+    INTERMEDIATE_PADDED = math.ceil(INTERMEDIATE_UNPADDED / BLOCK_N) * BLOCK_N
+    TP_SIZE = 4
+    GLOBAL_INTER = INTERMEDIATE_UNPADDED * TP_SIZE
+
+    def _make_fused_moe(self):
+        return _make_fused_moe_mock()
+
+    def test_load_w1_weight_all_tp_ranks(self):
+        """Each TP rank loads block-aligned rows into the w1 half.
+        The last rank gets fewer rows; the rest is padding."""
+        moe_module = self._make_fused_moe()
+        checkpoint = torch.randn(self.GLOBAL_INTER, self.HIDDEN_UNPADDED)
+
+        for tp_rank in range(self.TP_SIZE):
+            expert_data = torch.zeros(2 * self.INTERMEDIATE_PADDED, self.HIDDEN_PADDED)
+            FusedMoE._load_w13(
+                moe_module,
+                expert_data=expert_data,
+                shard_dim=0,
+                shard_id="w1",
+                loaded_weight=checkpoint.clone(),
+                tp_rank=tp_rank,
+            )
+            w1 = expert_data[: self.INTERMEDIATE_PADDED]
+            start = tp_rank * self.INTERMEDIATE_PADDED
+            n_available = min(self.INTERMEDIATE_PADDED, self.GLOBAL_INTER - start)
+            expected = checkpoint[start : start + n_available]
+
+            assert torch.equal(w1[:n_available, : self.HIDDEN_UNPADDED], expected)
+            assert torch.all(w1[n_available:] == 0)
+            assert torch.all(w1[:n_available, self.HIDDEN_UNPADDED :] == 0)
+            assert torch.all(expert_data[self.INTERMEDIATE_PADDED :] == 0)
+
+    def test_load_w3_weight_into_second_half(self):
+        """w3 weight is written into the second half of the w13 allocation."""
+        moe_module = self._make_fused_moe()
+        checkpoint = torch.randn(self.GLOBAL_INTER, self.HIDDEN_UNPADDED)
+        tp_rank = 2
+
+        expert_data = torch.zeros(2 * self.INTERMEDIATE_PADDED, self.HIDDEN_PADDED)
+        FusedMoE._load_w13(
+            moe_module,
+            expert_data=expert_data,
+            shard_dim=0,
+            shard_id="w3",
+            loaded_weight=checkpoint.clone(),
+            tp_rank=tp_rank,
+        )
+        assert torch.all(expert_data[: self.INTERMEDIATE_PADDED] == 0)
+
+        w3 = expert_data[self.INTERMEDIATE_PADDED :]
+        start = tp_rank * self.INTERMEDIATE_PADDED
+        n_available = min(self.INTERMEDIATE_PADDED, self.GLOBAL_INTER - start)
+        assert torch.equal(
+            w3[:n_available, : self.HIDDEN_UNPADDED],
+            checkpoint[start : start + n_available],
+        )
+        assert torch.all(w3[n_available:] == 0)
+
+    def test_load_w2_weight_all_tp_ranks(self):
+        """Each TP rank loads block-aligned columns of w2."""
+        moe_module = self._make_fused_moe()
+        checkpoint = torch.randn(self.HIDDEN_UNPADDED, self.GLOBAL_INTER)
+
+        for tp_rank in range(self.TP_SIZE):
+            expert_data = torch.zeros(self.HIDDEN_PADDED, self.INTERMEDIATE_PADDED)
+            FusedMoE._load_w2(
+                moe_module,
+                expert_data=expert_data,
+                shard_dim=1,
+                loaded_weight=checkpoint.clone(),
+                tp_rank=tp_rank,
+            )
+            start = tp_rank * self.INTERMEDIATE_PADDED
+            n_available = min(self.INTERMEDIATE_PADDED, self.GLOBAL_INTER - start)
+            expected = checkpoint[:, start : start + n_available]
+            assert torch.equal(
+                expert_data[: self.HIDDEN_UNPADDED, :n_available], expected
+            )
+            assert torch.all(expert_data[:, n_available:] == 0)
+            assert torch.all(expert_data[self.HIDDEN_UNPADDED :] == 0)
+
+    def test_load_w1_scale_all_tp_ranks(self):
+        """Each TP rank loads block-aligned scale rows for w1."""
+        moe_module = self._make_fused_moe()
+        n_rows_global = math.ceil(self.GLOBAL_INTER / self.BLOCK_N)
+        n_cols_ckpt = math.ceil(self.HIDDEN_UNPADDED / self.BLOCK_N)
+        n_rows_local = math.ceil(self.INTERMEDIATE_PADDED / self.BLOCK_N)
+        n_cols_alloc = math.ceil(self.HIDDEN_PADDED / self.BLOCK_N)
+
+        checkpoint_scale = torch.randn(n_rows_global, n_cols_ckpt)
+
+        for tp_rank in range(self.TP_SIZE):
+            expert_data = torch.zeros(2 * n_rows_local, n_cols_alloc)
+            FusedMoE._load_w13(
+                moe_module,
+                expert_data=expert_data,
+                shard_dim=0,
+                shard_id="w1",
+                loaded_weight=checkpoint_scale.clone(),
+                tp_rank=tp_rank,
+            )
+            w1_scale = expert_data[:n_rows_local]
+            start = n_rows_local * tp_rank
+            loaded = min(n_rows_local, n_rows_global - start)
+            expected = checkpoint_scale[start : start + loaded]
+            assert torch.equal(w1_scale[:loaded, :n_cols_ckpt], expected)
+
+    def test_load_w2_scale_all_tp_ranks(self):
+        """Each TP rank loads block-aligned scale columns for w2."""
+        moe_module = self._make_fused_moe()
+        n_rows = math.ceil(self.HIDDEN_UNPADDED / self.BLOCK_N)
+        n_cols_global = math.ceil(self.GLOBAL_INTER / self.BLOCK_N)
+        n_cols_local = math.ceil(self.INTERMEDIATE_PADDED / self.BLOCK_N)
+
+        checkpoint_scale = torch.randn(n_rows, n_cols_global)
+
+        for tp_rank in range(self.TP_SIZE):
+            expert_data = torch.zeros(n_rows, n_cols_local)
+            FusedMoE._load_w2(
+                moe_module,
+                expert_data=expert_data,
+                shard_dim=1,
+                loaded_weight=checkpoint_scale.clone(),
+                tp_rank=tp_rank,
+            )
+            start = n_cols_local * tp_rank
+            loaded = min(n_cols_local, n_cols_global - start)
+            expected = checkpoint_scale[:, start : start + loaded]
+            assert torch.equal(expert_data[:, :loaded], expected)
+
+    def test_no_padding_matches_simple_shard(self):
+        """When sizes are already block-aligned, loading is a simple
+        shard_size * tp_rank partition."""
+        intermediate = 512
+        hidden = 256
+        moe_module = _make_fused_moe_mock()
+        checkpoint = torch.randn(intermediate * self.TP_SIZE, hidden)
+
+        for tp_rank in range(self.TP_SIZE):
+            expert_data = torch.zeros(2 * intermediate, hidden)
+            FusedMoE._load_w13(
+                moe_module,
+                expert_data=expert_data,
+                shard_dim=0,
+                shard_id="w1",
+                loaded_weight=checkpoint.clone(),
+                tp_rank=tp_rank,
+            )
+            w1 = expert_data[:intermediate]
+            start = tp_rank * intermediate
+            assert torch.equal(w1, checkpoint[start : start + intermediate])
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -842,7 +842,10 @@ def _load_per_channel_weight_scale(
         if shard_id == "w2":
             hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
             expert_data = self._narrow_expert_data_for_padding(
-                expert_data, loaded_weight, hidden_dim=hidden_dim
+                expert_data,
+                loaded_weight,
+                hidden_dim=hidden_dim,
+                intermediate_dim=shard_dim,
             )
             expert_data.copy_(loaded_weight)
         elif shard_id in ("w1", "w3"):
@@ -882,29 +885,32 @@ def _narrow_expert_data_for_padding(
         expert_data: torch.Tensor,
         loaded_weight: torch.Tensor,
         hidden_dim: int,
+        intermediate_dim: int = -1,
     ) -> torch.Tensor:
-        """Narrow expert_data hidden dim to match loaded_weight for padded
-        hidden_size.
+        """Narrow expert_data to match loaded_weight for padded dimensions.
 
         When backends (e.g., DeepEP) round up hidden_size, weight parameters
-        are larger than checkpoint weights. Narrow the padded hidden dimension
-        before copying.
+        are larger than checkpoint weights. Similarly, on the last TP rank the
+        intermediate dimension of loaded_weight may be smaller than the padded
+        allocation. Narrow both dimensions before copying.
 
         Args:
             expert_data: The (possibly padded) parameter tensor to narrow.
             loaded_weight: The checkpoint weight tensor with original size.
             hidden_dim: The dimension index corresponding to hidden_size.
                 Must be non-negative.
+            intermediate_dim: The dimension index corresponding to the
+                intermediate size. When >= 0, expert_data is also narrowed
+                along this axis if it is larger than loaded_weight.
         """
-        if (
-            loaded_weight.ndim > 0
-            and 0 <= hidden_dim < expert_data.ndim
-            and hidden_dim < loaded_weight.ndim
-            and expert_data.shape[hidden_dim] > loaded_weight.shape[hidden_dim]
-        ):
-            expert_data = expert_data.narrow(
-                hidden_dim, 0, loaded_weight.shape[hidden_dim]
-            )
+        for dim in (hidden_dim, intermediate_dim):
+            if (
+                loaded_weight.ndim > 0
+                and 0 <= dim < expert_data.ndim
+                and dim < loaded_weight.ndim
+                and expert_data.shape[dim] > loaded_weight.shape[dim]
+            ):
+                expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim])
         return expert_data
 
     def _load_w13(
@@ -922,6 +928,7 @@ def _load_w13(
             shard_size = expert_data.shape[shard_dim] // 2
         else:
             shard_size = expert_data.shape[shard_dim]
+
         # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
         # and we're not loading the full weight
         if not load_full and loaded_weight.ndim > 0:
@@ -946,7 +953,10 @@ def _load_w13(
             expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
         hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
         expert_data = self._narrow_expert_data_for_padding(
-            expert_data, loaded_weight, hidden_dim=hidden_dim
+            expert_data,
+            loaded_weight,
+            hidden_dim=hidden_dim,
+            intermediate_dim=shard_dim,
         )
         expert_data.copy_(loaded_weight)
 
@@ -962,6 +972,7 @@ def _load_w2(
         # down_proj: "RowParallel" so tp sharding on input_dim
         # Narrow parameter and load.
         shard_size = expert_data.shape[shard_dim]
+
         # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
         # and we're not loading the full weight
         if not load_full and loaded_weight.ndim > 0:
@@ -979,7 +990,10 @@ def _load_w2(
         # w2, down_proj: Load into only logical weight of w2.
         hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
         expert_data = self._narrow_expert_data_for_padding(
-            expert_data, loaded_weight, hidden_dim=hidden_dim
+            expert_data,
+            loaded_weight,
+            hidden_dim=hidden_dim,
+            intermediate_dim=shard_dim,
         )
         expert_data.copy_(loaded_weight)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -24,6 +24,7 @@
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
 )
 from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
@@ -608,6 +609,36 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
             allow_vllm_cutlass=False,
         )
 
+    def maybe_roundup_sizes(
+        self,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        act_dtype: torch.dtype,
+        moe_parallel_config: "FusedMoEParallelConfig",
+    ) -> tuple[int, int]:
+        hidden_size, intermediate_size_per_partition = super().maybe_roundup_sizes(
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=intermediate_size_per_partition,
+            act_dtype=act_dtype,
+            moe_parallel_config=moe_parallel_config,
+        )
+        if self.block_quant:
+            assert self.weight_block_size is not None
+            block_n = self.weight_block_size[0]
+            if intermediate_size_per_partition % block_n != 0:
+                padded = (
+                    (intermediate_size_per_partition + block_n - 1) // block_n * block_n
+                )
+                logger.info_once(
+                    "Padding MoE intermediate size per partition from %d to "
+                    "%d for FP8 block quantization alignment (block_n=%d).",
+                    intermediate_size_per_partition,
+                    padded,
+                    block_n,
+                )
+                intermediate_size_per_partition = padded
+        return hidden_size, intermediate_size_per_partition
+
     def create_weights(
         self,
         layer: Module,
@@ -635,13 +666,13 @@ def create_weights(
             # NOTE: To ensure proper alignment of the block-wise quantization
             # scales, the output_size of the weights for both the gate and up
             # layers must be divisible by block_n.
-            # Required by column parallel or enabling merged weights
-            if intermediate_size_per_partition % block_n != 0:
-                raise ValueError(
-                    f"The output_size of gate's and up's weight = "
-                    f"{intermediate_size_per_partition} is not divisible by "
-                    f"weight quantization block_n = {block_n}."
-                )
+            # Required by column parallel or enabling merged weights.
+            # This is guaranteed by maybe_roundup_sizes() which pads
+            # intermediate_size_per_partition to the next block_n multiple.
+            assert intermediate_size_per_partition % block_n == 0, (
+                f"intermediate_size_per_partition={intermediate_size_per_partition} "
+                f"should have been padded to a multiple of block_n={block_n}"
+            )
             if tp_size > 1 and intermediate_size_per_partition % block_k != 0:
                 # Required by row parallel
                 raise ValueError(