Merge pull request #3678 from AI-Hypercomputer:mattdavidow-ragged-buffer-a1

Google-ML-Automation · Google-ML-Automation · commit e06745fbbe32 · 2026-04-16T18:11:15.000-07:00
PiperOrigin-RevId: 901000348
diff --git a/docs/reference/core_concepts/moe_configuration.md b/docs/reference/core_concepts/moe_configuration.md
@@ -88,6 +88,11 @@ Dropping:
 - Value > 0: Enforces a strict capacity limit; tokens exceeding this limit are dropped.
 - Value = -1: Dropless with dense matrix multiplication, which is computationally expensive and typically used only as a baseline.
 
+`ragged_buffer_factor`: A scalar multiplier for the size of the ragged buffer (effectively expert capacity). Effective only when `sparse_matmul` is True.
+
+- Value > 0: Uses an explicit buffer size which may drop tokens when this size is exceeded
+- Value = -1: Uses a worst case calculated buffer size which is guaranteed to not drop any tokens.
+
 `use_custom_sort_vjp`: If enabled, use a custom Vector-Jacobian Product (VJP) sort for efficient backward pass processing in sparse matmul. Recommended to replace the inefficient scatter-add generated by the `jax.numpy.take` in the backward pass.
 
 `mlp_bias`: If enabled, add learnable bias terms for MLP matmul. Originally implemented to support the GPT-OSS model architecture.
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -195,6 +195,11 @@ num_experts_per_tok: 1
 megablox: true
 sparse_matmul: true
 capacity_factor: -1.0 # a factor to decide expert capacity for token dropping, and no dropping by default
+ragged_buffer_factor: -1.0 # a factor to determine the size of the ragged buffer for routed MoE activations.
+# By default (-1), the routed buffer is worst case size to ensure no dropping.
+# When set to 1.0 this buffer if set to the size assuming perfectly balanced. If the routing dictates
+# a size larger than this then tokens are dropped.
+# In general if ragged_buffer_factor > 0, the ragged_buffer_size is balanced_size * ragged_buffer_factor.
 load_balance_loss_weight: 0.0 # weight for the load balance loss
 use_random_routing: false # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: true # whether to use a custom VJP sort for efficient backward pass processing in sparse matmul
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -646,6 +646,7 @@ class MoEGeneral(BaseModel):
   num_experts: PositiveInt = Field(1, description="The total number of experts in each MoE layer.")
   num_experts_per_tok: PositiveInt = Field(1, description="The number of experts to route each token to.")
   capacity_factor: float = Field(-1.0, description="Expert capacity factor. If < 0, no token dropping.")
+  ragged_buffer_factor: float = Field(-1.0, description="Ragged buffer factor. If < 0, ragged buffer is worst case size.")
   load_balance_loss_weight: NonNegativeFloat = Field(0.0, description="Weight for the load balancing auxiliary loss.")
   use_custom_sort_vjp: bool = Field(
       True,
@@ -2082,6 +2083,12 @@ def load_model_specific_defaults(cls, values: dict[str, Any]) -> dict[str, Any]:
     """This method is a no-op because `pyconfig` handles model-specific config loading."""
     return values
 
+  def validate_ragged_buffer_factor(self):
+    if self.ragged_buffer_factor <= 0:
+      return  # Nothing to validate if not using ragged buffer factor
+    if self.use_ring_of_experts:
+      raise ValueError("Currently we only support ragged buffer factor with ragged a2a approach.")
+
   @model_validator(mode="after")
   def set_derived_and_validate_values(self) -> "MaxTextConfig":
     """
@@ -2570,6 +2577,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         raise ValueError("GPT-OSS MoE only supports dropless (capacity_factor=-1) with dense matmul.")
       if self.routed_bias and self.routed_bias_update_rate > 0.0 and self.decoder_block != DecoderBlockType.DEEPSEEK:
         raise ValueError("Loss-free load balancing is only supported for the DeepSeek decoder block.")
+      self.validate_ragged_buffer_factor()
     if self.use_multimodal:
       valid_mm_models = (
           "gemma3-4b",
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -901,6 +901,42 @@ def transform_bias(self, experts_index, *biases):
     """Selects bias values for a variable number of bias tensors based on chosen experts."""
     return tuple(bias[experts_index] for bias in biases)
 
+  @staticmethod
+  def get_ragged_buffer_size(local_batch, ep_degree, global_experts, top_k, ragged_buffer_factor):
+    """Calculates the token batch size of the ragged buffer.
+    When explicitly setting ragged_buffer_factor>0, this is balanced_size * ragged_buffer_factor, which can drop tokens.
+    Otherwise this will be worst case size to ensure no dropping.
+
+    Inputs:
+      local_batch: local token batch (batch*seq blown up by top_k) shard on this device (e.g. inside shard_map)
+      ep_degree: degree of expert parallelism, generally equal to ici_expert_parallelism
+      global_experts: unsharded expert count, e.g. 256 for deepseek
+      top_k: aka num_experts_per_tok, 8 for deepseek.
+      ragged_buffer_factor: When set > 0, the buffer is balanced_size * ragged_buffer_factor.
+        The value 1.0 will be dropless only in the perfectly balanced case, else tokens will be dropped.
+    Outputs:
+      The ragged buffer's token batch size.
+    """
+    balanced_size = local_batch
+    if ragged_buffer_factor > 0.0:
+      # This will drop tokens if the true distribution exceeds this buffer.
+      return int(balanced_size * ragged_buffer_factor)
+    else:
+      # Worst case
+      # Either determined by degree of EP, or can be less when num_local_exp is smaller than top_k:
+      # Example: If we have 4 EP shards, top_k=8, and experts=256 (deepseek), then worst case is
+      # all tokens in our EP replica get routed to a single shard, e.g. rank 0 - thus is |EP|=4x larger than perfectly
+      # balanced. However if we use EP=128, then there are only 256/128 = 2 local experts, and thus at most in an EP
+      # replica group only the 2 experts of top_k=8 can be chosen, so at most 1/4 of all tokens goes to the most
+      # popular shard. Thus the imbalance factor goes like |EP|/(top_k/local_exp) = 128/4 = 32.
+      # In general for local_experts < top_k (e.g. |EP|>32), the balance will go as
+      # EP * local_experts / top_k = EP * (global_exp/EP) / top_k = global_exp / top_k.
+      # This is constant as a function of the model - e.g. for deepseek the imbalance is never worse than
+      # 256 exp / 8 top_k = 32. In practice the imbalance should be much less and potentially can use
+      # ragged_buffer_factor set to >1  e.g. 3.0, and likely have no dropping (not guaranteed)
+      worst_case_factor = min(ep_degree, global_experts / top_k)
+      return int(balanced_size * worst_case_factor)
+
   def sparse_matmul(
       self,
       inputs,
@@ -1165,16 +1201,13 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
                 num_expert_parallelism,
             )
 
-            # TODO(ranran): For better performance, we could update output buffer to a smaller
-            # size to replace self.get_expert_parallelism_size() for efficiency,
-            # Or we could apply capacity_factor for excessive experts.
-            # Note: Reducing buffer increase the risk of token dropping under unbalanced distribution.
-
-            # In the worst case, all of the global input data is assigned to each expert in the current shard.
-            # This would result in num_expert_shards * input_size * experts_per_shard assignments. However, if
-            # experts_per_shard > num_experts_per_tok we cannot assign more than num_experts_per_tok to all of the inputs.
-            max_local_experts_per_tok = min(local_expert_size, self.config.num_experts_per_tok)
-            buffer_size = int(num_expert_parallelism * batch_size * sequence_length * max_local_experts_per_tok)
+            buffer_size = self.get_ragged_buffer_size(
+                jnp.shape(x)[0],
+                num_expert_parallelism,
+                self.config.num_experts,
+                self.config.num_experts_per_tok,
+                self.config.ragged_buffer_factor,
+            )
             output_shape = jax.lax.empty((buffer_size, self.config.emb_dim), dtype=x.dtype)
 
             x = jax.lax.ragged_all_to_all(
diff --git a/tests/unit/moe_test.py b/tests/unit/moe_test.py
@@ -1063,6 +1063,58 @@ def test_get_all_to_all_params_unsharded_batch(self):
           jnp.array_equal(recv_sz, exp_recv_sz), f"Unsharded Batch: Receive sizes mismatch for shard {expert_shard_id}"
       )
 
+  def test_ragged_buffer_balanced(self):
+    ragged_buffer_factor = 1.0
+    local_batch = 32768
+    ep_degree = 4  # unused for ragged_factor>0
+    num_experts_per_tok = 8  # unused for ragged_factor>0
+    global_experts = 256  # unused for ragged_factor>0
+
+    expected_ragged_buffer = 32768
+    actual_ragged_buffer = moe.RoutedMoE.get_ragged_buffer_size(
+        local_batch, ep_degree, global_experts, num_experts_per_tok, ragged_buffer_factor
+    )
+    self.assertEqual(expected_ragged_buffer, actual_ragged_buffer)
+
+  def test_ragged_buffer_larger(self):
+    ragged_buffer_factor = 2.0
+    local_batch = 32768
+    ep_degree = 4  # unused for ragged_factor>0
+    num_experts_per_tok = 8  # unused for ragged_factor>0
+    global_experts = 256  # unused for ragged_factor>0
+
+    expected_ragged_buffer = 65536
+    actual_ragged_buffer = moe.RoutedMoE.get_ragged_buffer_size(
+        local_batch, ep_degree, global_experts, num_experts_per_tok, ragged_buffer_factor
+    )
+    self.assertEqual(expected_ragged_buffer, actual_ragged_buffer)
+
+  def test_small_ep_worst_case(self):
+    ragged_buffer_factor = -1.0  # Not using ragged_buffer_factor
+    local_batch = 32768
+    num_experts_per_tok = 8
+    global_experts = 256
+    ep_degree = 4
+
+    expected_ragged_buffer = 131072  # local_batch * ep_degree
+    actual_ragged_buffer = moe.RoutedMoE.get_ragged_buffer_size(
+        local_batch, ep_degree, global_experts, num_experts_per_tok, ragged_buffer_factor
+    )
+    self.assertEqual(expected_ragged_buffer, actual_ragged_buffer)
+
+  def test_large_ep_worst_case(self):
+    ragged_buffer_factor = -1.0  # Not using ragged_buffer_factor
+    local_batch = 32768
+    num_experts_per_tok = 8
+    global_experts = 256
+    ep_degree = 128
+
+    expected_ragged_buffer = 1048576  # (32768) * (global_exp / top_k)
+    actual_ragged_buffer = moe.RoutedMoE.get_ragged_buffer_size(
+        local_batch, ep_degree, global_experts, num_experts_per_tok, ragged_buffer_factor
+    )
+    self.assertEqual(expected_ragged_buffer, actual_ragged_buffer)
+
 
 if __name__ == "__main__":
   unittest.main()