refactor(moe): Remove tokamax_gmm_autotune and unconditionally use custom GMM tile sizes with Pallas subclass

darisoy · darisoy · commit 796ed9d5d558 · 2026-06-02T21:38:04.000Z
This change addresses review comments on PR #3779: 1. Replaces the brittle global monkey-patch and separate WI/WO subclasses with a single, reusable `PallasMosaicTpuRaggedDotCustom` class defined in `moe.py`. 2. Implements custom __post_init__ in this subclass to ensure JAX backward passes (VJP) preserve tile configurations correctly. 3. Removes the configuration flag `tokamax_gmm_autotune` entirely (unconditionally enabling custom tile sizes when Tokamax GMM is active). 4. Updates unit tests in `moe_test.py` to verify the new subclass-based tiling overrides. CONV=2c6843af-dcf7-403b-b67e-2fedd5f81b95
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -55,7 +55,46 @@
     DLHS_RAGGED_DOT_DIM_NUMS,
     DRHS_RAGGED_DOT_DIM_NUMS,
 )
+from tokamax._src.ops.ragged_dot import base
 from tokamax._src.ops import op
+import dataclasses
+
+
+@jax.tree_util.register_dataclass
+@dataclasses.dataclass(frozen=True, kw_only=True, slots=True)
+class PallasMosaicTpuRaggedDotCustom(PallasMosaicTpuRaggedDot):
+  config: Config | None = None
+  fwd_tile: tuple[int, int, int] = (128, 128, 128)
+  dlhs_tile: tuple[int, int, int] = (128, 128, 128)
+  drhs_tile: tuple[int, int, int] = (128, 128, 128)
+
+  def __post_init__(self):
+    from tokamax._src.ops.ragged_dot import base
+    qdtype = self.qdtype if self.qdtype is None else jnp.dtype(self.qdtype).name
+    if self.vjp is None:
+      fn = lambda *args, **kw: PallasMosaicTpuRaggedDotCustom(
+          qdtype=qdtype,
+          interpret=self.interpret,
+          fwd_tile=self.fwd_tile,
+          dlhs_tile=self.dlhs_tile,
+          drhs_tile=self.drhs_tile,
+      )(*args, **kw)
+      object.__setattr__(
+          self,
+          "vjp",
+          functools.partial(base.vjp, dlhs_ragged_dot=fn, drhs_ragged_dot=fn),
+      )
+
+  def _get_heuristics_config(self, ba) -> Config:
+    dims = ba.arguments.get("ragged_dot_dimension_numbers", DEFAULT_RAGGED_DOT_DIM_NUMS)
+    if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
+      return Config(tile_m=self.fwd_tile[0], tile_k=self.fwd_tile[1], tile_n=self.fwd_tile[2])
+    elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
+      return Config(tile_m=self.dlhs_tile[0], tile_k=self.dlhs_tile[1], tile_n=self.dlhs_tile[2])
+    elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
+      return Config(tile_m=self.drhs_tile[0], tile_k=self.drhs_tile[1], tile_n=self.drhs_tile[2])
+    return Config()
+
 
 set_xla_metadata = xla_metadata.set_xla_metadata
 
@@ -557,8 +596,7 @@ def __init__(
     ):
       self.wo.value = self.wo.value * self.per_expert_scale.value[:, None, None]
 
-    # Monkey-patch Tokamax heuristics globally once
-    _monkey_patch_tokamax_heuristics(self.config)
+
 
   def _maybe_shard_with_logical(self, inputs, logical_name):
     return maybe_shard_with_logical(
@@ -1095,6 +1133,18 @@ def sparse_matmul(
       wo_bias,
   ):
     """Perform sparse matrix multiplication of inputs and Experts."""
+    config = self.config
+
+    gmm_impl_wi = PallasMosaicTpuRaggedDotCustom(
+        fwd_tile=(config.wi_tile_fwd_batch_seq, config.wi_tile_fwd_embed_dim, config.wi_tile_fwd_mlp_dim),
+        dlhs_tile=(config.wi_tile_dlhs_batch_seq, config.wi_tile_dlhs_mlp_dim, config.wi_tile_dlhs_embed_dim),
+        drhs_tile=(config.wi_tile_drhs_batch_seq, config.wi_tile_drhs_embed_dim, config.wi_tile_drhs_mlp_dim),
+    )
+    gmm_impl_wo = PallasMosaicTpuRaggedDotCustom(
+        fwd_tile=(config.wo_tile_fwd_batch_seq, config.wo_tile_fwd_mlp_dim, config.wo_tile_fwd_embed_dim),
+        dlhs_tile=(config.wo_tile_dlhs_batch_seq, config.wo_tile_dlhs_embed_dim, config.wo_tile_dlhs_mlp_dim),
+        drhs_tile=(config.wo_tile_drhs_batch_seq, config.wo_tile_drhs_mlp_dim, config.wo_tile_drhs_embed_dim),
+    )
 
     def jax_ragged_dot_gmm(inputs, kernel, tiling, group_sizes, expert_assignments, padding_amount):
       """Execute jax.lax.ragged_dot, with potential quantization"""
@@ -1139,6 +1189,15 @@ def jax_ragged_dot_gmm(inputs, kernel, tiling, group_sizes, expert_assignments,
         output *= scales
       return output
 
+    def get_gmm_group_sizes(inputs, kernel, ep):
+      # Calculates perfectly balanced group sizes where each local expert receives an equal
+      # share of local tokens, adjusted for expert parallelism.
+      #
+      # Note: This function assumes the inputs are ragged and padded to the worst-case size
+      # (which is generally a factor of EP larger than perfectly balanced). This is why we must
+      # divide by EP.
+      return (inputs.shape[0] // kernel.shape[0] // ep,) * kernel.shape[0]
+
     def get_tokamax_group_sizes(group_sizes, inputs, kernel):
       # TODO (b/491979205) pipeline fsdp ag per repeat fails tokamax gmm
       if self.config.use_qwix_quantization or (
@@ -1151,7 +1210,7 @@ def get_tokamax_group_sizes(group_sizes, inputs, kernel):
         ep = self.get_expert_parallelism_size()
         return tokamax.RaggedDotGroupSizes(
             group_sizes,
-            (inputs.shape[0] // kernel.shape[0] // ep,) * kernel.shape[0],
+            get_gmm_group_sizes(inputs, kernel, ep),
         )
 
     def get_quantization_dtypes():
@@ -1162,7 +1221,7 @@ def get_quantization_dtypes():
         rhs_quantize_dtype = quant_dg.fwd.dg_quantizer.rhs.numerics.get_dtype()
       return lhs_quantize_dtype, rhs_quantize_dtype
 
-    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes):
+    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, gmm_impl=None):
       if inputs.shape[0] != expert_assignments.shape[0]:
         raise ValueError("The number of input tokens must match the number of expert assignments!")
 
@@ -1196,7 +1255,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
               group_sizes=tokamax_group_sizes,
               precision=jax.lax.Precision.DEFAULT,
               preferred_element_type=self.dtype,
-              implementation="mosaic",
+              implementation="mosaic" if gmm_impl is None else [gmm_impl],
           )
       elif self.config.megablox:  # Older forked megablox
         output = mblx.gmm(
@@ -1485,6 +1544,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           w0,
           tiling=wi_tile_size,
           weight_gather_axes=wi_gather_axes,
+          gmm_impl=gmm_impl_wi,
       )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w0 = jax.lax.psum(layer_w0, "tensor_transpose")
@@ -1497,6 +1557,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           w1,
           tiling=wi_tile_size,
           weight_gather_axes=wi_gather_axes,
+          gmm_impl=gmm_impl_wi,
       )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w1 = jax.lax.psum(layer_w1, "tensor_transpose")
@@ -1510,6 +1571,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           wo,
           tiling=wo_tile_size,
           weight_gather_axes=wo_gather_axes,
+          gmm_impl=gmm_impl_wo,
       )
       if self.get_tensor_parallelism_size() > 1:
         intermediate_output = jax.lax.psum_scatter(
@@ -2553,74 +2615,3 @@ def get_routed_and_shared_moe(
       abstract_init=False,
   )
   return module
-
-
-_heuristics_patched = False
-
-
-def _monkey_patch_tokamax_heuristics(config, force=False):
-  """Globally monkey-patches Tokamax GMM heuristics with manual tiling overrides."""
-  global _heuristics_patched
-  if _heuristics_patched and not force:
-    return
-
-  def custom_heuristics(self, ba: op.BoundArguments) -> Config:
-    lhs, rhs = ba.arguments["lhs"], ba.arguments["rhs"]
-    dims = ba.arguments.get("ragged_dot_dimension_numbers", DEFAULT_RAGGED_DOT_DIM_NUMS)
-
-    is_wo = False
-    if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-      is_wo = rhs.shape[1] == config.base_mlp_dim
-    elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-      is_wo = rhs.shape[2] == config.base_emb_dim
-    elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-      is_wo = lhs.shape[1] == config.base_mlp_dim
-
-    if is_wo:
-      # Return wo tile sizes
-      if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_fwd_batch_seq,
-            tile_k=config.wo_tile_fwd_mlp_dim,
-            tile_n=config.wo_tile_fwd_embed_dim,
-        )
-      elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_dlhs_batch_seq,
-            tile_k=config.wo_tile_dlhs_embed_dim,
-            tile_n=config.wo_tile_dlhs_mlp_dim,
-        )
-      elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_drhs_batch_seq,
-            tile_k=config.wo_tile_drhs_mlp_dim,
-            tile_n=config.wo_tile_drhs_embed_dim,
-        )
-    else:
-      # Return wi tile sizes
-      if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_fwd_batch_seq,
-            tile_k=config.wi_tile_fwd_embed_dim,
-            tile_n=config.wi_tile_fwd_mlp_dim,
-        )
-      elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_dlhs_batch_seq,
-            tile_k=config.wi_tile_dlhs_mlp_dim,
-            tile_n=config.wi_tile_dlhs_embed_dim,
-        )
-      elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_drhs_batch_seq,
-            tile_k=config.wi_tile_drhs_embed_dim,
-            tile_n=config.wi_tile_drhs_mlp_dim,
-        )
-
-    return Config()
-
-  # Apply class-level monkey patch!
-  # pylint: disable=protected-access
-  PallasMosaicTpuRaggedDot._get_heuristics_config = custom_heuristics
-  _heuristics_patched = True
-  print("[TOKAMAX_PATCH] Successfully monkey-patched Tokamax GMM heuristics globally!")
diff --git a/src/maxtext/models/deepseek_batchsplit_fp8.py b/src/maxtext/models/deepseek_batchsplit_fp8.py
@@ -29,7 +29,7 @@
 from maxtext.layers import quantizations
 import qwix.pallas as qpl
 import tokamax
-from maxtext.layers.moe import _monkey_patch_tokamax_heuristics
+
 
 
 @functools.partial(
@@ -833,9 +833,6 @@ def moe(
     config,
     quant,
 ):
-  """Performs dropless MoE with tensor/expert parallelism."""
-  # Monkey-patch Tokamax heuristics globally once
-  _monkey_patch_tokamax_heuristics(config)
   xs, ys = list(zip(*inputs))
   ys = with_data_parallel_constraint(
       process_activations(
@@ -943,6 +940,16 @@ def unroute(
 
 def compute(x, w0, w1, wo, group_sizes, weights, *, config, mesh):
   """Processes routed tokens through the MLP."""
+  gmm_impl_wi = moe_lib.PallasMosaicTpuRaggedDotCustom(
+      fwd_tile=(config.wi_tile_fwd_batch_seq, config.wi_tile_fwd_embed_dim, config.wi_tile_fwd_mlp_dim),
+      dlhs_tile=(config.wi_tile_dlhs_batch_seq, config.wi_tile_dlhs_mlp_dim, config.wi_tile_dlhs_embed_dim),
+      drhs_tile=(config.wi_tile_drhs_batch_seq, config.wi_tile_drhs_embed_dim, config.wi_tile_drhs_mlp_dim),
+  )
+  gmm_impl_wo = moe_lib.PallasMosaicTpuRaggedDotCustom(
+      fwd_tile=(config.wo_tile_fwd_batch_seq, config.wo_tile_fwd_mlp_dim, config.wo_tile_fwd_embed_dim),
+      dlhs_tile=(config.wo_tile_dlhs_batch_seq, config.wo_tile_dlhs_embed_dim, config.wo_tile_dlhs_mlp_dim),
+      drhs_tile=(config.wo_tile_drhs_batch_seq, config.wo_tile_drhs_mlp_dim, config.wo_tile_drhs_embed_dim),
+  )
 
   def gmm(
       inputs,
@@ -951,6 +958,7 @@ def gmm(
       group_sizes,
       preferred_element_type,
       weight_gather_axes,
+      gmm_impl=None,
   ):
     if config.use_qwix_quantization:
       output = megablox.gmm(
@@ -971,7 +979,7 @@ def gmm(
           group_sizes=tokamax.RaggedDotGroupSizes(group_sizes, len(inputs)),
           precision=jax.lax.Precision.DEFAULT,
           preferred_element_type=preferred_element_type,
-          implementation="mosaic",
+          implementation="mosaic" if gmm_impl is None else [gmm_impl],
       )
     return output
 
@@ -1031,6 +1039,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
         w01,
         tiling=wi_tile_size,
         weight_gather_axes=wi_gather_axes,
+        gmm_impl=gmm_impl_wi,
     )
     layer_w0, layer_w1 = jnp.split(layer_w01, 2, axis=-1)
   else:
@@ -1039,12 +1048,14 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
         w0,
         tiling=wi_tile_size,
         weight_gather_axes=wi_gather_axes,
+        gmm_impl=gmm_impl_wi,
     )
     layer_w1 = gmm_fn(
         x,
         w1,
         tiling=wi_tile_size,
         weight_gather_axes=wi_gather_axes,
+        gmm_impl=gmm_impl_wi,
     )
   layer_w0 = jax.ad_checkpoint.checkpoint_name(layer_w0, "mlpwi_0")
   layer_w1 = jax.ad_checkpoint.checkpoint_name(layer_w1, "mlpwi_1")
@@ -1055,6 +1066,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
       wo,
       tiling=wo_tile_size,
       weight_gather_axes=wo_gather_axes,
+      gmm_impl=gmm_impl_wo,
   )
   return layer_wo
 
diff --git a/tests/unit/moe_test.py b/tests/unit/moe_test.py