AI-Hypercomputer
diff --git a/‎src/maxtext/layers/moe.py‎
Lines changed: 93 additions & 76 deletions b/‎src/maxtext/layers/moe.py‎
Lines changed: 93 additions & 76 deletions
@@ -55,6 +55,7 @@
     DLHS_RAGGED_DOT_DIM_NUMS,
     DRHS_RAGGED_DOT_DIM_NUMS,
 )
+from tokamax._src.ops.ragged_dot import base
 from tokamax._src.ops import op
 
 set_xla_metadata = xla_metadata.set_xla_metadata
@@ -557,8 +558,7 @@ def __init__(
     ):
       self.wo.value = self.wo.value * self.per_expert_scale.value[:, None, None]
 
-    # Monkey-patch Tokamax heuristics globally once
-    _monkey_patch_tokamax_heuristics(self.config)
+
 
   def _maybe_shard_with_logical(self, inputs, logical_name):
     return maybe_shard_with_logical(
@@ -1095,6 +1095,86 @@ def sparse_matmul(
       wo_bias,
   ):
     """Perform sparse matrix multiplication of inputs and Experts."""
+    config = self.config
+
+    class PallasMosaicTpuRaggedDotWI(PallasMosaicTpuRaggedDot):
+
+      def __post_init__(self):
+        from tokamax._src.ops.ragged_dot import base
+        qdtype = self.qdtype if self.qdtype is None else jnp.dtype(self.qdtype).name
+        if self.vjp is None:
+          fn = lambda *args, **kw: PallasMosaicTpuRaggedDotWI(
+              qdtype=qdtype,
+              interpret=self.interpret,
+          )(*args, **kw)
+          object.__setattr__(
+              self,
+              "vjp",
+              functools.partial(base.vjp, dlhs_ragged_dot=fn, drhs_ragged_dot=fn),
+          )
+
+      def _get_heuristics_config(self, ba) -> Config:
+        dims = ba.arguments.get("ragged_dot_dimension_numbers", DEFAULT_RAGGED_DOT_DIM_NUMS)
+        if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wi_tile_fwd_batch_seq,
+              tile_k=config.wi_tile_fwd_embed_dim,
+              tile_n=config.wi_tile_fwd_mlp_dim,
+          )
+        elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wi_tile_dlhs_batch_seq,
+              tile_k=config.wi_tile_dlhs_mlp_dim,
+              tile_n=config.wi_tile_dlhs_embed_dim,
+          )
+        elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wi_tile_drhs_batch_seq,
+              tile_k=config.wi_tile_drhs_embed_dim,
+              tile_n=config.wi_tile_drhs_mlp_dim,
+          )
+        return Config()
+
+    class PallasMosaicTpuRaggedDotWO(PallasMosaicTpuRaggedDot):
+
+      def __post_init__(self):
+        from tokamax._src.ops.ragged_dot import base
+        qdtype = self.qdtype if self.qdtype is None else jnp.dtype(self.qdtype).name
+        if self.vjp is None:
+          fn = lambda *args, **kw: PallasMosaicTpuRaggedDotWO(
+              qdtype=qdtype,
+              interpret=self.interpret,
+          )(*args, **kw)
+          object.__setattr__(
+              self,
+              "vjp",
+              functools.partial(base.vjp, dlhs_ragged_dot=fn, drhs_ragged_dot=fn),
+          )
+
+      def _get_heuristics_config(self, ba) -> Config:
+        dims = ba.arguments.get("ragged_dot_dimension_numbers", DEFAULT_RAGGED_DOT_DIM_NUMS)
+        if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wo_tile_fwd_batch_seq,
+              tile_k=config.wo_tile_fwd_mlp_dim,
+              tile_n=config.wo_tile_fwd_embed_dim,
+          )
+        elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wo_tile_dlhs_batch_seq,
+              tile_k=config.wo_tile_dlhs_embed_dim,
+              tile_n=config.wo_tile_dlhs_mlp_dim,
+          )
+        elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
+          return Config(
+              tile_m=config.wo_tile_drhs_batch_seq,
+              tile_k=config.wo_tile_drhs_mlp_dim,
+              tile_n=config.wo_tile_drhs_embed_dim,
+          )
+        return Config()
+
+    gmm_impl_wi = PallasMosaicTpuRaggedDotWI(qdtype=None, interpret=False)
+    gmm_impl_wo = PallasMosaicTpuRaggedDotWO(qdtype=None, interpret=False)
 
     def jax_ragged_dot_gmm(inputs, kernel, tiling, group_sizes, expert_assignments, padding_amount):
       """Execute jax.lax.ragged_dot, with potential quantization"""
@@ -1139,6 +1219,11 @@ def jax_ragged_dot_gmm(inputs, kernel, tiling, group_sizes, expert_assignments,
         output *= scales
       return output
 
+    def get_gmm_group_sizes(inputs, kernel, ep):
+      # Calculates perfectly balanced group sizes where each local expert receives an equal
+      # share of local tokens, adjusted for expert parallelism.
+      return (inputs.shape[0] // kernel.shape[0] // ep,) * kernel.shape[0]
+
     def get_tokamax_group_sizes(group_sizes, inputs, kernel):
       # TODO (b/491979205) pipeline fsdp ag per repeat fails tokamax gmm
       if self.config.use_qwix_quantization or (
@@ -1151,7 +1236,7 @@ def get_tokamax_group_sizes(group_sizes, inputs, kernel):
         ep = self.get_expert_parallelism_size()
         return tokamax.RaggedDotGroupSizes(
             group_sizes,
-            (inputs.shape[0] // kernel.shape[0] // ep,) * kernel.shape[0],
+            get_gmm_group_sizes(inputs, kernel, ep),
         )
 
     def get_quantization_dtypes():
@@ -1162,7 +1247,7 @@ def get_quantization_dtypes():
         rhs_quantize_dtype = quant_dg.fwd.dg_quantizer.rhs.numerics.get_dtype()
       return lhs_quantize_dtype, rhs_quantize_dtype
 
-    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes):
+    def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, gmm_impl=None):
       if inputs.shape[0] != expert_assignments.shape[0]:
         raise ValueError("The number of input tokens must match the number of expert assignments!")
 
@@ -1196,7 +1281,7 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
               group_sizes=tokamax_group_sizes,
               precision=jax.lax.Precision.DEFAULT,
               preferred_element_type=self.dtype,
-              implementation="mosaic",
+              implementation="mosaic" if gmm_impl is None else [gmm_impl],
           )
       elif self.config.megablox:  # Older forked megablox
         output = mblx.gmm(
@@ -1485,6 +1570,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           w0,
           tiling=wi_tile_size,
           weight_gather_axes=wi_gather_axes,
+          gmm_impl=gmm_impl_wi,
       )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w0 = jax.lax.psum(layer_w0, "tensor_transpose")
@@ -1497,6 +1583,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           w1,
           tiling=wi_tile_size,
           weight_gather_axes=wi_gather_axes,
+          gmm_impl=gmm_impl_wi,
       )
       if self.get_tensor_transpose_parallelism_size() > 1:
         layer_w1 = jax.lax.psum(layer_w1, "tensor_transpose")
@@ -1510,6 +1597,7 @@ def get_active_sharding_axes(pspec_dim_axes, tensor_dim_index):
           wo,
           tiling=wo_tile_size,
           weight_gather_axes=wo_gather_axes,
+          gmm_impl=gmm_impl_wo,
       )
       if self.get_tensor_parallelism_size() > 1:
         intermediate_output = jax.lax.psum_scatter(
@@ -2553,74 +2641,3 @@ def get_routed_and_shared_moe(
       abstract_init=False,
   )
   return module
-
-
-_heuristics_patched = False
-
-
-def _monkey_patch_tokamax_heuristics(config, force=False):
-  """Globally monkey-patches Tokamax GMM heuristics with manual tiling overrides."""
-  global _heuristics_patched
-  if _heuristics_patched and not force:
-    return
-
-  def custom_heuristics(self, ba: op.BoundArguments) -> Config:
-    lhs, rhs = ba.arguments["lhs"], ba.arguments["rhs"]
-    dims = ba.arguments.get("ragged_dot_dimension_numbers", DEFAULT_RAGGED_DOT_DIM_NUMS)
-
-    is_wo = False
-    if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-      is_wo = rhs.shape[1] == config.base_mlp_dim
-    elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-      is_wo = rhs.shape[2] == config.base_emb_dim
-    elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-      is_wo = lhs.shape[1] == config.base_mlp_dim
-
-    if is_wo:
-      # Return wo tile sizes
-      if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_fwd_batch_seq,
-            tile_k=config.wo_tile_fwd_mlp_dim,
-            tile_n=config.wo_tile_fwd_embed_dim,
-        )
-      elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_dlhs_batch_seq,
-            tile_k=config.wo_tile_dlhs_embed_dim,
-            tile_n=config.wo_tile_dlhs_mlp_dim,
-        )
-      elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wo_tile_drhs_batch_seq,
-            tile_k=config.wo_tile_drhs_mlp_dim,
-            tile_n=config.wo_tile_drhs_embed_dim,
-        )
-    else:
-      # Return wi tile sizes
-      if dims == DEFAULT_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_fwd_batch_seq,
-            tile_k=config.wi_tile_fwd_embed_dim,
-            tile_n=config.wi_tile_fwd_mlp_dim,
-        )
-      elif dims == DLHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_dlhs_batch_seq,
-            tile_k=config.wi_tile_dlhs_mlp_dim,
-            tile_n=config.wi_tile_dlhs_embed_dim,
-        )
-      elif dims == DRHS_RAGGED_DOT_DIM_NUMS:
-        return Config(
-            tile_m=config.wi_tile_drhs_batch_seq,
-            tile_k=config.wi_tile_drhs_embed_dim,
-            tile_n=config.wi_tile_drhs_mlp_dim,
-        )
-
-    return Config()
-
-  # Apply class-level monkey patch!
-  # pylint: disable=protected-access
-  PallasMosaicTpuRaggedDot._get_heuristics_config = custom_heuristics
-  _heuristics_patched = True
-  print("[TOKAMAX_PATCH] Successfully monkey-patched Tokamax GMM heuristics globally!")