Enable Tokamax GMM with autotuning fallback in MaxText

darisoy · darisoy · commit 682db7f848b7 · 2026-05-05T17:18:43.000Z
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -561,6 +561,7 @@ class Attention(BaseModel):
       False,
       description="Whether to use the Tokamax library for GMM kernel implementation.",
   )
+  tokamax_gmm_autotune: bool = Field(False, description="Whether to use tokamax auto-tuner for GMM.")
   ragged_block_size: int = Field(256, description="Block size for ragged attention.")
   enable_padding_causal_mask: bool = Field(True, description="Temporary flag for TE padding.")
   use_tokamax_splash: bool = Field(False, description="Whether to use tokamax splash attention.")
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -43,6 +43,8 @@
 from qwix.contrib.sparsity import sparsity_module
 import qwix.pallas as qpl
 import tokamax
+from tokamax import config as tokamax_config
+from tokamax._src.ops.ragged_dot.pallas_mosaic_tpu import PallasMosaicTpuRaggedDot, Config
 
 set_xla_metadata = xla_metadata.set_xla_metadata
 
@@ -1121,14 +1123,25 @@ def gmm(inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_a
               weight_gather_axes=weight_gather_axes,
           )
         else:  # tokamax (unquantized)
-          output = tokamax.ragged_dot(
-              lhs=inputs,
-              rhs=kernel,
-              group_sizes=tokamax_group_sizes,
-              precision=jax.lax.Precision.DEFAULT,
-              preferred_element_type=self.dtype,
-              implementation="mosaic",
-          )
+          if self.config.tokamax_gmm_autotune:
+            with tokamax_config.autotuning_cache_miss_fallback("autotune"):
+              output = tokamax.ragged_dot(
+                  lhs=inputs,
+                  rhs=kernel,
+                  group_sizes=tokamax_group_sizes,
+                  precision=jax.lax.Precision.DEFAULT,
+                  preferred_element_type=self.dtype,
+                  implementation="mosaic",
+              )
+          else:
+            output = tokamax.ragged_dot(
+                lhs=inputs,
+                rhs=kernel,
+                group_sizes=tokamax_group_sizes,
+                precision=jax.lax.Precision.DEFAULT,
+                preferred_element_type=self.dtype,
+                implementation="mosaic",
+            )
       elif self.config.megablox:  # Older forked megablox
         output = mblx.gmm(
             lhs=inputs,
diff --git a/src/maxtext/models/deepseek_batchsplit_fp8.py b/src/maxtext/models/deepseek_batchsplit_fp8.py
@@ -29,6 +29,8 @@
 from maxtext.layers import quantizations
 import qwix.pallas as qpl
 import tokamax
+from tokamax import config as tokamax_config
+from tokamax._src.ops.ragged_dot.pallas_mosaic_tpu import PallasMosaicTpuRaggedDot, Config
 
 
 @functools.partial(
@@ -962,14 +964,25 @@ def gmm(
           qwix_rule=quantizations.get_fp8_full_qwix_rule_w_sparsity(config)[0],
       )
     else:
-      output = tokamax.ragged_dot(
-          lhs=inputs,
-          rhs=kernel,
-          group_sizes=tokamax.RaggedDotGroupSizes(group_sizes, len(inputs)),
-          precision=jax.lax.Precision.DEFAULT,
-          preferred_element_type=preferred_element_type,
-          implementation="mosaic",
-      )
+      if config.tokamax_gmm_autotune:
+        with tokamax_config.autotuning_cache_miss_fallback("autotune"):
+          output = tokamax.ragged_dot(
+              lhs=inputs,
+              rhs=kernel,
+              group_sizes=tokamax.RaggedDotGroupSizes(group_sizes, len(inputs)),
+              precision=jax.lax.Precision.DEFAULT,
+              preferred_element_type=preferred_element_type,
+              implementation="mosaic",
+          )
+      else:
+        output = tokamax.ragged_dot(
+            lhs=inputs,
+            rhs=kernel,
+            group_sizes=tokamax.RaggedDotGroupSizes(group_sizes, len(inputs)),
+            precision=jax.lax.Precision.DEFAULT,
+            preferred_element_type=preferred_element_type,
+            implementation="mosaic",
+        )
     return output
 
   gmm_fn = functools.partial(gmm, group_sizes=group_sizes, preferred_element_type=config.dtype)

Original file line number	Diff line number	Diff line change
`@@ -561,6 +561,7 @@ class Attention(BaseModel):`
`561`	`561`	`False,`
`562`	`562`	`description="Whether to use the Tokamax library for GMM kernel implementation.",`
`563`	`563`	`)`
	`564`	`+ tokamax_gmm_autotune: bool = Field(False, description="Whether to use tokamax auto-tuner for GMM.")`
`564`	`565`	`ragged_block_size: int = Field(256, description="Block size for ragged attention.")`
`565`	`566`	`enable_padding_causal_mask: bool = Field(True, description="Temporary flag for TE padding.")`
`566`	`567`	`use_tokamax_splash: bool = Field(False, description="Whether to use tokamax splash attention.")`