update for group_size for tokamax

suexu1025 · suexu1025 · commit 9c2bff5aa89a · 2026-03-07T05:40:56.000Z
update utils

update

format

format
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -891,6 +891,10 @@ def sparse_matmul(
     def gmm(
         inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, input_buffer_count, combine_scopes
     ):
+      tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
+          group_sizes,
+          representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
+      )
       pad_length = self.config.wi_tile_fwd_batch_seq
       hs_shape = inputs.shape
       # pad length is the 1st dimension of tiling size in gmm call
@@ -921,7 +925,7 @@ def gmm(
           output = mblx.gmm(
               lhs=inputs,
               rhs=kernel,
-              group_sizes=group_sizes,
+              group_sizes=tokamax_group_sizes,
               preferred_element_type=self.dtype,
               tiling=tiling,
               lhs_quantize_dtype=lhs_quantize_dtype,
@@ -936,7 +940,7 @@ def gmm(
           output = tokamax.ragged_dot(
               lhs=inputs,
               rhs=kernel,
-              group_sizes=group_sizes,
+              group_sizes=tokamax_group_sizes,
               precision=jax.lax.Precision.DEFAULT,
               preferred_element_type=self.dtype,
               implementation="mosaic",
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -27,6 +27,7 @@
 from maxtext.layers import attention_op
 from maxtext.layers import moe as moe_lib
 from maxtext.layers import quantizations
+from maxtext.utils import max_utils
 import qwix.pallas as qpl
 import tokamax
 
@@ -803,11 +804,16 @@ def gmm(
       input_buffer_count,
       combine_scopes,
   ):
+
+    tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
+        group_sizes,
+        representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
+    )
     if config.use_qwix_quantization:
       output = megablox.gmm(
           lhs=inputs,
           rhs=kernel,
-          group_sizes=group_sizes,
+          group_sizes=tokamax_group_sizes,
           preferred_element_type=preferred_element_type,
           tiling=tiling,
           use_qwix_quantization=config.use_qwix_quantization,
@@ -820,7 +826,7 @@ def gmm(
       output = tokamax.ragged_dot(
           lhs=inputs,
           rhs=kernel,
-          group_sizes=group_sizes,
+          group_sizes=tokamax_group_sizes,
           precision=jax.lax.Precision.DEFAULT,
           preferred_element_type=preferred_element_type,
           implementation="mosaic",
diff --git a/src/maxtext/utils/max_utils.py b/src/maxtext/utils/max_utils.py
@@ -1078,3 +1078,13 @@ def transformer_engine_context():
       yield
   except (ImportError, AttributeError):
     yield
+
+
+def generate_representative_group_sizes(target_m: int, g: int) -> tuple[int, ...]:
+  """Generate group sizes for a given target m."""
+  np.random.seed(0)
+  repr_val = np.random.uniform(size=(g,))
+  repr_val = np.random.binomial(1, 0.9, (g,)) * repr_val
+  repr_val = np.int32((repr_val / np.sum(repr_val)) * target_m)
+  repr_val[0] += target_m - np.sum(repr_val)
+  return tuple(map(int, repr_val))