Merge pull request #3330 from AI-Hypercomputer:qinwen/change_ragged_dot_group_size

Google-ML-Automation · Google-ML-Automation · commit ccd91f48454e · 2026-03-07T17:14:15.000-08:00
PiperOrigin-RevId: 880247095
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -896,6 +896,10 @@ def sparse_matmul(
     def gmm(
         inputs, kernel, tiling, group_sizes, expert_assignments, weight_gather_axes, input_buffer_count, combine_scopes
     ):
+      tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
+          group_sizes,
+          representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
+      )
       pad_length = self.config.wi_tile_fwd_batch_seq
       hs_shape = inputs.shape
       # pad length is the 1st dimension of tiling size in gmm call
@@ -926,7 +930,7 @@ def gmm(
           output = mblx.gmm(
               lhs=inputs,
               rhs=kernel,
-              group_sizes=group_sizes,
+              group_sizes=tokamax_group_sizes,
               preferred_element_type=self.dtype,
               tiling=tiling,
               lhs_quantize_dtype=lhs_quantize_dtype,
@@ -941,7 +945,7 @@ def gmm(
           output = tokamax.ragged_dot(
               lhs=inputs,
               rhs=kernel,
-              group_sizes=group_sizes,
+              group_sizes=tokamax_group_sizes,
               precision=jax.lax.Precision.DEFAULT,
               preferred_element_type=self.dtype,
               implementation="mosaic",
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -27,6 +27,7 @@
 from maxtext.layers import attention_op
 from maxtext.layers import moe as moe_lib
 from maxtext.layers import quantizations
+from maxtext.utils import max_utils
 import qwix.pallas as qpl
 import tokamax
 
@@ -803,11 +804,16 @@ def gmm(
       input_buffer_count,
       combine_scopes,
   ):
+
+    tokamax_group_sizes = tokamax.RaggedDotGroupSizes(
+        group_sizes,
+        representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
+    )
     if config.use_qwix_quantization:
       output = megablox.gmm(
           lhs=inputs,
           rhs=kernel,
-          group_sizes=group_sizes,
+          group_sizes=tokamax_group_sizes,
           preferred_element_type=preferred_element_type,
           tiling=tiling,
           use_qwix_quantization=config.use_qwix_quantization,
@@ -821,7 +827,7 @@ def gmm(
       output = tokamax.ragged_dot(
           lhs=inputs,
           rhs=kernel,
-          group_sizes=group_sizes,
+          group_sizes=tokamax_group_sizes,
           precision=jax.lax.Precision.DEFAULT,
           preferred_element_type=preferred_element_type,
           implementation="mosaic",
diff --git a/src/maxtext/utils/max_utils.py b/src/maxtext/utils/max_utils.py
@@ -1078,3 +1078,13 @@ def transformer_engine_context():
       yield
   except (ImportError, AttributeError):
     yield
+
+
+def generate_representative_group_sizes(target_m: int, g: int) -> tuple[int, ...]:
+  """Generate group sizes for a given target m."""
+  np.random.seed(0)
+  repr_val = np.random.uniform(size=(g,))
+  repr_val = np.random.binomial(1, 0.9, (g,)) * repr_val
+  repr_val = np.int32((repr_val / np.sum(repr_val)) * target_m)
+  repr_val[0] += target_m - np.sum(repr_val)
+  return tuple(map(int, repr_val))