Merge pull request #3627 from AI-Hypercomputer:nicogrande/fused-moe-gmm

Google-ML-Automation · Google-ML-Automation · commit 3fc8a2b73eb6 · 2026-04-21T16:37:29.000-07:00
PiperOrigin-RevId: 903492693
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -30,17 +30,19 @@ weight_dtype: bfloat16
 mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
 logical_axis_rules: [
                       ['activation_batch', ['data']],
-                      ['activation_batch_moe', []],
+                      ['activation_batch_moe', ['data']],
                       ['activation_embed_and_logits_batch', ['data', 'expert']],
                       ['activation_embed_and_logits_batch_sequence', ['data', 'expert']],
                       ['activation_heads', ['model', 'expert']],
                       ['activation_kv_heads', ['model', 'expert']],
                       ['activation_attn_length', []],
-                      ['activation_length', ['data']],
-                      ['activation_length_moe', ['data', 'expert']],
-                      ['activation_length_moe', 'data'],
+                      ['activation_length', []],
+                      ['activation_length_moe', []],
                       ['activation_q_length', ['expert', 'attn_dp_expert']],
                       ['activation_attn_embed', 'model'],
+                      # Expert is missing explicitly from activation_embed despite using TP.
+                      # We are going for a replicate-AR style of TP as opposed to our typical AG-RS style of TP
+                      # due to the output sharding of the fused_moe_gmm kernel in tpu-inference.
                       ['activation_embed', ['model', 'attn_dp']],
                       ['activation_embed_moe', ['model', 'attn_dp']],
                       ['activation_mlp', ['model', 'attn_dp']],
@@ -53,23 +55,21 @@ logical_axis_rules: [
                       ['activation_norm_length', []],
                       ['activation_norm_length_moe', []],
                       ['activation_exp', ['expert', 'attn_dp_expert']],
-                      ['decode_batch', ['expert', 'attn_dp_expert']],
-                      ['decode_batch_moe', []],
+                      ['decode_batch', ['data']],
+                      ['decode_batch_moe', ['data']],
                       ['decode_length', []],
                       ['mlp', ['model', 'attn_dp']],
                       ['mlp_moe', ['model', 'attn_dp']],
                       ['mlp_no_fsdp', ['model', 'attn_dp']],
                       ['vocab', ['model', 'attn_dp']],
-                      ['heads', ['model']],
+                      # Expert is intended to act like TP for attention.
+                      # We target two all-reduces, one at the end of attention out projection and one at the end of the feedforward.
+                      ['heads', ['model', 'expert']],
                       ['q_heads', ['model', 'expert']],
                       ['kv_heads', ['model', 'expert']],
                       ['kv_head_dim', []],
                       ['kv', []],
-                      ['embed', ['expert', 'attn_dp_expert']],
-                      ['embed', ['attn_dp_expert']],
-                      ['embed_vocab', ['expert', 'attn_dp_expert']],
-                      ['embed_vocab', ['attn_dp_expert']],
-                      ['embed_moe', []],
+                      ['embed', []],
                       ['embed_moe', []],
                       ['embed_tensor_transpose', ['attn_dp', 'model']],
                       ['q_lora', ['expert', 'attn_dp_expert']],
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -698,6 +698,11 @@ class MoEGeneral(BaseModel):
       False,
       description="Whether to cast inputs to fp32 to compute MoE gate logits for numerical stability.",
   )
+  prefuse_moe_weights: bool = Field(
+      False,
+      description="Whether to pre-fuse MoE weights (w0 and w1) during initialization. "
+      "This is useful for inference performance in vllm_rpa mode.",
+  )
 
 
 class MoEKernels(BaseModel):
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -82,6 +82,7 @@ def decode_with_vllm(config: Config) -> None:
               "weight_dtype": "bfloat16",
               "allow_split_physical_axes": True,
               "debug_sharding": config.debug_sharding,
+              "prefuse_moe_weights": config.prefuse_moe_weights,
           },
           "sharding": {
               "sharding_strategy": {
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -124,6 +124,9 @@ def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array, mesh: Mesh):
     # Model creation
     self.model: nnx.Module | None = None
 
+    # Indicates that the model handles its own sharding logic
+    self._self_manages_sharding = True
+
     # Handle dummy weight loading during initialization
     if vllm_config.load_config.load_format == "dummy":
       self.load_weights(rng_key)
@@ -161,8 +164,8 @@ def __call__(
       raise ValueError("Model must be an instance of type nnx.Module.")
 
     # Ensure inputs are at least 2D with a batch dimension
-    input_ids = jnp.atleast_2d(input_ids)
-    input_positions = jnp.atleast_2d(attention_metadata.input_positions)
+    input_ids = jnp.expand_dims(input_ids, axis=1)
+    input_positions = jnp.expand_dims(attention_metadata.input_positions, axis=1)
 
     with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
       aux_hidden_states = []
@@ -233,7 +236,7 @@ def compute_logits(self, hidden_states: jax.Array) -> jax.Array:
 
     with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
       # Reshape to (num_tokens, 1, hidden_dim) for decoder output head
-      y = hidden_states[:, jnp.newaxis, :]
+      y = jnp.expand_dims(hidden_states, axis=1)
 
       # Compute logits using the MaxText decoder's output head
       logits = self.model.decoder.apply_output_head(self.model.token_embedder, y, True, self.model_mode)
@@ -250,8 +253,8 @@ def load_weights(self, rng_key: jax.Array) -> None:
     if self.model is not None:
       return
 
-    with self.mesh, nn.logical_axis_rules(""):
-      model = model_creation_utils.from_pretrained(
+    with self.mesh, nn.logical_axis_rules(self.maxtext_config.logical_axis_rules):
+      model, _ = model_creation_utils.create_nnx_model(
           self.maxtext_config, mesh=self.mesh, model_mode=self.model_mode, rng_key=rng_key
       )
       self.model = nnx.data(model)
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -388,7 +388,10 @@ def __init__(
         kernel_init=self.kernel_init,
         kernel_axes=self.kernel_axes,
         use_bias=self.config.routed_bias,
-        score_func=self.config.routed_score_func,
+        # tpu-inference applies the score function in the fused_moe_gmm kernel,
+        # so we don't apply it here to avoid redundant computation.
+        # See https://github.com/vllm-project/tpu-inference/blob/main/tpu_inference/layers/common/fused_moe_gmm.py#L58.
+        score_func="" if self.config.attention == "vllm_rpa" else self.config.routed_score_func,
         matmul_precision=self.config.matmul_precision,
         shard_mode=config.shard_mode,
         rngs=self.rngs,
@@ -407,6 +410,27 @@ def __init__(
       self.wi_0 = jnp.zeros((num_experts, self.moe_expert_input_dim, intermediate_dim))
       self.wi_1 = jnp.zeros((num_experts, self.moe_expert_input_dim, intermediate_dim))
       self.wo = jnp.zeros((num_experts, intermediate_dim, self.moe_expert_input_dim))
+    elif self.config.prefuse_moe_weights and self.config.attention == "vllm_rpa":
+      self.wi = nnx.Param(
+          self.kernel_init(
+              self.rngs.params(),
+              (num_experts, self.moe_expert_input_dim, intermediate_dim * 2),
+              weight_dtype,
+              kernel_in_axis,
+              kernel_out_axis,
+          ),
+          sharding=self.wi_kernel_axes,
+      )
+      self.wo = nnx.Param(
+          self.kernel_init(
+              self.rngs.params(),
+              (self.num_experts, self.intermediate_dim, self.moe_expert_input_dim),
+              self.weight_dtype,
+              kernel_in_axis,
+              kernel_out_axis,
+          ),
+          sharding=self.wo_kernel_axes,
+      )
     else:
       self.wi_0 = nnx.Param(
           self.kernel_init(
@@ -2009,6 +2033,72 @@ def dense_matmul(
         ).astype(self.dtype)
       return output, lb_loss, bias_updates
 
+  def fused_moe_matmul(
+      self,
+      inputs,
+      gate_logits,
+      wo_kernel,
+      w0_kernel=None,
+      w1_kernel=None,
+      fused_kernel=None,
+  ) -> tuple[jax.Array, None, None]:
+    """Fused MoE via tpu_inference fused_moe_func (vllm_rpa path only).
+
+    fused_moe_func handles routing, GMM, and weighted combination internally.
+    It does not compute lb_loss or bias_updates (inference-only).
+    """
+    try:
+      # pylint: disable=import-outside-toplevel
+      # pytype: disable=import-error
+      from tpu_inference.layers.common.fused_moe_gmm import fused_moe_func
+    except ImportError as e:
+      raise ImportError("fused_moe_matmul requires the tpu-inference package.") from e
+
+    # Reshape 3D [B, S, D] -> 2D [T, D] (fused_moe_func expects 2D input)
+    batch_size, seq_len, emb_dim = inputs.shape
+    hidden_states = jnp.reshape(inputs, (batch_size * seq_len, emb_dim))
+    gating_output = jnp.reshape(gate_logits, (batch_size * seq_len, self.num_experts))
+
+    # Concatenate gate and up projections: [E, D, H] + [E, D, H] -> [E, D, 2H]
+    # fused_moe_func splits this internally: gate=w1[..., :H], up=w1[..., H:]
+    if fused_kernel is None:
+      fused_kernel = jnp.concatenate([w0_kernel, w1_kernel], axis=-1)
+
+    # Use expert parallelism if the expert axis has size > 1
+    use_ep = self.get_expert_parallelism_size() > 1
+
+    # Map MaxText config fields to fused_moe_func args
+    activation = self.config.mlp_activations[0]  # e.g. "silu"
+    scoring_fn = self.config.routed_score_func if self.config.routed_score_func else "softmax"
+
+    # Check if the model architecture intrinsically renormalizes weights
+    renormalize = self.config.norm_topk_prob or (
+        self.config.decoder_block not in (ctypes.DecoderBlockType.LLAMA4, ctypes.DecoderBlockType.GEMMA4)
+    )
+
+    output_2d = fused_moe_func(
+        hidden_states=hidden_states,
+        w1=fused_kernel,
+        w2=wo_kernel,
+        w1_scale=None,
+        w2_scale=None,
+        w1_bias=None,
+        w2_bias=None,
+        gating_output=gating_output,
+        topk=self.num_experts_per_tok,
+        renormalize=renormalize,
+        mesh=self.mesh,
+        use_ep=use_ep,
+        activation=activation,
+        scoring_fn=scoring_fn,
+        sc_kernel_threshold=16777216,
+        sc_kernel_col_chunk_size=1024,
+    )
+
+    # Reshape output 2D [T, D] -> 3D [B, S, D]
+    output = jnp.reshape(output_2d, (batch_size, seq_len, emb_dim))
+    return output, None, None
+
   def retrieve_quantized_weight(
       self,
       inputs,
@@ -2047,10 +2137,17 @@ def __call__(
     routing_inputs = inputs if gate_inputs is None else gate_inputs.astype(gate_dtype)
     gate_logits, pre_bias_logits = self.gate(routing_inputs)
 
-    w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
-    w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)
     wo_kernel = jnp.asarray(self.wo[...], self.dtype)
 
+    fused_kernel = None
+    w0_kernel = None
+    w1_kernel = None
+    if cfg.prefuse_moe_weights and cfg.attention == "vllm_rpa":
+      fused_kernel = jnp.asarray(self.wi[...], self.dtype)
+    else:
+      w0_kernel = jnp.asarray(self.wi_0[...], self.dtype)
+      w1_kernel = jnp.asarray(self.wi_1[...], self.dtype)
+
     if self.per_expert_scale is not None:
       wo_kernel = wo_kernel * jnp.asarray(self.per_expert_scale[...], self.dtype)[:, None, None]
 
@@ -2061,7 +2158,12 @@ def __call__(
     else:
       w0_bias, w1_bias, wo_bias = None, None, None
 
-    if cfg.sparse_matmul:
+    # vllm_rpa codepath uses fused_moe_func from tpu_inference for optimized inference.
+    if cfg.attention == "vllm_rpa":
+      output, lb_loss, bias_updates = self.fused_moe_matmul(
+          inputs, gate_logits, wo_kernel, w0_kernel=w0_kernel, w1_kernel=w1_kernel, fused_kernel=fused_kernel
+      )
+    elif cfg.sparse_matmul:
       if quantizations.in_serve_mode(self.quant):
         w0_kernel, w1_kernel, wo_kernel = self.retrieve_quantized_weight(
             inputs,
diff --git a/src/maxtext/utils/model_creation_utils.py b/src/maxtext/utils/model_creation_utils.py
@@ -28,6 +28,7 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from jax.sharding import Mesh
 from maxtext.configs import pyconfig
 from maxtext.common.common_types import MODEL_MODE_TRAIN
@@ -507,6 +508,39 @@ def create_sharded_state():
         # Get the structure of checkpoint in `config.load_parameters_path`
         metadata = ckptr.metadata(config.load_parameters_path)
 
+        def _adjust_target_for_moe_fusion(target, meta_tree, is_nnx):
+          if not hasattr(target, "items") or not hasattr(meta_tree, "items"):
+            return target
+          new_target = {}
+          for k, v in target.items():
+            if k == "wi" and "wi" not in meta_tree and "wi_0" in meta_tree and "wi_1" in meta_tree:
+              if not is_nnx:
+                arr = v
+                half_dim = arr.shape[-1] // 2
+                new_target["wi_0"] = jax.ShapeDtypeStruct(
+                    shape=arr.shape[:-1] + (half_dim,), dtype=arr.dtype, sharding=arr.sharding
+                )
+                new_target["wi_1"] = jax.ShapeDtypeStruct(
+                    shape=arr.shape[:-1] + (half_dim,), dtype=arr.dtype, sharding=arr.sharding
+                )
+              else:
+                arr = v["value"]
+                half_dim = arr.shape[-1] // 2
+                new_target["wi_0"] = {
+                    "value": jax.ShapeDtypeStruct(
+                        shape=arr.shape[:-1] + (half_dim,), dtype=arr.dtype, sharding=arr.sharding
+                    )
+                }
+                new_target["wi_1"] = {
+                    "value": jax.ShapeDtypeStruct(
+                        shape=arr.shape[:-1] + (half_dim,), dtype=arr.dtype, sharding=arr.sharding
+                    )
+                }
+            else:
+              new_target[k] = _adjust_target_for_moe_fusion(v, meta_tree.get(k, {}), is_nnx)
+
+          return new_target
+
         is_nnx_checkpoint = True
         if (
             "params" in metadata.item_metadata.tree.keys()
@@ -520,6 +554,10 @@ def create_sharded_state():
               is_leaf=lambda n: hasattr(n, "value"),
           )
 
+          target_for_restore = _adjust_target_for_moe_fusion(
+              target_for_restore, metadata.item_metadata.tree["params"]["params"], False
+          )
+
           item_to_restore = {"params": {"params": target_for_restore}}
           base_restore_args = ocp.checkpoint_utils.construct_restore_args(target_for_restore)
           restore_args = {
@@ -538,6 +576,7 @@ def create_sharded_state():
               sharded_state,
               is_leaf=lambda n: isinstance(n, nnx.Variable),
           )
+          target_for_restore = _adjust_target_for_moe_fusion(target_for_restore, metadata.item_metadata.tree, True)
           item_to_restore = target_for_restore
           base_restore_args = ocp.checkpoint_utils.construct_restore_args(target_for_restore)
           restore_args = _fix_restore_args_for_shape_mismatch(
@@ -577,6 +616,36 @@ def create_sharded_state():
               sharded_state,
               is_leaf=lambda n: isinstance(n, nnx.Variable),
           )
+
+          def to_dict(tree):
+            if hasattr(tree, "items"):
+              return {k: to_dict(v) for k, v in tree.items()}
+            return tree
+
+          model_arrays = to_dict(model_arrays)
+          checkpoint = to_dict(checkpoint)
+
+          def _fuse_moe_weights(ckpt_tree, model_arrays_tree):
+            if not hasattr(ckpt_tree, "items") or not hasattr(model_arrays_tree, "items"):
+              return ckpt_tree
+            new_ckpt = {}
+            for k, v in ckpt_tree.items():
+              if k in ("wi_0", "wi_1") and "wi" in model_arrays_tree:
+                continue
+              new_ckpt[k] = _fuse_moe_weights(v, model_arrays_tree.get(k, {}))
+
+            if "wi" in model_arrays_tree and "wi_0" in ckpt_tree and "wi_1" in ckpt_tree:
+              wi_0 = ckpt_tree["wi_0"]
+              wi_1 = ckpt_tree["wi_1"]
+              new_ckpt["wi"] = np.concatenate([wi_0, wi_1], axis=-1)
+
+            return new_ckpt
+
+          checkpoint = _fuse_moe_weights(checkpoint, model_arrays)
+          # Release the raw restored buffers now that wi_0/wi_1 have been fused (if needed).
+          # This prevents the replicated intermediate copies from persisting until function return.
+          del restored
+
           checkpoint = jax.tree.map(_expand_checkpoint_to_model_shapes, checkpoint, model_arrays)
           nnx.update(model, checkpoint)
 
diff --git a/tests/unit/moe_test.py b/tests/unit/moe_test.py