change EP axis to expert from attn_dp_expert.

NicoGrande · khatwanimohit · commit 1d2eefa01190 · 2026-03-17T23:18:58.000Z
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -33,8 +33,8 @@ logical_axis_rules: [
                       ['activation_batch_no_exp', []],
                       ['activation_embed_and_logits_batch', ['expert']],
                       ['activation_embed_and_logits_batch_sequence', ['expert']],
-                      ['activation_heads', ['model']],
-                      ['activation_kv_heads', ['model']],
+                      ['activation_heads', ['model', 'expert']],
+                      ['activation_kv_heads', ['model', 'expert']],
                       ['activation_attn_length', ['expert']],
                       ['activation_attn_length_no_exp', []],
                       ['activation_length', ['data', 'expert']],
@@ -58,11 +58,12 @@ logical_axis_rules: [
                       ['moe_mlp', ['model', 'attn_dp']],
                       ['vocab', ['model', 'attn_dp']],
                       ['heads', ['model']],
-                      ['q_heads', ['model']],
-                      ['kv_heads', ['model']],
+                      ['q_heads', ['model', 'expert']],
+                      ['kv_heads', ['model', 'expert']],
                       ['kv_head_dim', []],
                       ['kv', []],
                       ['embed', ['expert', 'attn_dp_expert']],
+                      ['embed', ['attn_dp_expert']],
                       ['embed_tensor_transpose', ['attn_dp', 'model']],
                       ['embed_no_exp', []],
                       ['q_lora', ['expert', 'attn_dp_expert']],
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -145,7 +145,6 @@ def decode_with_vllm(config: Config) -> None:
       max_tokens=max_tokens_to_generate,
       top_k=config.decode_sampling_top_k,
       top_p=config.decode_sampling_nucleus_p,
-      seed=FLAGS.seed,
   )
 
   outputs = llm.generate(prompts, sampling_params)
diff --git a/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py b/src/maxtext/integration/vllm/maxtext_vllm_adapter/adapter.py
@@ -30,6 +30,7 @@
 
 try:
   from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+  from tpu_inference.layers.common.attention_interface import ShardingAxisName
 except ImportError:
   # Mock for documentation build or environments without tpu_inference
   class AttentionMetadata:
@@ -39,7 +40,7 @@ class AttentionMetadata:
 from vllm.config import VllmConfig
 
 
-def generate_maxtext_config(vllm_config: VllmConfig) -> pyconfig.HyperParameters:
+def generate_maxtext_config(vllm_config: VllmConfig, mesh: Mesh) -> pyconfig.HyperParameters:
   """Generates a MaxText configuration from a vLLM configuration.
 
   This function takes a vLLM configuration object and translates relevant
@@ -50,6 +51,7 @@ def generate_maxtext_config(vllm_config: VllmConfig) -> pyconfig.HyperParameters
   Args:
     vllm_config: The vLLM configuration object containing model and load
       parameters.
+    mesh: The JAX mesh device for model sharding.
 
   Returns:
     A `pyconfig.HyperParameters` object configured for MaxText.
@@ -73,6 +75,22 @@ def generate_maxtext_config(vllm_config: VllmConfig) -> pyconfig.HyperParameters
   base_config_path = os.path.join(MAXTEXT_CONFIGS_DIR, "inference", "vllm.yml")
   argv_list = ["", str(base_config_path)]
 
+  # Pad the number of KV heads if its less than the TP / EP size
+  if isinstance(ShardingAxisName.ATTN_HEAD, tuple):
+    tp_sizes = [mesh.shape[axis_name] for axis_name in ShardingAxisName.ATTN_HEAD]
+    max_tp_size = max(tp_sizes)
+  else:
+    max_tp_size = mesh.shape[ShardingAxisName.ATTN_HEAD]
+
+  if (
+      max_tp_size % vllm_config.model_config.get_total_num_kv_heads() == 0
+      and vllm_config.model_config.get_total_num_kv_heads() < max_tp_size
+  ):
+    max_logging.log(
+        f"Padding num_kv_heads from {vllm_config.model_config.get_total_num_kv_heads()} to {max_tp_size} to match tp_size."
+    )
+    overrides["base_num_kv_heads"] = max_tp_size
+
   maxtext_config = pyconfig.initialize(argv_list, **overrides)
   return maxtext_config
 
@@ -96,7 +114,7 @@ def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array, mesh: Mesh):
     """
     self.vllm_config = vllm_config
     self.cfg = vllm_config.model_config
-    self.maxtext_config = generate_maxtext_config(vllm_config)
+    self.maxtext_config = generate_maxtext_config(vllm_config, mesh)
 
     # Model configuration
     self.mesh = mesh
diff --git a/src/maxtext/utils/model_creation_utils.py b/src/maxtext/utils/model_creation_utils.py
@@ -15,6 +15,7 @@
 # pylint: disable=bare-except, consider-using-generator
 """ Utils that are only interesting for creating a model in MaxText. """
 
+import dataclasses
 from collections.abc import Sequence
 from functools import partial
 from typing import overload
@@ -23,15 +24,128 @@
 from flax import nnx
 import flax.linen as nn
 import jax
+import jax.numpy as jnp
 from jax.sharding import AxisType, Mesh
 from maxtext.configs import pyconfig
 from maxtext.common.common_types import MODEL_MODE_TRAIN, ShardMode
 from maxtext.layers import quantizations
 from maxtext.models import models
+from maxtext.utils import max_logging
 from maxtext.utils import max_utils
 from maxtext.utils import maxtext_utils
 from orbax import checkpoint as ocp
 
+try:
+  from orbax.checkpoint.metadata import ArrayMetadata as _OrbaxArrayMetadata
+
+  def _is_orbax_array_metadata(x):
+    return isinstance(x, _OrbaxArrayMetadata)
+
+except ImportError:
+
+  def _is_orbax_array_metadata(x):
+    return hasattr(x, "shape") and hasattr(x, "sharding") and hasattr(x, "dtype") and not isinstance(x, jax.Array)
+
+
+def _expand_checkpoint_to_model_shapes(ckpt_arr, model_arr):
+  """Expand ckpt_arr to model_arr's shape and re-shard to model_arr's sharding.
+
+  Used to expand checkpoint KV-head (and similar) arrays that were saved with
+  fewer heads than the padded model shape requires (e.g. due to TP/EP padding
+  in adapter.py).  Each dimension must divide evenly into the corresponding
+  model dimension.
+
+  Uses jnp.repeat so that each original slice is placed adjacent to its copies.
+  For GQA with TP, device i needs KV head i//ratio from the original checkpoint,
+  so the correct layout is e.g. [h0, h0, h1, h1, h2, h2, h3, h3] rather than
+  [h0, h1, h2, h3, h0, h1, h2, h3].
+  """
+  ckpt_shape = ckpt_arr.shape
+  model_shape = model_arr.shape
+  if ckpt_shape == model_shape:
+    return jax.device_put(ckpt_arr, model_arr.sharding)
+  if len(ckpt_shape) != len(model_shape):
+    raise ValueError(
+        f"Checkpoint and model arrays have different ranks: {ckpt_shape} vs {model_shape}. "
+        "If the checkpoint was saved with scan_layers=True (stacked layers), convert it to "
+        "unscanned format before loading with vLLM (vllm.yml sets scan_layers=False)."
+    )
+  result = ckpt_arr
+  for axis, (ckpt_dim, model_dim) in enumerate(zip(ckpt_shape, model_shape)):
+    if model_dim % ckpt_dim != 0:
+      raise ValueError(
+          f"Model dimension {model_dim} is not evenly divisible by checkpoint dimension {ckpt_dim}."
+          f" Full shapes — checkpoint: {ckpt_shape}, model: {model_shape}"
+      )
+    if model_dim != ckpt_dim:
+      result = jnp.repeat(result, model_dim // ckpt_dim, axis=axis)
+  return jax.device_put(result, model_arr.sharding)
+
+
+def _fix_restore_args_for_shape_mismatch(restore_args, stored_metadata_tree, mesh):
+  """Use replicated sharding for arrays whose checkpoint shape differs from the model shape.
+
+  When the model is initialized with padded shapes (e.g. KV heads padded to match
+  TP size) but the checkpoint was saved with smaller shapes, Orbax will reject the
+  restore because the provided sharding is incompatible with the stored shape.
+  For those arrays we switch to a fully-replicated sharding and clear global_shape
+  so Orbax loads the array as-written.  _expand_checkpoint_to_model_shapes then
+  expands and re-shards the loaded arrays to match the model.
+
+  Uses tree_map_with_path so each ArrayRestoreArgs is looked up by path in the
+  metadata dict — avoids ordering/count mismatches from flattening two trees with
+  different pytree node types (e.g. nnx.State vs plain dict) independently.
+  """
+  replicated = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec())
+
+  def _key_str(key):
+    """Extract string name from a JAX path key (DictKey, GetAttrKey, etc.)."""
+    if hasattr(key, "key"):
+      return str(key.key)
+    if hasattr(key, "attr"):
+      return str(key.attr)
+    return str(key)
+
+  def _lookup_stored_meta(path):
+    """Navigate stored_metadata_tree using path keys from the restore_args tree."""
+    node = stored_metadata_tree
+    for key in path:
+      name = _key_str(key)
+      if isinstance(node, dict) and name in node:
+        node = node[name]
+      else:
+        return None
+    return node
+
+  mismatched_paths = []
+
+  def _fix_one(path, restore_arg):
+    if not isinstance(restore_arg, ocp.ArrayRestoreArgs):
+      return restore_arg
+    stored_meta = _lookup_stored_meta(path)
+    if stored_meta is not None and _is_orbax_array_metadata(stored_meta):
+      stored_shape = tuple(stored_meta.shape)
+      if (
+          restore_arg.global_shape is not None
+          and restore_arg.global_shape != stored_shape
+          and len(stored_shape) == len(restore_arg.global_shape)
+      ):
+        mismatched_paths.append(
+            f"  {'.'.join(_key_str(k) for k in path)}: stored={stored_shape} -> model={restore_arg.global_shape}"
+        )
+        return dataclasses.replace(
+            restore_arg, global_shape=None, shape=None, sharding=replicated, mesh=None, mesh_axes=None
+        )
+    return restore_arg
+
+  fixed = jax.tree_util.tree_map_with_path(_fix_one, restore_args, is_leaf=lambda x: isinstance(x, ocp.ArrayRestoreArgs))
+  if mismatched_paths:
+    max_logging.log(
+        f"Checkpoint shape mismatches ({len(mismatched_paths)} arrays): loading with replicated "
+        "sharding and expanding to model shape after restore.\n" + "\n".join(mismatched_paths)
+    )
+  return fixed
+
 
 @overload
 def from_config(
@@ -154,6 +268,7 @@ def create_sharded_state():
     with nn.logical_axis_rules(config.logical_axis_rules):
       sharded_state = create_sharded_state()
     model = nnx.merge(graphdef, sharded_state)
+
     # print weights sharding info under debug sharding mode
     if config.debug_sharding:
       max_utils.print_non_trivial_mesh_axis(model.mesh)
@@ -163,6 +278,7 @@ def create_sharded_state():
           mesh=model.mesh,
           logical_annotations=specs,
       )
+
     if config.load_parameters_path:
       try:
         ckptr = ocp.Checkpointer(
@@ -196,7 +312,16 @@ def create_sharded_state():
           )
 
           item_to_restore = {"params": {"params": target_for_restore}}
-          restore_args = {"params": {"params": ocp.checkpoint_utils.construct_restore_args(target_for_restore)}}
+          base_restore_args = ocp.checkpoint_utils.construct_restore_args(target_for_restore)
+          restore_args = {
+              "params": {
+                  "params": _fix_restore_args_for_shape_mismatch(
+                      base_restore_args,
+                      metadata.item_metadata.tree["params"]["params"],
+                      mesh,
+                  )
+              }
+          }
         else:
           # structure of nnx checkpoint: {'decoder': {'value': ...}}
           target_for_restore = jax.tree.map(
@@ -205,7 +330,12 @@ def create_sharded_state():
               is_leaf=lambda n: isinstance(n, nnx.Variable),
           )
           item_to_restore = target_for_restore
-          restore_args = ocp.checkpoint_utils.construct_restore_args(target_for_restore)
+          base_restore_args = ocp.checkpoint_utils.construct_restore_args(target_for_restore)
+          restore_args = _fix_restore_args_for_shape_mismatch(
+              base_restore_args,
+              metadata.item_metadata.tree,
+              mesh,
+          )
 
         restored = ckptr.restore(
             epath.Path(config.load_parameters_path),
@@ -223,7 +353,22 @@ def create_sharded_state():
         else:
           checkpoint = restored["params"]["params"]
 
+        loaded_count = len(jax.tree_util.tree_leaves(checkpoint))
+        expected_count = len(jax.tree_util.tree_leaves(target_for_restore))
+        if loaded_count < expected_count:
+          raise ValueError(
+              f"Checkpoint at '{config.load_parameters_path}' loaded only {loaded_count} of {expected_count} "
+              "expected parameter arrays. This usually means a scanned (stacked-layers) checkpoint was provided "
+              "where an unscanned checkpoint is required. Please convert the checkpoint to unscanned format first."
+          )
+
         if checkpoint:
+          model_arrays = jax.tree.map(
+              lambda v: v.value,
+              sharded_state,
+              is_leaf=lambda n: isinstance(n, nnx.Variable),
+          )
+          checkpoint = jax.tree.map(_expand_checkpoint_to_model_shapes, checkpoint, model_arrays)
           nnx.update(model, checkpoint)
 
       except Exception as e:
diff --git a/tests/unit/model_creation_utils_test.py b/tests/unit/model_creation_utils_test.py

Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,6 @@ def decode_with_vllm(config: Config) -> None:`
`145`	`145`	`max_tokens=max_tokens_to_generate,`
`146`	`146`	`top_k=config.decode_sampling_top_k,`
`147`	`147`	`top_p=config.decode_sampling_nucleus_p,`
`148`		`- seed=FLAGS.seed,`
`149`	`148`	`)`
`150`	`149`
`151`	`150`	`outputs = llm.generate(prompts, sampling_params)`