update debug sharding logic for decode

NuojCheng · NuojCheng · commit 0f4beb842774 · 2026-04-17T22:32:52.000Z
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -32,6 +32,7 @@
 AxisIdxes = tuple[int, ...]
 
 BATCH = "activation_batch"
+BATCH_ATTN = "activation_batch_attn"
 
 ATTN_LENGTH = "activation_attn_length"
 
diff --git a/src/maxtext/configs/custom_mesh_and_rule/vllm-attn-ep.yml b/src/maxtext/configs/custom_mesh_and_rule/vllm-attn-ep.yml
diff --git a/src/maxtext/configs/inference/vllm.yml b/src/maxtext/configs/inference/vllm.yml
@@ -29,59 +29,78 @@ weight_dtype: bfloat16
 # -------------- Logical Axis Rules --------------
 mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
 logical_axis_rules: [
-                      ['activation_batch', ['data']],
-                      ['activation_batch_moe', ['data']],
-                      ['activation_embed_and_logits_batch', ['data', 'expert']],
-                      ['activation_embed_and_logits_batch_sequence', ['data', 'expert']],
+                      # ==========================================
+                      # Vocabulary Embedding
+                      # ==========================================
+                      # Vocab Activations
+                      ['activation_embed_and_logits_batch', ['data']],
+                      ['activation_embed_and_logits_batch_sequence', ['data']],
+                      ['activation_vocab', ['model', 'expert', 'attn_dp', 'attn_dp_expert']],
+                      # Vocab Weights
+                      ['vocab', ['model', 'expert', 'attn_dp', 'attn_dp_expert']],
+                      ['embed_vocab', []],
+                      # ==========================================
+                      # Attention
+                      # ==========================================
+                      # Attention Activations
+                      ['activation_batch_attn', ['data', 'attn_dp', 'attn_dp_expert']],
                       ['activation_heads', ['model', 'expert']],
                       ['activation_kv_heads', ['model', 'expert']],
-                      ['activation_attn_length', []],
-                      ['activation_length', []],
-                      ['activation_length_moe', []],
-                      ['activation_q_length', ['expert', 'attn_dp_expert']],
-                      ['activation_attn_embed', 'model'],
-                      # Expert is missing explicitly from activation_embed despite using TP.
-                      # We are going for a replicate-AR style of TP as opposed to our typical AG-RS style of TP
-                      # due to the output sharding of the fused_moe_gmm kernel in tpu-inference.
-                      ['activation_embed', ['model', 'attn_dp']],
-                      ['activation_embed_moe', ['model', 'attn_dp']],
-                      ['activation_mlp', ['model', 'attn_dp']],
-                      ['activation_mlp_moe', ['model', 'attn_dp']],
-                      ['activation_kv', ['model']],
-                      ['activation_prefill_kv_batch', ['expert', 'attn_dp_expert']],
-                      ['activation_kv_batch', ['data']],
-                      ['activation_kv_head_dim', ['model']],
-                      ['activation_vocab', ['model', 'attn_dp']],
-                      ['activation_norm_length', []],
-                      ['activation_norm_length_moe', []],
-                      ['activation_exp', ['expert', 'attn_dp_expert']],
-                      ['decode_batch', ['data']],
-                      ['decode_batch_moe', ['data']],
-                      ['decode_length', []],
-                      ['mlp', ['model', 'attn_dp']],
-                      ['mlp_moe', ['model', 'attn_dp']],
-                      ['mlp_no_fsdp', ['model', 'attn_dp']],
-                      ['vocab', ['model', 'attn_dp']],
-                      # Expert is intended to act like TP for attention.
-                      # We target two all-reduces, one at the end of attention out projection and one at the end of the feedforward.
+                      ['activation_attn_embed', []],
+                      ['activation_kv', ['model', 'expert']],
+                      ['activation_kv_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_kv_head_dim', []],
+                      # Attention Weights
                       ['heads', ['model', 'expert']],
                       ['q_heads', ['model', 'expert']],
                       ['kv_heads', ['model', 'expert']],
-                      ['kv_head_dim', []],
+                      ['qkv', []],
                       ['kv', []],
-                      ['embed', ['expert', 'attn_dp_expert']],
-                      ['embed', ['attn_dp_expert']],
-                      ['embed_vocab', ['expert', 'attn_dp_expert']],
-                      ['embed_vocab', ['attn_dp_expert']],
-                      ['embed_moe', []],
+                      ['kv_head_dim', []],
+                      ['q_lora', []],
+                      ["q_lora_up_proj", []],
+                      ['kv_lora', []],
+                      ["kv_lora_up_proj", []],
+                      # ==========================================
+                      # Mixture of Experts (MoE)
+                      # ==========================================
+                      # MoE Activations
+                      ['activation_batch_moe', ['data']],
+                      ['activation_embed_moe', ['model']],
+                      ['activation_mlp_moe', []],
+                      ['activation_exp', ['expert', 'attn_dp', 'attn_dp_expert']],
+                      # MoE Weights
+                      ['exp', ['expert', 'attn_dp', 'attn_dp_expert']],
+                      ['mlp_moe', []],
                       ['embed_moe', []],
-                      ['embed_tensor_transpose', ['attn_dp', 'model']],
-                      ['q_lora', ['expert', 'attn_dp_expert']],
-                      ['kv_lora', ['expert', 'attn_dp_expert']],
+                      # ==========================================
+                      # Standard MLP / Dense Layers / Model Structure
+                      # ==========================================
+                      # Dense Activations
+                      ['activation_mlp', ['model', 'expert', 'attn_dp', 'attn_dp_expert']],
+                      # Note activation batch and length also get used in attention and vocab
+                      ['activation_batch', ['data']],
+                      ['activation_embed', ['model', 'expert', 'attn_dp', 'attn_dp_expert']],
+                      # General Weights
+                      ['mlp', ['model', 'expert', 'attn_dp', 'attn_dp_expert']],
+                      ['embed', []],
                       ['norm', []],
+                      # ==========================================
+                      # Inference(Prefill, Decode, Cache)
+                      # ==========================================
+                      ['activation_prefill_kv_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['decode_batch', ['data', 'attn_dp', 'attn_dp_expert']],
+                      ['cache_heads', ['model', 'expert']],
                       ['cache_heads', ['model']],
-                      ['exp', ['expert', 'attn_dp_expert']],
-                      ['paged_kv_heads', ['model']],
-                    ]
+                      ['paged_kv_heads', ['model', 'expert']],
+                      ['cache_batch_prefill', []],
+                      ['cache_batch', []],
+                      ['cache_heads_none', []],
+                      ['cache_kv', []],
+                      ['cache_sequence', []],
+                      ['num_pages', []],
+                      ['tokens_per_page', []],
+                      ['paged_kv_head_dim_size', []],
+                  ]
 data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]
 input_data_sharding_logical_axes: ['activation_embed_and_logits_batch']
diff --git a/src/maxtext/layers/attention_mla.py b/src/maxtext/layers/attention_mla.py
@@ -36,7 +36,7 @@
     Array,
     AxisIdxes,
     AxisNames,
-    BATCH,
+    BATCH_ATTN as BATCH,
     CACHE_BATCH,
     CACHE_BATCH_PREFILL,
     CACHE_SEQUENCE,
diff --git a/src/maxtext/layers/attention_op.py b/src/maxtext/layers/attention_op.py
@@ -38,7 +38,7 @@
     AttentionType,
     AxisIdxes,
     AxisNames,
-    BATCH,
+    BATCH_ATTN as BATCH,
     CACHE_BATCH,
     CACHE_BATCH_PREFILL,
     CACHE_HEADS,
diff --git a/src/maxtext/layers/attentions.py b/src/maxtext/layers/attentions.py
@@ -27,7 +27,7 @@
 
 from maxtext.common.common_types import (
     DecoderBlockType,
-    BATCH,
+    BATCH_ATTN as BATCH,
     HEAD,
     PREFILL_LENGTH,
     D_KV,
diff --git a/src/maxtext/utils/maxtext_utils.py b/src/maxtext/utils/maxtext_utils.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 
 # pylint: disable=line-too-long, disable=bare-except, consider-using-generator
-""" Utils that are only interesting to MaxText. """
+"""Utils that are only interesting to MaxText."""
 
 import functools
 import pickle
 import os
 
-from flax import linen as nn
+from flax import nnx, linen as nn
 from flax.linen import partitioning as nn_partitioning
 from flax.training import train_state
 
@@ -1625,7 +1625,35 @@ def schedule(step):
   return optax.join_schedules(pieces, boundaries)
 
 
-def print_shardings_params(params, params_sharding, mesh, logical_annotations=None):
+# def print_shardings_params(params, params_sharding, mesh, logical_annotations=None):
+#   """
+#   Print state shardings comparing Logical Definition vs Physical Result.
+#   """
+#   if not hasattr(params, "params"):
+#     params = {"params": params}
+#   if not hasattr(params_sharding, "params"):
+#     params_sharding = {"params": params_sharding}
+#   if logical_annotations and not hasattr(logical_annotations, "params"):
+#     logical_annotations = {"params": logical_annotations}
+
+#   leaves_params, _ = jax.tree_util.tree_flatten_with_path(params, is_leaf=lambda x: isinstance(x, nnx.Variable))
+#   leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(params_sharding, is_leaf=lambda x: isinstance(x, nnx.Variable))
+#   leaves_logical, _ = jax.tree_util.tree_flatten_with_path(logical_annotations, is_leaf=lambda x: isinstance(x, nnx.Variable))
+
+#   for (path, leaf_val), (_, leaf_sharding), (_, leaf_logical_val) in zip(leaves_params, leaves_sharding, leaves_logical):
+#     path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
+#     shape = jax.typeof(getattr(leaf_val, "value"))
+#     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
+#     pspec_str = str(tuple(pspec))
+#     logical_str = str(getattr(leaf_logical_val, "out_sharding", None))
+
+#     message = f" {path_str}\n" f"    Shape:     {shape}\n" f"    Logical:   {logical_str}\n" f"    Physical:  {pspec_str}"
+#     max_logging.info(message)
+
+#   print(flush=True)
+
+
+def print_shardings_params(params, params_sharding, mesh, logical_annotations=None, target_layer=0):
   """
   Print state shardings comparing Logical Definition vs Physical Result.
   """
@@ -1636,16 +1664,33 @@ def print_shardings_params(params, params_sharding, mesh, logical_annotations=No
   if logical_annotations and not hasattr(logical_annotations, "params"):
     logical_annotations = {"params": logical_annotations}
 
-  leaves_params, _ = jax.tree_util.tree_flatten_with_path(params)
-  leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(params_sharding)
-  leaves_logical, _ = jax.tree_util.tree_flatten_with_path(logical_annotations)
+  leaves_params, _ = jax.tree_util.tree_flatten_with_path(params, is_leaf=lambda x: isinstance(x, nnx.Variable))
+  leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(
+      params_sharding, is_leaf=lambda x: isinstance(x, nnx.Variable)
+  )
+  leaves_logical, _ = jax.tree_util.tree_flatten_with_path(
+      logical_annotations, is_leaf=lambda x: isinstance(x, nnx.Variable)
+  )
 
   for (path, leaf_val), (_, leaf_sharding), (_, leaf_logical_val) in zip(leaves_params, leaves_sharding, leaves_logical):
-    path_str = "/".join(str(p.key if hasattr(p, "key") else p.name) for p in path)
-    shape = jax.typeof(leaf_val)
+    # Extract path keys to accurately check for layer names
+    path_keys = [str(p.key if hasattr(p, "key") else p.name) for p in path]
+    path_str = "/".join(path_keys)
+
+    # Check if param is inside a layer block, and if it's the target layer
+    is_layer_param = any(k.startswith("layers_") for k in path_keys)
+    is_target_layer = any(k == f"layers_{target_layer}" for k in path_keys)
+    # Skip logging if it belongs to a layer that isn't our target
+    if is_layer_param and not is_target_layer:
+      continue
+
+    if "to_nnx__rngs" in path_str:
+      continue
+
+    shape = jax.typeof(getattr(leaf_val, "value"))
     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
     pspec_str = str(tuple(pspec))
-    logical_str = str(leaf_logical_val)
+    logical_str = str(getattr(leaf_logical_val, "out_sharding", None))
 
     message = f" {path_str}\n" f"    Shape:     {shape}\n" f"    Logical:   {logical_str}\n" f"    Physical:  {pspec_str}"
     max_logging.info(message)