add ici_attn_dp_expert_parallleism config

NuojCheng · NuojCheng · commit 4d486c4da46b · 2026-04-17T17:48:53.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -592,6 +592,7 @@ ici_tensor_sequence_parallelism: 1
 ici_autoregressive_parallelism: 1
 ici_pipeline_parallelism: 1
 ici_expert_parallelism: 1
+ici_attn_dp_expert_parallelism: 1
 
 # Enable ZeRO-1 optimizer sharding over data axis
 shard_optimizer_over_data: False
@@ -985,7 +986,7 @@ xprof_e2e_enable_fw_power_level_event: False
 xprof_e2e_enable_fw_thermal_event: False
 profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
 
-log_config: True # Prints the config (after defaults have been set by pyconfig logic)
+log_config: False # Prints the config (after defaults have been set by pyconfig logic)
 debug_sharding: False # Prints model weights sharding info
 
 # Checkpoint Structured logging
diff --git a/src/maxtext/configs/custom_mesh_and_rule/vllm-attn-ep.yml b/src/maxtext/configs/custom_mesh_and_rule/vllm-attn-ep.yml
@@ -0,0 +1,53 @@
+mesh_axes: ['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']
+logical_axis_rules: [
+                      ['activation_batch', []],
+                      ['activation_batch_moe', ['data']],
+                      ['activation_embed_and_logits_batch', ['data']],
+                      ['activation_embed_and_logits_batch_sequence', ['data']],
+                      ['activation_heads', ['model', 'expert']],
+                      ['activation_kv_heads', ['model', 'expert']],
+                      ['activation_attn_length', []],
+                      ['activation_length', []],
+                      ['activation_length_moe', []],
+                      ['activation_q_length', ['expert']],
+                      ['activation_attn_embed', 'model'],
+                      # Expert is missing explicitly from activation_embed despite using TP.
+                      # We are going for a replicate-AR style of TP as opposed to our typical AG-RS style of TP
+                      # due to the output sharding of the fused_moe_gmm kernel in tpu-inference.
+                      ['activation_embed', ['model', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_embed_moe', ['model', 'attn_dp', 'attn_dp_expert']],
+                      ['activation_mlp', ['model']],
+                      ['activation_mlp_moe', ['model']],
+                      ['activation_kv', ['model']],
+                      ['activation_prefill_kv_batch', ['expert']],
+                      ['activation_kv_batch', ['data', 'attn_dp_expert']],
+                      ['activation_kv_head_dim', ['model']],
+                      ['activation_vocab', ['model', 'attn_dp']],
+                      ['activation_norm_length', []],
+                      ['activation_norm_length_moe', []],
+                      ['activation_exp', ['expert', 'attn_dp_expert']],
+                      ['decode_batch', ['data', 'attn_dp_expert']],
+                      ['decode_batch_moe', ['data', 'attn_dp_expert']],
+                      ['decode_length', []],
+                      ['mlp', ['model', 'attn_dp']],
+                      ['mlp_moe', ['model', 'attn_dp']],
+                      ['mlp_no_fsdp', ['model', 'attn_dp']],
+                      ['vocab', ['model', 'attn_dp']],
+                      # Expert is intended to act like TP for attention.
+                      # We target two all-reduces, one at the end of attention out projection and one at the end of the feedforward.
+                      ['heads', ['model', 'expert']],
+                      ['q_heads', ['model', 'expert']],
+                      ['kv_heads', ['model', 'expert']],
+                      ['kv_head_dim', []],
+                      ['kv', []],
+                      ['embed', []],
+                      ['embed_moe', []],
+                      ['embed_tensor_transpose', ['attn_dp', 'model']],
+                      ['q_lora', ['expert']],
+                      ['kv_lora', ['expert']],
+                      ['norm', []],
+                      ['cache_heads', ['model']],
+                      ['exp', ['expert', 'attn_dp_expert']],
+                      ['paged_kv_heads', ['model']],
+                    ]
+data_sharding: [['data', 'attn_dp', 'model', 'expert', 'attn_dp_expert']]
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -886,6 +886,7 @@ class IciParallelism(BaseModel):
   ici_autoregressive_parallelism: int = Field(1, description="ICI axis for autoregressive parallelism.")
   ici_pipeline_parallelism: int = Field(1, description="ICI axis for pipeline parallelism.")
   ici_expert_parallelism: int = Field(1, description="ICI axis for expert parallelism.")
+  ici_attn_dp_expert_parallelism: int = Field(1, description="ICI axis for attn dp expert parallelism.")
 
 
 class PipelineParallelism(BaseModel):
@@ -2746,7 +2747,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         "expert": self.ici_expert_parallelism,
         "autoregressive": self.ici_autoregressive_parallelism,
         "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
-        "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
+        "attn_dp_expert": self.ici_attn_dp_expert_parallelism,
     }
     self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
 
diff --git a/src/maxtext/inference/vllm_decode.py b/src/maxtext/inference/vllm_decode.py
@@ -100,6 +100,9 @@ def decode_with_vllm(config: Config) -> None:
   enable_expert_parallel = config.ici_expert_parallelism > 1
   if enable_expert_parallel:
     vllm_args["additional_config"]["sharding"]["sharding_strategy"]["expert_parallelism"] = config.ici_expert_parallelism
+    vllm_args["additional_config"]["sharding"]["sharding_strategy"][
+        "attention_data_expert_parallelism"
+    ] = config.ici_attn_dp_expert_parallelism
     vllm_args["enable_expert_parallel"] = enable_expert_parallel
 
   max_logging.log(