introduce cp-as-ep rule for long context training or strong scaling

NuojCheng · NuojCheng · commit 968f625b259a · 2026-04-22T22:42:55.000Z
diff --git a/src/maxtext/configs/custom_mesh_and_rule/cp-as-ep.yml b/src/maxtext/configs/custom_mesh_and_rule/cp-as-ep.yml
@@ -0,0 +1,78 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This rule uses data, stage, FSDP, and expert. Expert axis acts as context parallelism in  
+# components except core dMoE part (between EP all2all).
+mesh_axes: ['data', 'stage', 'fsdp', 'context', 'expert']
+data_sharding: [['data', 'stage', 'fsdp', 'context', 'expert']]
+context_sharding: 'context'
+logical_axis_rules: [
+                      # ==========================================
+                      # Vocabulary Embedding
+                      # ==========================================
+                      # Vocab Activations
+                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'expert']],
+                      # Vocab Weights
+                      ['vocab', []],
+                      ['embed_vocab', ['fsdp', 'context', 'expert']],
+                      # ==========================================
+                      # Attention
+                      # ==========================================
+                      # Attention Activations
+                      ['activation_batch_attn', ['data', 'fsdp', 'expert']],
+                      ['activation_heads', []],
+                      ['activation_kv_heads', []],
+                      ['activation_length_attn', ['context']],
+                      ['activation_q_length', ['context']],
+                      ['activation_kv_length', []],
+                      ['activation_embed_attn', []],
+                      ['activation_kv', []],
+                      ['activation_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_head_dim', []],
+                      # Attention Weights
+                      ['heads', []],
+                      ['q_heads', []],
+                      ['kv_heads', []],
+                      ['qkv', []],
+                      ['kv', []],
+                      ['kv_head_dim', []],
+                      ['q_lora', ['fsdp', 'context', 'expert']],
+                      ["q_lora_up_proj", []],
+                      ['kv_lora', ['fsdp', 'context', 'expert']],
+                      ["kv_lora_up_proj", []],
+                      # ==========================================
+                      # Mixture of Experts (MoE)
+                      # ==========================================
+                      # MoE Activations
+                      ['activation_batch_moe', ['data', 'fsdp']],
+                      ['activation_exp', ['context', 'expert']],
+                      # MoE Weights
+                      ['exp', ['context', 'expert']],
+                      ['embed_moe', ['fsdp']],
+                      # ==========================================
+                      # Standard MLP / Dense Layers / Model Structure
+                      # ==========================================
+                      # Dense Activations
+                      ['activation_mlp', []],
+                      ['activation_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_length', ['context']],
+                      ['activation_norm_length', ['context']],
+                      ['activation_embed', []],
+                      ['activation_stage', 'stage'],
+                      # General Weights
+                      ['mlp', []],
+                      ['layers', 'stage'],
+                      ['embed', ['fsdp', 'context', 'expert']],
+                  ]
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -374,6 +374,9 @@ def __init__(
 
     if self.config.attention == "vllm_rpa" and self.config.enable_dp_attention:
       self._expert_parallelism_name = "attn_dp_expert"
+    elif self.config.custom_mesh_and_rule == "cp-as-ep":
+      # when custom mesh and rule is cp-as-ep, context axis is same with expert in MoE component
+      self._expert_parallelism_name = ("context", "expert")
     else:
       self._expert_parallelism_name = "expert"