AI-Hypercomputer · NuojCheng · Apr 22, 2026
@@ -0,0 +1,78 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This rule uses data, stage, FSDP, and expert. Expert axis acts as context parallelism in  
+# components except core dMoE part (between EP all2all).
+mesh_axes: ['data', 'stage', 'fsdp', 'context', 'expert']
+data_sharding: [['data', 'stage', 'fsdp', 'context', 'expert']]
+context_sharding: 'context'
+logical_axis_rules: [
+                      # ==========================================
+                      # Vocabulary Embedding
+                      # ==========================================
+                      # Vocab Activations
+                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'expert']],
+                      # Vocab Weights
+                      ['vocab', []],
+                      ['embed_vocab', ['fsdp', 'context', 'expert']],
+                      # ==========================================
+                      # Attention
+                      # ==========================================
+                      # Attention Activations
+                      ['activation_batch_attn', ['data', 'fsdp', 'expert']],
+                      ['activation_heads', []],
+                      ['activation_kv_heads', []],
+                      ['activation_length_attn', ['context']],
+                      ['activation_q_length', ['context']],
+                      ['activation_kv_length', []],
+                      ['activation_embed_attn', []],
+                      ['activation_kv', []],
+                      ['activation_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_head_dim', []],
+                      # Attention Weights
+                      ['heads', []],
+                      ['q_heads', []],
+                      ['kv_heads', []],
+                      ['qkv', []],
+                      ['kv', []],
+                      ['kv_head_dim', []],
+                      ['q_lora', ['fsdp', 'context', 'expert']],
+                      ["q_lora_up_proj", []],
+                      ['kv_lora', ['fsdp', 'context', 'expert']],
+                      ["kv_lora_up_proj", []],
+                      # ==========================================
+                      # Mixture of Experts (MoE)
+                      # ==========================================
+                      # MoE Activations
+                      ['activation_batch_moe', ['data', 'fsdp']],
+                      ['activation_exp', ['context', 'expert']],
+                      # MoE Weights
+                      ['exp', ['context', 'expert']],
+                      ['embed_moe', ['fsdp']],
+                      # ==========================================
+                      # Standard MLP / Dense Layers / Model Structure
+                      # ==========================================
+                      # Dense Activations
+                      ['activation_mlp', []],
+                      ['activation_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_length', ['context']],
+                      ['activation_norm_length', ['context']],
+                      ['activation_embed', []],
+                      ['activation_stage', 'stage'],
+                      # General Weights
+                      ['mlp', []],
+                      ['layers', 'stage'],
+                      ['embed', ['fsdp', 'context', 'expert']],
+                  ]
@@ -376,6 +376,9 @@ def __init__(
 
     if self.config.attention == "vllm_rpa" and self.config.enable_dp_attention:
       self._expert_parallelism_name = "attn_dp_expert"
+    elif self.config.custom_mesh_and_rule == "cp-as-ep":
+      # when custom mesh and rule is cp-as-ep, context axis is same with expert in MoE component
+      self._expert_parallelism_name = ("context", "expert")
     else:
       self._expert_parallelism_name = "expert"
 

@@ -55,6 +55,13 @@
         "ep-as-cp",
         ("ici_fsdp_parallelism=-1", "ici_expert_parallelism=2"),
     ),
+    (
+        "deepseek2-16b",
+        "tpu7x-8",
+        1,
+        "cp-as-ep",
+        ("ici_fsdp_parallelism=-1", "ici_context_parallelism=2", "ici_expert_parallelism=2"),
+    ),
     ("qwen3-0.6b", "tpu7x-16", 1, "", ()),
     ("gpt-oss-20b", "tpu7x-16", 1, "", ()),
     ("gpt-oss-20b", "tpu7x-16", 1, "", ("ici_fsdp_parallelism=-1", "ici_expert_parallelism=2")),

@@ -0,0 +1,178 @@
+{
+  "Activation Sharding Dump": [
+    {
+      "deepseek/inputs: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "deepseek/pre_attention_norm: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "attention_mla/inputs_q: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "attention_mla/inputs_kv: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "attention_mla/q_nope: bfloat16[96,2048,16,128]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/q_pe: bfloat16[96,2048,16,64]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/query: bfloat16[96,2048,16,192]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/key_nope: bfloat16[96,2048,16,128]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/key_rope: bfloat16[96,2048,16,64]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/key: bfloat16[96,2048,16,192]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_mla/value: bfloat16[96,2048,16,128]": {
+        "logic_axes": "('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "attention_op/arr: int8[1,4,4]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(None, 'context')"
+      }
+    },
+    {
+      "attention_op/arr: int32[2048]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P('context',)"
+      }
+    },
+    {
+      "attention_op/query: bfloat16[96,16,2048,192]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('fsdp', 'expert'), None, 'context', None)"
+      }
+    },
+    {
+      "attention_op/key: bfloat16[96,16,2048,192]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)"
+      }
+    },
+    {
+      "attention_op/value: bfloat16[96,16,2048,128]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('fsdp', 'expert'), None, None, None)"
+      }
+    },
+    {
+      "attention_mla/out: bfloat16[96,2048,16,128]": {
+        "logic_axes": "('activation_batch_attn', 'activation_length', 'activation_heads', 'activation_kv')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None, None)"
+      }
+    },
+    {
+      "deepseek/attention_result: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "deepseek/post_attention_norm: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "linears/x: bfloat16[96,2048,10944]": {
+        "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "deepseek/mlp: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "deepseek/x: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "moe/inputs: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', None)",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "moe/gate_logits: bfloat16[96,2048,64]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', None)",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "moe/w0_kernel: bfloat16[64,2048,1408]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('context', 'expert'), None, None)"
+      }
+    },
+    {
+      "moe/w1_kernel: bfloat16[64,2048,1408]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('context', 'expert'), None, None)"
+      }
+    },
+    {
+      "moe/wo_kernel: bfloat16[64,1408,2048]": {
+        "logic_axes": "Unknown",
+        "PartitionSpec": "P(('context', 'expert'), None, None)"
+      }
+    },
+    {
+      "linears/x: bfloat16[96,2048,2816]": {
+        "logic_axes": "('activation_batch', 'activation_length', 'activation_mlp')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    },
+    {
+      "deepseek/mlp_lnx: bfloat16[96,2048,2048]": {
+        "logic_axes": "('activation_batch', 'activation_norm_length', 'activation_embed')",
+        "PartitionSpec": "P(('fsdp', 'expert'), 'context', None)"
+      }
+    }
+  ]
+}