deprecate old 2dfsdp functions

NuojCheng · NuojCheng · commit f28f38c40075 · 2026-05-13T19:03:07.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -250,8 +250,6 @@ moe_fsdp_use_two_stage_all_gather: false
 # Shard the expert dimension of the MLP weights on the FSDP axis.
 # This configuration is recommended only when num_experts is a multiple of fsdp_parallelism
 shard_exp_on_fsdp: False
-# use fsdp and fsdp_transpose axes for sharding the moe weights
-use_2d_fsdp_sharding: False
 
 # deepseek moe
 first_num_dense_layers: 0 # number of initial dense layers in the model
diff --git a/src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml b/src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -221,7 +221,6 @@ class ProfilerType(str, Enum):
     "deepseek2-16b",
     "deepseek2-236b",
     "deepseek3-671b",
-    "deepseek3-671b-2dfsdp",
     "deepseek3-671b-batchsplit",
     "deepseek3-test",
     "deepseek3-tiny",
@@ -705,10 +704,6 @@ class MoEGeneral(BaseModel):
       description="Shard the expert dimension of the MLP weights on the FSDP axis, "
       "and recommended only when num_experts is a multiple of fsdp_parallelism",
   )
-  use_2d_fsdp_sharding: bool = Field(
-      False,
-      description="Use `fsdp` and `fsdp_transpose` axes for 2D FSDP sharding.",
-  )
   norm_topk_prob: bool = Field(
       False,
       description="Enable top-k probability normalization for router weights (Qwen3-specific).",
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -358,9 +358,6 @@ def __init__(
       # special sharding for dsv3
       self.wi_kernel_axes = ("embed_moe", None, "mlp_moe")
       self.wo_kernel_axes = ("embed_moe", "mlp_moe", None)
-    elif self.config.use_2d_fsdp_sharding:
-      self.wi_kernel_axes = ("embed_moe", "mlp_moe", None)
-      self.wo_kernel_axes = ("embed_moe", "mlp_moe", None)
     elif self.config.use_batch_split_schedule:
       self.wi_kernel_axes, self.wo_kernel_axes = get_batchsplit_init_kernel_axes()
     else:
@@ -1217,10 +1214,6 @@ def get_routed_moe_shardings(is_batch_sharded_by_expert):
           w0_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", None, "mlp_no_fsdp"))
           w1_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", None, "mlp_no_fsdp"))
           wo_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", "mlp_no_fsdp", None))
-      elif self.config.use_2d_fsdp_sharding:
-        w0_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", "mlp_no_fsdp", None))
-        w1_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", "mlp_no_fsdp", None))
-        wo_pspec = self._logical_to_mesh_axes(("embed_tensor_transpose", "mlp_no_fsdp", None))
       else:
         # These are the main shardings used by default - they use funky rules to AG over FSDP.
         w0_pspec = self._logical_to_mesh_axes(("exp", "embed_tensor_transpose", "mlp_no_fsdp"))
diff --git a/tests/unit/configs_test.py b/tests/unit/configs_test.py
@@ -200,7 +200,6 @@ def test_gpt_configs(config_file):
     os.path.join(CONFIGS_DIR, "models", "deepseek2-236b.yml"),
     os.path.join(CONFIGS_DIR, "models", "deepseek3-test.yml"),
     os.path.join(CONFIGS_DIR, "models", "deepseek3-671b.yml"),
-    os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-2dfsdp.yml"),
     os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-batchsplit.yml"),
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,6 @@ def test_gpt_configs(config_file):`
`200`	`200`	`os.path.join(CONFIGS_DIR, "models", "deepseek2-236b.yml"),`
`201`	`201`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-test.yml"),`
`202`	`202`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-671b.yml"),`
`203`		`- os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-2dfsdp.yml"),`
`204`	`203`	`os.path.join(CONFIGS_DIR, "models", "deepseek3-671b-batchsplit.yml"),`
`205`	`204`	`]`
`206`	`205`