Pin TP to 1, remove from config.

cspades · cspades · commit 37c056211d2f · 2025-09-03T09:28:25.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/recipes/vit/config/defaults.yaml b/recipes/vit/config/defaults.yaml
@@ -44,7 +44,6 @@ distributed:
   dp_inter: 1
   dp_shard: 1
   cp: 1
-  tp: 1
 
 fsdp:
   init_model_with_meta_device: true
diff --git a/recipes/vit/config/vit_base_patch16_224.yaml b/recipes/vit/config/vit_base_patch16_224.yaml
@@ -42,7 +42,6 @@ distributed:
   dp_inter: 1
   dp_shard: 1
   cp: 1
-  tp: 1
 
 fsdp:
   init_model_with_meta_device: true
diff --git a/recipes/vit/train.py b/recipes/vit/train.py
@@ -172,20 +172,17 @@ def setup_device_mesh(cfg):
     # TODO(@cspades): Will add TE-backed context parallelism (CP) in the future, just need to
     # modify the ViT model to shard the sequence dimension after tokenization. For now, we
     # setup the CP dimension for demonstrating how to use DeviceMesh and CP with Megatron-FSDP.
-    if (
-        cfg.distributed.dp_inter * cfg.distributed.dp_shard * cfg.distributed.cp * cfg.distributed.tp
-        != torch.distributed.get_world_size()
-    ):
+    if cfg.distributed.dp_inter * cfg.distributed.dp_shard * cfg.distributed.cp != torch.distributed.get_world_size():
         raise ValueError(
-            f"Invalid parallelism sizes: dp_inter({cfg.distributed.dp_inter}) * dp_shard({cfg.distributed.dp_shard}) * cp({cfg.distributed.cp}) * tp({cfg.distributed.tp}) != world_size({torch.distributed.get_world_size()})"
+            f"Invalid parallelism sizes: dp_inter({cfg.distributed.dp_inter}) * dp_shard({cfg.distributed.dp_shard}) * cp({cfg.distributed.cp}) * tp(1) != world_size({torch.distributed.get_world_size()})"
         )
     device_mesh = torch.distributed.device_mesh.init_device_mesh(
         "cuda",
         mesh_shape=(
             cfg.distributed.dp_inter,
             cfg.distributed.dp_shard,
             cfg.distributed.cp,
-            cfg.distributed.tp,
+            1,  # Needed to use TransformerEngine layers with Megatron-FSDP. "TP is always 1."
         ),
         mesh_dim_names=("dp_inter", "dp_shard", "cp", "tp"),
     )