Pipeclean TP with THD, add TP unit tests.

cspades · cspades · commit f539bfaecf12 · 2026-03-18T11:40:00.000-07:00
Signed-off-by: Cory Ye &lt;cye@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -526,9 +526,10 @@ def forward(
         if self.config.tensor_parallel:
             # If using TP, shard your activation across the TP group,
             # to support row-wise tensor parallelism in the LM head.
+            # Use ... to support both BSHD (3D) and THD (2D) hidden states.
             tp_rank = self.tp_mesh.get_local_rank()
             tp_stride = hidden_states.shape[-1] // self.config.tp_size
-            hidden_states = hidden_states[:, :, tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
+            hidden_states = hidden_states[..., tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
 
         with transformer_engine.pytorch.autocast(enabled=False):
             if hidden_states.ndim == 3:
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity_tp.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/L0_sanity_tp.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - L0_sanity
+  - _self_
+
+tp_size: 2  # Tensor Parallel sharding factor
+cp_size: 1
+
+use_sequence_packing: false
+
+config_kwargs:
+  attn_input_format: "bshd" # Alternatively "thd" on datacenter hardware.
+  self_attn_mask_type: "causal" # Alternatively "padding_causal" for THD inputs.
+  tensor_parallel: true   # Tensor Parallelism for TE
+  sequence_parallel: false  # Sequence parallelism for LayerNorm on TP ranks.
+  tp_size: ${tp_size}       # Tensor Parallel Size
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -532,9 +532,10 @@ def forward(
         if self.config.tensor_parallel:
             # If using TP, shard your activation across the TP group,
             # to support row-wise tensor parallelism in the LM head.
+            # Use ... to support both BSHD (3D) and THD (2D) hidden states.
             tp_rank = self.tp_mesh.get_local_rank()
             tp_stride = hidden_states.shape[-1] // self.config.tp_size
-            hidden_states = hidden_states[:, :, tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
+            hidden_states = hidden_states[..., tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
 
         with transformer_engine.pytorch.autocast(enabled=False):
             if hidden_states.ndim == 3:
diff --git a/bionemo-recipes/recipes/llama3_native_te/tests/test_train.py b/bionemo-recipes/recipes/llama3_native_te/tests/test_train.py
@@ -480,6 +480,61 @@ def test_sanity_ddp_fp8_stats_logging(tmp_path, recipe_path):
     assert stats_log.stat().st_size > 0, "Statistics log file is empty"
 
 
+def test_sanity_nd_parallel_tp1_bshd(tmp_path, recipe_path):
+    """Test ND-parallel training with tensor_parallel=True and tp_size=1 (trivial TP group), BSHD.
+
+    This test validates that all TP code paths in NVLlamaModel and NVLlamaForCausalLM execute
+    correctly with a single-rank TP mesh:
+    - parallelize_module on embed_tokens (ColwiseParallel)
+    - TransformerLayer TP mode flags
+    - lm_head row-parallel mode and set_tensor_parallel_group
+    - Hidden-state activation slicing in NVLlamaForCausalLM.forward
+    """
+    with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity_tp",
+            overrides=[
+                f"+wandb.dir={tmp_path}",
+                f"checkpoint.ckpt_dir={tmp_path}",
+                "num_train_steps=10",
+                "tp_size=1",
+                "checkpoint.resume_from_checkpoint=false",
+            ],
+        )
+
+    final_loss = main_fsdp2_cp(sanity_config)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    assert torch.isfinite(torch.tensor(final_loss)), f"Final loss {final_loss} is not finite"
+
+
+def test_sanity_nd_parallel_tp1_sequence_parallel_bshd(tmp_path, recipe_path):
+    """Test ND-parallel training with tensor_parallel=True, sequence_parallel=True, tp_size=1, BSHD.
+
+    Validates that the sequence-parallel RMSNorm (set_device_mesh on the final norm) does not
+    break forward/backward even when the TP group is a single rank.
+    """
+    with initialize_config_dir(config_dir=str(recipe_path / "hydra_config"), version_base="1.2"):
+        sanity_config = compose(
+            config_name="L0_sanity_tp",
+            overrides=[
+                f"+wandb.dir={tmp_path}",
+                f"checkpoint.ckpt_dir={tmp_path}",
+                "num_train_steps=10",
+                "tp_size=1",
+                "config_kwargs.sequence_parallel=true",
+                "checkpoint.resume_from_checkpoint=false",
+            ],
+        )
+
+    final_loss = main_fsdp2_cp(sanity_config)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    assert torch.isfinite(torch.tensor(final_loss)), f"Final loss {final_loss} is not finite"
+
+
 @requires_fp8
 def test_sanity_fsdp2_fp8_stats_logging(tmp_path, recipe_path):
     """Test that FP8 stats logging works with FSDP2."""
diff --git a/bionemo-recipes/recipes/llama3_native_te/tests/test_train_two_gpu.py b/bionemo-recipes/recipes/llama3_native_te/tests/test_train_two_gpu.py
@@ -238,6 +238,111 @@ def test_multi_gpu_train_te_fsdp2_cp_thd(tmp_path, recipe_path):
     )
 
 
+@requires_multi_gpu
+def test_multi_gpu_train_te_fsdp2_tp_bshd(tmp_path, recipe_path):
+    """Test FSDP2 with tensor parallelism on 2 GPUs using BSHD input format.
+
+    Validates:
+    - The 1-D TP device mesh (dp=1, cp=1, tp=2) is created and used correctly
+    - Embedding weights are ColwiseParallel-sharded across 2 TP ranks
+    - TransformerLayer TP mode shards QKV/FFN weights across ranks
+    - Row-wise parallel LM head with hidden-state slicing before forward
+    """
+    run_train_cmd(
+        [
+            "torchrun",
+            "--standalone",
+            "--nproc_per_node=2",
+            "train_fsdp2_nd_parallel.py",
+            "--config-name",
+            "L0_sanity_tp",
+            "num_train_steps=10",
+            f"checkpoint.ckpt_dir={tmp_path}",
+        ],
+        recipe_path,
+    )
+
+
+@requires_multi_gpu
+@requires_datacenter_hardware
+def test_multi_gpu_train_te_fsdp2_tp_thd(tmp_path, recipe_path):
+    """Test FSDP2 with tensor parallelism on 2 GPUs using THD (sequence-packed) input format.
+
+    Validates:
+    - TP=2, CP=1 with sequence-packing / THD attention format
+    - _unpad_input / _pad_input round-trip works alongside TP activation sharding
+    - padding_causal mask type is compatible with row-wise parallel LM head
+    """
+    run_train_cmd(
+        [
+            "torchrun",
+            "--standalone",
+            "--nproc_per_node=2",
+            "train_fsdp2_nd_parallel.py",
+            "--config-name",
+            "L0_sanity_tp",
+            "num_train_steps=10",
+            f"checkpoint.ckpt_dir={tmp_path}",
+            "use_sequence_packing=true",
+            "config_kwargs.attn_input_format=thd",
+            "config_kwargs.self_attn_mask_type=padding_causal",
+        ],
+        recipe_path,
+    )
+
+
+@requires_multi_gpu
+def test_multi_gpu_train_te_fsdp2_tp_sequence_parallel_bshd(tmp_path, recipe_path):
+    """Test FSDP2 with tensor parallelism + sequence parallelism on 2 GPUs, BSHD.
+
+    Validates that sequence parallelism (LayerNorm activations sharded across TP ranks)
+    works alongside standard tensor parallelism without errors.
+    """
+    run_train_cmd(
+        [
+            "torchrun",
+            "--standalone",
+            "--nproc_per_node=2",
+            "train_fsdp2_nd_parallel.py",
+            "--config-name",
+            "L0_sanity_tp",
+            "num_train_steps=10",
+            f"checkpoint.ckpt_dir={tmp_path}",
+            "config_kwargs.sequence_parallel=true",
+        ],
+        recipe_path,
+    )
+
+
+@requires_multi_gpu
+def test_multi_gpu_train_te_fsdp2_tp_bshd_with_checkpointing(tmp_path, recipe_path):
+    """Test FSDP2 TP training on 2 GPUs with checkpoint saving.
+
+    Validates:
+    - Sharded FSDP2 checkpoints are written correctly while TP is active
+    - The expected checkpoint directory structure is present after training
+    """
+    run_train_cmd(
+        [
+            "torchrun",
+            "--standalone",
+            "--nproc_per_node=2",
+            "train_fsdp2_nd_parallel.py",
+            "--config-name",
+            "L0_sanity_tp",
+            "num_train_steps=10",
+            f"checkpoint.ckpt_dir={tmp_path}",
+            "checkpoint.save_every_n_steps=5",
+            "checkpoint.resume_from_checkpoint=false",
+        ],
+        recipe_path,
+    )
+
+    ckpt_dir = tmp_path / "train_fsdp2"
+    assert ckpt_dir.exists(), f"Checkpoint directory not created: {ckpt_dir}"
+    assert (ckpt_dir / "step_5").exists(), "Checkpoint at step 5 not found"
+
+
 nsys_available = subprocess.run(["which", "nsys"], check=False, capture_output=True).returncode == 0
 
 
diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/modeling_llama_te.py
@@ -532,9 +532,10 @@ def forward(
         if self.config.tensor_parallel:
             # If using TP, shard your activation across the TP group,
             # to support row-wise tensor parallelism in the LM head.
+            # Use ... to support both BSHD (3D) and THD (2D) hidden states.
             tp_rank = self.tp_mesh.get_local_rank()
             tp_stride = hidden_states.shape[-1] // self.config.tp_size
-            hidden_states = hidden_states[:, :, tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
+            hidden_states = hidden_states[..., tp_rank * tp_stride : (tp_rank + 1) * tp_stride]
 
         with transformer_engine.pytorch.autocast(enabled=False):
             if hidden_states.ndim == 3: