@@ -238,6 +238,111 @@ def test_multi_gpu_train_te_fsdp2_cp_thd(tmp_path, recipe_path):
238238 )
239239
240240
241+ @requires_multi_gpu
242+ def test_multi_gpu_train_te_fsdp2_tp_bshd (tmp_path , recipe_path ):
243+ """Test FSDP2 with tensor parallelism on 2 GPUs using BSHD input format.
244+
245+ Validates:
246+ - The 1-D TP device mesh (dp=1, cp=1, tp=2) is created and used correctly
247+ - Embedding weights are ColwiseParallel-sharded across 2 TP ranks
248+ - TransformerLayer TP mode shards QKV/FFN weights across ranks
249+ - Row-wise parallel LM head with hidden-state slicing before forward
250+ """
251+ run_train_cmd (
252+ [
253+ "torchrun" ,
254+ "--standalone" ,
255+ "--nproc_per_node=2" ,
256+ "train_fsdp2_nd_parallel.py" ,
257+ "--config-name" ,
258+ "L0_sanity_tp" ,
259+ "num_train_steps=10" ,
260+ f"checkpoint.ckpt_dir={ tmp_path } " ,
261+ ],
262+ recipe_path ,
263+ )
264+
265+
266+ @requires_multi_gpu
267+ @requires_datacenter_hardware
268+ def test_multi_gpu_train_te_fsdp2_tp_thd (tmp_path , recipe_path ):
269+ """Test FSDP2 with tensor parallelism on 2 GPUs using THD (sequence-packed) input format.
270+
271+ Validates:
272+ - TP=2, CP=1 with sequence-packing / THD attention format
273+ - _unpad_input / _pad_input round-trip works alongside TP activation sharding
274+ - padding_causal mask type is compatible with row-wise parallel LM head
275+ """
276+ run_train_cmd (
277+ [
278+ "torchrun" ,
279+ "--standalone" ,
280+ "--nproc_per_node=2" ,
281+ "train_fsdp2_nd_parallel.py" ,
282+ "--config-name" ,
283+ "L0_sanity_tp" ,
284+ "num_train_steps=10" ,
285+ f"checkpoint.ckpt_dir={ tmp_path } " ,
286+ "use_sequence_packing=true" ,
287+ "config_kwargs.attn_input_format=thd" ,
288+ "config_kwargs.self_attn_mask_type=padding_causal" ,
289+ ],
290+ recipe_path ,
291+ )
292+
293+
294+ @requires_multi_gpu
295+ def test_multi_gpu_train_te_fsdp2_tp_sequence_parallel_bshd (tmp_path , recipe_path ):
296+ """Test FSDP2 with tensor parallelism + sequence parallelism on 2 GPUs, BSHD.
297+
298+ Validates that sequence parallelism (LayerNorm activations sharded across TP ranks)
299+ works alongside standard tensor parallelism without errors.
300+ """
301+ run_train_cmd (
302+ [
303+ "torchrun" ,
304+ "--standalone" ,
305+ "--nproc_per_node=2" ,
306+ "train_fsdp2_nd_parallel.py" ,
307+ "--config-name" ,
308+ "L0_sanity_tp" ,
309+ "num_train_steps=10" ,
310+ f"checkpoint.ckpt_dir={ tmp_path } " ,
311+ "config_kwargs.sequence_parallel=true" ,
312+ ],
313+ recipe_path ,
314+ )
315+
316+
317+ @requires_multi_gpu
318+ def test_multi_gpu_train_te_fsdp2_tp_bshd_with_checkpointing (tmp_path , recipe_path ):
319+ """Test FSDP2 TP training on 2 GPUs with checkpoint saving.
320+
321+ Validates:
322+ - Sharded FSDP2 checkpoints are written correctly while TP is active
323+ - The expected checkpoint directory structure is present after training
324+ """
325+ run_train_cmd (
326+ [
327+ "torchrun" ,
328+ "--standalone" ,
329+ "--nproc_per_node=2" ,
330+ "train_fsdp2_nd_parallel.py" ,
331+ "--config-name" ,
332+ "L0_sanity_tp" ,
333+ "num_train_steps=10" ,
334+ f"checkpoint.ckpt_dir={ tmp_path } " ,
335+ "checkpoint.save_every_n_steps=5" ,
336+ "checkpoint.resume_from_checkpoint=false" ,
337+ ],
338+ recipe_path ,
339+ )
340+
341+ ckpt_dir = tmp_path / "train_fsdp2"
342+ assert ckpt_dir .exists (), f"Checkpoint directory not created: { ckpt_dir } "
343+ assert (ckpt_dir / "step_5" ).exists (), "Checkpoint at step 5 not found"
344+
345+
241346nsys_available = subprocess .run (["which" , "nsys" ], check = False , capture_output = True ).returncode == 0
242347
243348
0 commit comments