@@ -55,13 +55,15 @@ def set_seed():
5555 torch .cuda .manual_seed_all (42 )
5656
5757
58- def test_sanity_convergence_fsdp2_te_bshd (tmp_path , recipe_path ):
58+ def test_sanity_convergence_fsdp2_te_bshd (tmp_path , recipe_path , local_tokenizer_path ):
59+ tokenizer_path = local_tokenizer_path
5960 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
6061 sanity_config = compose (
6162 config_name = "L0_sanity" ,
6263 overrides = [
6364 f"+wandb.dir={ tmp_path } " ,
6465 f"checkpoint.ckpt_dir={ tmp_path } " ,
66+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
6567 "checkpoint.resume_from_checkpoint=false" ,
6668 "num_train_steps=40" ,
6769 "config_kwargs.attn_input_format=bshd" ,
@@ -74,13 +76,15 @@ def test_sanity_convergence_fsdp2_te_bshd(tmp_path, recipe_path):
7476 assert final_loss < 8.5 , f"Final loss { final_loss } is too high, expected < 8.5"
7577
7678
77- def test_sanity_convergence_fsdp2_te_thd (tmp_path , recipe_path ):
79+ def test_sanity_convergence_fsdp2_te_thd (tmp_path , recipe_path , local_tokenizer_path ):
80+ tokenizer_path = local_tokenizer_path
7881 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
7982 sanity_config = compose (
8083 config_name = "L0_sanity" ,
8184 overrides = [
8285 f"+wandb.dir={ tmp_path } " ,
8386 f"checkpoint.ckpt_dir={ tmp_path } " ,
87+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
8488 "checkpoint.resume_from_checkpoint=false" ,
8589 "num_train_steps=40" ,
8690 "use_sequence_packing=true" ,
@@ -95,14 +99,16 @@ def test_sanity_convergence_fsdp2_te_thd(tmp_path, recipe_path):
9599 assert final_loss < 8.5 , f"Final loss { final_loss } is too high, expected < 8.5"
96100
97101
98- def test_sanity_convergence_fsdp2_te_bshd_grad_acc (tmp_path , recipe_path ):
102+ def test_sanity_convergence_fsdp2_te_bshd_grad_acc (tmp_path , recipe_path , local_tokenizer_path ):
99103 """Test FSDP2 training with gradient accumulation."""
104+ tokenizer_path = local_tokenizer_path
100105 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
101106 sanity_config = compose (
102107 config_name = "L0_sanity" ,
103108 overrides = [
104109 f"+wandb.dir={ tmp_path } " ,
105110 f"checkpoint.ckpt_dir={ tmp_path } " ,
111+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
106112 "checkpoint.resume_from_checkpoint=false" ,
107113 "num_train_steps=40" ,
108114 "config_kwargs.attn_input_format=bshd" ,
@@ -117,21 +123,16 @@ def test_sanity_convergence_fsdp2_te_bshd_grad_acc(tmp_path, recipe_path):
117123 assert final_loss < 8.5 , f"Final loss { final_loss } is too high, expected < 8.5"
118124
119125
120- def test_sanity_convergence_ddp_te (tmp_path , recipe_path ):
121- """Test that DDP training converges on sanity-scale data.
122-
123- This test validates:
124- - The train_ddp.py script runs end-to-end without errors
125- - Model, optimizer, and dataloader integrate correctly
126- - Training converges to reasonable loss on small dataset
127- - Uses L0_sanity config with small model and few training steps
128- """
126+ def test_sanity_convergence_ddp_te (tmp_path , recipe_path , local_tokenizer_path ):
127+ """Test that DDP training converges on sanity-scale data."""
128+ tokenizer_path = local_tokenizer_path
129129 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
130130 sanity_config = compose (
131131 config_name = "L0_sanity" ,
132132 overrides = [
133133 f"+wandb.dir={ tmp_path } " ,
134134 f"checkpoint.ckpt_dir={ tmp_path } " ,
135+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
135136 "checkpoint.resume_from_checkpoint=false" ,
136137 "num_train_steps=40" ,
137138 "config_kwargs.attn_input_format=bshd" ,
@@ -144,14 +145,16 @@ def test_sanity_convergence_ddp_te(tmp_path, recipe_path):
144145 assert final_loss < 8.5 , f"Final loss { final_loss } is too high, expected < 8.5"
145146
146147
147- def test_sanity_convergence_ddp_te_grad_acc (tmp_path , recipe_path ):
148+ def test_sanity_convergence_ddp_te_grad_acc (tmp_path , recipe_path , local_tokenizer_path ):
148149 """Test DDP training with gradient accumulation."""
150+ tokenizer_path = local_tokenizer_path
149151 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
150152 sanity_config = compose (
151153 config_name = "L0_sanity" ,
152154 overrides = [
153155 f"+wandb.dir={ tmp_path } " ,
154156 f"checkpoint.ckpt_dir={ tmp_path } " ,
157+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
155158 "checkpoint.resume_from_checkpoint=false" ,
156159 "num_train_steps=40" ,
157160 "config_kwargs.attn_input_format=bshd" ,
@@ -165,20 +168,16 @@ def test_sanity_convergence_ddp_te_grad_acc(tmp_path, recipe_path):
165168 assert final_loss < 8.5 , f"Final loss { final_loss } is too high, expected < 8.5"
166169
167170
168- def test_sanity_convergence_fsdp2_hf (tmp_path , recipe_path ):
169- """Test that FSDP2 training converges with HuggingFace (non-TE) model.
170-
171- This test validates:
172- - The train_fsdp2.py script runs end-to-end without errors using vanilla HF layers
173- - FSDP2 wrapping and sharding work correctly without TransformerEngine
174- - Training converges to reasonable loss on small dataset
175- """
171+ def test_sanity_convergence_fsdp2_hf (tmp_path , recipe_path , local_tokenizer_path ):
172+ """Test that FSDP2 training converges with HuggingFace (non-TE) model."""
173+ tokenizer_path = local_tokenizer_path
176174 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
177175 sanity_config = compose (
178176 config_name = "L0_sanity" ,
179177 overrides = [
180178 f"+wandb.dir={ tmp_path } " ,
181179 f"checkpoint.ckpt_dir={ tmp_path } " ,
180+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
182181 "checkpoint.resume_from_checkpoint=false" ,
183182 "num_train_steps=40" ,
184183 "use_te=false" ,
@@ -194,14 +193,16 @@ def test_sanity_convergence_fsdp2_hf(tmp_path, recipe_path):
194193
195194@requires_fp8
196195@requires_datacenter_hardware
197- def test_sanity_convergence_fsdp2_te_fp8 (tmp_path , recipe_path , fp_recipe ):
196+ def test_sanity_convergence_fsdp2_te_fp8 (tmp_path , recipe_path , local_tokenizer_path , fp_recipe ):
198197 """Test FSDP2 training with FP8 enabled using parametrized FP8 recipes."""
198+ tokenizer_path = local_tokenizer_path
199199 with initialize_config_dir (config_dir = str (recipe_path / "hydra_config" ), version_base = "1.2" ):
200200 sanity_config = compose (
201201 config_name = "L0_sanity" ,
202202 overrides = [
203203 f"+wandb.dir={ tmp_path } " ,
204204 f"checkpoint.ckpt_dir={ tmp_path } " ,
205+ f"dataset.tokenizer_name_or_path={ tokenizer_path } " ,
205206 "checkpoint.resume_from_checkpoint=false" ,
206207 "num_train_steps=40" ,
207208 "config_kwargs.attn_input_format=bshd" ,
0 commit comments