1414# limitations under the License.
1515
1616import os
17+ import random
1718import re
1819import shutil
1920import subprocess
4041)
4142
4243
44+ def extract_final_train_loss (output_text : str ) -> float :
45+ """
46+ Parse the training output to extract the final train_loss value.
47+
48+ Args:
49+ output_text: Combined stdout and stderr from training process
50+
51+ Returns:
52+ Final train_loss value as float
53+
54+ Raises:
55+ ValueError: If no train_loss found or parsing fails
56+ """
57+ # Look for dictionary-like patterns containing train_loss
58+ # Pattern matches: {'key': value, 'train_loss': value, ...}
59+ pattern = r'\{[^{}]*[\'"]train_loss[\'"]:\s*([0-9.]+)[^{}]*\}'
60+
61+ matches = re .findall (pattern , output_text )
62+
63+ if not matches :
64+ # Fallback: try to find train_loss in any context
65+ simple_pattern = r'[\'"]train_loss[\'"]:\s*([0-9.]+)'
66+ matches = re .findall (simple_pattern , output_text )
67+
68+ if not matches :
69+ raise ValueError ("No train_loss found in training output" )
70+
71+ # Return the last (final) train_loss value found
72+ final_train_loss = float (matches [- 1 ])
73+ return final_train_loss
74+
75+
4376def test_train_can_resume_from_checkpoint (monkeypatch , tmp_path : Path ):
4477 """Test that train.py runs successfully with sanity config and creates expected outputs."""
4578
@@ -51,11 +84,20 @@ def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
5184 monkeypatch .setenv ("RANK" , "0" )
5285 monkeypatch .setenv ("WORLD_SIZE" , "1" )
5386 monkeypatch .setenv ("MASTER_ADDR" , "localhost" )
54- monkeypatch .setenv ("MASTER_PORT" , "29500 " )
87+ monkeypatch .setenv ("MASTER_PORT" , f" { random . randint ( 20000 , 40000 ) } " )
5588 monkeypatch .setenv ("WANDB_MODE" , "disabled" )
5689
5790 with initialize_config_dir (config_dir = str (recipe_dir / "hydra_config" ), version_base = "1.2" ):
58- sanity_config = compose (config_name = "L0_sanity" , overrides = [f"trainer.output_dir={ tmp_path } " ])
91+ sanity_config = compose (
92+ config_name = "L0_sanity" ,
93+ overrides = [
94+ f"trainer.output_dir={ tmp_path } " ,
95+ "stop_after_n_steps=4" ,
96+ "trainer.do_eval=False" ,
97+ "trainer.save_steps=2" ,
98+ f"hydra.run.dir={ tmp_path } /outputs" ,
99+ ],
100+ )
59101
60102 main (sanity_config )
61103
@@ -155,11 +197,15 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
155197 str (accelerate_config_path ),
156198 "--num_processes" ,
157199 "1" ,
200+ "--main_process_port" ,
201+ f"{ random .randint (20000 , 40000 )} " ,
158202 str (train_py ),
159203 "--config-name" ,
160204 "L0_sanity.yaml" ,
161205 f"model_tag={ model_tag } " ,
162206 f"trainer.output_dir={ tmp_path } " ,
207+ f"hydra.run.dir={ tmp_path } /outputs" ,
208+ "trainer.do_eval=False" ,
163209 ]
164210
165211 result = subprocess .run (
@@ -176,6 +222,17 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
176222 print (f"STDERR:\n { result .stderr } " )
177223 pytest .fail (f"Command:\n { ' ' .join (cmd )} \n failed with exit code { result .returncode } " )
178224
225+ # Parse the training output to check final train_loss
226+ combined_output = result .stdout + result .stderr
227+ try :
228+ final_train_loss = extract_final_train_loss (combined_output )
229+ print (f"Final train_loss: { final_train_loss } " )
230+ assert final_train_loss < 3.0 , f"Final train_loss { final_train_loss } should be less than 3.0"
231+ except ValueError as e :
232+ print (f"STDOUT:\n { result .stdout } " )
233+ print (f"STDERR:\n { result .stderr } " )
234+ pytest .fail (f"Failed to extract train_loss from output: { e } " )
235+
179236
180237@requires_multi_gpu
181238@pytest .mark .parametrize (
@@ -186,9 +243,11 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
186243 # modeling_esm_te import seems to fix it.
187244 # ("fsdp1_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
188245 ("fsdp2_te.yaml" , "nvidia/esm2_t6_8M_UR50D" ),
189- ("default.yaml" , "facebook/esm2_t6_8M_UR50D" ),
190- ("fsdp1_hf.yaml" , "facebook/esm2_t6_8M_UR50D" ),
191- ("fsdp2_hf.yaml" , "facebook/esm2_t6_8M_UR50D" ),
246+ # TODO: (BIONEMO-2761). These tests were broken by https://github.com/huggingface/transformers/pull/40370, but
247+ # oddly the single-GPU tests still seem to pass. Changing the attention_backend doesn't seem to help.
248+ # ("default.yaml", "facebook/esm2_t6_8M_UR50D"),
249+ # ("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
250+ # ("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
192251 ],
193252)
194253def test_accelerate_launch_multi_gpu (accelerate_config , model_tag , tmp_path ):
@@ -211,11 +270,15 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
211270 str (accelerate_config_path ),
212271 "--num_processes" ,
213272 "2" ,
273+ "--main_process_port" ,
274+ f"{ random .randint (20000 , 40000 )} " ,
214275 str (train_py ),
215276 "--config-name" ,
216277 "L0_sanity.yaml" ,
217278 f"model_tag={ model_tag } " ,
218279 f"trainer.output_dir={ tmp_path } " ,
280+ f"hydra.run.dir={ tmp_path } /outputs" ,
281+ "trainer.do_eval=False" ,
219282 ]
220283
221284 result = subprocess .run (
@@ -231,3 +294,14 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
231294 print (f"STDOUT:\n { result .stdout } " )
232295 print (f"STDERR:\n { result .stderr } " )
233296 pytest .fail (f"Command:\n { ' ' .join (cmd )} \n failed with exit code { result .returncode } " )
297+
298+ # Parse the training output to check final train_loss
299+ combined_output = result .stdout + result .stderr
300+ try :
301+ final_train_loss = extract_final_train_loss (combined_output )
302+ print (f"Final train_loss: { final_train_loss } " )
303+ assert final_train_loss < 3.0 , f"Final train_loss { final_train_loss } should be less than 3.0"
304+ except ValueError as e :
305+ print (f"STDOUT:\n { result .stdout } " )
306+ print (f"STDERR:\n { result .stderr } " )
307+ pytest .fail (f"Failed to extract train_loss from output: { e } " )
0 commit comments