Skip to content

Commit 0d30652

Browse files
authored
add partial conv tests to esm2_accelerate recipe (#1122)
Adds partial conv tests to the esm2_accelerate recipe similar to those used in the mfsdp recipe <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - New Features - Added configurable warmup steps (default 0) to training. - Chores - Increased default training duration (more steps). - Reduced frequency of saving, evaluation, and logging to lower overhead. - Tests - Improved distributed run stability by using dynamic, collision-free ports. - Added parsing of final training loss from output and assertions to ensure expected convergence. - Streamlined test overrides for faster, deterministic sanity runs. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Peter St. John <pstjohn@nvidia.com>
1 parent a1e2340 commit 0d30652

4 files changed

Lines changed: 88 additions & 11 deletions

File tree

.devcontainer/recipes/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
77
torchmetrics
88
tqdm
99
transformer_engine
10-
transformers @ git+https://github.com/huggingface/transformers.git
10+
transformers
1111
typer
1212
wandb
Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
defaults:
22
- defaults
3+
- _self_
34

45
model_tag: "nvidia/esm2_t6_8M_UR50D"
5-
stop_after_n_steps: 4
6+
stop_after_n_steps: 250
7+
68
trainer:
79
run_name: "esm2_t6_8M_UR50D_sanity"
810
per_device_train_batch_size: 2
911
per_device_eval_batch_size: 2
10-
save_steps: 2
11-
eval_steps: 2
12-
logging_steps: 1
12+
save_steps: 1000
13+
eval_steps: 1000
14+
logging_steps: 10
1315
report_to: "none"
1416
dataloader_num_workers: 0
17+
warmup_steps: 0

recipes/esm2_accelerate/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ datasets
33
deepspeed
44
hydra-core
55
torchmetrics
6-
transformers @ git+https://github.com/huggingface/transformers.git
6+
transformers
77
wandb

recipes/esm2_accelerate/test_train.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515

1616
import os
17+
import random
1718
import re
1819
import shutil
1920
import subprocess
@@ -40,6 +41,38 @@
4041
)
4142

4243

44+
def extract_final_train_loss(output_text: str) -> float:
45+
"""
46+
Parse the training output to extract the final train_loss value.
47+
48+
Args:
49+
output_text: Combined stdout and stderr from training process
50+
51+
Returns:
52+
Final train_loss value as float
53+
54+
Raises:
55+
ValueError: If no train_loss found or parsing fails
56+
"""
57+
# Look for dictionary-like patterns containing train_loss
58+
# Pattern matches: {'key': value, 'train_loss': value, ...}
59+
pattern = r'\{[^{}]*[\'"]train_loss[\'"]:\s*([0-9.]+)[^{}]*\}'
60+
61+
matches = re.findall(pattern, output_text)
62+
63+
if not matches:
64+
# Fallback: try to find train_loss in any context
65+
simple_pattern = r'[\'"]train_loss[\'"]:\s*([0-9.]+)'
66+
matches = re.findall(simple_pattern, output_text)
67+
68+
if not matches:
69+
raise ValueError("No train_loss found in training output")
70+
71+
# Return the last (final) train_loss value found
72+
final_train_loss = float(matches[-1])
73+
return final_train_loss
74+
75+
4376
def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
4477
"""Test that train.py runs successfully with sanity config and creates expected outputs."""
4578

@@ -51,11 +84,20 @@ def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
5184
monkeypatch.setenv("RANK", "0")
5285
monkeypatch.setenv("WORLD_SIZE", "1")
5386
monkeypatch.setenv("MASTER_ADDR", "localhost")
54-
monkeypatch.setenv("MASTER_PORT", "29500")
87+
monkeypatch.setenv("MASTER_PORT", f"{random.randint(20000, 40000)}")
5588
monkeypatch.setenv("WANDB_MODE", "disabled")
5689

5790
with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
58-
sanity_config = compose(config_name="L0_sanity", overrides=[f"trainer.output_dir={tmp_path}"])
91+
sanity_config = compose(
92+
config_name="L0_sanity",
93+
overrides=[
94+
f"trainer.output_dir={tmp_path}",
95+
"stop_after_n_steps=4",
96+
"trainer.do_eval=False",
97+
"trainer.save_steps=2",
98+
f"hydra.run.dir={tmp_path}/outputs",
99+
],
100+
)
59101

60102
main(sanity_config)
61103

@@ -155,11 +197,15 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
155197
str(accelerate_config_path),
156198
"--num_processes",
157199
"1",
200+
"--main_process_port",
201+
f"{random.randint(20000, 40000)}",
158202
str(train_py),
159203
"--config-name",
160204
"L0_sanity.yaml",
161205
f"model_tag={model_tag}",
162206
f"trainer.output_dir={tmp_path}",
207+
f"hydra.run.dir={tmp_path}/outputs",
208+
"trainer.do_eval=False",
163209
]
164210

165211
result = subprocess.run(
@@ -176,6 +222,17 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
176222
print(f"STDERR:\n{result.stderr}")
177223
pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
178224

225+
# Parse the training output to check final train_loss
226+
combined_output = result.stdout + result.stderr
227+
try:
228+
final_train_loss = extract_final_train_loss(combined_output)
229+
print(f"Final train_loss: {final_train_loss}")
230+
assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
231+
except ValueError as e:
232+
print(f"STDOUT:\n{result.stdout}")
233+
print(f"STDERR:\n{result.stderr}")
234+
pytest.fail(f"Failed to extract train_loss from output: {e}")
235+
179236

180237
@requires_multi_gpu
181238
@pytest.mark.parametrize(
@@ -186,9 +243,11 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
186243
# modeling_esm_te import seems to fix it.
187244
# ("fsdp1_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
188245
("fsdp2_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
189-
("default.yaml", "facebook/esm2_t6_8M_UR50D"),
190-
("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
191-
("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
246+
# TODO: (BIONEMO-2761). These tests were broken by https://github.com/huggingface/transformers/pull/40370, but
247+
# oddly the single-GPU tests still seem to pass. Changing the attention_backend doesn't seem to help.
248+
# ("default.yaml", "facebook/esm2_t6_8M_UR50D"),
249+
# ("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
250+
# ("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
192251
],
193252
)
194253
def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
@@ -211,11 +270,15 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
211270
str(accelerate_config_path),
212271
"--num_processes",
213272
"2",
273+
"--main_process_port",
274+
f"{random.randint(20000, 40000)}",
214275
str(train_py),
215276
"--config-name",
216277
"L0_sanity.yaml",
217278
f"model_tag={model_tag}",
218279
f"trainer.output_dir={tmp_path}",
280+
f"hydra.run.dir={tmp_path}/outputs",
281+
"trainer.do_eval=False",
219282
]
220283

221284
result = subprocess.run(
@@ -231,3 +294,14 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
231294
print(f"STDOUT:\n{result.stdout}")
232295
print(f"STDERR:\n{result.stderr}")
233296
pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
297+
298+
# Parse the training output to check final train_loss
299+
combined_output = result.stdout + result.stderr
300+
try:
301+
final_train_loss = extract_final_train_loss(combined_output)
302+
print(f"Final train_loss: {final_train_loss}")
303+
assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
304+
except ValueError as e:
305+
print(f"STDOUT:\n{result.stdout}")
306+
print(f"STDERR:\n{result.stderr}")
307+
pytest.fail(f"Failed to extract train_loss from output: {e}")

0 commit comments

Comments
 (0)