Skip to content

Commit 04cb703

Browse files
Merge branch 'main' into pbinder/remove_core
2 parents 82409c1 + 0d30652 commit 04cb703

26 files changed

Lines changed: 2458 additions & 79 deletions

File tree

.devcontainer/recipes/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
77
torchmetrics
88
tqdm
99
transformer_engine
10-
transformers @ git+https://github.com/huggingface/transformers.git
10+
transformers
1111
typer
1212
wandb

.github/workflows/unit-tests-recipes.yml

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,30 @@ jobs:
8989
fi
9090
fi
9191
92-
echo "dirs=$DIRS" >> $GITHUB_OUTPUT
92+
# Assign Docker images to the selected directories
93+
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
94+
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
95+
96+
# This uses a squashed version of the pytorch:25.06-py3 image, generated with `docker-squash
97+
# nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed --output
98+
# type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
99+
# to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
100+
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
101+
102+
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
103+
map({
104+
dir: .,
105+
image: (
106+
if . == "models/amplify" then
107+
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
108+
else
109+
"svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed-zstd"
110+
end
111+
)
112+
})
113+
')
114+
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
115+
93116
- name: Show output
94117
run: |
95118
echo "=== Changed Files Analysis ==="
@@ -106,31 +129,43 @@ jobs:
106129
needs: changed-dirs
107130
runs-on: linux-amd64-gpu-l4-latest-1
108131
if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
132+
container:
133+
image: ${{ matrix.recipe.image }}
134+
strategy:
135+
matrix:
136+
recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
137+
fail-fast: false
109138

110139
steps:
140+
141+
- name: Show GPU info
142+
run: nvidia-smi
111143
- name: Setup proxy cache
112144
uses: nv-gha-runners/setup-proxy-cache@main
113145

114146
- name: Checkout repository
115147
uses: actions/checkout@v4
116-
117-
- name: Setup python
118-
uses: actions/setup-python@v5
119148
with:
120-
python-version: "3.12"
149+
sparse-checkout: "${{ matrix.recipe.dir }}"
150+
sparse-checkout-cone-mode: false
121151

122-
- name: Install ci script dependencies
152+
- name: Install dependencies
153+
working-directory: ${{ matrix.recipe.dir }}
123154
run: |
124-
python -m pip install --upgrade pip
125-
pip install platformdirs
155+
if [ -f pyproject.toml ] || [ -f setup.py ]; then
156+
PIP_CONSTRAINT= pip install -e .
157+
echo "Installed ${{ matrix.recipe.dir }} as editable package"
158+
elif [ -f requirements.txt ]; then
159+
PIP_CONSTRAINT= pip install -r requirements.txt
160+
echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
161+
else
162+
echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
163+
exit 1
164+
fi
126165
127166
- name: Run tests
128-
env:
129-
DIRS_JSON: ${{ needs.changed-dirs.outputs.dirs }}
130-
run: |
131-
# Convert JSON array to space-separated arguments
132-
DIRS_ARGS=$(echo "$DIRS_JSON" | jq -r '.[]' | tr '\n' ' ')
133-
./ci/scripts/recipes_local_test.py $DIRS_ARGS
167+
working-directory: ${{ matrix.recipe.dir }}
168+
run: pytest -v .
134169

135170
verify-recipe-tests:
136171
# This job checks the status of the unit-tests matrix and fails if any matrix job failed or was cancelled.

ci/benchmarks/partial-conv/evo2_pretrain.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ key_segments:
77
lr: False
88
min_lr: False
99
wu_steps: False
10-
artefacts_url: False
10+
pckg_url: False
1111
file_name_wheel: False
1212
script_args:
1313
# All arguments referenced in the script string must be specified here.
1414
# Arguments not referenced in the script string must have the 'arg' field specified.
1515
# See jet/core/configs.py for the specification of the configuration class
1616
workspace: /workspace/bionemo2
1717
data_path: /data/evo2
18-
artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
18+
pckg_url: gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple/
1919
file_name_wheel: subquadratic-ops
2020
model: evo2
2121
variant: train
@@ -40,7 +40,7 @@ script_args:
4040
script: |-
4141
INSTALL_FLAG="/tmp/install_done_${{SLURMD_NODENAME}}";
4242
if [ "$SLURM_LOCALID" = "0" ]; then
43-
pip install ${file_name_wheel} --index-url ${artefacts_url}
43+
pip install ${file_name_wheel} --index-url https://oauth2:$JET_GITLAB_TOKEN@${pckg_url} --extra-index-url https://pypi.org/simple/
4444
touch $INSTALL_FLAG
4545
fi
4646
# All ranks wait until install flag file appears

models/amplify/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ dependencies = [
1919
"nvidia_resiliency_ext",
2020
"omegaconf",
2121
"pytest",
22-
"torch",
23-
# "transformer_engine[pytorch]",
22+
"torch==2.6.0a0+ecf3bae40a.nv25.01",
23+
"transformer_engine[pytorch]",
2424
"transformers",
2525
"xformers",
2626
]
Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
defaults:
22
- defaults
3+
- _self_
34

45
model_tag: "nvidia/esm2_t6_8M_UR50D"
5-
stop_after_n_steps: 4
6+
stop_after_n_steps: 250
7+
68
trainer:
79
run_name: "esm2_t6_8M_UR50D_sanity"
810
per_device_train_batch_size: 2
911
per_device_eval_batch_size: 2
10-
save_steps: 2
11-
eval_steps: 2
12-
logging_steps: 1
12+
save_steps: 1000
13+
eval_steps: 1000
14+
logging_steps: 10
1315
report_to: "none"
1416
dataloader_num_workers: 0
17+
warmup_steps: 0

recipes/esm2_accelerate/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ datasets
33
deepspeed
44
hydra-core
55
torchmetrics
6-
transformers @ git+https://github.com/huggingface/transformers.git
6+
transformers
77
wandb

recipes/esm2_accelerate/test_train.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515

1616
import os
17+
import random
1718
import re
1819
import shutil
1920
import subprocess
@@ -40,6 +41,38 @@
4041
)
4142

4243

44+
def extract_final_train_loss(output_text: str) -> float:
45+
"""
46+
Parse the training output to extract the final train_loss value.
47+
48+
Args:
49+
output_text: Combined stdout and stderr from training process
50+
51+
Returns:
52+
Final train_loss value as float
53+
54+
Raises:
55+
ValueError: If no train_loss found or parsing fails
56+
"""
57+
# Look for dictionary-like patterns containing train_loss
58+
# Pattern matches: {'key': value, 'train_loss': value, ...}
59+
pattern = r'\{[^{}]*[\'"]train_loss[\'"]:\s*([0-9.]+)[^{}]*\}'
60+
61+
matches = re.findall(pattern, output_text)
62+
63+
if not matches:
64+
# Fallback: try to find train_loss in any context
65+
simple_pattern = r'[\'"]train_loss[\'"]:\s*([0-9.]+)'
66+
matches = re.findall(simple_pattern, output_text)
67+
68+
if not matches:
69+
raise ValueError("No train_loss found in training output")
70+
71+
# Return the last (final) train_loss value found
72+
final_train_loss = float(matches[-1])
73+
return final_train_loss
74+
75+
4376
def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
4477
"""Test that train.py runs successfully with sanity config and creates expected outputs."""
4578

@@ -51,11 +84,20 @@ def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
5184
monkeypatch.setenv("RANK", "0")
5285
monkeypatch.setenv("WORLD_SIZE", "1")
5386
monkeypatch.setenv("MASTER_ADDR", "localhost")
54-
monkeypatch.setenv("MASTER_PORT", "29500")
87+
monkeypatch.setenv("MASTER_PORT", f"{random.randint(20000, 40000)}")
5588
monkeypatch.setenv("WANDB_MODE", "disabled")
5689

5790
with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
58-
sanity_config = compose(config_name="L0_sanity", overrides=[f"trainer.output_dir={tmp_path}"])
91+
sanity_config = compose(
92+
config_name="L0_sanity",
93+
overrides=[
94+
f"trainer.output_dir={tmp_path}",
95+
"stop_after_n_steps=4",
96+
"trainer.do_eval=False",
97+
"trainer.save_steps=2",
98+
f"hydra.run.dir={tmp_path}/outputs",
99+
],
100+
)
59101

60102
main(sanity_config)
61103

@@ -155,11 +197,15 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
155197
str(accelerate_config_path),
156198
"--num_processes",
157199
"1",
200+
"--main_process_port",
201+
f"{random.randint(20000, 40000)}",
158202
str(train_py),
159203
"--config-name",
160204
"L0_sanity.yaml",
161205
f"model_tag={model_tag}",
162206
f"trainer.output_dir={tmp_path}",
207+
f"hydra.run.dir={tmp_path}/outputs",
208+
"trainer.do_eval=False",
163209
]
164210

165211
result = subprocess.run(
@@ -176,6 +222,17 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
176222
print(f"STDERR:\n{result.stderr}")
177223
pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
178224

225+
# Parse the training output to check final train_loss
226+
combined_output = result.stdout + result.stderr
227+
try:
228+
final_train_loss = extract_final_train_loss(combined_output)
229+
print(f"Final train_loss: {final_train_loss}")
230+
assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
231+
except ValueError as e:
232+
print(f"STDOUT:\n{result.stdout}")
233+
print(f"STDERR:\n{result.stderr}")
234+
pytest.fail(f"Failed to extract train_loss from output: {e}")
235+
179236

180237
@requires_multi_gpu
181238
@pytest.mark.parametrize(
@@ -186,9 +243,11 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
186243
# modeling_esm_te import seems to fix it.
187244
# ("fsdp1_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
188245
("fsdp2_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
189-
("default.yaml", "facebook/esm2_t6_8M_UR50D"),
190-
("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
191-
("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
246+
# TODO: (BIONEMO-2761). These tests were broken by https://github.com/huggingface/transformers/pull/40370, but
247+
# oddly the single-GPU tests still seem to pass. Changing the attention_backend doesn't seem to help.
248+
# ("default.yaml", "facebook/esm2_t6_8M_UR50D"),
249+
# ("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
250+
# ("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
192251
],
193252
)
194253
def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
@@ -211,11 +270,15 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
211270
str(accelerate_config_path),
212271
"--num_processes",
213272
"2",
273+
"--main_process_port",
274+
f"{random.randint(20000, 40000)}",
214275
str(train_py),
215276
"--config-name",
216277
"L0_sanity.yaml",
217278
f"model_tag={model_tag}",
218279
f"trainer.output_dir={tmp_path}",
280+
f"hydra.run.dir={tmp_path}/outputs",
281+
"trainer.do_eval=False",
219282
]
220283

221284
result = subprocess.run(
@@ -231,3 +294,14 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
231294
print(f"STDOUT:\n{result.stdout}")
232295
print(f"STDERR:\n{result.stderr}")
233296
pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
297+
298+
# Parse the training output to check final train_loss
299+
combined_output = result.stdout + result.stderr
300+
try:
301+
final_train_loss = extract_final_train_loss(combined_output)
302+
print(f"Final train_loss: {final_train_loss}")
303+
assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
304+
except ValueError as e:
305+
print(f"STDOUT:\n{result.stdout}")
306+
print(f"STDERR:\n{result.stderr}")
307+
pytest.fail(f"Failed to extract train_loss from output: {e}")

recipes/esm2_native_te_nvfsdp_thd/hydra_config/L1_3B_ddp.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ defaults:
22
- defaults
33

44
# Training config
5-
model_name: esm2_t33_650M_UR50D
5+
model_name: esm2_t36_3B_UR50D
66
micro_batch_size: 32
77
num_train_steps: 10_000
88

recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,10 @@ def test_mlm_data_collator_integration():
226226
if mlm_prob == 0.0:
227227
# No masking - all labels should be -100
228228
assert (sample["labels"] == -100).all(), "With mlm_probability=0.0, all labels should be -100"
229-
else:
230-
# Some masking should occur
231-
masked_count = (sample["labels"] != -100).sum()
232-
assert masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
229+
# TODO: This is a very flaky test with such a small input batch, we should make it larger if we want to ensure a
230+
# token is masked
231+
# else: # Some masking should occur masked_count = (sample["labels"] != -100).sum() assert
232+
# masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
233233

234234

235235
if __name__ == "__main__":

recipes/esm2_native_te_nvfsdp_thd/test_train.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@
1616
from pathlib import Path
1717

1818
import pytest
19+
import torch
1920
from hydra import compose, initialize_config_dir
2021

2122
from train import main
2223

2324

24-
@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
25+
@pytest.mark.xfail(
26+
torch.cuda.get_device_capability() == (12, 0),
27+
reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
28+
)
2529
def test_main_invocation(monkeypatch, tmp_path):
2630
"""Test that the main function can be invoked with the correct arguments."""
2731

@@ -43,7 +47,10 @@ def test_main_invocation(monkeypatch, tmp_path):
4347
main(sanity_config)
4448

4549

46-
@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
50+
@pytest.mark.xfail(
51+
torch.cuda.get_device_capability() == (12, 0),
52+
reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
53+
)
4754
def test_main_invocation_ddp(monkeypatch, tmp_path):
4855
"""Test that the main function can be invoked wrapping the model in DDP."""
4956

0 commit comments

Comments
 (0)