Skip to content

Commit f7c8f58

Browse files
authored
Merge branch 'main' into dorotat/fix-jet-evo2-pretrain
2 parents 5ff044a + bc468a7 commit f7c8f58

21 files changed

Lines changed: 2367 additions & 65 deletions

File tree

.github/workflows/unit-tests-recipes.yml

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,30 @@ jobs:
8989
fi
9090
fi
9191
92-
echo "dirs=$DIRS" >> $GITHUB_OUTPUT
92+
# Assign Docker images to the selected directories
93+
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
94+
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
95+
96+
# This uses a squashed version of the pytorch:25.06-py3 image, generated with `docker-squash
97+
# nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed --output
98+
# type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
99+
# to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
100+
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
101+
102+
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
103+
map({
104+
dir: .,
105+
image: (
106+
if . == "models/amplify" then
107+
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
108+
else
109+
"svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed-zstd"
110+
end
111+
)
112+
})
113+
')
114+
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
115+
93116
- name: Show output
94117
run: |
95118
echo "=== Changed Files Analysis ==="
@@ -106,31 +129,43 @@ jobs:
106129
needs: changed-dirs
107130
runs-on: linux-amd64-gpu-l4-latest-1
108131
if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
132+
container:
133+
image: ${{ matrix.recipe.image }}
134+
strategy:
135+
matrix:
136+
recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
137+
fail-fast: false
109138

110139
steps:
140+
141+
- name: Show GPU info
142+
run: nvidia-smi
111143
- name: Setup proxy cache
112144
uses: nv-gha-runners/setup-proxy-cache@main
113145

114146
- name: Checkout repository
115147
uses: actions/checkout@v4
116-
117-
- name: Setup python
118-
uses: actions/setup-python@v5
119148
with:
120-
python-version: "3.12"
149+
sparse-checkout: "${{ matrix.recipe.dir }}"
150+
sparse-checkout-cone-mode: false
121151

122-
- name: Install ci script dependencies
152+
- name: Install dependencies
153+
working-directory: ${{ matrix.recipe.dir }}
123154
run: |
124-
python -m pip install --upgrade pip
125-
pip install platformdirs
155+
if [ -f pyproject.toml ] || [ -f setup.py ]; then
156+
PIP_CONSTRAINT= pip install -e .
157+
echo "Installed ${{ matrix.recipe.dir }} as editable package"
158+
elif [ -f requirements.txt ]; then
159+
PIP_CONSTRAINT= pip install -r requirements.txt
160+
echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
161+
else
162+
echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
163+
exit 1
164+
fi
126165
127166
- name: Run tests
128-
env:
129-
DIRS_JSON: ${{ needs.changed-dirs.outputs.dirs }}
130-
run: |
131-
# Convert JSON array to space-separated arguments
132-
DIRS_ARGS=$(echo "$DIRS_JSON" | jq -r '.[]' | tr '\n' ' ')
133-
./ci/scripts/recipes_local_test.py $DIRS_ARGS
167+
working-directory: ${{ matrix.recipe.dir }}
168+
run: pytest -v .
134169

135170
verify-recipe-tests:
136171
# This job checks the status of the unit-tests matrix and fails if any matrix job failed or was cancelled.

models/amplify/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ dependencies = [
1919
"nvidia_resiliency_ext",
2020
"omegaconf",
2121
"pytest",
22-
"torch",
23-
# "transformer_engine[pytorch]",
22+
"torch==2.6.0a0+ecf3bae40a.nv25.01",
23+
"transformer_engine[pytorch]",
2424
"transformers",
2525
"xformers",
2626
]

recipes/esm2_native_te_nvfsdp_thd/hydra_config/L1_3B_ddp.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ defaults:
22
- defaults
33

44
# Training config
5-
model_name: esm2_t33_650M_UR50D
5+
model_name: esm2_t36_3B_UR50D
66
micro_batch_size: 32
77
num_train_steps: 10_000
88

recipes/esm2_native_te_nvfsdp_thd/test_thd_format.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,10 @@ def test_mlm_data_collator_integration():
226226
if mlm_prob == 0.0:
227227
# No masking - all labels should be -100
228228
assert (sample["labels"] == -100).all(), "With mlm_probability=0.0, all labels should be -100"
229-
else:
230-
# Some masking should occur
231-
masked_count = (sample["labels"] != -100).sum()
232-
assert masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
229+
# TODO: This is a very flaky test with such a small input batch, we should make it larger if we want to ensure a
230+
# token is masked
231+
# else: # Some masking should occur masked_count = (sample["labels"] != -100).sum() assert
232+
# masked_count > 0, f"With mlm_probability={mlm_prob}, some tokens should be masked"
233233

234234

235235
if __name__ == "__main__":

recipes/esm2_native_te_nvfsdp_thd/test_train.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@
1616
from pathlib import Path
1717

1818
import pytest
19+
import torch
1920
from hydra import compose, initialize_config_dir
2021

2122
from train import main
2223

2324

24-
@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
25+
@pytest.mark.xfail(
26+
torch.cuda.get_device_capability() == (12, 0),
27+
reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
28+
)
2529
def test_main_invocation(monkeypatch, tmp_path):
2630
"""Test that the main function can be invoked with the correct arguments."""
2731

@@ -43,7 +47,10 @@ def test_main_invocation(monkeypatch, tmp_path):
4347
main(sanity_config)
4448

4549

46-
@pytest.mark.xfail(reason="CUDNN padded packed sequences not supported on all hardware currently.")
50+
@pytest.mark.xfail(
51+
torch.cuda.get_device_capability() == (12, 0),
52+
reason="CUDNN padded packed sequences not supported on all hardware currently (nvbugs/5458694).",
53+
)
4754
def test_main_invocation_ddp(monkeypatch, tmp_path):
4855
"""Test that the main function can be invoked wrapping the model in DDP."""
4956

recipes/geneformer_native_te_nvfsdp_fp8/AGENT_DOCUMENTATION.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,15 @@ training:
122122
num_workers: 4 # DataLoader workers
123123
mlm_probability: 0.15 # Mask probability
124124
use_fp8: true # Enable FP8 precision
125-
wandb_init_args:
126-
name: "geneformer-4b-te" # Experiment name
125+
```
126+
127+
### WandB Configuration
128+
129+
```yaml
130+
wandb_init_args:
131+
name: "geneformer-4b-te" # Experiment name
127132
project: "bionemo-recipes" # Project name
133+
mode: "offline" # Run data management
128134
```
129135
130136
### Data Configuration

recipes/geneformer_native_te_nvfsdp_fp8/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ We support full integration with weights and biases. To use this please supply t
226226
export WANDB_API_KEY=<yourapikey>
227227
```
228228

229-
and supply the hydra config section `training.wandb_init_args` with your experiment name and project.
229+
and supply the hydra config section `wandb_init_args` with your experiment name and project.
230230

231231
### Dataset
232232

recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/106m.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ training:
1818
num_train_steps: 100 # this setting defines the number of training steps
1919
num_workers: 4 # this setting defines the number of workers for the dataloader
2020
mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
21-
wandb_init_args: # These arguments are for managing the weights and biases experiment.
22-
name: "geneformer-l0-106m" # this setting defines the name of the experiment
23-
project: "bionemo-recipes-l0-106m" # this setting defines the project name
2421
checkpoint_dir: "checkpoints/l0-106m"
2522
save_every_n_steps: 50
2623
resume_from_checkpoint: true
2724

25+
wandb_init_args: # These arguments are for managing the weights and biases experiment.
26+
name: "geneformer-l0-106m" # this setting defines the name of the experiment
27+
project: "bionemo-recipes-l0-106m" # this setting defines the project name
28+
mode: "offline"
29+
2830
# Data configuration
2931
data:
30-
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
32+
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.

recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/10m.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ training:
1818
num_train_steps: 100 # this setting defines the number of training steps
1919
num_workers: 4 # this setting defines the number of workers for the dataloader
2020
mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
21-
wandb_init_args: # These arguments are for managing the weights and biases experiment.
22-
name: "geneformer-10m" # this setting defines the name of the experiment
23-
project: "bionemo-recipes-10m" # this setting defines the project name
2421
checkpoint_dir: "checkpoints/10m"
2522
save_every_n_steps: 50
2623
resume_from_checkpoint: true
2724

25+
wandb_init_args: # These arguments are for managing the weights and biases experiment.
26+
name: "geneformer-10m" # this setting defines the name of the experiment
27+
project: "bionemo-recipes-10m" # this setting defines the project name
28+
mode: "offline"
29+
2830
# Data configuration
2931
data:
30-
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
32+
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.

recipes/geneformer_native_te_nvfsdp_fp8/hydra_config/4b.yaml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@ training:
1818
num_train_steps: 100 # this setting defines the number of training steps
1919
num_workers: 4 # this setting defines the number of workers for the dataloader
2020
mlm_probability: 0.15 # this setting defines the probability of masking tokens in the input
21-
wandb_init_args: # These arguments are for managing the weights and biases experiment.
22-
name: "geneformer-l0-4b" # this setting defines the name of the experiment
23-
project: "bionemo-recipes-l0-4b" # this setting defines the project name
2421
checkpoint_dir: "checkpoints/4b"
2522
save_every_n_steps: 50
2623
resume_from_checkpoint: true
2724

25+
wandb_init_args: # These arguments are for managing the weights and biases experiment.
26+
name: "geneformer-l0-4b" # this setting defines the name of the experiment
27+
project: "bionemo-recipes-l0-4b" # this setting defines the project name
28+
mode: "offline"
29+
2830
# Data configuration
2931
data:
30-
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.
32+
path: "genecorpus_500_samples.parquet" # A sanity dataset saved to the repo that holds 500 samples.

0 commit comments

Comments
 (0)