Skip to content

Commit b7a7e1f

Browse files
authored
Merge branch 'main' into jwilber/add-dashboard-build-action
2 parents 54d2301 + 1ac89fc commit b7a7e1f

130 files changed

Lines changed: 4640 additions & 725 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devcontainer/recipes/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
77
torchmetrics
88
tqdm
99
transformer_engine
10-
transformers @ git+https://github.com/huggingface/transformers.git
10+
transformers
1111
typer
1212
wandb
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: "BioNeMo Model Convergence Tests"
2+
3+
on:
4+
workflow_dispatch:
5+
6+
# run lepton tests
7+
# update dashboard
8+
9+
jobs:
10+
submit-lepton-jobs:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout
14+
uses: actions/checkout@v4
15+
16+
- name: Submit Lepton Jobs
17+
run: |
18+
python ci/lepton/model_convergence/scripts/launch_job.py --config-name "evo2_finetune_lora"

.github/workflows/unit-tests-framework.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,18 @@ jobs:
3030
fetch-depth: 0
3131
submodules: "recursive"
3232

33+
- name: Get merge-base commit
34+
id: merge-base
35+
run: |
36+
# Get the merge-base between current branch and main
37+
MERGE_BASE=$(git merge-base HEAD origin/main)
38+
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
39+
echo "Merge-base commit: $MERGE_BASE"
40+
3341
- uses: step-security/changed-files@v46
3442
id: changed-files
3543
with:
36-
base_sha: main
44+
base_sha: ${{ steps.merge-base.outputs.merge-base }}
3745
files: |
3846
**
3947
!models/**
@@ -42,6 +50,7 @@ jobs:
4250
!.github/**
4351
!.gitignore
4452
!.devcontainer/**
53+
!ci/scripts/recipes_local_test.py
4554
.github/workflows/unit-tests-framework.yml
4655
4756
- name: Show output
@@ -239,8 +248,6 @@ jobs:
239248
- name: Run notebook tests
240249
env:
241250
BIONEMO_DATA_SOURCE: ngc
242-
# this variable should be used in the notebooks to run a subset of the model layers or a smaller model/dataset
243-
FAST_CI_MODE: true
244251
run: |
245252
chmod +x ./ci/scripts/run_pytest_notebooks.sh
246253
./ci/scripts/run_pytest_notebooks.sh

.github/workflows/unit-tests-recipes.yml

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,21 @@ jobs:
3535
with:
3636
fetch-depth: 0
3737

38+
- name: Get merge-base commit
39+
id: merge-base
40+
run: |
41+
# Get the merge-base between current branch and main
42+
MERGE_BASE=$(git merge-base HEAD origin/main)
43+
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
44+
echo "Merge-base commit: $MERGE_BASE"
45+
3846
- name: Get changed files
3947
id: changed-files
4048
uses: step-security/changed-files@v46
4149
with:
4250
json: true
4351
matrix: true
44-
base_sha: main
52+
base_sha: ${{ steps.merge-base.outputs.merge-base }}
4553
dir_names: true
4654
dir_names_max_depth: 2
4755
files: |
@@ -84,21 +92,35 @@ jobs:
8492
# Assign Docker images to the selected directories
8593
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
8694
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
95+
96+
# This uses a squashed version of the pytorch:25.06-py3 image, generated with `docker-squash
97+
# nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed --output
98+
# type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15` and pushed
99+
# to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
100+
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
101+
87102
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
88103
map({
89104
dir: .,
90105
image: (
91106
if . == "models/amplify" then
92107
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
93108
else
94-
"nvcr.io/nvidia/pytorch:25.06-py3"
109+
"svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed-zstd"
95110
end
96111
)
97112
})
98113
')
99114
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
115+
100116
- name: Show output
101117
run: |
118+
echo "=== Changed Files Analysis ==="
119+
echo "Current branch: ${{ github.ref_name }}"
120+
echo "Merge-base commit: ${{ steps.merge-base.outputs.merge-base }}"
121+
echo "Changed files compared to merge-base:"
122+
echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq -r '.[]' | sed 's/^/ - /'
123+
echo "Total changed files: $(echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq '. | length')"
102124
echo '${{ toJSON(steps.changed-files.outputs) }}'
103125
echo '${{ toJSON(steps.set-dirs.outputs) }}'
104126
shell: bash
@@ -115,8 +137,12 @@ jobs:
115137
fail-fast: false
116138

117139
steps:
140+
141+
- name: Show GPU info
142+
run: nvidia-smi
118143
- name: Setup proxy cache
119144
uses: nv-gha-runners/setup-proxy-cache@main
145+
120146
- name: Checkout repository
121147
uses: actions/checkout@v4
122148
with:
@@ -125,7 +151,6 @@ jobs:
125151

126152
- name: Install dependencies
127153
working-directory: ${{ matrix.recipe.dir }}
128-
#
129154
run: |
130155
if [ -f pyproject.toml ] || [ -f setup.py ]; then
131156
PIP_CONSTRAINT= pip install -e .

3rdparty/NeMo

Submodule NeMo updated 63 files

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ apt-get install -qyy \
4949
curl \
5050
pre-commit \
5151
sudo \
52+
emacs-nox \
5253
gnupg \
5354
unzip \
5455
libsqlite3-dev

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ With a locally cloned repository and initialized submodules, build the BioNeMo c
8282
docker buildx build . -t my-container-tag
8383
```
8484

85+
If you see an error message like `No file descriptors available (os error 24)`, add the option `--ulimit nofile=65535:65535` to the docker build command.
86+
8587
#### VSCode Devcontainer for Interactive Debugging
8688

8789
We distribute a [development container](https://devcontainers.github.io/) configuration for vscode

bionemo-recipes.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ The biological AI community is actively prototyping model architectures and need
88

99
- **Flexible scaling**: Scale from single-GPU prototyping to multi-node training without complex parallelism configurations
1010
- **Framework compatibility**: Works with popular frameworks like HuggingFace Accelerate, PyTorch Lightning, and vanilla PyTorch
11-
- **Performance optimization**: Leverages TransformerEngine and nvFSDP for state-of-the-art training efficiency
11+
- **Performance optimization**: Leverages TransformerEngine and megatron-fsdp for state-of-the-art training efficiency
1212
- **Research-friendly**: Hackable, readable code that researchers can easily adapt for their experiments
1313

1414
### Use Cases
@@ -35,7 +35,7 @@ Example models include ESM-2, Geneformer, and AMPLIFY.
3535
Self-contained training examples demonstrating best practices for scaling biological foundation models. Each recipe is a complete Docker container with:
3636

3737
- **Framework examples**: Vanilla PyTorch, HuggingFace Accelerate, PyTorch Lightning
38-
- **Feature demonstrations**: FP8 training, nvFSDP, context parallelism, sequence packing
38+
- **Feature demonstrations**: FP8 training, megatron-fsdp, context parallelism, sequence packing
3939
- **Scaling strategies**: Single-GPU to multi-node training patterns
4040
- **Benchmarked performance**: Validated throughput and convergence metrics
4141

@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained("nvidia/AMPLIFY_120M")
5757

5858
```bash
5959
# Navigate to a recipe
60-
cd recipes/esm2_native_te_nvfsdp
60+
cd recipes/esm2_native_te_mfsdp
6161

6262
# Build and run
6363
docker build -t esm2_recipe .
@@ -191,4 +191,4 @@ For technical support and questions:
191191

192192
- Check existing issues before opening a new one
193193
- Review our training recipes for implementation examples
194-
- Consult the TransformerEngine and nvFSDP documentation for underlying technologies
194+
- Consult the TransformerEngine and megatron-fsdp documentation for underlying technologies

ci/benchmarks/partial-conv/evo2_finetuning.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,11 @@ script: |-
8989
--devices=${gpus} \
9090
--num-nodes=${nodes} \
9191
--val-check-interval=${val_check_interval} \
92-
--wandb-project=${wandb_project_name} \
93-
--wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
9492
--create-tensorboard-logger \
9593
--activation-checkpoint-recompute-num-layers=${activation_checkpoint_layers} \
9694
--disable-checkpointing \
9795
--early-stop-on-step=${stop_steps} \
96+
--wandb-project=${wandb_project_name} \
97+
--wandb-group=${model}_${variant}_${config_name}_${task}_${target} \
98+
--wandb-job-type=${pipeline_label} \
9899
--garbage-collect-at-inference;

ci/benchmarks/partial-conv/evo2_pretrain.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ key_segments:
77
lr: False
88
min_lr: False
99
wu_steps: False
10-
artefacts_url: False
10+
pckg_url: False
1111
file_name_wheel: False
1212
script_args:
1313
# All arguments referenced in the script string must be specified here.
1414
# Arguments not referenced in the script string must have the 'arg' field specified.
1515
# See jet/core/configs.py for the specification of the configuration class
1616
workspace: /workspace/bionemo2
1717
data_path: /data/evo2
18-
artefacts_url: https://__token__:${{JET_GITLAB_TOKEN}}@gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple
18+
pckg_url: gitlab-master.nvidia.com/api/v4/projects/180496/packages/pypi/simple/
1919
file_name_wheel: subquadratic-ops
2020
model: evo2
2121
variant: train
@@ -40,7 +40,7 @@ script_args:
4040
script: |-
4141
INSTALL_FLAG="/tmp/install_done_${{SLURMD_NODENAME}}";
4242
if [ "$SLURM_LOCALID" = "0" ]; then
43-
pip install ${file_name_wheel} --index-url ${artefacts_url}
43+
pip install ${file_name_wheel} --index-url https://oauth2:$JET_GITLAB_TOKEN@${pckg_url} --extra-index-url https://pypi.org/simple/
4444
touch $INSTALL_FLAG
4545
fi
4646
# All ranks wait until install flag file appears

0 commit comments

Comments
 (0)