Skip to content

Commit 5ff044a

Browse files
committed
Merge remote-tracking branch 'origin/main' into dorotat/fix-jet-evo2-pretrain
2 parents c482697 + d91788a commit 5ff044a

17 files changed

Lines changed: 306 additions & 135 deletions

File tree

.github/workflows/unit-tests-framework.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,18 @@ jobs:
3030
fetch-depth: 0
3131
submodules: "recursive"
3232

33+
- name: Get merge-base commit
34+
id: merge-base
35+
run: |
36+
# Get the merge-base between current branch and main
37+
MERGE_BASE=$(git merge-base HEAD origin/main)
38+
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
39+
echo "Merge-base commit: $MERGE_BASE"
40+
3341
- uses: step-security/changed-files@v46
3442
id: changed-files
3543
with:
36-
base_sha: main
44+
base_sha: ${{ steps.merge-base.outputs.merge-base }}
3745
files: |
3846
**
3947
!models/**
@@ -42,6 +50,7 @@ jobs:
4250
!.github/**
4351
!.gitignore
4452
!.devcontainer/**
53+
!ci/scripts/recipes_local_test.py
4554
.github/workflows/unit-tests-framework.yml
4655
4756
- name: Show output
@@ -239,8 +248,6 @@ jobs:
239248
- name: Run notebook tests
240249
env:
241250
BIONEMO_DATA_SOURCE: ngc
242-
# this variable should be used in the notebooks to run a subset of the model layers or a smaller model/dataset
243-
FAST_CI_MODE: true
244251
run: |
245252
chmod +x ./ci/scripts/run_pytest_notebooks.sh
246253
./ci/scripts/run_pytest_notebooks.sh

.github/workflows/unit-tests-recipes.yml

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,21 @@ jobs:
3535
with:
3636
fetch-depth: 0
3737

38+
- name: Get merge-base commit
39+
id: merge-base
40+
run: |
41+
# Get the merge-base between current branch and main
42+
MERGE_BASE=$(git merge-base HEAD origin/main)
43+
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
44+
echo "Merge-base commit: $MERGE_BASE"
45+
3846
- name: Get changed files
3947
id: changed-files
4048
uses: step-security/changed-files@v46
4149
with:
4250
json: true
4351
matrix: true
44-
base_sha: main
52+
base_sha: ${{ steps.merge-base.outputs.merge-base }}
4553
dir_names: true
4654
dir_names_max_depth: 2
4755
files: |
@@ -81,24 +89,15 @@ jobs:
8189
fi
8290
fi
8391
84-
# Assign Docker images to the selected directories
85-
# Currently, AMPLIFY is the only folder that needs a custom base image, since we have to support both TE and
86-
# xformers-based models for golden value testing. The rest of the models use the default pytorch image.
87-
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
88-
map({
89-
dir: .,
90-
image: (
91-
if . == "models/amplify" then
92-
"svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025"
93-
else
94-
"nvcr.io/nvidia/pytorch:25.06-py3"
95-
end
96-
)
97-
})
98-
')
99-
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
92+
echo "dirs=$DIRS" >> $GITHUB_OUTPUT
10093
- name: Show output
10194
run: |
95+
echo "=== Changed Files Analysis ==="
96+
echo "Current branch: ${{ github.ref_name }}"
97+
echo "Merge-base commit: ${{ steps.merge-base.outputs.merge-base }}"
98+
echo "Changed files compared to merge-base:"
99+
echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq -r '.[]' | sed 's/^/ - /'
100+
echo "Total changed files: $(echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq '. | length')"
102101
echo '${{ toJSON(steps.changed-files.outputs) }}'
103102
echo '${{ toJSON(steps.set-dirs.outputs) }}'
104103
shell: bash
@@ -107,40 +106,31 @@ jobs:
107106
needs: changed-dirs
108107
runs-on: linux-amd64-gpu-l4-latest-1
109108
if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
110-
container:
111-
image: ${{ matrix.recipe.image }}
112-
strategy:
113-
matrix:
114-
recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
115-
fail-fast: false
116109

117110
steps:
118111
- name: Setup proxy cache
119112
uses: nv-gha-runners/setup-proxy-cache@main
113+
120114
- name: Checkout repository
121115
uses: actions/checkout@v4
116+
117+
- name: Setup python
118+
uses: actions/setup-python@v5
122119
with:
123-
sparse-checkout: "${{ matrix.recipe.dir }}"
124-
sparse-checkout-cone-mode: false
120+
python-version: "3.12"
125121

126-
- name: Install dependencies
127-
working-directory: ${{ matrix.recipe.dir }}
128-
#
122+
- name: Install ci script dependencies
129123
run: |
130-
if [ -f pyproject.toml ] || [ -f setup.py ]; then
131-
PIP_CONSTRAINT= pip install -e .
132-
echo "Installed ${{ matrix.recipe.dir }} as editable package"
133-
elif [ -f requirements.txt ]; then
134-
PIP_CONSTRAINT= pip install -r requirements.txt
135-
echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
136-
else
137-
echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
138-
exit 1
139-
fi
124+
python -m pip install --upgrade pip
125+
pip install platformdirs
140126
141127
- name: Run tests
142-
working-directory: ${{ matrix.recipe.dir }}
143-
run: pytest -v .
128+
env:
129+
DIRS_JSON: ${{ needs.changed-dirs.outputs.dirs }}
130+
run: |
131+
# Convert JSON array to space-separated arguments
132+
DIRS_ARGS=$(echo "$DIRS_JSON" | jq -r '.[]' | tr '\n' ' ')
133+
./ci/scripts/recipes_local_test.py $DIRS_ARGS
144134
145135
verify-recipe-tests:
146136
# This job checks the status of the unit-tests matrix and fails if any matrix job failed or was cancelled.

3rdparty/NeMo

Submodule NeMo updated 63 files

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ apt-get install -qyy \
4949
curl \
5050
pre-commit \
5151
sudo \
52+
emacs-nox \
5253
gnupg \
5354
unzip \
5455
libsqlite3-dev
Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,39 @@
2323
from pathlib import Path
2424
from typing import List, Optional
2525

26+
from platformdirs import user_cache_dir
27+
28+
29+
PIP_CACHE_DIR = user_cache_dir(appname="bionemo-pip-cache", appauthor="nvidia")
2630

2731
logger = logging.getLogger(__name__)
2832
logger.setLevel(logging.DEBUG)
2933

3034
DOCKER_RUN_ARGS = [
3135
"--rm",
32-
"-it",
3336
"--gpus",
3437
"all",
3538
"--ipc=host",
3639
"--ulimit",
3740
"memlock=-1",
3841
"--ulimit",
3942
"stack=67108864",
43+
"-v",
44+
f"{PIP_CACHE_DIR}:/workspace/.cache/pip",
4045
]
4146

4247
CUSTOM_CONTAINERS = {
4348
"models/amplify": "svcbionemo023/bionemo-framework:amplify-model-devcontainer-082025",
4449
}
45-
DEFAULT_CONTAINER = "nvcr.io/nvidia/pytorch:25.06-py3"
50+
51+
# DEFAULT_CONTAINER = "nvcr.io/nvidia/pytorch:25.06-py3"
52+
53+
# This is a squashed version of the pytorch:25.06-py3 image, generated with
54+
# docker-squash nvcr.io/nvidia/pytorch:25.06-py3 -t svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed
55+
# --output type=registry,compression=zstd,force-compression=true,oci-mediatypes=true,compression-level=15
56+
# and pushed to the dockerhub registry. Our github actions are able to cache image pulls from dockerhub but not nvcr, so
57+
# hopefully this cuts down slightly on CI time at the expense of having a slightly in-directed image location.
58+
DEFAULT_CONTAINER = "svcbionemo023/bionemo-framework:pytorch25.06-py3-squashed"
4659

4760

4861
def get_git_root() -> str:
@@ -89,24 +102,26 @@ def run_tests_in_docker(work_dir: str) -> bool:
89102
install_and_test_script = textwrap.dedent("""
90103
set -e # Exit on any error
91104
92-
echo "Checking for dependency files..."
105+
# Ensure image-embedded constraints do not leak into local recipe installs
106+
unset PIP_CONSTRAINT || true
93107
108+
echo "Checking for dependency files..."
94109
# Install dependencies based on available files
95110
if [ -f pyproject.toml ] || [ -f setup.py ]; then
96111
echo "Installing package in editable mode..."
97-
PIP_CONSTRAINT= pip install -e .
112+
PIP_CACHE_DIR=/workspace/.cache/pip pip install -e .
98113
echo "Installed package as editable package"
99114
elif [ -f requirements.txt ]; then
100115
echo "Installing from requirements.txt..."
101-
PIP_CONSTRAINT= pip install -r requirements.txt
116+
PIP_CACHE_DIR=/workspace/.cache/pip pip install -r requirements.txt
102117
echo "Installed from requirements.txt"
103118
else
104119
echo "No pyproject.toml, setup.py, or requirements.txt found"
105120
exit 1
106121
fi
107122
108123
echo "Running tests..."
109-
pytest -v .
124+
python -m pytest -v .
110125
""")
111126

112127
relative_path = Path(work_dir).relative_to(git_root).as_posix()
@@ -166,6 +181,8 @@ def main():
166181
if args.debug:
167182
logging.getLogger().setLevel(logging.DEBUG)
168183

184+
logger.info(f"Caching pip installations to: {PIP_CACHE_DIR}")
185+
169186
# Get directories to test
170187
test_dirs = get_test_directories(args.directories)
171188

ci/scripts/run_pytest_notebooks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
# Enable strict mode with better error handling
2020
set -euox pipefail
2121

22-
pytest -v --nbval-lax -x -p no:python docs/ sub-packages/
22+
FAST_CI_MODE=true pytest -v --nbval-lax -x -p no:python docs/ sub-packages/

recipes/esm2_native_te_mfsdp/hydra_config/L0_sanity.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,3 @@ wandb_init_args:
1414
# Learning rate scheduler config
1515
lr_scheduler_kwargs:
1616
num_warmup_steps: 0
17-
18-
adamw_kwargs:
19-
lr: 1e-2

recipes/esm2_native_te_mfsdp/hydra_config/L1_3B_ddp.yaml renamed to recipes/esm2_native_te_mfsdp/hydra_config/L1_3B.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ defaults:
22
- defaults
33

44
# Training config
5-
model_name: esm2_t33_650M_UR50D
6-
micro_batch_size: 32
5+
model_name: nvidia/esm2_t36_3B_UR50D
6+
micro_batch_size: 16
77
num_train_steps: 10_000
88

99
# WandB config

recipes/esm2_native_te_mfsdp/hydra_config/L1_650M.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ defaults:
33
- _self_
44

55
# Training config
6-
model_name: esm2_t33_650M_UR50D
7-
micro_batch_size: 16
6+
model_name: nvidia/esm2_t33_650M_UR50D
7+
micro_batch_size: 4
88
num_train_steps: 200
99

1010
# WandB config
1111
wandb_init_args:
12-
name: "esm2_t33_650M_UR50D_nvfsdp"
12+
name: "esm2_t33_650M_UR50D_mfsdp"
1313
project: "bionemo-recipes-pstjohn"
1414
mode: "offline"

recipes/esm2_native_te_mfsdp/hydra_config/defaults.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@ max_seq_length: 1024
55
data_path: .
66
num_train_steps: ???
77

8+
# TODO: Once BIONEMO-2583 and BIONEMO-2719 are fixed, enable this by default and simplify training scripts to remove the
9+
# meta-device conditional.
10+
use_meta_device: false
11+
812
# WandB config
913
wandb_init_args:
1014
name: ???
1115

12-
# nvFSDP config
16+
# mFSDP config
1317
fully_shard_kwargs:
1418
zero_dp_strategy: "optim_grads_params"
1519
calculate_per_token_loss: false
16-
init_model_with_meta_device: false
20+
init_model_with_meta_device: ${use_meta_device}
1721
check_for_nan_in_grad: true
1822
grad_reduce_in_fp32: false
1923
preserve_fp32_weights: true

0 commit comments

Comments
 (0)