-
Notifications
You must be signed in to change notification settings - Fork 153
Expand file tree
/
Copy pathunit-tests-mbridge-recipes.yaml
More file actions
286 lines (257 loc) · 10.5 KB
/
unit-tests-mbridge-recipes.yaml
File metadata and controls
286 lines (257 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
name: "BioNeMo MBridge Recipes CI"
on:
push:
branches:
- "pull-request/[0-9]+"
- "dependabot/**"
merge_group:
types: [checks_requested]
schedule:
- cron: "0 9 * * *" # Runs at 9 AM UTC daily (2 AM MST)
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
changed-dirs:
runs-on: ubuntu-latest
outputs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
all_changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
dirs: ${{ steps.set-dirs.outputs.dirs }}
labels: ${{ steps.set-dirs.outputs.labels }}
steps:
- id: get-pr-info
if: ${{ startsWith(github.ref_name, 'pull-request/') }}
uses: nv-gha-runners/get-pr-info@main
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get merge-base commit
id: merge-base
run: |
# Get the merge-base between current branch and main
MERGE_BASE=$(git merge-base HEAD origin/main)
echo "merge-base=$MERGE_BASE" >> $GITHUB_OUTPUT
echo "Merge-base commit: $MERGE_BASE"
- name: Get changed files
id: changed-files
uses: step-security/changed-files@v46
with:
json: true
matrix: true
base_sha: ${{ steps.merge-base.outputs.merge-base }}
dir_names: true
dir_names_max_depth: 3
files: |
bionemo-recipes/recipes/*megatron/**
sub-packages/bionemo-recipeutils/**
sub-packages/bionemo-core/**
- id: set-dirs
name: Determine which directories to run
env:
EVENT_NAME: ${{ github.event_name }}
PR_INFO: ${{ steps.get-pr-info.outputs.pr-info }}
CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
run: |
# Get all *megatron recipe directories
ALL_DIRS=$(ls -d bionemo-recipes/recipes/*megatron/ 2>/dev/null | jq -R -s -c 'split("\n")[:-1] | map(rtrimstr("/"))')
# Helper to check for a PR label
has_label() {
[[ "$PR_INFO" != "null" && "$PR_INFO" != "" ]] && \
echo "$PR_INFO" | jq -e ".labels[]? | select(.name == \"$1\")" > /dev/null 2>&1
}
# --- Shared sub-package dependency handling ---
# MBridge recipes depend on shared sub-packages (bionemo-core, bionemo-recipeutils)
# installed from git in pyproject.toml. When those sub-packages change, we need to
# test all megatron recipes against the local version. Each recipe's .ci_build.sh
# handles reinstalling from the local checkout if present.
#
# To add a new megatron recipe that depends on shared sub-packages:
# 1. Add it as a directory under bionemo-recipes/recipes/ with a *megatron suffix
# 2. Ensure its .ci_build.sh includes the local sub-package override
# (see eden_megatron/.ci_build.sh for the pattern)
# Determine which directories to run
if [[ "$EVENT_NAME" == "schedule" ]]; then
echo "Scheduled run - running all megatron recipes"
DIRS="$ALL_DIRS"
elif has_label "ciflow:skip"; then
echo "Found 'ciflow:skip' label - skipping all recipe tests"
DIRS="[]"
elif has_label "ciflow:all-recipes"; then
echo "Found 'ciflow:all-recipes' label - running all megatron recipes"
DIRS="$ALL_DIRS"
else
# Start with megatron recipe directories that have direct changes
DIRS=$(echo "$ALL_DIRS" | jq -c --argjson changed "$CHANGED_FILES" '
map(select(. as $dir | $changed | index($dir) != null))
')
# If a shared sub-package changed, run ALL megatron recipes
SHARED_DEP_CHANGED=$(echo "$CHANGED_FILES" | jq 'map(select(startswith("sub-packages/bionemo-recipeutils") or startswith("sub-packages/bionemo-core"))) | length > 0')
if [[ "$SHARED_DEP_CHANGED" == "true" ]]; then
echo "Shared sub-package changed - running all megatron recipes"
DIRS="$ALL_DIRS"
fi
fi
# Assign Docker images to the selected directories
DIRS_WITH_IMAGES=$(echo "$DIRS" | jq -c '
map({
dir: .,
name: (. | sub("^bionemo-recipes/"; "")),
image: "svcbionemo023/bionemo-framework:pytorch26.04-py3-squashed"
})
')
echo "dirs=$DIRS_WITH_IMAGES" >> $GITHUB_OUTPUT
# Emit PR labels as a JSON array so downstream jobs can gate on ciflow:* labels.
if [[ "$PR_INFO" != "null" && "$PR_INFO" != "" ]]; then
LABELS=$(echo "$PR_INFO" | jq -c '[.labels[]?.name]' 2>/dev/null || echo "[]")
else
LABELS="[]"
fi
echo "labels=$LABELS" >> $GITHUB_OUTPUT
- name: Show output
run: |
echo "=== Changed Files Analysis ==="
echo "Current branch: ${{ github.ref_name }}"
echo "Merge-base commit: ${{ steps.merge-base.outputs.merge-base }}"
echo "Changed files compared to merge-base:"
echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq -r '.[]' | sed 's/^/ - /'
echo "Total changed files: $(echo '${{ steps.changed-files.outputs.all_changed_files }}' | jq '. | length')"
echo '${{ toJSON(steps.changed-files.outputs) }}'
echo '${{ toJSON(steps.set-dirs.outputs) }}'
shell: bash
unit-tests:
needs: changed-dirs
runs-on: linux-amd64-gpu-l4-latest-1
if: ${{ needs.changed-dirs.outputs.dirs != '[]' }}
name: "mbridge-unit-tests (${{ matrix.recipe.name }})"
container:
image: ${{ matrix.recipe.image }}
options: --shm-size=16G
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HOME: /cache/huggingface
strategy:
matrix:
recipe: ${{ fromJson(needs.changed-dirs.outputs.dirs) }}
fail-fast: false
steps:
- name: Show GPU info
run: nvidia-smi
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
${{ matrix.recipe.dir }}
sub-packages/bionemo-recipeutils
sub-packages/bionemo-core
sparse-checkout-cone-mode: false
- name: Cache Hugging Face models
uses: actions/cache@v4
with:
path: /cache/huggingface
key: ${{ runner.os }}-huggingface-${{ matrix.recipe.name }}-${{ github.sha }}
restore-keys: |
${{ runner.os }}-huggingface-${{ matrix.recipe.name }}-
${{ runner.os }}-huggingface-
- name: Install dependencies
working-directory: ${{ matrix.recipe.dir }}
run: |
if [ -f .ci_build.sh ]; then
bash .ci_build.sh
elif [ -f pyproject.toml ] || [ -f setup.py ]; then
PIP_CONSTRAINT= pip install -e .
echo "Installed ${{ matrix.recipe.dir }} as editable package"
elif [ -f requirements.txt ]; then
PIP_CONSTRAINT= pip install -r requirements.txt
echo "Installed ${{ matrix.recipe.dir }} from requirements.txt"
else
echo "No pyproject.toml, setup.py, or requirements.txt found in ${{ matrix.recipe.dir }}"
exit 1
fi
- name: Run tests
working-directory: ${{ matrix.recipe.dir }}
run: |
if [ -f .ci_test_env.sh ]; then
source .ci_test_env.sh
fi
pytest -v .
run-tests-notebooks:
needs: changed-dirs
runs-on: linux-amd64-gpu-l4-latest-1
# Mirrors the framework workflow's notebook-trigger pattern (label-only on PRs,
# auto on merge_group + nightly schedule). Currently scoped to evo2_megatron --
# the only megatron recipe with example notebooks.
if: |
contains(needs.changed-dirs.outputs.dirs, 'bionemo-recipes/recipes/evo2_megatron') &&
(
(github.event_name == 'schedule') ||
(github.event_name == 'merge_group') ||
contains(fromJSON(needs.changed-dirs.outputs.labels || '[]'), 'ciflow:all-recipes') ||
contains(fromJSON(needs.changed-dirs.outputs.labels || '[]'), 'ciflow:notebooks')
)
name: "mbridge-notebook-tests (evo2_megatron)"
container:
image: svcbionemo023/bionemo-framework:pytorch26.04-py3-squashed
options: --shm-size=16G
env:
CI: true
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_HOME: /cache/huggingface
BIONEMO_DATA_SOURCE: ngc
steps:
- name: Show GPU info
run: nvidia-smi
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
- name: Checkout repository
uses: actions/checkout@v4
with:
sparse-checkout: |
bionemo-recipes/recipes/evo2_megatron
sub-packages/bionemo-recipeutils
sub-packages/bionemo-core
sparse-checkout-cone-mode: false
- name: Cache Hugging Face models
uses: actions/cache@v4
with:
path: /cache/huggingface
key: ${{ runner.os }}-huggingface-evo2_megatron-notebooks-${{ github.sha }}
restore-keys: |
${{ runner.os }}-huggingface-evo2_megatron-notebooks-
${{ runner.os }}-huggingface-evo2_megatron-
${{ runner.os }}-huggingface-
- name: Install dependencies
working-directory: bionemo-recipes/recipes/evo2_megatron
run: |
bash .ci_build.sh
source .ci_test_env.sh
pip install nbval
- name: Run notebook tests
working-directory: bionemo-recipes/recipes/evo2_megatron
run: |
source .ci_test_env.sh
FAST_CI_MODE=1 pytest -v -s --nbval-lax -x -p no:python \
examples/lora-fine-tuning-tutorial.ipynb
verify-mbridge-recipe-tests:
needs:
- changed-dirs
- unit-tests
- run-tests-notebooks
runs-on: ubuntu-latest
if: always()
steps:
- name: Check test job statuses
run: |
if [[ "${{ contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
echo "Some mbridge test jobs have failed or been cancelled!"
exit 1
else
echo "All mbridge test jobs have completed successfully or were skipped!"
exit 0
fi