-
Notifications
You must be signed in to change notification settings - Fork 713
411 lines (365 loc) · 16.2 KB
/
Copy pathgithub-nightly-uv.yml
File metadata and controls
411 lines (365 loc) · 16.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This CI runs nightly to generate the coverage report and testmon database.
# It runs ALL tests and caches the testmon database for use by PR workflows.
# The tests run here will only use UV. This is meant to be nightly functionality
# testing AND a baseline dependency graph for PRs.
#
# ----------------------------------------------------------------------------
# Cache design (see .github/CACHE_CONTRACT.md for the full contract):
#
# uv download cache (~/.cache/uv)
# key : <UV_CACHE_KEY_PREFIX>-latest
# prefix: container + python + uv version
# scope : additive wheel store; survives lockfile changes; refreshed
# via delete-before-save when the cache is cold. Restored
# fail-open. This is the ONLY cross-run cache for the Python
# environment; the realized .venv is rebuilt every job from
# the committed lockfile (deterministic given a pinned
# container + --frozen + the pinned uv version).
#
# Consumer contract for PR workflows:
# * Restore the uv download cache fail-open (speed only).
# * Always `uv sync --frozen --group dev --extra cu12` (accelerated by
# the restored uv download cache).
# * Run tests via `.venv/bin/python` or `uv run --no-sync` so the
# realized env cannot be mutated mid-job.
# ----------------------------------------------------------------------------
# TO DO: THE COVERAGE LIMIT IS VERY LOW, BECAUSE THIS IS NOT USING GPU TESTS OR
# THE DATA-DRIVEN TESTS. RAISE THIS UP AGAIN EVENTUALLY.
name: Nightly Github UV Workflow
on:
schedule:
# Run nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch:
# Allow manual triggering
permissions:
contents: read
actions: write
checks: write
# Two overlapping nightly runs (manual + schedule, or two manuals) would
# race on the static `-latest` uv download cache key. Serialise them so
# the delete-before-save dance stays correct. We do NOT cancel
# in-progress because the nightly testmon DB is consumed by PR workflows
# and we'd rather a slow nightly than a missing one.
concurrency:
group: nightly-github-uv
cancel-in-progress: false
# The CUDA container's default shell is sh, which does not support
# `set -o pipefail`. Force bash everywhere.
defaults:
run:
shell: bash
env:
# ---- Container baseline identity ---------------------------------------
# Change ANY of these and the uv cache invalidates via prefix change.
# Keep CONTAINER_ID in sync with the `image:` tag below.
PYTHON_VERSION: "3.12"
UV_VERSION: "0.11.7"
CONTAINER_ID: "cuda12.8.1-cudnn-devel-ubuntu24.04"
# All feature extras + cu12 backend + matching natten wheel index. This
# powers the @requires_module / pytest.importorskip tests that would
# otherwise be skipped due to missing optional dependencies. Modules
# with no extras home (moto, scikit-image, pyg_lib, earth2grid, ...) are
# installed by the `Install CI-only test dependencies` step inside the
# setup-uv-env composite action.
EXTRAS_TAG: "cu12,natten-cu12,utils-extras,mesh-extras,nn-extras,model-extras,datapipes-extras,uq-extras,gnns,sym,transformer-engine-cu12"
# ---- Cache key prefixes ------------------------------------------------
# Inlined literally because GitHub Actions does not allow env-to-env
# references within the same env: block. Bump in lockstep with the
# baseline values above. The `-fullextras` suffix is bumped relative
# to the previous prefix so the first run under the expanded EXTRAS_TAG
# rebuilds the wheel cache from scratch instead of layering on top of
# a stale narrower cache.
UV_CACHE_KEY_PREFIX: "uv-cache-nightly-cuda12.8.1-cudnn-devel-ubuntu24.04-py3.12-uv0.11.7-fullextras"
TESTMON_CACHE_KEY_PREFIX: "testmon-nightly"
COVERAGE_CACHE_KEY_PREFIX: "coverage-nightly"
JIT_CACHE_KEY_PREFIX: "jit-cache-cuda12.8.1-cudnn-devel-ubuntu24.04-py3.12"
JIT_CACHE_DIR: "/root/.cache/jit"
# ---- uv read-only defaults --------------------------------------------
# Belt-and-braces against the historical bug class where an unguarded
# `uv run` (without --frozen, without the cu12 extra) silently re-syncs
# the venv to a different CUDA variant and rewrites uv.lock.
#
# UV_FROZEN=1 -> all uv invocations refuse to mutate the lockfile.
# UV_NO_SYNC=1 -> `uv run` will not implicitly sync. The explicit
# `uv sync` inside setup-uv-env is unaffected by this
# flag.
UV_FROZEN: "1"
UV_NO_SYNC: "1"
PYVISTA_OFF_SCREEN: "true"
jobs:
# Stage 1: Warm the uv download cache
#
# This job's sole purpose is to make sure ~/.cache/uv is populated with
# the wheels implied by the current lockfile before the downstream GPU
# jobs start. Each downstream job does its own `uv sync --frozen`, but
# that sync is fast because it hits the warm cache this job publishes.
build-environment:
name: Build Environment
runs-on: linux-amd64-cpu8
container:
image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
# /dev/shm defaults to 64 MiB in docker, which DALI's multiprocess
# worker pool exhausts via SemLock allocations and trips ENOSPC
# ("No space left on device") in datapipes tests. 2 GiB is plenty
# for the test suite and matches the PyTorch container default.
options: --shm-size=2g
steps:
- uses: actions/checkout@v5
- name: Bootstrap cuDNN CI container
uses: ./.github/actions/bootstrap-cudnn-ci
with:
python-version: ${{ env.PYTHON_VERSION }}
uv-version: ${{ env.UV_VERSION }}
- name: Setup uv environment from cache
id: setup-uv-env
uses: ./.github/actions/setup-uv-env
with:
uv-cache-key-prefix: ${{ env.UV_CACHE_KEY_PREFIX }}
uv-cache-key-suffix: "latest"
extras: ${{ env.EXTRAS_TAG }}
- name: Report setup action outputs
run: |
echo "setup-uv-env.uv_cache_hit=${{ steps.setup-uv-env.outputs.uv_cache_hit }}"
# --- uv download cache (static key, delete-before-save) ---
#
# Fires only on a cold cache (first run, prefix bump, or manual
# purge). In steady state uv_cache_hit is true and these steps
# no-op: the warm cache already contains every wheel the frozen sync
# needed. The replace-cache action centralises the delete-before-
# save + verify dance shared by all four mutable-slot caches in
# this workflow.
- name: Prune uv cache
if: steps.setup-uv-env.outputs.uv_cache_hit != 'true'
run: |
set -euo pipefail
uv cache prune
echo "uv cache after prune:"
du -sh ~/.cache/uv 2>/dev/null || echo " (not present)"
- name: Replace uv download cache
if: steps.setup-uv-env.outputs.uv_cache_hit != 'true'
uses: ./.github/actions/replace-cache
with:
path: ~/.cache/uv
key: ${{ env.UV_CACHE_KEY_PREFIX }}-latest
description: uv download cache
github-token: ${{ secrets.GITHUB_TOKEN }}
# Stage 2: Run testmon tests and cache the database
testmon:
name: Testmon
needs: build-environment
runs-on: linux-amd64-gpu-h100-latest-1
container:
image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
# /dev/shm defaults to 64 MiB in docker, which DALI's multiprocess
# worker pool exhausts via SemLock allocations and trips ENOSPC
# ("No space left on device") in datapipes tests. 2 GiB is plenty
# for the test suite and matches the PyTorch container default.
options: --shm-size=2g
steps:
- uses: actions/checkout@v5
- name: Bootstrap cuDNN CI container
uses: ./.github/actions/bootstrap-cudnn-ci
with:
python-version: ${{ env.PYTHON_VERSION }}
uv-version: ${{ env.UV_VERSION }}
# Restore the warm uv download cache (published by build-environment
# earlier in this same workflow run) and rebuild .venv from the
# frozen lockfile. With the cache warm the sync is dominated by
# local file copies, not network I/O.
- name: Setup uv environment from cache
uses: ./.github/actions/setup-uv-env
with:
uv-cache-key-prefix: ${{ env.UV_CACHE_KEY_PREFIX }}
uv-cache-key-suffix: "latest"
extras: ${{ env.EXTRAS_TAG }}
# Restore compiled JIT artifacts (warp, triton, inductor) from the
# previous nightly so kernel compilation is skipped when source hasn't
# changed. Fail-open: a miss only costs compilation time.
- name: Restore JIT compilation cache
id: jit-cache-restore
uses: actions/cache/restore@v5
with:
path: ${{ env.JIT_CACHE_DIR }}
key: ${{ env.JIT_CACHE_KEY_PREFIX }}-latest
- name: Download CI test data
uses: ./.github/actions/download-ci-data
with:
hf-token: ${{ secrets.HF_CI_DATA_TOKEN }}
- name: Run core tests (collect all for testmon)
env:
WARP_CACHE_PATH: ${{ env.JIT_CACHE_DIR }}/warp
TRITON_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/triton
TORCHINDUCTOR_CACHE_DIR: ${{ env.JIT_CACHE_DIR }}/inductor
run: |
# Workflow-level UV_NO_SYNC=1 + UV_FROZEN=1 keep `uv run` strictly
# read-only, so the .venv cannot be mutated mid-job.
uv run --no-sync python -m pytest --testmon --ignore-glob="*docs*" --ignore-glob="*examples*"
# --- JIT compilation cache (static key, delete-before-save) ---
#
# Same pattern as the uv download cache: the `-latest` key is a
# mutable slot refreshed via replace-cache. The cache is additive
# and each compiler handles its own source-hash invalidation, so it
# survives lockfile and kernel-source changes safely. if: always()
# so a flaky-but-non-fatal pytest exit still publishes the warm
# JIT artifacts produced before the failure.
- name: Replace JIT compilation cache
if: always()
uses: ./.github/actions/replace-cache
with:
path: ${{ env.JIT_CACHE_DIR }}
key: ${{ env.JIT_CACHE_KEY_PREFIX }}-latest
description: JIT compilation cache
github-token: ${{ secrets.GITHUB_TOKEN }}
# --- Testmon database cache (mutable -latest slot) ---
#
# Previously keyed on hashFiles('uv.lock', 'pyproject.toml'), which
# collided with the previous nightly's save whenever the lockfile
# was unchanged (the common case): GitHub Actions caches are
# immutable, so the second save logged "Failed to save: Unable to
# reserve cache" as a *warning* and the stale DB persisted for
# days. PRs then restored the stale DB and testmon invalidated
# everything because the env fingerprint had drifted. Switching to
# a -latest mutable slot via replace-cache fixes both: the slot is
# always overwritten, and silent save failures become hard job
# failures via the embedded verify step.
#
# if: always() so a flaky-but-non-fatal pytest exit still updates
# the DB with whatever progress was made.
- name: Replace testmon database cache
if: always()
uses: ./.github/actions/replace-cache
with:
path: |
.testmondata
.testmondata-shm
.testmondata-wal
key: ${{ env.TESTMON_CACHE_KEY_PREFIX }}-latest
description: testmon database
github-token: ${{ secrets.GITHUB_TOKEN }}
# Stage 3: Run coverage tests and upload artifacts
coverage:
name: Coverage
needs: build-environment
runs-on: linux-amd64-gpu-h100-latest-1
container:
image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
# /dev/shm defaults to 64 MiB in docker, which DALI's multiprocess
# worker pool exhausts via SemLock allocations and trips ENOSPC
# ("No space left on device") in datapipes tests. 2 GiB is plenty
# for the test suite and matches the PyTorch container default.
options: --shm-size=2g
steps:
- uses: actions/checkout@v5
- name: Bootstrap cuDNN CI container
uses: ./.github/actions/bootstrap-cudnn-ci
with:
python-version: ${{ env.PYTHON_VERSION }}
uv-version: ${{ env.UV_VERSION }}
- name: Setup uv environment from cache
uses: ./.github/actions/setup-uv-env
with:
uv-cache-key-prefix: ${{ env.UV_CACHE_KEY_PREFIX }}
uv-cache-key-suffix: "latest"
extras: ${{ env.EXTRAS_TAG }}
- name: Download CI test data
uses: ./.github/actions/download-ci-data
with:
hf-token: ${{ secrets.HF_CI_DATA_TOKEN }}
- name: Run core tests for coverage report
run: |
# See note in testmon job re: workflow-level UV_NO_SYNC / UV_FROZEN.
uv run --no-sync coverage run --rcfile='test/coverage.pytest.rc' -m pytest --ignore-glob="*docs*" --ignore-glob="*examples*" --junitxml=coverage-core-report.xml
- name: Run doc tests (testmon not supported for doctests)
run: |
uv run --no-sync coverage run --rcfile='test/coverage.docstring.rc' -m pytest --doctest-modules physicsnemo/ --ignore-glob="*internal*" --ignore-glob="*experimental*" --junitxml=coverage-doctest-report.xml
- name: Upload core test JUnit XML
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: junit-coverage-core
path: coverage-core-report.xml
- name: Upload doctest JUnit XML
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: junit-coverage-doctest
path: coverage-doctest-report.xml
# --- Coverage baseline cache (mutable -latest slot) ---
#
# Same immutable-key bug as the testmon cache: the previous lockhash
# suffix could not be re-saved on consecutive nightlies with an
# unchanged lockfile. Migrated to a -latest slot via replace-cache
# for parity with testmon and JIT.
- name: Replace coverage baseline cache
uses: ./.github/actions/replace-cache
with:
path: .coverage*
key: ${{ env.COVERAGE_CACHE_KEY_PREFIX }}-latest
description: coverage baseline
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Merge coverage reports
run: |
uv run --no-sync coverage combine
# -i / --ignore-errors downgrades coverage's fatal "No source for
# code" error to a warning, matching the PR workflow. Kept in
# lockstep with github-pr.yml per .github/CACHE_CONTRACT.md.
uv run --no-sync coverage report -i --show-missing --omit="*test*" --omit="*internal*" --omit="*experimental*" --fail-under=45
uv run --no-sync coverage html -i
# Also create an XML report for potential CI integrations
uv run --no-sync coverage xml -i -o coverage.xml
- name: Upload coverage HTML report
uses: actions/upload-artifact@v4
with:
name: coverage-report-nightly
path: htmlcov/
retention-days: 7
- name: Upload combined coverage data
uses: actions/upload-artifact@v4
with:
name: coverage-data-nightly
path: |
.coverage
coverage.xml
retention-days: 30
# Stage 4: Generate browsable test reports from JUnit XML
test-reports:
name: Test Reports
needs: [coverage]
if: ${{ !cancelled() }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Download JUnit artifacts
uses: actions/download-artifact@v4
with:
pattern: junit-*
- name: Core test report
uses: dorny/test-reporter@v3
with:
name: Core Test Results
path: junit-coverage-core/coverage-core-report.xml
reporter: java-junit
fail-on-error: 'false'
- name: Doctest report
uses: dorny/test-reporter@v3
with:
name: Doctest Results
path: junit-coverage-doctest/coverage-doctest-report.xml
reporter: java-junit
fail-on-error: 'false'