Skip to content

Commit c7aae0d

Browse files
committed
Merge remote-tracking branch 'origin/main' into sj/fine-grained-activation-offload
2 parents 55bc355 + 9114a1d commit c7aae0d

255 files changed

Lines changed: 27918 additions & 2689 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.agents/skills

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../skills

.claude/settings.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"hooks": {
3+
"UserPromptSubmit": [
4+
{
5+
"hooks": [
6+
{
7+
"type": "command",
8+
"command": "printf '{\"hookSpecificOutput\":{\"hookEventName\":\"UserPromptSubmit\",\"additionalContext\":\"MANDATORY WORKFLOW — never skip or reorder: (1) Read the artifact first (commit, file, error, PR). (2) Identify and invoke the relevant skill via the Skill tool BEFORE forming any answer or plan — even when the answer seems obvious. (3) Only then answer using the skill context. Skipping step 2 is not allowed.\"}}'"
9+
}
10+
]
11+
}
12+
]
13+
}
14+
}

.claude/skills

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../skills

.github/actions/test-template/action.yml

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,21 @@ runs:
8181
if: ${{ inputs.has-azure-credentials == 'true' }}
8282
shell: bash
8383
run: |
84-
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
84+
for i in 1 2 3; do
85+
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash && break
86+
echo "Attempt $i failed, retrying in 10s..."
87+
sleep 10
88+
done
8589
8690
- name: Install uuidgen
8791
shell: bash -x -e -u -o pipefail {0}
8892
if: ${{ contains(inputs.runner, 'gcp') }}
8993
run: |
90-
apt-get update
91-
apt-get install -y uuid-runtime
94+
for i in 1 2 3; do
95+
apt-get update && apt-get install -y uuid-runtime && break
96+
echo "Attempt $i failed, retrying in 10s..."
97+
sleep 10
98+
done
9299
93100
- name: Docker system cleanup
94101
shell: bash
@@ -122,6 +129,12 @@ runs:
122129
COVERAGE_PREFIX=$([[ "${{ inputs.is_doc_test }}" == "true" ]] && echo "doc-test" || ([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e"))
123130
echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
124131
132+
echo -e "\033[1;34m┌─ launching test ─────────────────────────────────────────────────────────┐\033[0m"
133+
echo -e "\033[1;34m│ script : ${{ inputs.script }}\033[0m"
134+
echo -e "\033[1;34m│ runner : ${{ inputs.runner }}\033[0m"
135+
echo -e "\033[1;34m│ container : ${{ inputs.registry }}/${{ inputs.image }}:${{ inputs.image-tag || github.run_id }}\033[0m"
136+
echo -e "\033[1;34m└──────────────────────────────────────────────────────────────────────────┘\033[0m"
137+
echo "::group::Logs"
125138
docker run --rm -u root --runtime=nvidia --gpus all \
126139
--shm-size=64g \
127140
--env TRANSFORMERS_OFFLINE=0 \
@@ -152,28 +165,43 @@ runs:
152165
fi
153166
bash tests/${{ inputs.is_doc_test == 'true' && 'docs' || (inputs.is_unit_test == 'true' && 'unit' || 'functional') }}/${{ inputs.script }}.sh && \
154167
echo "Finished successfully." || echo "Did not finish."' 2>&1 | tee err.log
168+
echo "::endgroup::"
155169
156170
- name: Check result
157171
id: check
158-
shell: bash
172+
shell: bash -e -u -o pipefail {0}
173+
if: always()
159174
run: |
160175
coverage_report=coverage-${{ steps.test.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
161-
echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
176+
echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT"
162177
163178
IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false")
164179
165-
if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then
166-
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
180+
if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is_optional }}" == "true" ]]; then
181+
echo "::warning::Test failed but is marked optional — treating as success."
167182
IS_SUCCESS=true
168183
fi
169184
170-
if [[ "$IS_SUCCESS" == "false" ]]; then
171-
echo Test did not finish successfully.
185+
if [[ "$IS_SUCCESS" == "true" ]]; then
186+
echo -e "\033[1;32m╔══════════════════════════════════════════════════════════════════════════╗\033[0m"
187+
echo -e "\033[1;32m║ ║\033[0m"
188+
echo -e "\033[1;32m║ ✅ PASSED ║\033[0m"
189+
echo -e "\033[1;32m║ ${{ inputs.script }}\033[0m"
190+
echo -e "\033[1;32m║ ║\033[0m"
191+
echo -e "\033[1;32m╚══════════════════════════════════════════════════════════════════════════╝\033[0m"
192+
echo "::notice title=Result::✅ ${{ inputs.script }} — PASSED"
193+
exit 0
194+
else
195+
echo -e "\033[1;31m╔══════════════════════════════════════════════════════════════════════════╗\033[0m"
196+
echo -e "\033[1;31m║ ║\033[0m"
197+
echo -e "\033[1;31m║ ❌ FAILED ║\033[0m"
198+
echo -e "\033[1;31m║ ${{ inputs.script }}\033[0m"
199+
echo -e "\033[1;31m║ ║\033[0m"
200+
echo -e "\033[1;31m╚══════════════════════════════════════════════════════════════════════════╝\033[0m"
201+
echo "::error title=Result::❌ ${{ inputs.script }} — FAILED"
172202
exit 1
173203
fi
174204
175-
exit $EXIT_CODE
176-
177205
- name: Upload artifacts
178206
uses: actions/upload-artifact@v6
179207
if: ${{ steps.check.outputs.coverage_report != 'none' }}
@@ -183,6 +211,15 @@ runs:
183211
${{ github.workspace }}/${{ github.run_id }}/${{steps.uuid.outputs.id }}/nemo-rl/tests/.coverage
184212
include-hidden-files: true
185213

214+
- name: Upload nemo_gym actual test data
215+
uses: actions/upload-artifact@v6
216+
if: always()
217+
with:
218+
name: actual_test_nemo_gym_sanity-${{ github.run_id }}
219+
path: |
220+
${{ github.workspace }}/${{ github.run_id }}/${{steps.uuid.outputs.id }}/nemo-rl/tests/unit/environments/nemo_gym_test_data/actual_test_nemo_gym_sanity.json
221+
if-no-files-found: ignore
222+
186223
- name: Container shutdown
187224
if: always()
188225
shell: bash

.github/workflows/build-test-publish-wheel.yml

Lines changed: 0 additions & 39 deletions
This file was deleted.

.github/workflows/cicd-main.yml

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ on:
3939
default: ""
4040

4141
concurrency:
42-
group: ${{ github.workflow }}-${{ github.ref }}
42+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
4343
cancel-in-progress: true
4444

4545
jobs:
@@ -291,16 +291,50 @@ jobs:
291291
build-ref: ${{ needs.pre-flight.outputs.test_sha }}
292292
image-name: ${{ vars.CI_CONTAINER_NAME }}
293293
dockerfile: docker/Dockerfile
294-
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
294+
runner: ${{ contains(needs.org-member-pre-flight.outputs.runner_prefix, 'azure') && format('{0}-gpu-x2', needs.org-member-pre-flight.outputs.runner_prefix) || contains(needs.org-member-pre-flight.outputs.runner_prefix, 'gcp') && format('{0}-gpu-x4', needs.org-member-pre-flight.outputs.runner_prefix) }}
295295
image-label: ${{ vars.CI_CONTAINER_NAME }}
296296
target: release
297297
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
298298
build-contexts: |
299299
nemo-rl=${{ github.run_id }}/
300+
${{ vars.UV_BUILD_CACHE == 'enabled' && format('uv-cache-seed=docker-image://{0}/{1}-uv-cache:latest', needs.org-member-pre-flight.outputs.registry, vars.CI_CONTAINER_NAME) || '' }}
300301
build-args: |
301302
MAX_JOBS=4
302303
NEMO_RL_COMMIT=${{ needs.pre-flight.outputs.test_sha }}
303304
305+
update-uv-cache:
306+
name: Update uv build cache
307+
needs: [build-container, org-member-pre-flight]
308+
if: >-
309+
${{
310+
github.ref == 'refs/heads/main' &&
311+
needs.build-container.result == 'success'
312+
}}
313+
runs-on: ${{ format('{0}-gpu-x2', needs.org-member-pre-flight.outputs.runner_prefix) }}
314+
environment: nemo-ci
315+
env:
316+
REGISTRY: ${{ needs.org-member-pre-flight.outputs.registry }}
317+
IMAGE_NAME: ${{ vars.CI_CONTAINER_NAME }}
318+
steps:
319+
- name: Extract and push uv cache image
320+
run: |
321+
set -euo pipefail
322+
SRC="${REGISTRY}/${IMAGE_NAME}:${{ github.run_id }}"
323+
DST="${REGISTRY}/${IMAGE_NAME}-uv-cache:latest"
324+
325+
docker pull "${SRC}"
326+
CID=$(docker create "${SRC}" true)
327+
mkdir -p /tmp/uv-cache
328+
docker cp "${CID}:/root/.cache/uv/." /tmp/uv-cache/
329+
docker rm "${CID}"
330+
331+
printf 'FROM scratch\nCOPY uv-cache/ /\n' > /tmp/Dockerfile.uv-cache
332+
docker build -t "${DST}" -f /tmp/Dockerfile.uv-cache /tmp
333+
docker push "${DST}"
334+
335+
docker rmi "${SRC}" "${DST}" 2>/dev/null || true
336+
rm -rf /tmp/uv-cache /tmp/Dockerfile.uv-cache
337+
304338
cicd-doc-tests:
305339
strategy:
306340
fail-fast: false
@@ -480,6 +514,24 @@ jobs:
480514
SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"')
481515
echo '🤖: CICD Result for test level: ${{ needs.pre-flight.outputs.test_level }}' >> $GITHUB_STEP_SUMMARY
482516
echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
517+
518+
if [[ "$TEST_LEVEL" == "none" ]]; then
519+
echo "" >> $GITHUB_STEP_SUMMARY
520+
echo "---" >> $GITHUB_STEP_SUMMARY
521+
echo "⚠️ **No tests were run.** This PR does not have a CI label." >> $GITHUB_STEP_SUMMARY
522+
echo "" >> $GITHUB_STEP_SUMMARY
523+
echo "To trigger tests, add one of the following labels to your PR:" >> $GITHUB_STEP_SUMMARY
524+
echo "| Label | What it runs |" >> $GITHUB_STEP_SUMMARY
525+
echo "|-------|-------------|" >> $GITHUB_STEP_SUMMARY
526+
echo "| \`CI:docs\` | Doc tests only |" >> $GITHUB_STEP_SUMMARY
527+
echo "| \`CI:Lfast\` | Fast subset (reuses main container) |" >> $GITHUB_STEP_SUMMARY
528+
echo "| \`CI:L0\` | Unit tests + docs + lint |" >> $GITHUB_STEP_SUMMARY
529+
echo "| \`CI:L1\` | L0 + functional tests |" >> $GITHUB_STEP_SUMMARY
530+
echo "| \`CI:L2\` | L1 + convergence tests |" >> $GITHUB_STEP_SUMMARY
531+
echo "" >> $GITHUB_STEP_SUMMARY
532+
echo "This check will remain failed until a CI label is added." >> $GITHUB_STEP_SUMMARY
533+
fi
534+
483535
test "$ALL_SUCCESS" = "true" || test "$CI_SKIP" = "true"
484536
485537
notify-nightly-failure:
@@ -519,19 +571,25 @@ jobs:
519571
matrix:
520572
flag: [doc-test, unit-test, e2e]
521573
steps:
574+
- name: Get PR info
575+
id: get-pr-info
576+
if: startsWith(github.ref, 'refs/heads/pull-request/')
577+
uses: nv-gha-runners/get-pr-info@main
578+
522579
- name: Checkout
523580
uses: actions/checkout@v6
524581

525582
- name: Download coverage reports of current branch
526583
uses: actions/download-artifact@v7
527584
with:
528585
pattern: coverage-${{ matrix.flag }}-*
586+
path: coverage-downloads
529587

530588
- name: Check if artifacts were downloaded
531589
id: check-artifacts
532590
run: |
533-
# Check if any coverage directories were downloaded
534-
if ls coverage-* 1> /dev/null 2>&1; then
591+
# Check if any .coverage files were downloaded
592+
if find coverage-downloads -name ".coverage" -type f 2>/dev/null | grep -q .; then
535593
echo "artifacts-found=true" >> $GITHUB_OUTPUT
536594
echo "Found coverage artifacts for ${{ matrix.flag }}"
537595
else
@@ -545,12 +603,10 @@ jobs:
545603
run: |
546604
pip install coverage
547605
548-
ls -al .
549-
ls -al coverage-*/
550-
coverage combine --keep $(ls coverage-*/.coverage)
606+
find coverage-downloads -name ".coverage" -type f
607+
coverage combine --keep $(find coverage-downloads -name ".coverage" -type f)
551608
coverage report -i --show-missing
552-
rm -rf coverage-*
553-
ls -al
609+
rm -rf coverage-downloads
554610
555611
- name: Skip coverage processing
556612
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'false' }}
@@ -564,6 +620,7 @@ jobs:
564620
token: ${{ secrets.CODECOV_TOKEN }}
565621
verbose: true
566622
flags: ${{ matrix.flag }}
623+
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').base.sha }}
567624

568625
- name: Upload artifacts
569626
if: ${{ steps.check-artifacts.outputs.artifacts-found == 'true' }}

.github/workflows/claude-review.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@ jobs:
1616
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_claude_review.yml@v0.79.0
1717
with:
1818
prompt: |
19-
You are doing a light code review. Keep it concise and actionable.
19+
Mandatory workflow — never skip or reorder:
20+
1. Read the PR diff first.
21+
2. Based on the changed files and areas, identify relevant skills from skills/<name>/SKILL.md.
22+
Common skill names: linting-and-formatting, testing, cicd, build-and-dependency,
23+
contributing, copyright, docs, error-handling, config-conventions, launch-nemo-rl.
24+
3. Read the SKILL.md files for all relevant areas using the Read tool.
25+
4. Only then perform the review using the skill context.
2026
21-
Read and follow the review guidelines in CLAUDE.md and the coding guideline skills in .claude/skills/ at the repository root.
27+
You are doing a light code review. Keep it concise and actionable.
2228
2329
Focus ONLY on:
2430
- Critical bugs or logic errors

0 commit comments

Comments
 (0)