Skip to content

Commit bda0f89

Browse files
authored
Merge branch 'main' into ernie-upstream
2 parents 6dccad0 + 6a56e15 commit bda0f89

10 files changed

Lines changed: 308 additions & 61 deletions

File tree

.github/workflows/cicd-main.yml

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,12 @@ jobs:
8383
runs-on: ubuntu-latest
8484
needs: [pre-flight]
8585
outputs:
86-
needs_more_tests: ${{ steps.configure.outputs.needs_more_tests }}
87-
full_test_suite: ${{ steps.configure.outputs.full_test_suite }}
88-
expect_l0: ${{ steps.configure.outputs.expect_l0 }}
89-
expect_l1: ${{ steps.configure.outputs.expect_l1 }}
90-
expect_l2: ${{ steps.configure.outputs.expect_l2 }}
86+
needs_more_tests: ${{ steps.configure.outputs.needs_more_tests }}
87+
full_test_suite: ${{ steps.configure.outputs.full_test_suite }}
88+
expect_l0: ${{ steps.configure.outputs.expect_l0 }}
89+
expect_l1: ${{ steps.configure.outputs.expect_l1 }}
90+
expect_l2: ${{ steps.configure.outputs.expect_l2 }}
91+
perf_scripts_only: ${{ steps.configure.outputs.perf_scripts_only }}
9192
steps:
9293
- name: Get PR info
9394
id: get-pr-info
@@ -113,8 +114,18 @@ jobs:
113114
NEEDS_MORE_TESTS=$(echo "$LABELS" | jq 'any(. == "needs-more-tests")')
114115
FULL_TEST_SUITE=$(echo "$LABELS" | jq 'any(. == "full-test-suite")')
115116
116-
# Tests are expected on every run except docs-only and deployment
117-
if [[ "$DOCS_ONLY" == "true" || "$IS_DEPLOYMENT" == "true" ]]; then
117+
# Detect if every changed file lives under scripts/performance/
118+
PERF_SCRIPTS_ONLY=false
119+
if [[ -n "$PR_NUMBER" ]]; then
120+
CHANGED_FILES=$(gh pr diff "$PR_NUMBER" --repo ${{ github.repository }} --name-only 2>/dev/null) || CHANGED_FILES=""
121+
if [[ -n "$CHANGED_FILES" ]]; then
122+
NON_PERF=$(echo "$CHANGED_FILES" | grep -v '^scripts/performance/' || true)
123+
[[ -z "$NON_PERF" ]] && PERF_SCRIPTS_ONLY=true
124+
fi
125+
fi
126+
127+
# Tests are expected on every run except docs-only, deployment, and perf-scripts-only
128+
if [[ "$DOCS_ONLY" == "true" || "$IS_DEPLOYMENT" == "true" || "$PERF_SCRIPTS_ONLY" == "true" ]]; then
118129
RUN_TESTS=false
119130
else
120131
RUN_TESTS=true
@@ -142,24 +153,26 @@ jobs:
142153
EXPECT_L2=true
143154
fi
144155
145-
echo "needs_more_tests=$NEEDS_MORE_TESTS" | tee -a "$GITHUB_OUTPUT"
146-
echo "full_test_suite=$FULL_TEST_SUITE" | tee -a "$GITHUB_OUTPUT"
147-
echo "expect_l0=$EXPECT_L0" | tee -a "$GITHUB_OUTPUT"
148-
echo "expect_l1=$EXPECT_L1" | tee -a "$GITHUB_OUTPUT"
149-
echo "expect_l2=$EXPECT_L2" | tee -a "$GITHUB_OUTPUT"
156+
echo "needs_more_tests=$NEEDS_MORE_TESTS" | tee -a "$GITHUB_OUTPUT"
157+
echo "full_test_suite=$FULL_TEST_SUITE" | tee -a "$GITHUB_OUTPUT"
158+
echo "expect_l0=$EXPECT_L0" | tee -a "$GITHUB_OUTPUT"
159+
echo "expect_l1=$EXPECT_L1" | tee -a "$GITHUB_OUTPUT"
160+
echo "expect_l2=$EXPECT_L2" | tee -a "$GITHUB_OUTPUT"
161+
echo "perf_scripts_only=$PERF_SCRIPTS_ONLY" | tee -a "$GITHUB_OUTPUT"
150162
151163
# Active row markers for step summary decision tree
152-
_L0=$( [[ "$EXPECT_L0" == "true" ]] && echo "**→**" || echo "" )
153-
_L1=$( [[ "$EXPECT_L1" == "true" ]] && echo "**→**" || echo "" )
154-
_L2=$( [[ "$EXPECT_L2" == "true" ]] && echo "**→**" || echo "" )
155-
_SKIP_DOCS=$( [[ "$DOCS_ONLY" == "true" ]] && echo "**→**" || echo "" )
156-
_SKIP_DEPLOY=$( [[ "$IS_DEPLOYMENT" == "true" ]] && echo "**→**" || echo "" )
157-
_MG=$( [[ "$EVENT_NAME" == "merge_group" ]] && echo "**→**" || echo "" )
158-
_MAIN=$( [[ "$REF" == "refs/heads/main" ]] && echo "**→**" || echo "" )
159-
_SCHED=$( [[ "$EVENT_NAME" == "schedule" ]] && echo "**→**" || echo "" )
160-
_WD=$( [[ "$EVENT_NAME" == "workflow_dispatch" ]] && echo "**→**" || echo "" )
161-
_NMT=$( [[ "$NEEDS_MORE_TESTS" == "true" ]] && echo "**→**" || echo "" )
162-
_FTS=$( [[ "$FULL_TEST_SUITE" == "true" ]] && echo "**→**" || echo "" )
164+
_L0=$( [[ "$EXPECT_L0" == "true" ]] && echo "**→**" || echo "" )
165+
_L1=$( [[ "$EXPECT_L1" == "true" ]] && echo "**→**" || echo "" )
166+
_L2=$( [[ "$EXPECT_L2" == "true" ]] && echo "**→**" || echo "" )
167+
_SKIP_DOCS=$( [[ "$DOCS_ONLY" == "true" ]] && echo "**→**" || echo "" )
168+
_SKIP_DEPLOY=$([[ "$IS_DEPLOYMENT" == "true" ]] && echo "**→**" || echo "" )
169+
_SKIP_PERF=$( [[ "$PERF_SCRIPTS_ONLY" == "true" ]] && echo "**→**" || echo "" )
170+
_MG=$( [[ "$EVENT_NAME" == "merge_group" ]] && echo "**→**" || echo "" )
171+
_MAIN=$( [[ "$REF" == "refs/heads/main" ]] && echo "**→**" || echo "" )
172+
_SCHED=$( [[ "$EVENT_NAME" == "schedule" ]] && echo "**→**" || echo "" )
173+
_WD=$( [[ "$EVENT_NAME" == "workflow_dispatch" ]] && echo "**→**" || echo "" )
174+
_NMT=$( [[ "$NEEDS_MORE_TESTS" == "true" ]] && echo "**→**" || echo "" )
175+
_FTS=$( [[ "$FULL_TEST_SUITE" == "true" ]] && echo "**→**" || echo "" )
163176
164177
cat <<SUMMARY >> "$GITHUB_STEP_SUMMARY"
165178
## CI Configuration
@@ -170,14 +183,15 @@ jobs:
170183
|---|---|
171184
| \`docs_only\` | \`$DOCS_ONLY\` |
172185
| \`is_deployment_workflow\` | \`$IS_DEPLOYMENT\` |
186+
| \`perf_scripts_only\` | \`$PERF_SCRIPTS_ONLY\` |
173187
| \`needs_more_tests\` | \`$NEEDS_MORE_TESTS\` |
174188
| \`full_test_suite\` | \`$FULL_TEST_SUITE\` |
175189
176190
### Expected test tiers
177191
178192
| | Tier | Condition |
179193
|---|---|---|
180-
| $_L0 | **L0** | any non-docs/deployment run |
194+
| $_L0 | **L0** | any non-docs/deployment/perf-scripts run |
181195
| $_L1 | **L1** | \`main\` / \`schedule\` / \`workflow_dispatch\` / \`merge_group\` / label _needs-more-tests_ |
182196
| $_L2 | **L2** | \`schedule\` / \`workflow_dispatch\` / label _full-test-suite_ |
183197
@@ -189,6 +203,7 @@ jobs:
189203
|---|---|
190204
| $_SKIP_DOCS | Docs-only change (no src files modified) |
191205
| $_SKIP_DEPLOY | Deployment workflow (\`deploy-release/*\` branch) |
206+
| $_SKIP_PERF | Perf-scripts-only change (all changes under \`scripts/performance/\`) |
192207
193208
**L1/L2 active trigger**
194209
@@ -245,21 +260,22 @@ jobs:
245260
pre-commit run --all-files --show-diff-on-failure --color=always
246261
247262
cicd-wait-in-queue:
248-
needs: [pre-flight, lint-check]
263+
needs: [pre-flight, lint-check, configure]
249264
runs-on: ubuntu-latest
250265
environment: test
251266
if: |
252267
!(needs.pre-flight.outputs.is_ci_workload == 'true'
253268
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
254-
|| needs.pre-flight.outputs.docs_only == 'true')
269+
|| needs.pre-flight.outputs.docs_only == 'true'
270+
|| needs.configure.outputs.perf_scripts_only == 'true')
255271
&& github.event_name != 'merge_group'
256272
steps:
257273
- name: Running CI tests
258274
run: |
259275
echo "Running CI tests"
260276
261277
cicd-compute-build-matrix:
262-
needs: [pre-flight, cicd-wait-in-queue]
278+
needs: [pre-flight, configure, cicd-wait-in-queue]
263279
runs-on: ubuntu-latest
264280
if: |
265281
(
@@ -269,6 +285,7 @@ jobs:
269285
|| github.event_name == 'merge_group'
270286
)
271287
&& !cancelled()
288+
&& needs.configure.outputs.perf_scripts_only != 'true'
272289
outputs:
273290
matrix: ${{ steps.compute.outputs.matrix }}
274291
steps:
@@ -293,7 +310,7 @@ jobs:
293310
echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
294311
295312
cicd-container-build:
296-
needs: [pre-flight, cicd-wait-in-queue, cicd-compute-build-matrix]
313+
needs: [pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix]
297314
strategy:
298315
fail-fast: false
299316
matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }}
@@ -307,6 +324,7 @@ jobs:
307324
|| github.event_name == 'merge_group'
308325
)
309326
&& !cancelled()
327+
&& needs.configure.outputs.perf_scripts_only != 'true'
310328
steps:
311329
- name: Get PR info
312330
id: get-pr-info
@@ -1010,14 +1028,15 @@ jobs:
10101028
10111029
Coverage_Fake:
10121030
runs-on: ubuntu-latest
1013-
needs: [Nemo_CICD_Test, pre-flight]
1031+
needs: [Nemo_CICD_Test, pre-flight, configure]
10141032
if: |
10151033
always()
10161034
&& !cancelled()
10171035
&& needs.pre-flight.outputs.is_ci_workload == 'false'
10181036
&& (
10191037
needs.pre-flight.outputs.docs_only == 'true'
10201038
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
1039+
|| needs.configure.outputs.perf_scripts_only == 'true'
10211040
)
10221041
steps:
10231042
- name: Generate fake coverage report

docs/releases/release-process.md

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,32 @@ From RC3 onward, RCs are cut **more frequently and as needed**, rather than stri
2828

2929
## Golden Values
3030

31-
Golden values are reference outputs used to validate model behavior in CI.
31+
Golden values are reference outputs used to validate model behavior in CI. They live in the **internal CI repository** and are the baseline for the internal regression tracker — keeping them current and accurate is therefore critical for meaningful signal.
32+
33+
### When to update golden values
34+
35+
Any PR that can affect performance metrics (e.g. changes to model code, training loop, optimizer, or numerical kernels) **must be accompanied by a corresponding internal PR that updates the golden values** before merging. Do not wait until after the PR lands.
36+
37+
### Updating golden values for PRs targeting `main`
38+
39+
1. **Rebase the MBridge PR against `main`** so it is at top-of-tree before launching CI.
40+
2. **Launch an internal CI run** using:
41+
- The **latest nightly container** as the base image.
42+
- The **latest MCore commit** on `main`.
43+
- The **MBridge PR commit** (the head of your MBridge branch).
44+
3. Collect the outputs and open a PR against the **internal CI repository's `main` branch** with the updated golden values.
45+
4. The MBridge PR and the internal golden-values PR should be merged together (or the golden-values PR first).
46+
47+
### Updating golden values during a release
48+
49+
When golden values need to be refreshed on the release branch (e.g. at the start of code-freeze or after an accepted regression):
50+
51+
1. **Rebase the MBridge PR against the MBridge release branch** so it is at the head of that branch.
52+
2. **Launch an internal CI run** using:
53+
- The **latest internal RC container** for the release.
54+
- The **MCore commit pinned on the release branch**.
55+
- The **MBridge PR commit** (head of the MBridge release branch).
56+
3. Open a PR against the **internal CI repository's release branch** with the updated golden values.
3257

3358
### During the RC Phase (before code-freeze)
3459

@@ -41,24 +66,26 @@ This means golden values are not automatically updated with every run — a deli
4166

4267
### On the Release Branch (during code-freeze)
4368

44-
When the release branch is created at code-freeze, all golden values are updated **unconditionally**. Whatever the current output is becomes the new reference baseline for the release.
69+
When the release branch is created at code-freeze, all golden values are updated **unconditionally** — whatever the current output is becomes the new reference baseline for the release.
70+
71+
In **Week 5**, the last bulk update of golden values is performed. After that point, engineers are individually responsible for updating any remaining golden values on the release branch, reviewing discrepancies and ensuring the suite is clean ahead of the release.
4572

4673
-----
4774

4875
## Code-Freeze
4976

5077
Code-freeze lasts **two weeks** and begins when RC3 is cut. This is the **stabilization phase** — no new features are landed.
5178

52-
### First Half
79+
### First Half (Weeks 3–5)
5380

5481
- **Release branches are created.**
5582
- All golden values on the release branch are updated unconditionally (see above).
56-
- The **last bulk CI run** occurs one week into the code-freeze period.
83+
- The **last bulk update of golden values** happens in **Week 5**.
5784
- RCs continue to be cut as needed.
5885

59-
### Second Half
86+
### Second Half (Weeks 6–7)
6087

61-
- **Engineers are responsible for updating golden values** on the release branch — reviewing any remaining discrepancies and ensuring the suite is in a clean state ahead of release.
88+
- **Engineers are individually responsible for updating golden values** on the release branch — reviewing any remaining discrepancies and ensuring the suite is in a clean state ahead of release.
6289
- RCs continue to be cut as needed.
6390

6491
### Release Day
@@ -67,6 +94,23 @@ The release goes out on the **first Wednesday after the code-freeze window ends*
6794

6895
-----
6996

97+
## Patch Release
98+
99+
After the main release ships (Week 7, typically mid-month), the release branches are **reopened** for contributions targeting the patch release.
100+
101+
### Patch Release Timeline
102+
103+
| Period | Approximate Timing | Key Activity |
104+
|--------|--------------------|--------------|
105+
| Reopening | Release day (Week 7) | Branches accept contributions; patch development begins |
106+
| Lockdown | First Monday of the following month (~2 weeks later) | Release branches locked; **patch RC0 (`XX.YY.01.RC0`) shipped internally** |
107+
| Stabilization | Week 1–2 after RC0 | Bug fixes and small improvements only |
108+
| | End of Week 2 | QA exit, patch release |
109+
110+
The patch stabilization flow mirrors the main release's code-freeze phase, but compressed into approximately two weeks.
111+
112+
-----
113+
70114
## CI and Known Failures
71115

72116
### Ticket-Annotated Tests

docs/skills-index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and verification steps.
1010
:maxdepth: 1
1111
1212
skills/developer-guide/SKILL
13+
skills/test-system/SKILL
1314
skills/mlm-bridge-training/SKILL
1415
skills/recipe-recommender/SKILL
1516
```

scripts/performance/configs/nemotronh/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212
nemotron_3_nano_pretrain_config_gb200,
1313
nemotron_3_nano_pretrain_config_gb300,
1414
nemotron_3_nano_pretrain_config_h100,
15+
nemotron_3_nano_pretrain_config_vr200,
1516
nemotron_3_super_pretrain_config_b200,
1617
nemotron_3_super_pretrain_config_b300,
1718
nemotron_3_super_pretrain_config_gb200,
1819
nemotron_3_super_pretrain_config_gb300,
20+
nemotron_3_super_pretrain_config_vr200,
1921
)
2022
from .nemotronh_llm_pretrain import (
2123
nemotronh_56b_pretrain_config_b200,
@@ -40,6 +42,9 @@
4042
NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1,
4143
NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1,
4244
NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1,
45+
NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_BF16_V1,
46+
NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_FP8_MX_V1,
47+
NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_NVFP4_V1,
4348
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_BF16_V1,
4449
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_FP8_MX_V1,
4550
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_NVFP4_V1,
@@ -52,6 +57,9 @@
5257
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_BF16_V1,
5358
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_FP8_MX_V1,
5459
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_NVFP4_V1,
60+
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_BF16_V1,
61+
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_FP8_MX_V1,
62+
NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_NVFP4_V1,
5563
)
5664
from .nemotronh_workload_base_configs import (
5765
NEMOTRONH_56B_PRETRAIN_CONFIG_B200_FP8_CS_V1,
@@ -82,6 +90,9 @@
8290
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1",
8391
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1",
8492
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1",
93+
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_BF16_V1",
94+
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_FP8_MX_V1",
95+
"NEMOTRON_3_NANO_PRETRAIN_CONFIG_VR200_NVFP4_V1",
8596
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_BF16_V1",
8697
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_FP8_MX_V1",
8798
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_GB300_NVFP4_V1",
@@ -94,6 +105,9 @@
94105
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_BF16_V1",
95106
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_FP8_MX_V1",
96107
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_B200_NVFP4_V1",
108+
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_BF16_V1",
109+
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_FP8_MX_V1",
110+
"NEMOTRON_3_SUPER_PRETRAIN_CONFIG_VR200_NVFP4_V1",
97111
]
98112

99113
if HAVE_MEGATRON_BRIDGE:
@@ -106,11 +120,13 @@
106120
"nemotronh_56b_pretrain_config_h100",
107121
"nemotron_3_nano_pretrain_config_gb300",
108122
"nemotron_3_nano_pretrain_config_gb200",
123+
"nemotron_3_nano_pretrain_config_vr200",
109124
"nemotron_3_nano_pretrain_config_b300",
110125
"nemotron_3_nano_pretrain_config_b200",
111126
"nemotron_3_nano_pretrain_config_h100",
112127
"nemotron_3_super_pretrain_config_gb300",
113128
"nemotron_3_super_pretrain_config_gb200",
129+
"nemotron_3_super_pretrain_config_vr200",
114130
"nemotron_3_super_pretrain_config_b300",
115131
"nemotron_3_super_pretrain_config_b200",
116132
]

0 commit comments

Comments
 (0)