Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
436 commits
Select commit Hold shift + click to select a range
87ac13d
update API compat check baseline to 274e04d (#2548)
pablo-garay Dec 4, 2025
f0c1b55
feat: mcore trigger mbridge (#2340) (#2552)
pablo-garay Dec 5, 2025
8de5a7f
[Dev] Optimize TE CUDA Graph capturing time (#2483)
buptzyb Dec 5, 2025
1f08ceb
[Dev] Feature: linear cross entropy fusion (#2256)
Jianbing-D Dec 5, 2025
9cf6838
Fix gpt_layer_spec for frequently linear attention (#2481)
yuzhongw-nvidia Dec 5, 2025
89fe895
Skip trainloader when `args.skip_train` is True (#2501)
Niccolo-Ajroldi Dec 5, 2025
a6d86a6
[DEV] fixes for muon(qwen3-next, ep multi-adam) (#2564)
FDecaYed Dec 5, 2025
aee4a74
[Dev] remove fp16 assert in moe_grouped_gemm & EP (#2494)
HaochenYuan Dec 8, 2025
dfe4da2
Update tp support in muon (#2385)
skyw Dec 8, 2025
1d462bd
[DEV] Update GitHub MoE functional test cases (#2449)
Victarry Dec 8, 2025
23e092f
Fix: don't enter branch if mtp_num_layers == 0 (#2581)
rj42 Dec 9, 2025
c60d5c2
[Dev] fix(moe): Support HybridEP and reduce memory overhead for 1F1B …
lhb8125 Dec 10, 2025
4db2f11
Merge branch 'main' into dev
FDecaYed Dec 10, 2025
ed804b4
[dev] pull main 1201 (#2448)
ko3n1g Dec 11, 2025
2d398b4
chore: Bump baseline (#2626)
ko3n1g Dec 11, 2025
e8a9275
[Dev] Use the latest Hybrid-EP (#2424)
Autumn1998 Dec 12, 2025
305957a
API compat: ignore ParameterMovedBreakage for __init__ methods (#2649)
pablo-garay Dec 12, 2025
e93814b
[training migration] add training config dataclass and arg generation…
maanug-nv Dec 16, 2025
288b8ea
[Dev] Optimize TE CUDA Graph _get_sample_arguments() Time (#2568)
buptzyb Dec 17, 2025
0eec631
Reopen qwen3next functional test in lightweight mode (#2493)
yuzhongw-nvidia Dec 17, 2025
2ebff67
[Dev] Fix CUDA RNG Tracker (#2640)
buptzyb Dec 17, 2025
368e580
[Dev] Mark API backwards compatibility checks as OPTIONAL (non-blocki…
pablo-garay Dec 17, 2025
3714d81
[Dev] FP8 params support for megatron-fsdp (MXFP8/Blockwise) (#2086)
kunlunl Dec 18, 2025
a935008
[Dev] Feat(moe): Gated delta net context parallel (CP) (#2614)
yuzhongw-nvidia Dec 19, 2025
fd932c9
ci: Gridify test configs (#2707)
ko3n1g Dec 19, 2025
2b1fc70
Revert "[dev] Add assertion for mxfp8 params without dp overlap (#2270)"
ko3n1g Dec 22, 2025
4665be4
Revert "[Dev] Use the latest Hybrid-EP (#2424)" (#2732)
ko3n1g Dec 22, 2025
46b5505
[Dev] Fix ep overlap missing final layernorm (#2691)
Wohox Dec 23, 2025
0b6714e
[Dev] Remove calculation of padding token in moe routing loss (#2121)
HaochenYuan Dec 24, 2025
1068d77
Revert "[Dev] Remove calculation of padding token in moe routing loss…
chtruong814 Dec 24, 2025
9885ddb
[Dev] Disable ep overlap memory optimization (#2750)
Wohox Dec 30, 2025
14c35dc
Merge branch 'main' into dev
FDecaYed Dec 30, 2025
929e77f
feat: Cherry-pick PR of PR!2661 for dev branch (#2757)
youngeunkwon0405 Dec 30, 2025
b361561
Merge branch 'dev' into deyuf/dev_pull_main_1217_test
FDecaYed Dec 31, 2025
922e8e9
cp: Allow disabling external contributors (#2784) (#2786)
chtruong814 Dec 31, 2025
5455f0a
build: Pin down `nvidia-nvshmem-cu13` (#2798)
ko3n1g Jan 3, 2026
71d5c84
[dev] Fix bug of reuse_grad_buf_for_mxfp8_param_ag (#2801)
kunlunl Jan 5, 2026
8b93e0d
[Dev] Partial CUDA Graph support for EP Overlap (#2168)
Wohox Jan 5, 2026
c1045f6
Revert "[Dev] FP8 params support for megatron-fsdp (MXFP8/Blockwise) …
ko3n1g Jan 5, 2026
bd06945
Revert "[Dev] Partial CUDA Graph support for EP Overlap (#2168)"
ko3n1g Jan 5, 2026
29ffe43
Merge branch 'dev' into deyuf/dev_pull_main_1217_test
FDecaYed Jan 5, 2026
d8464fc
PR for testing pull main 1217 (#2716)
ko3n1g Jan 5, 2026
dfa6cc1
[Dev] Remove calculation of padding token in moe routing loss (#2754)
HaochenYuan Jan 6, 2026
5823534
[dev] Reapply fsdp mxfp8 (#2828)
kunlunl Jan 6, 2026
1ec0beb
[Dev] Partial CUDA Graph support for EP Overlap (#2810)
Wohox Jan 6, 2026
0bc4114
[Dev] fix EP Overlap Partial Cuda Graph Unit Test hang issue (#2838)
Wohox Jan 7, 2026
28c586e
build: Bump jet-client (#2877)
ko3n1g Jan 8, 2026
46d1f47
FP8 attention knob for nvFP4 recipe (#2818)
vasunvidia Jan 9, 2026
ed6ebff
[DEV][NVFP4][MOE] 128 Zero Padding for Grouped Quantization kernels a…
zhongbozhu Jan 9, 2026
ebe7079
Add check for full_iteration scope before instantiating CudaGraphMana…
vasunvidia Jan 9, 2026
736da3c
Reapply "[Dev] Use the latest Hybrid-EP (#2423)" (#2867)
ko3n1g Jan 9, 2026
9d741cf
build: Main dependency bump for 26.02 (#2682)
ko3n1g Jan 12, 2026
de866fa
ci(fix): Update golden values (#2921)
ko3n1g Jan 13, 2026
ae3dbc0
ci(hotfix): Re-add `gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8…
ko3n1g Jan 13, 2026
583dd58
ci: Skip broken tests after dependency update (#2935)
chtruong814 Jan 13, 2026
b0a702b
Cherry-pick optimizer override refactor from #2723 (#2835)
yaoyu-33 Jan 14, 2026
1964d39
ci(hotfix): Disable gpt_grpo_tp1_pp1_dp8_583m_throughputtest
ko3n1g Jan 14, 2026
383505c
[dev]: ci: Onboard GB200 (#2922)
ko3n1g Jan 14, 2026
ab3ae8a
ci(hotfix): Repair recipe
ko3n1g Jan 14, 2026
dce8e88
Fix clip_qk for virtual pipeline size > 1 (#2776)
juntaowww Jan 15, 2026
748ab80
ci(hotfix): GB200 to nightly
ko3n1g Jan 15, 2026
a32b198
ci(fix): GB200 racecondition (#2962)
ko3n1g Jan 15, 2026
7c6c4e9
Revert "ci(fix): GB200 racecondition (#2962)"
ko3n1g Jan 15, 2026
619115a
ci: Fix GB200 change (#2969) (#2974)
ko3n1g Jan 16, 2026
b395016
[Dev] TE cudagraph recompute (#2694)
buptzyb Jan 16, 2026
b927e1f
[Dev] docs(megatron-fsdp): add Megatron-FSDP user guide (#2397)
xuwchen Jan 16, 2026
6b157e0
[Dev] Optimizer State and Master Weight Offloading (#2760)
hxbai Jan 16, 2026
8ac3a9f
Revert "[Dev] Optimizer State and Master Weight Offloading (#2760)" (…
ko3n1g Jan 16, 2026
bd8411c
Forced load imbalance (#2917)
nanz-nv Jan 19, 2026
0a2e01f
[Dev] [Reapply] Optimizer State and Master Weight Offloading (#2987)
hxbai Jan 19, 2026
8abc086
ci(fix): CI_COMMIT_BRANCH on forks (#2982) (#2989)
ko3n1g Jan 19, 2026
5b17f19
[Dev] Update MoE readme. (#2808)
Victarry Jan 19, 2026
9ea50a9
feat: add routing replay for Mcore (#2693)
litianjian Jan 20, 2026
ac9f665
[dev] feat(moe): Support apply wd to qk layernorm for Qwen3-Next (#2825)
yuzhongw-nvidia Jan 21, 2026
6e2153b
[dev] feat(moe): Cherry-pick #1989 back to dev (#3011)
yuzhongw-nvidia Jan 21, 2026
68e5fec
[Dev]feat(moe): code refactor for fine grained activation offloading …
lhb8125 Jan 22, 2026
6807df4
[Dev] [fix] Bug fix for offloading in evaluate() (#3041)
lhb8125 Jan 22, 2026
b3bba3f
ci: Log node name (#3081) (#3082)
ko3n1g Jan 26, 2026
a4e3fb3
[dev] pull main 260122 (#3045)
FDecaYed Jan 27, 2026
420aa6a
ci: Skip test_precision_aware_optimizer (#3062)
thomasdhc Jan 23, 2026
da56650
Merge branch 'main' into deyuf/dev_pull_main_260122_fix_git
FDecaYed Jan 27, 2026
08357d8
[dev] fix git history for dev pull main 260122 (#3094)
ko3n1g Jan 27, 2026
0f82f05
[dev] fixes for pull main 260122 (#3103)
FDecaYed Jan 28, 2026
0ceb698
ci: Disable broken test (#3121)
ko3n1g Jan 28, 2026
f6f2abe
[Dev] Param offset in _ParamAndGradBucket should be aligned (#3010)
BestJuly Jan 29, 2026
d587dd1
[Dev] fix cg missing wgrad hook (#2999)
Wohox Jan 29, 2026
8f8f735
[Megatron-FSDP] Add fsdp_all_gather_in_start_param_sync option in DDP…
shjwudp Jan 29, 2026
bde9e32
[Dev] Support EP with HSDP (#2800)
wplf Jan 29, 2026
27fcfb2
Cherrypick CI improvements to dev branch (#3118)
ko3n1g Jan 29, 2026
a9fb6c8
Merge branch 'main' into deyuf/dev_pull_main_260130
FDecaYed Jan 30, 2026
55e3a0a
[dev] ci: Add DSv3 proxy (#3144)
ko3n1g Jan 30, 2026
a78ae49
[dev] ci: Fix DSv3 (#3187)
ko3n1g Jan 31, 2026
9375be4
Fix: nccl-ub in ddp path (#3181)
youngeunkwon0405 Feb 1, 2026
0f73a8a
[dev] perf(moe): Refine gated delta net implementation (#3040)
yuzhongw-nvidia Feb 2, 2026
5035cbe
[Dev] Add the missing part to support 1F1B overlap for Qwen3-Next (#2…
BestJuly Feb 2, 2026
4aac3fe
Use the latest hybrid-ep (#3092)
Autumn1998 Feb 2, 2026
bfa1d31
[BUG FIX] Try to enable cuda graph ut (#3192)
Autumn1998 Feb 2, 2026
13ad653
[Dev] Fix Linear-Cross-Entropy Convergence Issue (#2739)
shjwudp Feb 3, 2026
b8b8662
Revert "[Dev] Fix Linear-Cross-Entropy Convergence Issue (#2739)" (#3…
chtruong814 Feb 3, 2026
2ab74ab
Fix missing PackedSeqParams import (#3215)
parthmannan Feb 3, 2026
20e8ac8
fix merge main issues
FDecaYed Jan 30, 2026
77b5a3d
[dev] pull main 260130 (#3166)
ko3n1g Feb 3, 2026
c5b282b
ci(hotfix): Pin uv (#3233) (#3234)
ko3n1g Feb 3, 2026
8a29fd5
[DEV] Reapply fix Linear CE Fusion (#3226)
shjwudp Feb 4, 2026
dd17acc
Missing import fix (#3242)
parthmannan Feb 4, 2026
fa5bcf6
[Dev] Fix EP Overlap Bugs for Full-Iter CG (#3163)
Wohox Feb 4, 2026
a592819
[Refactor] Decouple topk and loss from DSA Indexer (#3013)
laixinn Feb 4, 2026
54f4feb
cp: Fix uv install for GH actions (#3259) (#3261)
chtruong814 Feb 5, 2026
ef336ca
[Dev] Fix EP Overlap missing record stream for shared expert (#3244)
Wohox Feb 5, 2026
ec94d63
Restore missing linear-cross-entropy option accidentally removed from…
shjwudp Feb 6, 2026
500e080
Fix reload_model_params failure when loading MoE models with explicit…
eternally-z Feb 9, 2026
433c169
ci: Disable moe20 tests (#3312)
ko3n1g Feb 9, 2026
fd4801e
ci: Pin down setuptools to lt 82 (#3316)
ko3n1g Feb 9, 2026
52eabf0
[None][Fix] Prevent resource leak warnings (#3216)
IanBoyanZhang Feb 10, 2026
c0030d6
[Dev] Fix backward dw dependency (#3338)
Wohox Feb 10, 2026
2c2e749
ci: Rely exclusively on GitHub CI (#3341)
ko3n1g Feb 10, 2026
98f6f81
[dev] ci: skip queue in merge-gate (#3344)
ko3n1g Feb 10, 2026
28b130f
Revert "[None][Fix] Prevent resource leak warnings (#3216)" (#3366)
ko3n1g Feb 11, 2026
e868e8f
ci: Fix dev branch merge queue (#3397)
chtruong814 Feb 13, 2026
c4b910f
[Dev] Add Qwen3-VL support with Megatron-FSDP (#2842)
xuwchen Feb 13, 2026
6059f36
Add absorbed-mla (#3193)
kunlunl Feb 13, 2026
9f2ca96
cp: Remove gpu sanity check (#3420) into dev (#3421)
chtruong814 Feb 13, 2026
1dcf0da
[dev] ci: Fix merge queue (#3385)
ko3n1g Feb 14, 2026
cd1c215
[dev] `cp: Cherrypick CI changes` (#3543)
ko3n1g Feb 23, 2026
aa86018
[Dev] Fix MoE aux loss tracker hang with MTP enabled (#3400)
Victarry Feb 25, 2026
2b4b9c4
ci: Remove multi-approval action from dev branch (#3576)
chtruong814 Feb 25, 2026
0ab47fa
Merge branch 'main' into dev
FDecaYed Feb 26, 2026
a1a73f8
[dev] pull main 260220 (#3574)
ko3n1g Feb 26, 2026
2e4a5d4
[dev] fix(moe): fix the bug where gate was not sliced when kv_head < …
LiuXTao Feb 27, 2026
d0e0cf0
Add unit test for THD (#3608)
kunlunl Feb 28, 2026
bc9298c
[Dev] feat(checkpoint): zero-copy storage sharing in CheckpointWithou…
Victarry Mar 2, 2026
5c613ab
[Dev] Add E2E support for THD format (#2924)
xiaoyao0115 Mar 3, 2026
5dadaf1
fix: skip FSDP DTensor boundary validation under fake process group (…
Victarry Mar 4, 2026
2176c4a
ci: Remove cudagraph codeowners entry in dev branch (#3712)
chtruong814 Mar 5, 2026
31f5294
[dev] refactor to support emerging optimizers beyond muon (#3618)
FDecaYed Mar 5, 2026
a268231
[Dev] Move some processing into a function so can be compiled (#3220)
BestJuly Mar 5, 2026
f983b21
[Dev] Refactor MoE loss logging (#2569)
yanring Mar 5, 2026
0b0074e
[dev] feat(mHC): Add basic pytorch implementation of manifold hyper c…
jingqiny-99 Mar 6, 2026
597f0d8
[Dev] Cherry-pick: M-FSDP: Cancel erroneous grad accumulation check (…
Victarry Mar 6, 2026
3d097e5
[dev] fix(moe): Fix DSA spec and rope. (#3402)
yuzhongw-nvidia Mar 6, 2026
1edfbd6
Fix split_state_dict function for MoE models (#3667)
eternally-z Mar 10, 2026
28a0aef
Exposing interleave argument for fused_apply_rotary_pos_emb_thd (#3759)
huvunvidia Mar 10, 2026
15fb557
build: Move fast-hadmard-transform (#3786)
ko3n1g Mar 11, 2026
dbf6c4c
fix ddp bug when --overlap-grad-reduce and --num-distributed-optimi f…
wplf Mar 11, 2026
cde56a4
[Dev] Fix for rope when enabling THD + Dynamic-CP; and use the naming…
xiaoyao0115 Mar 11, 2026
9374a4d
Continue emerging optimizer refactoring (#3737)
skyw Mar 12, 2026
f47ad91
Fix emerging optimizer init_group for ckpt loading (#3897)
FDecaYed Mar 17, 2026
74124ba
fix cg acess issue by using dict instead of list to iteratively acces…
ilml Mar 17, 2026
51299c5
Enhance rotary positional embedding version checks (#3887)
huvunvidia Mar 17, 2026
7c3eea6
[DEV] fix(megatron-fsdp): build expt_device_mesh only for MoE models …
xuwchen Mar 17, 2026
a9e5bf9
[Fix][Dev] Missing Assertion for moe layer recomptue in A2A Overlap (…
Wohox Mar 18, 2026
ebf1508
ci: Fix sso users check (#3937)
chtruong814 Mar 19, 2026
8ae70d4
Add more emerging optimizers (#3907)
skyw Mar 19, 2026
c72c459
Support GEMM + Swiglu fused MLP (#3890)
ksivaman Mar 20, 2026
0296101
[Dev] Support EP Overlap's Dynamic Computation Stream For Full-Iter C…
Wohox Mar 25, 2026
4108d68
[dev] mHC kernel fusion (#3828)
jingqiny-99 Mar 25, 2026
79aeecf
Merge remote-tracking branch 'upstream/main' into tolong/sync-main-to…
ilml Mar 25, 2026
0e53b30
fix: correct H2->H4 header skips in router_replay.md
ilml Mar 25, 2026
076d20f
fix: add missing tensor_parallel import in absorbed_mla.py
ilml Mar 25, 2026
0961196
fix: correct import ordering for tensor_parallel in absorbed_mla
ilml Mar 25, 2026
6823637
fix layerwise related merge error due to dev refactor
FDecaYed Mar 30, 2026
0c306dc
[Dev][feat] Support CUDA Graph capture offloading modules (#3219)
lhb8125 Mar 30, 2026
9c0b6ef
update golden value for gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_tor…
FDecaYed Mar 30, 2026
f36257c
Merge branch 'dev' into pull-request/4031
FDecaYed Mar 30, 2026
4ef64eb
Sync main into dev (#4031)
ko3n1g Mar 30, 2026
2bb0d38
[Dev] Fix golden values mismatch and dependency error due to last pul…
Victarry Apr 3, 2026
8d1fd3c
[Dev] Skip routed expert padding for graph-safe MoE (#4071)
zhongbozhu Apr 3, 2026
74751c9
[DEV] Minor update optimizer (#4082)
skyw Apr 7, 2026
ab6c0ff
TE fused grouped mlp with grouped bias and delayed wgrad (#4095)
ksivaman Apr 7, 2026
37a4cee
[Dev][feat] Support overlapping A2A Combine backprop with wgrad GEMM …
Wohox Apr 7, 2026
b6193a3
[dev] feat(moe): Support packed sequence for gated delta net (GDN) (#…
yuzhongw-nvidia Apr 7, 2026
07b9fb2
Revert unintended code owner change (#4172)
kunlunl Apr 7, 2026
2d6e946
[Dev] feat: Dynamic CP (part 2) (#2000)
xiaoyao0115 Apr 7, 2026
16a985f
Update golden values nightly (#4185)
ko3n1g Apr 8, 2026
7fec528
[Dev][MoE] Add a new score function to the router (#4193)
yaox12 Apr 8, 2026
0b121cf
[Dev] feat(fsdp): use TE general_gemm for mixed-precision wgrad in FS…
Victarry Apr 8, 2026
06ecac3
ci: update dev nightly golden values (#4201)
ko3n1g Apr 8, 2026
3beeaa6
ci: Update gb200 golden value test after merge from main to dev (#4216)
chtruong814 Apr 9, 2026
5c54484
Enabled fused grouped MLP for `quick_gelu` and add config for grouped…
ksivaman Apr 9, 2026
0410104
[Dev] Update golden values for inference (#4246)
yaox12 Apr 10, 2026
7226640
[dev] fix: Support mHC with cuda graph and activation offloading (#4190)
jingqiny-99 Apr 10, 2026
7960a31
Add training code to MCore wheel (#3573) (#4255)
maanug-nv Apr 10, 2026
0f6fcb0
[dev] fix(ssm): handle alignment padding in GDN packed seq + CP (#4230)
yxs Apr 13, 2026
c0c4fdc
[Dev] Paged Stashing (#2690)
nanz-nv Apr 13, 2026
1fc9200
[Dev] Fix TE version check for retain_pinned_cpu_buffers in cpu offlo…
BestJuly Apr 13, 2026
7ff046b
[Dev] Add diagnostic warnings to TEGroupedMLP fused impl checks (#4269)
Victarry Apr 14, 2026
66e55f9
Merge remote-tracking branch 'origin/main' into main2dev/14_04_2026
svcnvidia-nemo-ci Apr 14, 2026
11d78a1
chore: nightly sync main into dev (14_04_2026)
svcnvidia-nemo-ci Apr 14, 2026
1d7f3a9
fix: take main's pyproject.toml and uv.lock for lock-file consistency
svcnvidia-nemo-ci Apr 14, 2026
74ec0f9
fix: re-run black formatting on 4 files missed in initial commit
svcnvidia-nemo-ci Apr 14, 2026
913ca3d
fix: add missing docstrings to MegatronGradScaler abstract methods
svcnvidia-nemo-ci Apr 14, 2026
861c840
fix: revert to dev's pyproject.toml, uv.lock, and Dockerfile.ci.dev
svcnvidia-nemo-ci Apr 14, 2026
4012662
fix: use main's pyproject.toml, uv.lock, and Dockerfile.ci.dev
svcnvidia-nemo-ci Apr 14, 2026
e763da8
fix: restore RMSNorm import order in legacy model __init__
svcnvidia-nemo-ci Apr 14, 2026
9f69681
fix: remove stale sequence_packing parametrize and use dev's TE revision
svcnvidia-nemo-ci Apr 14, 2026
2be925c
fix: restore missing CudaGraphScope import, take dev's gated_delta_ne…
Phlip79 Apr 14, 2026
c203444
fix: add fast-hadamard-transform dependency from dev for DSA test
Phlip79 Apr 14, 2026
e7e7a3e
fix: remove fast-hadamard-transform from no-build-isolation-package t…
Phlip79 Apr 14, 2026
6b8f089
fix: splice fast-hadamard-transform into pyproject.toml and uv.lock
Phlip79 Apr 14, 2026
bd24b5c
fix: disambiguate packaging version in uv.lock for fast-hadamard-tran…
Phlip79 Apr 14, 2026
98b2076
fix: take dev's uv.lock — main's lockfile is missing dev-only depende…
Phlip79 Apr 14, 2026
0ba88b5
fix: take dev's pyproject.toml and uv.lock together — they must be co…
Phlip79 Apr 14, 2026
7c27108
revert: restore pyproject.toml, uv.lock, Dockerfile.ci.dev to last kn…
Phlip79 Apr 14, 2026
22e70ca
fix: add fast-hadamard-transform and regenerate uv.lock in CUDA conta…
Phlip79 Apr 14, 2026
09a76f8
fix: take dev's Dockerfile.ci.dev — it must match dev's pyproject.tom…
Phlip79 Apr 14, 2026
e9f1020
fix: take dev's pyproject.toml, uv.lock, and Dockerfile.ci.dev
Phlip79 Apr 14, 2026
9e33263
fix: add nvidia-resiliency-ext git source from main to fix ImportError
Phlip79 Apr 14, 2026
9a7c5dd
fix mfsdp unwrap stuck at MegatronFSDP [dev] (#4273)
wplf Apr 15, 2026
76371d4
Fix UT timeout (#4310)
kunlunl Apr 15, 2026
817b2c4
fix: restore dev's GroupedQuantizedTensor handling in distrib_optimizer
Phlip79 Apr 15, 2026
e56a6c0
fix: remove double-remove in fine_grained_activation_offload bulk_off…
Phlip79 Apr 15, 2026
2a68a9c
fix: resolve fine-grained offloading API mismatches from merge
Phlip79 Apr 15, 2026
01b70a1
[dev] fix(ssm): handle alignment padding in GDN packed seq + CP (#4230)
yxs Apr 13, 2026
a2e673f
chore: nightly sync main into dev (14_04_2026) (#4291)
ko3n1g Apr 16, 2026
6f0795c
ci(action): improve GitHub Actions output UX (#4336)
ko3n1g Apr 16, 2026
c2c7f0f
fix(dev): correct params->parameters typo in ChainedOptimizer.step() …
Phlip79 Apr 17, 2026
f145c98
[Dev] Revert code owner changes from pull main (#4354)
yaox12 Apr 17, 2026
ac6ca5b
build: bump TransformerEngine to release_v2.14 (dev) (#4332)
ko3n1g Apr 17, 2026
f2a40ef
[Dev] Add permute/unpermute fusion with dispatch/combine in Hybrid-EP…
Autumn1998 Apr 17, 2026
bd698d1
Fix fused grouped MLP wgrad hooks for DDP reduce-scatter (#4311)
gdengk Apr 17, 2026
73cc2ce
Fix activation_func check and MLP sharded_state_dict (#4325)
gdengk Apr 17, 2026
6efb083
Allow fine-grained offloading with MC impl of full-CG. (#4253)
rapatel Apr 17, 2026
be3b874
Add TEFusedDenseMLP for Dense+Grouped GEMM fusion on SM100+ (#4318)
sraman-rgb Apr 20, 2026
1b47bc0
[Dev] Fix docs build from main sync (#4356)
Victarry Apr 20, 2026
55df4e5
[Dev] overload factor logging (#4110)
nanz-nv Apr 20, 2026
13557a2
Revert "fix mfsdp unwrap stuck at MegatronFSDP [dev] (#4273)" (#4393)
wplf Apr 21, 2026
546a448
M4 leftover for TE cuda graph (dev) (#4369)
Phlip79 Apr 21, 2026
85bced0
[Dev] Add high-priority a2a comm stream option and hybridep preproces…
gdengk Apr 22, 2026
57005c8
Merge remote-tracking branch 'origin/main' into main2dev/22_04_2026
github-actions[bot] Apr 22, 2026
8add4e4
chore: post-merge fixes for nightly sync main into dev (22_04_2026)
github-actions[bot] Apr 22, 2026
baa3df4
fix: revert nvidia-resiliency-ext revision to match uv.lock
github-actions[bot] Apr 22, 2026
bbb06e2
fix: reformat 4 files with correct black==24.4.2 and isort==5.13.2
github-actions[bot] Apr 22, 2026
89798f3
fix: restore missing ArgumentGroupFactory import in arguments.py
svcnvidia-nemo-ci Apr 22, 2026
db5ade5
Reorder mtp_post_process after attn backward in 1F1B schedule plan (#…
gdengk Apr 23, 2026
8cf5458
chore: keep CODEOWNERS unchanged in main→dev sync
Phlip79 Apr 23, 2026
fba3a80
chore: update gpt3_mcore_te_tp2_pp2_mhc golden values for main→dev sync
Phlip79 Apr 23, 2026
78858b2
[Dev] Fix mis-set decoupled gradient for Megatron-FSDP. (#4426)
cspades Apr 25, 2026
64d2e0a
chore: nightly sync main into dev (22_04_2026) (#4436)
ko3n1g Apr 28, 2026
8821e6f
fix(ci): re-enable scoped_cudagraph MoE test with TE 2.14 golden valu…
buptzyb Apr 29, 2026
66a2ff8
[Dev] remove dead manual_release_grads code path in 1F1B overlap sche…
Wohox Apr 29, 2026
fe729e9
[dev] [DeepSeek-v4] Part 2: Hash MoE and SwiGLU clamp (#4481)
hxbai Apr 30, 2026
bf4e1db
[dev] [DeepSeek-v4] Part 1: Hybrid Attention with CSA and HCA (#4458)
hxbai Apr 30, 2026
a2d7153
feat(attention): Add rotary_base_per_layer for Step-3.5-Flash (#4473)
shifangx May 3, 2026
03df3a8
MTP support with mHC; new mHC contract
hxbai Apr 29, 2026
da85042
minor fix
hxbai Apr 29, 2026
42d79c0
fix new contract dtype
hxbai Apr 30, 2026
fb32fe2
format and add tests
hxbai Apr 30, 2026
d9d5eaa
fix mscale
yuzhongw-nvidia May 1, 2026
2f9160d
Add hadamard_transform fallback and fix mask dtype for aarch64 container
weijiac0619 May 7, 2026
0a55633
hc_head fix
weijiac0619 May 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 5 additions & 59 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,67 +1,13 @@
megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo

megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt

megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal

megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba

megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model

megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets

megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers

megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp

megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp

megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing

megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer

megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference

megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets

megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism

megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/transformer

megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech

megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference

megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo

megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training

megatron/post_training/ @NVIDIA/post-training

megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs

megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo
megatron/training/arguments.py
* @NVIDIA/core-nemo @NVIDIA/core-devtech

.gitlab/ @NVIDIA/ci
.github/ @NVIDIA/ci
.github/oncall_schedule.json @NVIDIA/mcore-oncall-rotation
.gitlab-ci.yml @NVIDIA/ci
docker/ @NVIDIA/ci
tests/unit_tests/run_ci_test.sh @NVIDIA/ci
tests/test_utils/python_scripts/
tests/functional_tests/python_test_utils/ @NVIDIA/ci
tests/functional_tests/shell_test_utils/ @NVIDIA/ci
tests/test_utils/recipes/ @NVIDIA/ci
tests/unit_tests/run_ci_test.sh @NVIDIA/ci

# API Backwards Compatibility Check
scripts/check_api_backwards_compatibility.py @NVIDIA/ci
scripts/README_API_COMPAT.md @NVIDIA/ci
.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci
docs/api-backwards-compatibility-check.md @NVIDIA/ci
tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci

megatron/rl/ @NVIDIA/reinforcement-learning
examples/rl/ @NVIDIA/reinforcement-learning
test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
train_rl.py @NVIDIA/reinforcement-learning
pyproject.toml @NVIDIA/ci
uv.lock @NVIDIA/ci
43 changes: 11 additions & 32 deletions .github/scripts/sync_team_usergroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
Slack user groups to match.
"""

import argparse
import os
import re
import sys
import argparse
import requests

import requests
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError

Expand Down Expand Up @@ -53,10 +53,7 @@ def get_headers():
print("Error: GH_TOKEN or GITHUB_TOKEN not set")
sys.exit(1)

return {
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
}
return {"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}


def get_org():
Expand Down Expand Up @@ -215,9 +212,7 @@ def get_user_email(username):

# Check Signed-off-by lines in the commit message for @nvidia.com emails
message = commit_data.get('message', '')
sob_matches = re.findall(
r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message
)
sob_matches = re.findall(r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message)
if sob_matches:
_email_cache[username] = sob_matches[0]
print(f"Found @nvidia.com email for {username} from Signed-off-by")
Expand Down Expand Up @@ -339,21 +334,14 @@ def create_slack_usergroup(slack_client, handle, team_slug):

try:
print(f"Creating Slack usergroup '@{handle}' with name '{name}'...")
response = slack_client.usergroups_create(
name=name,
handle=handle,
description=description,
)
response = slack_client.usergroups_create(name=name, handle=handle, description=description)
usergroup = response.get("usergroup", {})
usergroup_id = usergroup.get("id")

if usergroup_id:
# Update cache with new usergroup
if _usergroups_cache is not None:
_usergroups_cache[handle] = {
"id": usergroup_id,
"users": [],
}
_usergroups_cache[handle] = {"id": usergroup_id, "users": []}
print(f"Successfully created Slack usergroup '@{handle}'")
return usergroup_id
else:
Expand Down Expand Up @@ -446,9 +434,7 @@ def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):

# 5. Update the usergroup
try:
slack_client.usergroups_users_update(
usergroup=usergroup_id, users=slack_user_ids
)
slack_client.usergroups_users_update(usergroup=usergroup_id, users=slack_user_ids)
print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members")
return True
except SlackApiError as e:
Expand Down Expand Up @@ -530,18 +516,12 @@ def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None):


def main():
parser = argparse.ArgumentParser(
description="Sync GitHub team membership to Slack user groups"
)
parser = argparse.ArgumentParser(description="Sync GitHub team membership to Slack user groups")
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be done without making changes",
"--dry-run", action="store_true", help="Show what would be done without making changes"
)
parser.add_argument(
"--list",
action="store_true",
help="List all configured team-to-usergroup mappings",
"--list", action="store_true", help="List all configured team-to-usergroup mappings"
)
parser.add_argument(
"--parent-team",
Expand All @@ -559,8 +539,7 @@ def main():
dest="direct_teams",
metavar="SLUG",
help=(
"Sync this GitHub team directly (can be repeated). "
f"Defaults to: {DIRECT_TEAM_SLUGS}"
"Sync this GitHub team directly (can be repeated). " f"Defaults to: {DIRECT_TEAM_SLUGS}"
),
)

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ jobs:
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
run: |
# Skip SSO check for scheduled jobs, main branch, or merge groups
if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
# Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups
if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
exit 0
fi
Expand Down
129 changes: 129 additions & 0 deletions .github/workflows/mirror-to-main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Mirror Dev to Main

on:
push:
branches:
- "pull-request/[0-9]+"

jobs:
cherry-pick-to-main:
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write

steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.PAT }}

- name: Get PR info
id: get-pr-info
uses: nv-gha-runners/get-pr-info@main

- name: Configure Git
run: |
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git config --global user.name "GitHub Actions Bot"

- name: Cherry-pick to main
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
set -x

PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
HAS_MIRROR_MAIN_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "mirror-to-main")' || echo "false")
TARGET_BRANCH="cherry-pick-$PR_NUMBER-into-main"

# Skip if not labeled with mirror-to-main
if [ "$HAS_MIRROR_MAIN_LABEL" != "true" ]; then
echo "PR is not labeled with mirror-to-main, will not mirror to main."
exit 0
fi

# Skip if not targeting dev
if [ "$BASE_REF" != "dev" ]; then
echo "PR is not targeting dev, will not mirror to main."
exit 0
fi

# Check if target branch already exists
if git ls-remote --heads origin "refs/heads/$TARGET_BRANCH" | grep -q .; then
echo "Target branch already exists, will not cherry-pick again."
exit 0
fi

# Get PR details
PR_AUTHOR="${{ fromJSON(steps.get-pr-info.outputs.pr-info).user.login }}"
PR_TITLE="${{ fromJSON(steps.get-pr-info.outputs.pr-info).title }}"
SOURCE_BRANCH="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.ref }}"
SOURCE_REPO="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.repo.full_name }}"

# Fetch all branches
git fetch origin dev

# Handle forks vs same repo
if [ "$SOURCE_REPO" = "${{ github.repository }}" ]; then
git fetch origin "$SOURCE_BRANCH"
git checkout "$SOURCE_BRANCH"
else
git fetch "https://github.com/$SOURCE_REPO.git" "$SOURCE_BRANCH"
git checkout FETCH_HEAD
fi

# Find commit range to cherry-pick
START_COMMIT=$(git merge-base origin/dev HEAD)
END_COMMIT=$(git rev-parse HEAD)

# Create cherry-pick branch from main
git fetch origin main
git checkout main
git checkout -b "$TARGET_BRANCH"

# Cherry-pick commits
if ! git cherry-pick "$START_COMMIT..$END_COMMIT"; then
# Comment on the original PR about the failure
COMMENT_BODY=$(cat <<'EOF'
❌ **Cherry-pick to main failed**

The cherry-pick encountered conflicts and could not be completed automatically.

**Next steps:**
1. Manually create a PR with these changes to main
2. Resolve any conflicts
EOF
)

gh pr comment $PR_NUMBER --body "$COMMENT_BODY"
exit 1
fi

# Push branch
git push -u origin "$TARGET_BRANCH"

# Create PR to main
gh pr create \
--base main \
--head "$TARGET_BRANCH" \
--title "cp: \`$PR_TITLE ($PR_NUMBER)\` into \`main\`" \
--body "[🤖]: Hi @$PR_AUTHOR 👋<br><br>We've cherry-picked \`$PR_TITLE (#$PR_NUMBER)\` into \`main\` for you! 🚀<br><br>Please review and approve this cherry-pick at your convenience!" \
--label "cherry-pick" \
--reviewer "$PR_AUTHOR"

74 changes: 0 additions & 74 deletions .github/workflows/multi-approval-bot.yml

This file was deleted.

Loading