Skip to content

Commit becc4b7

Browse files
committed
Add Moonlight AutoEP config files (AdamW and Muon)
1 parent bc2f020 commit becc4b7

2 files changed

Lines changed: 66 additions & 0 deletions

File tree

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"train_batch_size": 16,
3+
"bf16": {
4+
"enabled": true
5+
},
6+
"zero_optimization": {
7+
"stage": 2
8+
},
9+
"optimizer": {
10+
"type": "AdamW",
11+
"params": {
12+
"lr": 2e-06,
13+
"betas": [0.9, 0.999],
14+
"eps": 1e-08,
15+
"weight_decay": 0.01
16+
}
17+
},
18+
"expert_parallel": {
19+
"enabled": true,
20+
"autoep_size": 8,
21+
"expert_w1": "gate_proj",
22+
"expert_w2": "down_proj",
23+
"expert_w3": "up_proj",
24+
"route_scale": 2.446,
25+
"load_balance_coeff": null
26+
},
27+
"gradient_accumulation_steps": 2,
28+
"gradient_clipping": 1.0,
29+
"zero_allow_untested_optimizer": true
30+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"train_batch_size": 16,
3+
"bf16": {
4+
"enabled": true
5+
},
6+
"zero_optimization": {
7+
"stage": 2
8+
},
9+
"optimizer": {
10+
"type": "Muon",
11+
"params": {
12+
"muon_lr": 0.002,
13+
"adam_lr": 2e-06,
14+
"momentum": 0.95,
15+
"ns_method": "gram",
16+
"betas": [
17+
0.9,
18+
0.999
19+
],
20+
"eps": 1e-08,
21+
"weight_decay": 0.01
22+
}
23+
},
24+
"expert_parallel": {
25+
"enabled": true,
26+
"autoep_size": 8,
27+
"expert_w1": "gate_proj",
28+
"expert_w2": "down_proj",
29+
"expert_w3": "up_proj",
30+
"route_scale": 2.446,
31+
"load_balance_coeff": null
32+
},
33+
"gradient_accumulation_steps": 2,
34+
"gradient_clipping": 1.0,
35+
"zero_allow_untested_optimizer": true
36+
}

0 commit comments

Comments
 (0)