Skip to content

Commit 2332d20

Browse files
authored
feat: Add YaRN rope scaling support on Magatron-Bridge (#2188)
Signed-off-by: ruit <ruit@nvidia.com>
1 parent 5085f14 commit 2332d20

25 files changed

Lines changed: 424 additions & 14 deletions

docs/design-docs/training-backends.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,16 @@ export HF_HOME="/shared/nfs/huggingface"
7575
- **Mount in checkpoint directory**: If you are using Docker, make sure the Megatron checkpoint path is covered by `-v`/`--mount`. Similarly, if you are using SLURM+pyxis, ensure `--container-mounts` includes this path.
7676
- **Use shared storage**: Ensure the checkpoint directory is accessible from all nodes (e.g., NFS, shared filesystem).
7777
- **Prefer HF_HOME**: If you already have `HF_HOME` mounted across nodes, this reduces the number of environment variables to manage.
78-
- **Sufficient space**: Ensure adequate disk space for the converted model checkpoints.
78+
- **Sufficient space**: Ensure adequate disk space for the converted model checkpoints.
79+
80+
### Force Reconvert ###
81+
82+
By default, NeMo RL skips the HF → Megatron conversion if a converted checkpoint already exists at the target path. If you need to force a fresh conversion (e.g., after updating megatron-bridge or changing `hf_config_overrides`), set the following option in your config:
83+
84+
```yaml
85+
policy:
86+
megatron_cfg:
87+
force_reconvert_from_hf: True # Default: False
88+
```
89+
90+
This is equivalent to deleting the converted checkpoint directory and rerunning — the old checkpoint will be overwritten with a freshly converted one.

examples/configs/distillation_math.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ policy: &POLICY_BASE
8787

8888
megatron_cfg: &MEGATRON_BASE
8989
enabled: false
90+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
9091
empty_unused_memory_level: 0
9192
activation_checkpointing: false
9293
converter_type: "Qwen3ForCausalLM"
@@ -196,6 +197,7 @@ policy: &POLICY_BASE
196197
num_last_layers_in_bf16: 0
197198
num_first_layers_in_bf16: 0
198199
distributed_executor_backend: null
200+
vllm_kwargs: {}
199201

200202
colocated:
201203
# true: generation shares training GPUs

examples/configs/distillation_math_megatron.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ policy: &POLICY_BASE
145145
num_last_layers_in_bf16: 0
146146
num_first_layers_in_bf16: 0
147147
distributed_executor_backend: null
148+
vllm_kwargs: {}
148149

149150
colocated:
150151
# true: generation shares training GPUs

examples/configs/dpo.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ policy:
121121
enabled: false
122122
use_linear_ce_fusion_loss: false
123123
linear_ce_fusion_chunk_size: 256
124+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
124125
empty_unused_memory_level: 1
125126
activation_checkpointing: false
126127
tensor_model_parallel_size: 2

examples/configs/grpo_math_1B.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ policy:
128128

129129
megatron_cfg:
130130
enabled: false
131+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
131132
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
132133
activation_checkpointing: false
133134
converter_type: "Qwen2ForCausalLM"

examples/configs/grpo_math_1B_megatron.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ policy:
7878

7979
megatron_cfg:
8080
enabled: true
81+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
8182
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
8283
activation_checkpointing: false
8384
converter_type: "Qwen2ForCausalLM"
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
defaults: ../../grpo_math_1B_megatron.yaml
2+
grpo:
3+
max_num_steps: 100
4+
num_prompts_per_step: 2
5+
checkpointing:
6+
checkpoint_dir: results/grpo-qwen2.5-1.5B-4n8g-megatron-yarn-256k
7+
save_period: 20
8+
policy:
9+
train_global_batch_size: 32
10+
train_micro_batch_size: 1
11+
max_total_sequence_length: 262144
12+
make_sequence_length_divisible_by: 64
13+
megatron_cfg:
14+
context_parallel_size: 32
15+
hf_config_overrides:
16+
rope_scaling:
17+
rope_type: yarn
18+
rope_theta: 1000000
19+
factor: ${div:${policy.max_total_sequence_length},${policy.hf_config_overrides.rope_scaling.original_max_position_embeddings}}
20+
original_max_position_embeddings: 131072
21+
truncate: true
22+
beta_fast: 32
23+
beta_slow: 1
24+
mscale: 1
25+
mscale_all_dim: 0
26+
logger:
27+
wandb:
28+
project: yarn
29+
name: grpo-qwen2.5-1.5B-4n8g-megatron-yarn-256k
30+
cluster:
31+
gpus_per_node: 8
32+
num_nodes: 4
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
defaults: ../../sft.yaml
2+
sft:
3+
max_num_steps: 100
4+
checkpointing:
5+
checkpoint_dir: results/sft-qwen3-0.6B-1n8g-megatron-yarn-64k
6+
save_period: 20
7+
policy:
8+
model_name: Qwen/Qwen3-0.6B
9+
train_global_batch_size: 16
10+
max_total_sequence_length: 65536
11+
dtensor_cfg:
12+
enabled: false
13+
megatron_cfg:
14+
enabled: true
15+
context_parallel_size: 8
16+
distributed_data_parallel_config:
17+
grad_reduce_in_fp32: true
18+
optimizer:
19+
lr: 2.0e-05
20+
min_lr: 2.0e-05
21+
weight_decay: 0.01
22+
adam_eps: 1.0e-08
23+
clip_grad: 0
24+
params_dtype: bfloat16
25+
use_precision_aware_optimizer: false
26+
scheduler:
27+
lr_warmup_iters: 1
28+
lr_warmup_init: 1.999999e-05
29+
sequence_packing:
30+
enabled: true
31+
make_sequence_length_divisible_by: 16
32+
optimizer: null
33+
hf_config_overrides:
34+
rope_scaling:
35+
rope_type: yarn
36+
rope_theta: 1000000
37+
factor: 1.6
38+
original_max_position_embeddings: 40960
39+
truncate: true
40+
beta_fast: 32
41+
beta_slow: 1
42+
mscale: 1
43+
mscale_all_dim: 0
44+
data:
45+
add_generation_prompt: true
46+
train:
47+
dataset_name: Nemotron-Cascade-2-SFT-Math
48+
split_validation_size: 0.05
49+
max_samples: 100000
50+
validation: null
51+
logger:
52+
wandb:
53+
project: yarn
54+
name: sft-qwen3-0.6B-1n8g-megatron-yarn-64k
55+
tensorboard:
56+
log_dir: tb_logs-sft-qwen3-0.6B-1n8g-megatron-yarn-64k
57+
cluster:
58+
gpus_per_node: 8

examples/configs/rm.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ policy:
7777
## ignored since enabled=false, but needed for testing purposes
7878
megatron_cfg:
7979
enabled: false
80+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
8081
empty_unused_memory_level: 1
8182
activation_checkpointing: false
8283
tensor_model_parallel_size: 2

examples/configs/sft.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ policy:
101101
enabled: false
102102
use_linear_ce_fusion_loss: false
103103
linear_ce_fusion_chunk_size: 256
104+
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
104105
env_vars: {}
105106
empty_unused_memory_level: 1
106107
activation_checkpointing: false

0 commit comments

Comments
 (0)