1- #! /bin/bash
2- set -x
3-
4-
5- # ##**** For better performance, it's recommended to have:
6- # ## (ppo_micro_batch_size_per_gpu * nnodes * n_gpus_per_node) % rollout.n = 0
7- # Below setting for training on 4*A100-80GB GPUs
8- nnodes=1
9- n_gpus_per_node=4
10- ppo_micro_batch_size_per_gpu=4
11- rollout_n=8
12-
13- loss_mode=' disco'
14- # ## score function selection for disco
15- score_func=' logL' # Options: 'logL', 'Lratio'
16- tau=10 # ## tau=10 is recommended for 'logL', tau=1 is recommended for 'Lratio'
17-
18- MODEL_PATH=" deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19- # Train over a single node, 4 A100-80GB GPUs.
20- python3 -m recipe.disco.main_disco \
21- algorithm.adv_estimator=disco \
22- algorithm.filter_groups.enable=False \
23- data.train_files=./recipe/disco/data/deepscaler_preview.parquet \
24- data.val_files=./recipe/disco/data/aime24.parquet \
25- data.train_batch_size=128 \
26- data.val_batch_size=512 \
27- data.max_prompt_length=1024 \
28- data.max_response_length=8192 \
29- data.filter_overlong_prompts=True \
30- actor_rollout_ref.model.path=$MODEL_PATH \
31- actor_rollout_ref.actor.optim.lr=2e-6 \
32- actor_rollout_ref.model.use_remove_padding=True \
33- actor_rollout_ref.actor.ppo_mini_batch_size=32 \
34- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$ppo_micro_batch_size_per_gpu \
35- actor_rollout_ref.actor.use_dynamic_bsz=False \
36- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=36864 \
37- actor_rollout_ref.actor.ppo_epochs=1 \
38- +actor_rollout_ref.ref.enable=False \
39- actor_rollout_ref.actor.use_kl_loss=False \
40- actor_rollout_ref.actor.kl_loss_coef=0.001 \
41- actor_rollout_ref.actor.kl_loss_type=low_var_kl \
42- actor_rollout_ref.actor.policy_loss.loss_mode=$loss_mode \
43- actor_rollout_ref.actor.policy_loss.score_func=$score_func \
44- actor_rollout_ref.actor.policy_loss.delta=1e-4 \
45- actor_rollout_ref.actor.policy_loss.beta=1e3 \
46- actor_rollout_ref.actor.policy_loss.tau=$tau \
47- actor_rollout_ref.actor.entropy_coeff=0.0 \
48- actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
49- actor_rollout_ref.model.enable_gradient_checkpointing=True \
50- actor_rollout_ref.actor.fsdp_config.param_offload=False \
51- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
52- actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
53- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$ppo_micro_batch_size_per_gpu \
54- actor_rollout_ref.rollout.name=vllm \
55- actor_rollout_ref.rollout.temperature=0.6 \
56- actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
57- actor_rollout_ref.rollout.n=8 \
58- actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
59- actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
60- actor_rollout_ref.rollout.val_kwargs.top_k=-1 \
61- actor_rollout_ref.rollout.val_kwargs.do_sample=True \
62- actor_rollout_ref.rollout.val_kwargs.n=16 \
63- actor_rollout_ref.rollout.max_num_batched_tokens=10240 \
64- actor_rollout_ref.rollout.max_num_seqs=1024 \
65- actor_rollout_ref.ref.fsdp_config.param_offload=True \
66- trainer.critic_warmup=0 \
67- trainer.logger=[' console' ,' wandb' ] \
68- trainer.project_name=' verl-disco' \
69- trainer.experiment_name=' 1.5B-disco-logL' \
70- trainer.balance_batch=False \
71- trainer.val_before_train=True \
72- trainer.n_gpus_per_node=$n_gpus_per_node \
73- trainer.nnodes=$nnodes \
74- trainer.save_freq=20 \
75- trainer.test_freq=20 \
76- trainer.default_hdfs_dir=null \
77- trainer.total_epochs=30 " ${@: 1} " \
1+ #! /bin/bash
2+ set -x
3+
4+
5+ # ##**** For better performance, it's recommended to have:
6+ # ## (ppo_micro_batch_size_per_gpu * nnodes * n_gpus_per_node) % rollout.n = 0
7+ # Below setting for training on 4*A100-80GB GPUs
8+ nnodes=1
9+ n_gpus_per_node=4
10+ ppo_micro_batch_size_per_gpu=4
11+ rollout_n=8
12+
13+ loss_mode=' disco'
14+ # ## score function selection for disco
15+ score_func=' logL' # Options: 'logL', 'Lratio'
16+ tau=10 # ## tau=10 is recommended for 'logL', tau=1 is recommended for 'Lratio'
17+
18+ MODEL_PATH=" deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19+ # Train over a single node, 4 A100-80GB GPUs.
20+ python3 -m recipe.disco.main_disco \
21+ algorithm.adv_estimator=disco \
22+ algorithm.filter_groups.enable=False \
23+ data.train_files=./recipe/disco/data/deepscaler_preview.parquet \
24+ data.val_files=./recipe/disco/data/aime24.parquet \
25+ data.train_batch_size=128 \
26+ data.val_batch_size=512 \
27+ data.max_prompt_length=1024 \
28+ data.max_response_length=8192 \
29+ data.filter_overlong_prompts=True \
30+ actor_rollout_ref.model.path=$MODEL_PATH \
31+ actor_rollout_ref.actor.optim.lr=2e-6 \
32+ actor_rollout_ref.model.use_remove_padding=True \
33+ actor_rollout_ref.actor.ppo_mini_batch_size=32 \
34+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$ppo_micro_batch_size_per_gpu \
35+ actor_rollout_ref.actor.use_dynamic_bsz=False \
36+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=36864 \
37+ actor_rollout_ref.actor.ppo_epochs=1 \
38+ +actor_rollout_ref.ref.enable=False \
39+ actor_rollout_ref.actor.use_kl_loss=False \
40+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
41+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
42+ actor_rollout_ref.actor.policy_loss.loss_mode=$loss_mode \
43+ actor_rollout_ref.actor.policy_loss.score_func=$score_func \
44+ actor_rollout_ref.actor.policy_loss.delta=1e-4 \
45+ actor_rollout_ref.actor.policy_loss.beta=1e3 \
46+ actor_rollout_ref.actor.policy_loss.tau=$tau \
47+ actor_rollout_ref.actor.entropy_coeff=0.0 \
48+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
49+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
50+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
51+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
52+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
53+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$ppo_micro_batch_size_per_gpu \
54+ actor_rollout_ref.rollout.name=vllm \
55+ actor_rollout_ref.rollout.temperature=0.6 \
56+ actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
57+ actor_rollout_ref.rollout.n=8 \
58+ actor_rollout_ref.rollout.val_kwargs.temperature=0.6 \
59+ actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
60+ actor_rollout_ref.rollout.val_kwargs.top_k=-1 \
61+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
62+ actor_rollout_ref.rollout.val_kwargs.n=16 \
63+ actor_rollout_ref.rollout.max_num_batched_tokens=10240 \
64+ actor_rollout_ref.rollout.max_num_seqs=1024 \
65+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
66+ trainer.critic_warmup=0 \
67+ trainer.logger=[' console' ,' wandb' ] \
68+ trainer.project_name=' verl-disco' \
69+ trainer.experiment_name=' 1.5B-disco-logL' \
70+ trainer.balance_batch=False \
71+ trainer.val_before_train=True \
72+ trainer.n_gpus_per_node=$n_gpus_per_node \
73+ trainer.nnodes=$nnodes \
74+ trainer.save_freq=20 \
75+ trainer.test_freq=20 \
76+ trainer.default_hdfs_dir=null \
77+ trainer.total_epochs=30 " ${@: 1} " \
7878 trainer.resume_mode=auto
0 commit comments