diff --git a/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md index 17c26b9af9..3ea5b99cfe 100644 --- a/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md +++ b/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md @@ -56,7 +56,8 @@ importance_weights = torch.exp(log_importance_weights) ```bash --epsilon 3e-4 # from paper section 5.1 --epsilon_high 4e-4 # from paper section 5.1 - --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates) + --gradient_accumulation_steps 8 + --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates) --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306 ``` diff --git a/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md index e79ca791a5..f2e6e03fc5 100644 --- a/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md +++ b/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md @@ -58,7 +58,8 @@ Other hyperparameters in the paper ```bash --epsilon 3e-4 # from paper section 5.1 --epsilon_high 4e-4 # from paper section 5.1 - --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates) + --gradient_accumulation_steps 8 + --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates) --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306 ``` diff --git a/examples/train/grpo/internal/chord.sh b/examples/train/grpo/internal/chord.sh index cad12e0947..aac7fb69f8 100644 --- a/examples/train/grpo/internal/chord.sh +++ b/examples/train/grpo/internal/chord.sh @@ -19,7 +19,6 @@ swift rlhf \ --load_from_cache_file true \ --torch_dtype bfloat16 \ --beta 0.0 \ - --steps_per_generation 4 \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 8 \ diff --git a/examples/train/grpo/internal/gspo.sh b/examples/train/grpo/internal/gspo.sh index e8a51bd955..e5b9ed4ff3 100644 --- a/examples/train/grpo/internal/gspo.sh +++ b/examples/train/grpo/internal/gspo.sh @@ -3,7 +3,7 @@ # hyperparameter # - epsilon = 3e-4 from paper serction 5.1 # - epsilon_high = 4e-4 from paper serction 5.1 -# - steps_per_generation = 4 from paper serction 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates) +# - steps_per_generation = 32 (= 4 * gradient_accumulation_steps): paper section 5.1 partitions each batch of rollout data into four minibatches for gradient updates; in swift HF GRPO, steps_per_generation counts micro-batches, so it must be multiplied by gradient_accumulation_steps # - beta = 0: zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ @@ -17,7 +17,7 @@ swift rlhf \ --beta 0.0 \ --epsilon 3e-4 \ --epsilon_high 4e-4 \ - --steps_per_generation 4 \ + --steps_per_generation 32 \ --importance_sampling_level sequence \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \