modelscope · hjh0119 · May 15, 2026 · May 15, 2026
diff --git a/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md
@@ -56,7 +56,8 @@ importance_weights = torch.exp(log_importance_weights)
 ```bash
     --epsilon 3e-4 # from paper section 5.1
     --epsilon_high 4e-4 # from paper section 5.1
-    --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+    --gradient_accumulation_steps 8 
+    --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
     --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 ```
 

diff --git a/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md
@@ -58,7 +58,8 @@ Other hyperparameters in the paper
 ```bash
     --epsilon 3e-4 # from paper section 5.1
     --epsilon_high 4e-4 # from paper section 5.1
-    --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+    --gradient_accumulation_steps 8 
+    --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
     --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 ```
 

diff --git a/examples/train/grpo/internal/chord.sh b/examples/train/grpo/internal/chord.sh
@@ -19,7 +19,6 @@ swift rlhf \
     --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --beta 0.0 \
-    --steps_per_generation 4 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 4 \
     --gradient_accumulation_steps 8 \

diff --git a/examples/train/grpo/internal/gspo.sh b/examples/train/grpo/internal/gspo.sh
@@ -3,7 +3,7 @@
 # hyperparameter
 # - epsilon = 3e-4 from paper serction 5.1
 # - epsilon_high = 4e-4 from paper serction 5.1
-# - steps_per_generation = 4 from paper serction 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+# - steps_per_generation = 32 (= 4 * gradient_accumulation_steps): paper section 5.1 partitions each batch of rollout data into four minibatches for gradient updates; in swift HF GRPO, steps_per_generation counts micro-batches, so it must be multiplied by gradient_accumulation_steps
 # - beta = 0: zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
@@ -17,7 +17,7 @@ swift rlhf \
     --beta 0.0 \
     --epsilon 3e-4 \
     --epsilon_high 4e-4 \
-    --steps_per_generation 4 \
+    --steps_per_generation 32 \
     --importance_sampling_level sequence \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \