From 8073a0ab453811d92d5aa0a442701c746740f83e Mon Sep 17 00:00:00 2001
From: hjh <hujinghan.hjh@alibaba-inc.com>
Date: Fri, 15 May 2026 16:04:33 +0800
Subject: [PATCH 1/2] fix example

---
 examples/train/grpo/internal/chord.sh | 1 -
 examples/train/grpo/internal/gspo.sh  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/train/grpo/internal/chord.sh b/examples/train/grpo/internal/chord.sh
index cad12e0947..aac7fb69f8 100644
--- a/examples/train/grpo/internal/chord.sh
+++ b/examples/train/grpo/internal/chord.sh
@@ -19,7 +19,6 @@ swift rlhf \
     --load_from_cache_file true \
     --torch_dtype bfloat16 \
     --beta 0.0 \
-    --steps_per_generation 4 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 4 \
     --gradient_accumulation_steps 8 \
diff --git a/examples/train/grpo/internal/gspo.sh b/examples/train/grpo/internal/gspo.sh
index e8a51bd955..1a6ba776e5 100644
--- a/examples/train/grpo/internal/gspo.sh
+++ b/examples/train/grpo/internal/gspo.sh
@@ -17,7 +17,7 @@ swift rlhf \
     --beta 0.0 \
     --epsilon 3e-4 \
     --epsilon_high 4e-4 \
-    --steps_per_generation 4 \
+    --steps_per_generation 32 \
     --importance_sampling_level sequence \
     --num_train_epochs 1 \
     --per_device_train_batch_size 2 \

From 5c2374e7a58caafc8059d84ac2eca43e49ca3aa2 Mon Sep 17 00:00:00 2001
From: hjh <hujinghan.hjh@alibaba-inc.com>
Date: Fri, 15 May 2026 16:12:45 +0800
Subject: [PATCH 2/2] update doc

---
 docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md    | 3 ++-
 docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md | 3 ++-
 examples/train/grpo/internal/gspo.sh                     | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md
index 17c26b9af9..3ea5b99cfe 100644
--- a/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md
+++ b/docs/source/Instruction/GRPO/AdvancedResearch/GSPO.md
@@ -56,7 +56,8 @@ importance_weights = torch.exp(log_importance_weights)
 ```bash
     --epsilon 3e-4 # from paper section 5.1
     --epsilon_high 4e-4 # from paper section 5.1
-    --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+    --gradient_accumulation_steps 8 
+    --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
     --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 ```
 
diff --git a/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md b/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md
index e79ca791a5..f2e6e03fc5 100644
--- a/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md
+++ b/docs/source_en/Instruction/GRPO/AdvancedResearch/GSPO.md
@@ -58,7 +58,8 @@ Other hyperparameters in the paper
 ```bash
     --epsilon 3e-4 # from paper section 5.1
     --epsilon_high 4e-4 # from paper section 5.1
-    --steps_per_generation 4 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+    --gradient_accumulation_steps 8 
+    --steps_per_generation 32 # from paper section 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
     --beta 0 # zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 ```
 
diff --git a/examples/train/grpo/internal/gspo.sh b/examples/train/grpo/internal/gspo.sh
index 1a6ba776e5..e5b9ed4ff3 100644
--- a/examples/train/grpo/internal/gspo.sh
+++ b/examples/train/grpo/internal/gspo.sh
@@ -3,7 +3,7 @@
 # hyperparameter
 # - epsilon = 3e-4 from paper serction 5.1
 # - epsilon_high = 4e-4 from paper serction 5.1
-# - steps_per_generation = 4 from paper serction 5.1 (each batch of rollout data is partitioned into four minibatches for gradient updates)
+# - steps_per_generation = 32 (= 4 * gradient_accumulation_steps): paper section 5.1 partitions each batch of rollout data into four minibatches for gradient updates; in swift HF GRPO, steps_per_generation counts micro-batches, so it must be multiplied by gradient_accumulation_steps
 # - beta = 0: zero kl regularization https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306
 
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \