NVIDIA-NeMo
diff --git a/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo.yaml‎
Lines changed: 0 additions & 87 deletions b/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo.yaml‎
Lines changed: 0 additions & 87 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo.yaml‎
Lines changed: 0 additions & 77 deletions b/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo.yaml‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎…ca-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎ ‎…cy-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo.yaml renamed to examples/configs/recipes/llm/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-cispo.yaml
Lines changed: 4 additions & 4 deletions b/‎…ca-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎ ‎…cy-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo.yaml renamed to examples/configs/recipes/llm/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-cispo.yaml
Lines changed: 4 additions & 4 deletions
diff --git a/‎…ica-qwen3-30ba3b-2n8g-megatron-grpo.yaml‎ ‎…icy-qwen3-30ba3b-2n8g-megatron-grpo.yaml‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-grpo.yaml renamed to examples/configs/recipes/llm/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-grpo.yaml
Lines changed: 8 additions & 8 deletions b/‎…ica-qwen3-30ba3b-2n8g-megatron-grpo.yaml‎ ‎…icy-qwen3-30ba3b-2n8g-megatron-grpo.yaml‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-grpo.yaml renamed to examples/configs/recipes/llm/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-grpo.yaml
Lines changed: 8 additions & 8 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo.yaml‎
Lines changed: 0 additions & 88 deletions b/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo.yaml‎
Lines changed: 0 additions & 88 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml‎
Lines changed: 0 additions & 59 deletions b/‎examples/configs/recipes/llm/cispo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml‎
Lines changed: 0 additions & 59 deletions
@@ -1,4 +1,4 @@
-# MiniMax-M1 replication study (https://arxiv.org/abs/2506.13585), CISPO arm.
+# MiniMax-M1 high-off-policy study (https://arxiv.org/abs/2506.13585), CISPO arm.
 # Minimal-diff copy of workspace-4's proven 2n8g SAPO recipe. Only the
 # loss_fn block differs across arms.
 #
@@ -25,7 +25,7 @@ grpo:
 
 policy:
   model_name: Qwen/Qwen3-30B-A3B
-  train_global_batch_size: 128
+  train_global_batch_size: 32
   train_micro_batch_size: 1
   logprob_batch_size: 1
   max_total_sequence_length: 4096
@@ -82,13 +82,13 @@ checkpointing:
   enabled: false
 
 logger:
-  log_dir: logs/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo
+  log_dir: logs/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-cispo
   wandb_enabled: true
   tensorboard_enabled: true
   monitor_gpus: false
   wandb:
     project: nemo-rl
-    name: cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo
+    name: cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-cispo
 
 cluster:
   gpus_per_node: 8
 
@@ -1,15 +1,15 @@
-# MiniMax-M1 replication study (https://arxiv.org/abs/2506.13585), GRPO arm.
+# MiniMax-M1 high-off-policy study (https://arxiv.org/abs/2506.13585), GRPO arm.
 # Minimal-diff copy of workspace-4's proven 2n8g SAPO recipe:
 #   grpo-qwen3-30ba3b-2n8g-megatron-sapo-asym.yaml
 # Only the loss_fn block (and logger names) differs.
 #
 # Three-way A/B/C: this is the GRPO baseline; DAPO and CISPO are at
-#   cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo.yaml
-#   cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo.yaml
+#   cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-dapo.yaml
+#   cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-cispo.yaml
 # Submit via cispo_mm1_replica.slurm with ARM=grpo|dapo|cispo.
 #
-# Off-policy regime: 32 x 16 = 512 trajectories, train_global_batch_size=128
-# -> 4 gradient updates per rollout (SAPO/GSPO Sec 5.1 setting). KL beta=0,
+# Off-policy regime: 32 x 16 = 512 trajectories, train_global_batch_size=32
+# -> 16 gradient updates per rollout (SAPO/GSPO Sec 5.1 setting). KL beta=0,
 # token-level loss, sampling temperature 1.0.
 # NOT in the PR; local research artifact.
 defaults: ../../grpo_math_qwen30ba3b_megatron.yaml
@@ -26,7 +26,7 @@ grpo:
 
 policy:
   model_name: Qwen/Qwen3-30B-A3B
-  train_global_batch_size: 128
+  train_global_batch_size: 32
   train_micro_batch_size: 1
   logprob_batch_size: 1
   max_total_sequence_length: 4096
@@ -85,13 +85,13 @@ checkpointing:
   enabled: false
 
 logger:
-  log_dir: logs/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-grpo
+  log_dir: logs/cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-grpo
   wandb_enabled: true
   tensorboard_enabled: true
   monitor_gpus: false
   wandb:
     project: nemo-rl
-    name: cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-grpo
+    name: cispo-mm1-highoffpolicy-qwen3-30ba3b-2n8g-megatron-grpo
 
 cluster:
   gpus_per_node: 8