NVIDIA-NeMo
diff --git a/‎examples/configs/cispo_math_8B.yaml‎
Lines changed: 0 additions & 25 deletions b/‎examples/configs/cispo_math_8B.yaml‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 6 additions & 0 deletions b/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo.yaml‎
Lines changed: 87 additions & 0 deletions b/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo.yaml‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo.yaml‎
Lines changed: 77 additions & 0 deletions b/‎examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo.yaml‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎
Lines changed: 95 additions & 0 deletions b/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo.yaml‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo.yaml‎
Lines changed: 88 additions & 0 deletions b/‎examples/configs/recipes/llm/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo.yaml‎
Lines changed: 88 additions & 0 deletions
@@ -71,6 +71,12 @@ loss_fn:
   token_level_loss: true
   force_on_policy_ratio: false  # Set to true to force ratio=1.0 (requires train_global_batch_size == num_prompts_per_step * num_generations_per_prompt)
   use_kl_in_reward: false  # Reinforce++: add KL penalty to reward instead of loss
+  use_cispo: false  # CISPO (https://arxiv.org/abs/2506.13585): clipped IS-weight policy optimization
+  # Optional CISPO-style diagnostics. Cheap; works on GRPO/DAPO/CISPO arms.
+  # See ClippedPGLossConfig in nemo_rl/algorithms/loss/loss_functions.py.
+  cispo_diagnostics: false
+  cispo_diag_grpo_eps: 0.2          # baseline GRPO eps for would_clip_frac
+  cispo_diag_low_prob_threshold: 0.05  # proxy threshold for rare reflective tokens
 
 checkpointing:
   enabled: true
 
@@ -0,0 +1,87 @@
+# A/B treatment arm: CISPO (Clipped IS-weight Policy Optimization,
+# arXiv:2506.13585) on Qwen2.5-Math-1.5B-Instruct.
+#
+# CISPO replaces GRPO's hard PPO clip + advantage product with a stop-gradient
+# clipped importance weight applied to the log-probability:
+#
+#     L_CISPO = -A_t * sg(clip(r_t, 1 - eps_low, 1 + eps_high)) * log pi(a_t)
+#
+# Pair with:
+#   examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo.yaml
+# Everything except the loss-fn block is identical between the two arms.
+#
+# Off-policy regime (where CISPO and the hard PPO clip diverge most):
+#   * 32 prompts x 16 generations = 512 trajectories per step
+#   * train_global_batch_size = 128 -> 4 gradient updates per rollout
+#     (matches the GSPO Sec 5.1 reference setting, arXiv:2507.18071)
+#   * KL beta = 0 (CISPO paper Sec 5.1; kept identical in both arms so the
+#     KL regularizer is not a confounder)
+#
+# NOT in the CISPO PR - this is a local research-validation artifact.
+# (The PR ships the on-policy machinery-smoke recipe at
+#  examples/configs/recipes/llm/cispo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml.)
+defaults: ../../grpo_math_1B.yaml
+
+grpo:
+  max_num_steps: 100
+  val_period: 10
+  val_at_start: true
+  val_at_end: true
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42                              # matched-pair: identical RNG to GRPO arm
+
+policy:
+  model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  train_global_batch_size: 128         # off-policy: 4 grad updates / rollout
+  train_micro_batch_size: 4
+  logprob_batch_size: 4
+  max_total_sequence_length: 1024
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 1024
+
+data:
+  max_input_seq_length: 512
+
+loss_fn:
+  # CISPO treatment arm. Paper-recommended clip: very loose lower (no
+  # effective lower clip), tighter upper. With nemo-rl's parameterisation
+  # (lower = 1 - ratio_clip_min, upper = 1 + ratio_clip_max):
+  #   ratio_clip_min = 1.0 -> lower bound = 0.0 (ratios are positive, so this
+  #                           is effectively unclipped below)
+  #   ratio_clip_max = 0.8 -> upper bound = 1.8
+  use_cispo: true
+  reference_policy_kl_penalty: 0.0     # matched to the GRPO arm for fairness
+  reference_policy_kl_type: k3
+  ratio_clip_min: 1.0
+  ratio_clip_max: 0.8
+  ratio_clip_c: null                   # dual clipping MUST be off for CISPO
+  token_level_loss: true
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  force_on_policy_ratio: false
+
+checkpointing:
+  enabled: false                       # research run; skip checkpoint I/O
+
+logger:
+  log_dir: logs/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
@@ -0,0 +1,77 @@
+# A/B baseline arm: vanilla GRPO with the standard hard PPO clip.
+#
+# This recipe is the *control* arm in a back-to-back A/B comparison meant to
+# isolate the effect of swapping the hard PPO clip for CISPO's clipped IS-
+# weight surrogate (arXiv:2506.13585). Pair with:
+#   examples/configs/recipes/llm/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-cispo.yaml
+# Everything except the loss-fn block is identical between the two arms.
+#
+# Off-policy regime (where CISPO and the hard PPO clip diverge most):
+#   * 32 prompts x 16 generations = 512 trajectories per step
+#   * train_global_batch_size = 128 -> 4 gradient updates per rollout
+#     (matches the GSPO Sec 5.1 reference setting, arXiv:2507.18071)
+#   * KL beta = 0 (CISPO paper Sec 5.1; kept identical in both arms so the
+#     KL regularizer is not a confounder)
+#   * token-level loss, sampling temperature inherited from base
+#
+# NOT in the CISPO PR - this is a local research-validation artifact.
+defaults: ../../grpo_math_1B.yaml
+
+grpo:
+  max_num_steps: 100
+  val_period: 10
+  val_at_start: true
+  val_at_end: true
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+
+policy:
+  model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  train_global_batch_size: 128         # off-policy: 4 grad updates / rollout
+  train_micro_batch_size: 4
+  logprob_batch_size: 4
+  max_total_sequence_length: 1024
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 1024
+
+data:
+  max_input_seq_length: 512
+
+loss_fn:
+  # GRPO control arm: standard hard PPO clip at +/- 0.2.
+  use_cispo: false
+  reference_policy_kl_penalty: 0.0     # matched to the CISPO arm for fairness
+  reference_policy_kl_type: k3
+  ratio_clip_min: 0.2                  # PPO clip lower bound = 1 - 0.2 = 0.8
+  ratio_clip_max: 0.2                  # PPO clip upper bound = 1 + 0.2 = 1.2
+  ratio_clip_c: null
+  token_level_loss: true
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  force_on_policy_ratio: false
+
+checkpointing:
+  enabled: false                       # research run; skip checkpoint I/O
+
+logger:
+  log_dir: logs/cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: cispo-ab-qwen2.5-math-1.5b-instruct-1n8g-grpo
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
@@ -0,0 +1,95 @@
+# MiniMax-M1 replication study (https://arxiv.org/abs/2506.13585), CISPO arm.
+# Minimal-diff copy of workspace-4's proven 2n8g SAPO recipe. Only the
+# loss_fn block differs across arms.
+#
+# CISPO (MiniMax-M1 §3.1) clips the IS weight as a stop-gradient
+# coefficient instead of clipping the policy ratio. Gradients flow
+# through log pi for *every* token, including the rare reflective
+# tokens ("However", "Wait", "Recheck") that GRPO/DAPO would zero out.
+#
+#   L_CISPO = -A * sg(clip(r, 1 - eps_low, 1 + eps_high)) * log pi(a)
+#
+# Per ms-swift's CISPO recipe and ScaleRL (arXiv:2510.13786), we use a
+# very loose lower clip and a much wider upper clip (eps_high = 5.0).
+defaults: ../../grpo_math_qwen30ba3b_megatron.yaml
+
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_num_steps: 200
+  val_period: 20
+  val_at_start: true
+  val_at_end: true
+  max_val_samples: 128
+  val_batch_size: 128
+
+policy:
+  model_name: Qwen/Qwen3-30B-A3B
+  train_global_batch_size: 128
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+  max_total_sequence_length: 4096
+  sequence_packing:
+    enabled: true
+    algorithm: modified_first_fit_decreasing
+    sequence_length_round: 64
+  megatron_cfg:
+    enabled: true
+    converter_type: LlamaForCausalLM
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 8
+    sequence_parallel: true
+    empty_unused_memory_level: 1
+    freeze_moe_router: true
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    moe_router_bias_update_rate: 0.0
+    optimizer:
+      lr: 3.0e-7
+      min_lr: 3.0e-8
+    scheduler:
+      lr_decay_iters: 500
+      lr_warmup_iters: 10
+      lr_warmup_init: 3.0e-8
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.7
+      enforce_eager: false
+    colocated:
+      enabled: true
+
+loss_fn:
+  # ---- CISPO arm ----
+  reference_policy_kl_penalty: 0.0
+  reference_policy_kl_type: k3
+  ratio_clip_min: 1.0                  # lower bound = 0; effectively unclipped
+  ratio_clip_max: 5.0                  # eps_high = 5.0 (ms-swift / ScaleRL)
+  ratio_clip_c: null                   # dual clipping MUST be off for CISPO
+  token_level_loss: true
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  force_on_policy_ratio: false
+  use_cispo: true
+  cispo_diagnostics: true
+  cispo_diag_grpo_eps: 0.2             # measure GRPO-equivalent clip rate
+  cispo_diag_low_prob_threshold: 0.05
+
+checkpointing:
+  enabled: false
+
+logger:
+  log_dir: logs/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: false
+  wandb:
+    project: nemo-rl
+    name: cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-cispo
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
@@ -0,0 +1,88 @@
+# MiniMax-M1 replication study (https://arxiv.org/abs/2506.13585), DAPO arm.
+# Minimal-diff copy of workspace-4's proven 2n8g SAPO recipe. Only the
+# loss_fn block differs across arms.
+#
+# DAPO ("Clip-Higher", https://arxiv.org/abs/2503.14476): asymmetric clip
+# with a tighter lower bound and a looser upper bound.
+defaults: ../../grpo_math_qwen30ba3b_megatron.yaml
+
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_num_steps: 200
+  val_period: 20
+  val_at_start: true
+  val_at_end: true
+  max_val_samples: 128
+  val_batch_size: 128
+
+policy:
+  model_name: Qwen/Qwen3-30B-A3B
+  train_global_batch_size: 128
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+  max_total_sequence_length: 4096
+  sequence_packing:
+    enabled: true
+    algorithm: modified_first_fit_decreasing
+    sequence_length_round: 64
+  megatron_cfg:
+    enabled: true
+    converter_type: LlamaForCausalLM
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 8
+    sequence_parallel: true
+    empty_unused_memory_level: 1
+    freeze_moe_router: true
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    moe_router_bias_update_rate: 0.0
+    optimizer:
+      lr: 3.0e-7
+      min_lr: 3.0e-8
+    scheduler:
+      lr_decay_iters: 500
+      lr_warmup_iters: 10
+      lr_warmup_init: 3.0e-8
+    env_vars:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.7
+      enforce_eager: false
+    colocated:
+      enabled: true
+
+loss_fn:
+  # ---- DAPO ("Clip-Higher") arm ----
+  reference_policy_kl_penalty: 0.0
+  reference_policy_kl_type: k3
+  ratio_clip_min: 0.2                  # eps_low  - identical to GRPO
+  ratio_clip_max: 0.28                 # eps_high - DAPO "Clip-Higher"
+  ratio_clip_c: null
+  token_level_loss: true
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  force_on_policy_ratio: false
+  use_cispo: false
+  cispo_diagnostics: true
+  cispo_diag_grpo_eps: 0.2
+  cispo_diag_low_prob_threshold: 0.05
+
+checkpointing:
+  enabled: false
+
+logger:
+  log_dir: logs/cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: false
+  wandb:
+    project: nemo-rl
+    name: cispo-mm1-replica-qwen3-30ba3b-2n8g-megatron-dapo
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2