PrimeIntellect-ai · samsja · May 24, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs.
 
+- **`sampling.min_tokens`, `sampling.repetition_penalty`, `sampling.seed` removed**: Dropped from both `TrainSamplingConfig` and `EvalSamplingConfig` (group-level `[orchestrator.train.sampling]` / `[orchestrator.eval.sampling]` and per-env `[[orchestrator.train.env.sampling]]` / `[[orchestrator.eval.env.sampling]]`). `min_tokens` suppressed natural EOS, `repetition_penalty` distorts the on-policy sampling distribution, and `seed` wasn't pulling its weight — none belonged on the supported config surface. Existing configs setting any of these must delete the field. Hard-deprecation, no migration window. (2026-05-27)
+- **`wandb.shared` removed**: The deprecation shim that popped `wandb.shared` from input dicts with a `FutureWarning` (introduced in #2649) is gone. The `rl` entrypoint always uses shared W&B mode now, and existing configs that still set `wandb.shared = true` (or `false`) will fail validation. Drop the field from your config. (2026-05-27)
+- **`max_async_level` and `strict_async_level` removed**: The async-execution semantics between trainer and orchestrator are now design invariants, not config knobs. The trainer always runs exactly one step ahead of inference, and the orchestrator always adopts the freshest checkpoint that doesn't violate the one-step barrier. The shared top-level `max_async_level`, the per-sub-config `trainer.max_async_level` / `orchestrator.max_async_level`, and `orchestrator.strict_async_level` have all been removed. Existing configs setting any of these must drop the field; the previous defaults (`max_async_level = 1`, `strict_async_level = false`) match the new hardcoded behavior. Bench mode no longer bypasses the weight-ckpt wait (the `int(1e9)` workaround is gone) and `multimodal/rl_color_codeword_feat_renderer.toml`'s prior `max_async_level = 0` (fully synchronous on-policy) is no longer expressible. (2026-05-25)
+- **`teacher_inference` removed from RL entrypoint**: The `[teacher_inference]` config block and the `deployment.num_teacher_gpus` / `deployment.num_teacher_nodes` fields have been removed. The `rl` entrypoint now only manages the student-policy inference server. External teachers (used by OPD and local-vLLM SFT) must be started manually (e.g. `CUDA_VISIBLE_DEVICES=1 uv run inference --model.name <teacher> --server.port 8001 ...`) and pointed at via `[orchestrator.teacher.client]`. Existing configs using `[teacher_inference]` or `deployment.num_teacher_gpus` / `deployment.num_teacher_nodes` must drop those fields and bring up the teacher out-of-band. (2026-05-25)
 - **`rollouts_per_example` → `group_size`**: The orchestrator-level field, the group-level `[orchestrator.eval]` field, and the per-env `[[orchestrator.eval.env]]` field have all been renamed. The old name still parses as a validation alias (in both TOML and CLI), so existing configs keep working without changes; new configs should prefer `group_size`. (2026-05-22)
 - **`AdvantageInputs` / `AdvantageOutputs` are now per-group, and `AdvantageOutputs.advantages` is a plain `list[float]`** (second breaking change to this API in three weeks). `AdvantageInputs.rollouts` is now `list[vf.RolloutOutput]` (a single group) instead of `list[list[vf.RolloutOutput]]`, and `AdvantageOutputs.advantages` is now `list[float]` instead of a 2D `Float[Tensor, "num_examples rollouts_per_example"]`. `compute_advantages` calls `advantage_fn` once per group, which lets partial-group training (groups smaller than `rollouts_per_example` after rollout errors) round-trip without the previous bucket-by-size workaround. Custom advantage functions must drop the outer list dimension and return a list of floats — e.g. `AdvantageOutputs(advantages=(rewards - rewards.mean(dim=1, keepdim=True)).tolist())` becomes `AdvantageOutputs(advantages=[r - mean for r in rewards])` (or `.tolist()` if you keep torch internally). (2026-05-22)
 - **`[model.vlm]` requires `orchestrator.use_renderer = true`**: VLMs must go through the renderer path; the `vlm_requires_renderer` validator rejects `use_renderer = false` when `[model.vlm]` is set. The renderer owns the HF processor per-slot and ships generic `mm_kwargs` keyed by the model's forward signature. Since `use_renderer` already defaults to `true`, most VLM configs need no change. (2026-05-19)

diff --git a/configs/debug/qwen3_30b_a3b_pd_rlm_swe_router_replay.toml b/configs/debug/qwen3_30b_a3b_pd_rlm_swe_router_replay.toml
@@ -0,0 +1,126 @@
+output_dir = "/beegfs/outputs/qwen3-30b-a3b-pd-rlm-swe-router-replay-r3-delta-5step"
+clean_output_dir = true
+max_steps = 5
+seq_len = 8192
+max_async_level = 1
+
+[log]
+level = "debug"
+
+[model]
+name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+[deployment]
+type = "multi_node"
+num_train_nodes = 1
+num_infer_nodes = 2
+gpus_per_node = 8
+
+[slurm]
+job_name = "qwen3-pd-rlm-swe-r3-delta-5"
+partition = "cluster"
+time = "08:00:00"
+exclude = "ltc-idc3-hgx8-h200-[53,88]"
+
+[wandb]
+project = "qwen3-router-replay-e2e"
+name = "qwen3-30b-a3b-pd-rlm-swe-r3-delta-5step"
+group = "router-replay-e2e"
+offline = true
+shared = false
+
+[weight_broadcast]
+type = "nccl"
+timeout = 3600
+
+[trainer]
+enable_router_replay = true
+max_concurrent_runs = 1
+dist_timeout_seconds = 3600
+
+[trainer.model]
+impl = "custom"
+attn = "flash_attention_3"
+optim_cpu_offload = true
+ep = 8
+
+[trainer.model.ac]
+mode = "full"
+freq = 1
+
+[trainer.model.ac_offloading]
+max_inflight_activations = 5
+
+[trainer.model.compile]
+
+[trainer.optim]
+type = "adamw"
+lr = 1e-6
+
+[inference]
+gpu_memory_utilization = 0.9
+enable_return_routed_experts = true
+enable_eplb = false
+
+[inference.parallel]
+tp = 8
+
+[inference.model]
+max_model_len = 8192
+
+[inference.deployment]
+type = "disaggregated"
+num_prefill_nodes = 1
+num_decode_nodes = 1
+
+[orchestrator]
+filters = []
+batch_size = 2
+max_inflight_rollouts = 2
+rollouts_per_example = 1
+max_off_policy_steps = 1
+
+[orchestrator.train.sampling]
+temperature = 1.0
+repetition_penalty = 1.0
+max_completion_tokens = 256
+min_tokens = 0
+
+[orchestrator.client]
+extra_headers_from_state = { "X-Session-ID" = "example_id" }
+
+[[orchestrator.train.env]]
+id = "rlm_swe"
+name = "rlm-swe-low"
+num_workers = 1
+max_retries = 0
+max_total_completion_tokens = 1536
+
+[orchestrator.train.env.args]
+task_type = "swebench"
+dataset_name = "PrimeIntellect/SWE-Bench-Verified-Quick"
+max_turns = 6
+rlm_max_turns = 6
+timeout_seconds = 1800
+poll_interval = 5
+sandbox_cpu_cores = 2
+sandbox_memory_gb = 4
+sandbox_disk_size_gb = 4
+sandbox_client_max_workers = 16
+rlm_ref = "f466fccb6bc682092c88edf2b344951d7cbbd000"
+labels = ["rlm-r3-delta-swe"]
+
+[orchestrator.student.client]
+timeout = 1800
+wait_for_ready_timeout = 1800
+
+[orchestrator.renderer]
+name = "qwen3"
+preserve_all_thinking = true
+
+[orchestrator.eval]
+env = []
+
+[orchestrator.buffer]
+easy_threshold = 1.0
+hard_threshold = 0.0
diff --git a/configs/debug/qwen3_30b_a3b_pd_rlm_swe_router_replay_start_audit.toml b/configs/debug/qwen3_30b_a3b_pd_rlm_swe_router_replay_start_audit.toml
@@ -0,0 +1,126 @@
+output_dir = "/beegfs/outputs/qwen3-30b-a3b-pd-rlm-swe-router-replay-start-audit"
+clean_output_dir = true
+max_steps = 5
+seq_len = 8192
+max_async_level = 1
+
+[log]
+level = "debug"
+
+[model]
+name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+[deployment]
+type = "multi_node"
+num_train_nodes = 1
+num_infer_nodes = 2
+gpus_per_node = 8
+
+[slurm]
+job_name = "qwen3-pd-rlm-swe-start-audit"
+partition = "cluster"
+time = "08:00:00"
+exclude = "ltc-idc3-hgx8-h200-[53,88]"
+
+[wandb]
+project = "qwen3-router-replay-e2e"
+name = "qwen3-30b-a3b-pd-rlm-swe-start-audit"
+group = "router-replay-e2e"
+offline = true
+shared = false
+
+[weight_broadcast]
+type = "nccl"
+timeout = 3600
+
+[trainer]
+enable_router_replay = true
+max_concurrent_runs = 1
+dist_timeout_seconds = 3600
+
+[trainer.model]
+impl = "custom"
+attn = "flash_attention_3"
+optim_cpu_offload = true
+ep = 8
+
+[trainer.model.ac]
+mode = "full"
+freq = 1
+
+[trainer.model.ac_offloading]
+max_inflight_activations = 5
+
+[trainer.model.compile]
+
+[trainer.optim]
+type = "adamw"
+lr = 1e-6
+
+[inference]
+gpu_memory_utilization = 0.9
+enable_return_routed_experts = true
+enable_eplb = false
+
+[inference.parallel]
+tp = 8
+
+[inference.model]
+max_model_len = 8192
+
+[inference.deployment]
+type = "disaggregated"
+num_prefill_nodes = 1
+num_decode_nodes = 1
+
+[orchestrator]
+filters = []
+batch_size = 2
+max_inflight_rollouts = 2
+rollouts_per_example = 1
+max_off_policy_steps = 1
+
+[orchestrator.train.sampling]
+temperature = 1.0
+repetition_penalty = 1.0
+max_completion_tokens = 256
+min_tokens = 0
+
+[orchestrator.client]
+extra_headers_from_state = { "X-Session-ID" = "example_id" }
+
+[[orchestrator.train.env]]
+id = "rlm_swe"
+name = "rlm-swe-low"
+num_workers = 1
+max_retries = 0
+max_total_completion_tokens = 1536
+
+[orchestrator.train.env.args]
+task_type = "swebench"
+dataset_name = "PrimeIntellect/SWE-Bench-Verified-Quick"
+max_turns = 6
+rlm_max_turns = 6
+timeout_seconds = 1800
+poll_interval = 5
+sandbox_cpu_cores = 2
+sandbox_memory_gb = 4
+sandbox_disk_size_gb = 4
+sandbox_client_max_workers = 16
+rlm_ref = "f466fccb6bc682092c88edf2b344951d7cbbd000"
+labels = ["rlm-r3-delta-swe-start-audit"]
+
+[orchestrator.student.client]
+timeout = 1800
+wait_for_ready_timeout = 1800
+
+[orchestrator.renderer]
+name = "qwen3"
+preserve_all_thinking = true
+
+[orchestrator.eval]
+env = []
+
+[orchestrator.buffer]
+easy_threshold = 1.0
+hard_threshold = 0.0
diff --git a/configs/debug/qwen3_30b_a3b_pd_wordle_router_replay.toml b/configs/debug/qwen3_30b_a3b_pd_wordle_router_replay.toml
@@ -0,0 +1,113 @@
+output_dir = "/beegfs/outputs/qwen3-30b-a3b-pd-wordle-router-replay-r3-delta-10step"
+clean_output_dir = true
+max_steps = 10
+seq_len = 4096
+max_async_level = 1
+
+[log]
+level = "debug"
+
+[model]
+name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+[deployment]
+type = "multi_node"
+num_train_nodes = 1
+num_infer_nodes = 2
+gpus_per_node = 8
+
+[slurm]
+job_name = "qwen3-pd-wordle-r3-delta-10"
+partition = "cluster"
+time = "06:00:00"
+exclude = "ltc-idc3-hgx8-h200-[53,88]"
+
+[wandb]
+project = "qwen3-router-replay-e2e"
+name = "qwen3-30b-a3b-pd-wordle-r3-delta-10step"
+group = "router-replay-e2e"
+offline = true
+shared = false
+
+[weight_broadcast]
+type = "nccl"
+timeout = 3600
+
+[trainer]
+enable_router_replay = true
+max_concurrent_runs = 1
+dist_timeout_seconds = 3600
+
+[trainer.model]
+impl = "custom"
+attn = "flash_attention_3"
+optim_cpu_offload = true
+ep = 8
+
+[trainer.model.ac]
+mode = "full"
+freq = 1
+
+[trainer.model.ac_offloading]
+max_inflight_activations = 5
+
+[trainer.model.compile]
+
+[trainer.optim]
+type = "adamw"
+lr = 1e-6
+
+[inference]
+gpu_memory_utilization = 0.9
+enable_return_routed_experts = true
+enable_eplb = false
+
+[inference.parallel]
+tp = 8
+
+[inference.model]
+max_model_len = 4096
+
+[inference.deployment]
+type = "disaggregated"
+num_prefill_nodes = 1
+num_decode_nodes = 1
+
+[orchestrator]
+filters = []
+batch_size = 8
+max_inflight_rollouts = 8
+rollouts_per_example = 1
+max_off_policy_steps = 2
+
+[orchestrator.train.sampling]
+temperature = 1.0
+repetition_penalty = 1.0
+max_completion_tokens = 512
+min_tokens = 0
+
+[[orchestrator.train.env]]
+id = "wordle"
+name = "wordle"
+num_workers = 1
+max_retries = 0
+max_total_completion_tokens = 1024
+
+[orchestrator.train.env.args]
+num_train_examples = 8
+num_eval_examples = 4
+
+[orchestrator.train.env.extra_env_kwargs]
+max_total_completion_tokens = 1024
+max_seq_len = 4096
+
+[orchestrator.student.client]
+timeout = 1200
+wait_for_ready_timeout = 1800
+
+[orchestrator.renderer]
+preserve_all_thinking = true
+
+[orchestrator.buffer]
+easy_threshold = 1.0
+hard_threshold = 0.0