diff --git a/configs/debug/training_modes/README.md b/configs/debug/training_modes/README.md index 96ccebb009..fd4b60009f 100644 --- a/configs/debug/training_modes/README.md +++ b/configs/debug/training_modes/README.md @@ -10,6 +10,7 @@ Minimal end-to-end configs for the three training modes (`rl` / `opd` / `sft`) a | `sft.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | | | `sft_lora.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) | | `sft_external.toml` | `sft` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local teacher | +| `sft_replay.toml` | `sft` | none | replays saved message traces through `sft-replay` | The student inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local teacher (used by everything except `rl.toml` and `sft_external.toml`) is **not** auto-launched — start it manually on GPU 1. @@ -42,6 +43,9 @@ uv run rl @ configs/debug/training_modes/sft_lora.toml # SFT hard distill from openai/gpt-5-mini via PI inference # (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local teacher needed) uv run rl @ configs/debug/training_modes/sft_external.toml + +# SFT from replayed dataset traces (no teacher) +uv run rl @ configs/debug/training_modes/sft_replay.toml ``` See [docs/training.md](../../docs/training.md#training-modes-rl--opd--sft-via-orchestrator) for what each mode does. diff --git a/configs/debug/training_modes/sft_replay.toml b/configs/debug/training_modes/sft_replay.toml new file mode 100644 index 0000000000..eae996c1af --- /dev/null +++ b/configs/debug/training_modes/sft_replay.toml @@ -0,0 +1,41 @@ +# Static trace SFT through the RL orchestrator. No teacher server is needed: +# sft-replay turns dataset message rows into replayed rollout trajectories. + +max_steps = 20 +seq_len = 2048 + +[model] +name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT" + +[wandb] +project = "reverse-text-debug" +name = "debug-sft-replay" + +[orchestrator] +training_mode = "sft" +batch_size = 128 +group_size = 1 + +[[orchestrator.train.env]] +id = "sft-replay" + +[orchestrator.train.env.args.taskset] +dataset = "PrimeIntellect/Reverse-Text-SFT" + +[orchestrator.eval] +interval = 1 +num_examples = 128 + +[orchestrator.eval.sampling] +max_completion_tokens = 128 + +[[orchestrator.eval.env]] +id = "reverse-text" + +[trainer.optim] +lr = 3e-6 + +[ckpt] + +[inference] +gpu_memory_utilization = 0.5 diff --git a/deps/verifiers b/deps/verifiers index 05c66c2358..14be5ee386 160000 --- a/deps/verifiers +++ b/deps/verifiers @@ -1 +1 @@ -Subproject commit 05c66c235875d785754f2b7078db0e7deeddbeae +Subproject commit 14be5ee38619290968701fb8af5c87b061556e94 diff --git a/docs/training.md b/docs/training.md index 53fc7fa7aa..a0d39ca1f6 100644 --- a/docs/training.md +++ b/docs/training.md @@ -89,16 +89,18 @@ The RL entrypoint supports three training modes, switched via `orchestrator.trai |---|---|---|---| | `rl` | Required | Forbidden | Standard RL | | `opd` | Required | Required, must be vLLM (needs `prompt_logprobs`) | [On-policy distillation](https://thinkingmachines.ai/blog/on-policy-distillation/): student generates rollouts, trainer minimizes KL to teacher logprobs | -| `sft` | Required | Required, any OpenAI-compatible endpoint | Hard-distill: teacher generates rollouts, student trains on them | +| `sft` | Required | Optional | Hard-distill from teacher-generated rollouts, or train from replayed message traces via `sft-replay` | -The `rl` entrypoint only manages student-policy inference. For OPD and (local-vLLM) SFT, start the teacher inference server manually and point `[orchestrator.teacher.client]` at it: +The `rl` entrypoint only manages student-policy inference. For OPD and teacher-backed local-vLLM SFT, start the teacher inference server manually and point `[orchestrator.teacher.client]` at it: ```bash CUDA_VISIBLE_DEVICES=1 uv run inference \ --model.name --server.port 8001 ``` -The standalone `uv run sft` entrypoint is the more traditional SFT path — pure dataset-based, no teacher, no orchestrator. Use `orchestrator.training_mode = "sft"` only when you want a teacher to generate the supervision on the fly. +Teacherless orchestrator SFT is valid only when every train env is `sft-replay` and each env config sets `args.taskset.dataset` (or `args.config.taskset.dataset`). In that path, the env replays stored assistant messages into trajectories without model calls, then prime-rl tokenizes them for the trainer. + +The standalone `uv run sft` entrypoint is the more traditional SFT path — pure dataset-based, no teacher, no orchestrator. Use `orchestrator.training_mode = "sft"` when you want teacher-generated supervision or replayed env trajectories inside the RL orchestrator/eval pipeline. ### Important Metrics diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py index be5fe249f3..689133857f 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py @@ -20,6 +20,31 @@ from prime_rl.configs.trainer import TokenizerConfig from prime_rl.utils.config import BaseConfig +SFT_REPLAY_ENV_ID = "sft-replay" + + +def _is_sft_replay_env_id(env_id: str) -> bool: + stripped = env_id.split("@")[0] + return stripped == SFT_REPLAY_ENV_ID or stripped.endswith(f"/{SFT_REPLAY_ENV_ID}") + + +def _sft_replay_dataset_arg(env_args: dict) -> object | None: + taskset = env_args.get("taskset") + if isinstance(taskset, dict): + dataset = taskset.get("dataset") + if dataset: + return dataset + + config = env_args.get("config") + if isinstance(config, dict): + taskset = config.get("taskset") + if isinstance(taskset, dict): + dataset = taskset.get("dataset") + if dataset: + return dataset + + return None + class OptimizerConfig(BaseConfig): lr: float = Field(1e-4, ge=0) @@ -501,13 +526,13 @@ class RolloutModelConfig(BaseConfig): class OrchestratorConfig(BaseConfig): training_mode: Literal["rl", "opd", "sft"] = "rl" - """Training mode. ``rl``: student generates rollouts, no teacher. ``opd``: student generates rollouts, teacher computes logprobs (teacher_tau > 0). ``sft``: teacher generates rollouts, student inference pool used for evals and weight sync.""" + """Training mode. ``rl``: student generates rollouts, no teacher. ``opd``: student generates rollouts, teacher computes logprobs (teacher_tau > 0). ``sft``: teacher generates rollouts when configured, otherwise train envs must provide replayed traces.""" student: RolloutModelConfig = Field(RolloutModelConfig(), validation_alias=AliasChoices("student", "model")) """Student rollout participant (model + client) — the model being trained.""" teacher: RolloutModelConfig | None = Field(None, validation_alias=AliasChoices("teacher", "teacher_model")) - """Teacher rollout participant (model + client). Role depends on ``training_mode``: ``opd`` — teacher computes logprobs; ``sft`` — teacher generates rollouts.""" + """Teacher rollout participant (model + client). Role depends on ``training_mode``: ``opd`` — teacher computes logprobs; ``sft`` — teacher generates rollouts when configured.""" train: TrainConfig = TrainConfig() @@ -752,10 +777,16 @@ def validate_unique_filter_types(self): ) return self + @model_validator(mode="after") + def _drop_default_sft_zero_advantage_filter(self): + if self.training_mode == "sft" and "post_batch_filters" not in self.model_fields_set: + self.post_batch_filters = [f for f in self.post_batch_filters if f.type != "zero_advantage"] + return self + @model_validator(mode="after") def _force_no_renderer_for_sft(self): - """SFT rolls out via the teacher's plain chat-completions endpoint; the - renderer client doesn't apply. Force ``renderer=None`` so the user + """SFT train rollouts use teacher chat completions or replayed traces; + the renderer client doesn't apply. Force ``renderer=None`` so the user doesn't have to remember to set it. Declared before the renderer validators below so they see the corrected value.""" if self.training_mode == "sft": @@ -768,8 +799,30 @@ def validate_training_mode(self): has_teacher = self.teacher is not None if self.training_mode == "rl" and has_teacher: raise ValueError("orchestrator.teacher must not be set when training_mode = 'rl'.") - if self.training_mode in ("opd", "sft") and not has_teacher: - raise ValueError(f"orchestrator.teacher must be configured when training_mode = '{self.training_mode}'.") + if self.training_mode == "opd" and not has_teacher: + raise ValueError("orchestrator.teacher must be configured when training_mode = 'opd'.") + return self + + @model_validator(mode="after") + def validate_teacherless_sft_uses_sft_replay(self): + """Teacherless SFT is only valid when train envs replay existing data.""" + if self.training_mode != "sft" or self.teacher is not None: + return self + + non_replay_envs = [env.id for env in self.train.env if not _is_sft_replay_env_id(env.id)] + if non_replay_envs: + raise ValueError( + "orchestrator.teacher must be configured for SFT unless every train env uses " + f"{SFT_REPLAY_ENV_ID!r}; got non-replay train env(s): {non_replay_envs}." + ) + + missing_dataset = [env.resolved_name for env in self.train.env if _sft_replay_dataset_arg(env.args) is None] + if missing_dataset: + raise ValueError( + f"teacherless SFT with {SFT_REPLAY_ENV_ID!r} requires an explicit " + "env.args.taskset.dataset or env.args.config.taskset.dataset for " + f"each train env; missing for: {missing_dataset}." + ) return self @model_validator(mode="after") diff --git a/pyproject.toml b/pyproject.toml index cc931e1446..4341b1a80d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,7 @@ envs = [ "rlm-swe", "science-env", "simpleqa-verified", + "sft-replay", "tau2-bench", "wiki-search", "wordle", @@ -150,6 +151,8 @@ override-dependencies = [ "transformers==5.6.2", "torch>=2.9.0", "openenv-core", + "harnesses>=0.1.0", + "tasksets>=0.1.0", ] # ModelExpress 0.3.0 publishes protobuf<6 metadata, but its generated proto is @@ -224,6 +227,9 @@ reverse-text = { path = "deps/verifiers/environments/reverse_text", editable = t rlm-swe = { path = "deps/research-environments/environments/rlm_swe", editable = true } science-env = { path = "deps/research-environments/environments/science_env", editable = true } simpleqa-verified = { path = "deps/research-environments/environments/simpleqa_verified", editable = true } +sft-replay = { path = "deps/verifiers/environments/sft_replay", editable = true } +harnesses = { path = "deps/verifiers/packages/harnesses", editable = true } +tasksets = { path = "deps/verifiers/packages/tasksets", editable = true } tau2-bench = { path = "deps/research-environments/environments/tau2_bench", editable = true } wiki-search = { path = "deps/verifiers/environments/wiki_search", editable = true } wordle = { path = "deps/verifiers/environments/wordle", editable = true } diff --git a/skills/configs/SKILL.md b/skills/configs/SKILL.md index 83f7dd8d47..8a7f94bb53 100644 --- a/skills/configs/SKILL.md +++ b/skills/configs/SKILL.md @@ -60,6 +60,23 @@ CLI: `--env.0.id reverse-text --env.1.id math-env`. In TOML, an empty section header (`[ckpt]`) does the same. +## Replay-backed SFT config + +For teacherless SFT through the `rl` orchestrator, set `orchestrator.training_mode = "sft"` and use only `sft-replay` train envs. Each train env must provide the dataset under the taskset config: + +```toml +[orchestrator] +training_mode = "sft" + +[[orchestrator.train.env]] +id = "sft-replay" + +[orchestrator.train.env.args.taskset] +dataset = "PrimeIntellect/Reverse-Text-SFT" +``` + +Do not pass the dataset at `env.args.dataset`; config validation rejects that shape because replay data belongs to the taskset. + ## RL trainer token exports For rollout debugging, enable trainer-side token export under `trainer.experimental.token_export` (or `experimental.token_export` when running the trainer entrypoint directly). It writes one JSONL record per exported sequence under `output_dir/token_exports/step_/rank_.jsonl`. Each record stores aligned per-token arrays for token ids, loss mask, advantage, reward, entropy, mismatch KL, inference/trainer logprobs, importance ratios, probability deltas, and masking diagnostics. It does not decode token text in the trainer. diff --git a/skills/training/start-run/SKILL.md b/skills/training/start-run/SKILL.md index 415778c437..31308eabb9 100644 --- a/skills/training/start-run/SKILL.md +++ b/skills/training/start-run/SKILL.md @@ -34,6 +34,7 @@ uv run rl @ examples/reverse_text/rl.toml --dry-run - Config: `RLConfig` (`packages/prime-rl-configs/src/prime_rl/configs/rl.py`) - Entrypoint: `src/prime_rl/entrypoints/rl.py` - SLURM: single- and multi-node +- Training modes: `orchestrator.training_mode = "rl"` (default), `"opd"` (requires teacher), or `"sft"`. SFT can use a configured teacher, or teacherless replay when every train env is `sft-replay` with `args.taskset.dataset` set. - Environment packages: before launching a config with a non-core verifier env id, verify the package imports under `uv run` (for example `uv run python -c "import importlib.util; print(importlib.util.find_spec('rlm_swe'))"`). @@ -82,7 +83,7 @@ curl http://localhost:8000/v1/chat/completions \ | Command | Purpose | Typical use | |---------|---------|-------------| -| `rl` | Full RL pipeline | Production RL training | +| `rl` | Full orchestrator pipeline | RL, OPD, and orchestrator-backed SFT | | `sft` | Supervised fine-tuning | SFT and hard-distill | | `inference` | vLLM server | Standalone serving / debugging | diff --git a/src/prime_rl/orchestrator/dispatcher.py b/src/prime_rl/orchestrator/dispatcher.py index 133bc08da0..4dd9f6b98a 100644 --- a/src/prime_rl/orchestrator/dispatcher.py +++ b/src/prime_rl/orchestrator/dispatcher.py @@ -135,8 +135,9 @@ def __init__( self.policy = policy self.train_envs = train_envs self.eval_envs = eval_envs - # Train rollouts go to ``inference`` (the teacher in SFT mode); - # eval always evaluates the student, so it uses ``eval_inference``. + # Train rollouts go to ``inference`` (teacher in teacher-SFT, student + # otherwise); eval always evaluates the student, so it uses + # ``eval_inference``. self.inference = inference self.eval_inference = eval_inference self.train_source = train_source @@ -173,9 +174,9 @@ def __init__( @property def train_model_name(self) -> str: - """Model name for *train* rollouts. In SFT mode train data comes from - the teacher pool, so use its model name; otherwise the live student - policy. (Eval always uses ``policy.model_name`` — the student.)""" + """Model name for *train* rollouts. Teacher-SFT uses the teacher pool + name; replay SFT receives the student name but ignores it. Eval always + uses ``policy.model_name`` — the student.""" if self.training_mode == "sft": return self.inference.model_name return self.policy.model_name diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py index 902c8b963b..9b1135884d 100644 --- a/src/prime_rl/orchestrator/orchestrator.py +++ b/src/prime_rl/orchestrator/orchestrator.py @@ -345,10 +345,10 @@ async def setup(self) -> None: else: get_logger().info("Training from scratch") - # SFT generates rollouts via the teacher (the student is trained on - # the teacher's outputs); RL / OPD generate via the student - if config.training_mode == "sft": - assert self.teacher_inference is not None, "sft mode requires teacher inference" + # SFT train rollouts come from the teacher when configured. Teacherless + # SFT is validated at config parse time to use replay envs, which ignore + # the client/model passed by the dispatcher. + if config.training_mode == "sft" and self.teacher_inference is not None: rollout_inference = self.teacher_inference else: rollout_inference = self.student_inference diff --git a/src/prime_rl/orchestrator/trajectories.py b/src/prime_rl/orchestrator/trajectories.py index 3e8431c12a..7171901ad2 100644 --- a/src/prime_rl/orchestrator/trajectories.py +++ b/src/prime_rl/orchestrator/trajectories.py @@ -164,6 +164,24 @@ def _tokenize_step_with_renderer( return build_trajectory_step(renderer, prompt, completion, tools=tools) +def _set_token_usage_from_trajectory(output: vf.RolloutOutput) -> None: + trajectory = output.get("trajectory") or [] + tokenized_steps = [step for step in trajectory if step.get("tokens") is not None] + if not tokenized_steps: + return + + prompt_tokens = [len(step["tokens"]["prompt_ids"]) for step in tokenized_steps] + completion_tokens = [len(step["tokens"]["completion_ids"]) for step in tokenized_steps] + total_completion = sum(completion_tokens) + last_total = prompt_tokens[-1] + completion_tokens[-1] + output["token_usage"] = { + "input_tokens": float(sum(prompt_tokens)), + "output_tokens": float(total_completion), + "final_input_tokens": float(max(0, last_total - total_completion)), + "final_output_tokens": float(total_completion), + } + + def backfill_rollout_tokens( output: vf.RolloutOutput, tokenizer: PreTrainedTokenizer, @@ -175,6 +193,9 @@ def backfill_rollout_tokens( Otherwise falls back to the tokenizer + apply_chat_template path. """ if all(step["tokens"] is not None for step in output["trajectory"]): + token_usage = output.get("token_usage") or {} + if "final_input_tokens" not in token_usage or "final_output_tokens" not in token_usage: + _set_token_usage_from_trajectory(output) return True logger = get_logger() @@ -198,6 +219,10 @@ def backfill_rollout_tokens( reconstructed.pop("original_prompt_len") step["tokens"] = reconstructed + token_usage = output.get("token_usage") or {} + if "final_input_tokens" not in token_usage or "final_output_tokens" not in token_usage: + _set_token_usage_from_trajectory(output) + return True diff --git a/tests/unit/orchestrator/test_sft_replay_env.py b/tests/unit/orchestrator/test_sft_replay_env.py new file mode 100644 index 0000000000..3b69ca5d9d --- /dev/null +++ b/tests/unit/orchestrator/test_sft_replay_env.py @@ -0,0 +1,90 @@ +import pytest +import verifiers as vf +from verifiers.clients import Client + +from prime_rl.orchestrator.trajectories import backfill_rollout_tokens, interleave_rollout + + +class NoopClient(Client): + def setup_client(self, config): + return object() + + async def to_native_tool(self, tool): + raise AssertionError("sft-replay must not convert tools") + + async def to_native_prompt(self, messages): + raise AssertionError("sft-replay must not render prompts through a client") + + async def get_native_response(self, prompt, model, sampling_args, tools=None, **kwargs): + raise AssertionError("sft-replay must not request model responses") + + async def raise_from_native_response(self, response) -> None: + raise AssertionError("sft-replay must not handle native responses") + + async def from_native_response(self, response): + raise AssertionError("sft-replay must not parse native responses") + + async def close(self) -> None: + return None + + +class SimpleChatTokenizer: + def __init__(self): + self._tok2id: dict[str, int] = {} + self._next_id = 1 + + def _id(self, token: str) -> int: + if token not in self._tok2id: + self._tok2id[token] = self._next_id + self._next_id += 1 + return self._tok2id[token] + + def apply_chat_template(self, messages, add_generation_prompt=False, return_dict=False, tools=None): + del return_dict, tools + ids = [] + for message in messages: + role = message.get("role", "unknown") + ids.append(self._id(f"<|{role}|>")) + content = message.get("content", "") + if isinstance(content, str): + if content: + ids.append(self._id(content)) + else: + ids.append(self._id(str(content))) + if add_generation_prompt: + ids.append(self._id("<|assistant|>")) + return ids + + +def role_content(messages) -> list[tuple[str, object]]: + return [(message["role"], message["content"]) for message in messages] + + +@pytest.mark.asyncio +async def test_sft_replay_env_replays_messages_for_prime_rl_training_path(): + env = vf.load_environment("sft-replay", taskset={}) + row = dict(env.get_dataset()[0]) + + output = await env.run_rollout( + row, + client=NoopClient(vf.ClientConfig()), + model="unused-student", + sampling_args={}, + state_columns=["trajectory", "sampling_args"], + ) + + assert output["error"] is None + assert output["stop_condition"] == "replayed_messages" + assert len(output["trajectory"]) == 1 + assert output["trajectory"][0]["tokens"] is None + assert role_content(output["trajectory"][0]["prompt"]) == [("user", "Reverse abc.")] + assert role_content(output["trajectory"][0]["completion"]) == [("assistant", "cba")] + + backfill_rollout_tokens(output, SimpleChatTokenizer()) + samples = interleave_rollout(output, env_name="sft-replay") + + assert samples is not None + assert len(samples) == 1 + assert any(samples[0].completion_mask) + assert output["token_usage"]["final_input_tokens"] > 0 + assert output["token_usage"]["final_output_tokens"] > 0 diff --git a/tests/unit/test_configs.py b/tests/unit/test_configs.py index fcdee7a843..9942ada866 100644 --- a/tests/unit/test_configs.py +++ b/tests/unit/test_configs.py @@ -202,6 +202,66 @@ def test_orchestrator_vlm_requires_renderer(): assert config.renderer is not None +def test_sft_training_mode_allows_missing_teacher(): + config = OrchestratorConfig.model_validate( + { + "training_mode": "sft", + "renderer": None, + "train": {"env": [{"id": "sft-replay", "args": {"taskset": {"dataset": "local.jsonl"}}}]}, + } + ) + + assert config.teacher is None + assert "zero_advantage" not in {f.type for f in config.post_batch_filters} + + +def test_sft_training_mode_preserves_explicit_post_filters(): + config = OrchestratorConfig.model_validate( + { + "training_mode": "sft", + "renderer": None, + "train": {"env": [{"id": "prime/sft-replay@0.1.0", "args": {"taskset": {"dataset": "local.jsonl"}}}]}, + "post_batch_filters": [{"type": "zero_advantage", "enforce": True}], + } + ) + + assert [f.type for f in config.post_batch_filters] == ["zero_advantage"] + + +def test_sft_training_mode_accepts_nested_env_config_dataset(): + config = OrchestratorConfig.model_validate( + { + "training_mode": "sft", + "renderer": None, + "train": {"env": [{"id": "sft-replay", "args": {"config": {"taskset": {"dataset": "local.jsonl"}}}}]}, + } + ) + + assert config.teacher is None + + +def test_teacherless_sft_requires_sft_replay_env(): + with pytest.raises(ValidationError, match="every train env uses 'sft-replay'"): + OrchestratorConfig.model_validate( + { + "training_mode": "sft", + "renderer": None, + "train": {"env": [{"id": "reverse-text"}]}, + } + ) + + +def test_teacherless_sft_requires_sft_replay_taskset_dataset(): + with pytest.raises(ValidationError, match="requires an explicit env.args.taskset.dataset"): + OrchestratorConfig.model_validate( + { + "training_mode": "sft", + "renderer": None, + "train": {"env": [{"id": "sft-replay", "args": {"dataset": "local.jsonl"}}]}, + } + ) + + def test_selective_activation_checkpointing_requires_custom_impl(): with pytest.raises(ValidationError, match="Selective activation checkpointing requires model.impl='custom'"): TrainerModelConfig.model_validate({"impl": "hf", "ac": {"mode": "selective"}}) diff --git a/uv.lock b/uv.lock index 0fe7a4e076..7cd5141124 100644 --- a/uv.lock +++ b/uv.lock @@ -34,9 +34,11 @@ prime = false [manifest] overrides = [ + { name = "harnesses", editable = "deps/verifiers/packages/harnesses" }, { name = "nvidia-cudnn-cu12", specifier = ">=9.15" }, { name = "nvidia-cutlass-dsl", specifier = ">=4.4.1" }, { name = "openenv-core" }, + { name = "tasksets", editable = "deps/verifiers/packages/tasksets" }, { name = "torch", specifier = ">=2.9.0", index = "https://download.pytorch.org/whl/cu128" }, { name = "transformers", specifier = "==5.6.2" }, ] @@ -1461,6 +1463,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, ] +[[package]] +name = "harnesses" +source = { editable = "deps/verifiers/packages/harnesses" } +dependencies = [ + { name = "verifiers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiohttp", marker = "extra == 'nemogym'", specifier = ">=3.9.0" }, + { name = "nemo-gym", marker = "python_full_version >= '3.12' and extra == 'nemogym'", specifier = ">=0.2.1,<0.3" }, + { name = "verifiers", specifier = ">=0.1.15.dev11" }, +] +provides-extras = ["nemogym"] + [[package]] name = "hf-xet" version = "1.5.0" @@ -3576,6 +3593,7 @@ envs = [ { name = "reverse-text", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "rlm-swe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "science-env", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "sft-replay", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "simpleqa-verified", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "tau2-bench", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "wiki-search", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -3677,6 +3695,7 @@ requires-dist = [ { name = "rlm-swe", marker = "extra == 'envs'", editable = "deps/research-environments/environments/rlm_swe" }, { name = "science-env", marker = "extra == 'envs'", editable = "deps/research-environments/environments/science_env" }, { name = "setproctitle", specifier = ">=1.3.0" }, + { name = "sft-replay", marker = "extra == 'envs'", editable = "deps/verifiers/environments/sft_replay" }, { name = "simpleqa-verified", marker = "extra == 'envs'", editable = "deps/research-environments/environments/simpleqa_verified" }, { name = "tau2-bench", marker = "extra == 'envs'", editable = "deps/research-environments/environments/tau2_bench" }, { name = "tenacity", specifier = ">=8.2.0" }, @@ -4543,6 +4562,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/b8/f1f62a5e3c0ad2ff1d189590bfa4c46b4f3b6e49cef6f26c6ee4e575394d/setuptools-80.10.2-py3-none-any.whl", hash = "sha256:95b30ddfb717250edb492926c92b5221f7ef3fbcc2b07579bcd4a27da21d0173", size = 1064234, upload-time = "2026-01-25T22:38:15.216Z" }, ] +[[package]] +name = "sft-replay" +version = "0.1.0" +source = { editable = "deps/verifiers/environments/sft_replay" } +dependencies = [ + { name = "harnesses", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "tasksets", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "verifiers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [ + { name = "harnesses", specifier = ">=0.1.2" }, + { name = "tasksets", specifier = ">=0.1.5" }, + { name = "verifiers", specifier = ">=0.1.15.dev11" }, +] + [[package]] name = "shellingham" version = "1.5.4" @@ -4725,6 +4761,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" }, ] +[[package]] +name = "tasksets" +source = { editable = "deps/verifiers/packages/tasksets" } +dependencies = [ + { name = "verifiers", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [ + { name = "nemo-gym", marker = "python_full_version >= '3.12' and extra == 'nemogym'", specifier = ">=0.2.1,<0.3" }, + { name = "nltk", marker = "extra == 'ta'" }, + { name = "openenv-core", marker = "extra == 'openenv'", specifier = ">=0.3.0" }, + { name = "openreward", marker = "python_full_version >= '3.11' and extra == 'openreward'", specifier = ">=0.1.123" }, + { name = "textarena", marker = "extra == 'ta'" }, + { name = "verifiers", specifier = ">=0.1.15.dev11" }, +] +provides-extras = ["nemogym", "openenv", "openreward", "ta"] + [[package]] name = "tau2" version = "0.2.1.dev0"