Skip to content

Commit 4027dce

Browse files
committed
Move agent versions to harness config
1 parent 48ac9e8 commit 4027dce

11 files changed

Lines changed: 96 additions & 77 deletions

File tree

packages/harnesses/README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,13 @@ Command agents use `name@version` specs where their installer supports a
6666
versioned package or release. Use `@latest` for a moving latest install:
6767

6868
```toml
69-
[eval.harness.program]
70-
# OpenCode
71-
install_spec = "PrimeIntellect-ai/opencode@latest"
69+
[eval.harness]
70+
id = "harnesses.opencode"
71+
version = "PrimeIntellect-ai/opencode@latest"
72+
```
7273

73-
# MiniSWEAgent or Pi
74-
install_spec = "mini-swe-agent@2.2.8"
74+
```toml
75+
[eval.harness]
76+
id = "harnesses.mini_swe_agent"
77+
version = "mini-swe-agent@2.2.8"
7578
```

packages/harnesses/harnesses/mini_swe_agent.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import shlex
22
from pathlib import PurePosixPath
33

4+
from pydantic import PrivateAttr, model_validator
45
import verifiers as vf
56
from verifiers.v1.utils.sandbox_python_utils import python_runtime_setup_command
67

@@ -16,23 +17,23 @@
1617
MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
1718
MINI_SWE_AGENT_DEFAULT_LOG_PATH = "/logs/agent/mini-swe-agent.log"
1819
MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH = "/logs/agent/mini-swe-agent.traj.json"
19-
MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC = "mini-swe-agent@2.2.8"
20+
MINI_SWE_AGENT_DEFAULT_VERSION = "mini-swe-agent@2.2.8"
2021
MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC = "mini"
2122
MINI_SWE_AGENT_DEFAULT_MODEL_CLASS = "litellm"
2223
MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT = 120
2324

2425

2526
def build_mini_swe_agent_install_script(
26-
install_spec: str = MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC,
27+
version: str = MINI_SWE_AGENT_DEFAULT_VERSION,
2728
prefix_dir: str = DEFAULT_PREFIX_DIR,
2829
) -> str:
2930
root = shlex.quote(str(PurePosixPath(prefix_dir).parent))
3031
prefix = shlex.quote(prefix_dir)
3132
site_packages = shlex.quote(f"{prefix_dir.rstrip('/')}/site-packages")
32-
name, version = split_versioned_agent_spec(install_spec)
33+
name, pin = split_versioned_agent_spec(version)
3334
requirement = name
34-
if version and version != "latest":
35-
requirement = f"{name}=={version}"
35+
if pin and pin != "latest":
36+
requirement = f"{name}=={pin}"
3637
return f"""\
3738
set -e
3839
{python_runtime_setup_command()}
@@ -51,12 +52,13 @@ def build_mini_swe_agent_install_script(
5152

5253

5354
class MiniSWEAgentProgramConfig(vf.ProgramConfig):
55+
_version: str = PrivateAttr(MINI_SWE_AGENT_DEFAULT_VERSION)
56+
5457
agent_workdir: str = MINI_SWE_AGENT_DEFAULT_AGENT_WORKDIR
5558
instruction_path: str = MINI_SWE_AGENT_DEFAULT_INSTRUCTION_PATH
5659
system_prompt_path: str = MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH
5760
log_path: str = MINI_SWE_AGENT_DEFAULT_LOG_PATH
5861
trajectory_path: str = MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH
59-
install_spec: str = MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC
6062
config_spec: str = MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC
6163
model_class: str = MINI_SWE_AGENT_DEFAULT_MODEL_CLASS
6264
environment_timeout: int = MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT
@@ -112,7 +114,7 @@ def resolve(self) -> vf.ProgramConfig:
112114
config_args.extend(["-c", shlex.quote(spec)])
113115

114116
setup = build_mini_swe_agent_install_script(
115-
install_spec=self.install_spec,
117+
version=self._version,
116118
)
117119
log_dir = str(PurePosixPath(self.log_path).parent)
118120
trajectory_dir = str(PurePosixPath(self.trajectory_path).parent)
@@ -155,9 +157,15 @@ def resolve(self) -> vf.ProgramConfig:
155157

156158

157159
class MiniSWEAgentConfig(vf.HarnessConfig):
160+
version: str = MINI_SWE_AGENT_DEFAULT_VERSION
158161
program: MiniSWEAgentProgramConfig = MiniSWEAgentProgramConfig()
159162
max_turns: int = 4
160163

164+
@model_validator(mode="after")
165+
def apply_version(self):
166+
self.program._version = self.version
167+
return self
168+
161169

162170
class MiniSWEAgent(vf.Harness[MiniSWEAgentConfig]):
163171
config: MiniSWEAgentConfig

packages/harnesses/harnesses/opencode.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import shlex
33
from pathlib import PurePosixPath
44

5+
from pydantic import PrivateAttr, model_validator
56
import verifiers as vf
67
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
78

89
from .utils import split_versioned_agent_spec
910

10-
OPENCODE_DEFAULT_INSTALL_SPEC = "PrimeIntellect-ai/opencode@1.1.63-rl2"
11+
OPENCODE_DEFAULT_VERSION = "PrimeIntellect-ai/opencode@1.1.63-rl2"
1112
OPENCODE_DEFAULT_AGENT_WORKDIR = "/app"
1213
OPENCODE_DEFAULT_INSTRUCTION_PATH = "/opencode/instruction.txt"
1314
OPENCODE_DEFAULT_SYSTEM_PROMPT_PATH = "/opencode/system.txt"
@@ -43,14 +44,15 @@
4344

4445

4546
class OpenCodeProgramConfig(vf.ProgramConfig):
47+
_version: str = PrivateAttr(OPENCODE_DEFAULT_VERSION)
48+
4649
agent_workdir: str = OPENCODE_DEFAULT_AGENT_WORKDIR
4750
instruction_path: str = OPENCODE_DEFAULT_INSTRUCTION_PATH
4851
system_prompt_path: str = OPENCODE_DEFAULT_SYSTEM_PROMPT_PATH
4952
log_path: str = OPENCODE_DEFAULT_LOG_PATH
5053
disabled_tools: list[str] = OPENCODE_DEFAULT_DISABLED_TOOLS
5154
allow_git: bool = False
5255
disable_compaction: bool = True
53-
install_spec: str = OPENCODE_DEFAULT_INSTALL_SPEC
5456
install_ripgrep: bool = True
5557
provider_timeout_ms: int = 3_600_000
5658

@@ -75,7 +77,7 @@ def resolve(self) -> vf.ProgramConfig:
7577
if self.install_ripgrep
7678
else ""
7779
)
78-
repo, version = split_versioned_agent_spec(self.install_spec)
80+
repo, version = split_versioned_agent_spec(self._version)
7981
path = "releases/latest/download"
8082
if version and version != "latest":
8183
tag = version if version.startswith("v") else f"v{version}"
@@ -197,9 +199,15 @@ class OpenCodeConfig(vf.HarnessConfig):
197199
system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = (
198200
OPENCODE_DEFAULT_SYSTEM_PROMPT
199201
)
202+
version: str = OPENCODE_DEFAULT_VERSION
200203
program: OpenCodeProgramConfig = OpenCodeProgramConfig()
201204
max_turns: int = 4
202205

206+
@model_validator(mode="after")
207+
def apply_version(self):
208+
self.program._version = self.version
209+
return self
210+
203211

204212
class OpenCode(vf.Harness[OpenCodeConfig]):
205213
config: OpenCodeConfig

packages/harnesses/harnesses/pi.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import shlex
33
from pathlib import PurePosixPath
44

5+
from pydantic import PrivateAttr, model_validator
56
import verifiers as vf
67
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
78

8-
PI_DEFAULT_INSTALL_SPEC = "@earendil-works/pi-coding-agent@latest"
9+
PI_DEFAULT_VERSION = "@earendil-works/pi-coding-agent@latest"
910
PI_DEFAULT_WORKDIR = "/app"
1011
PI_DEFAULT_INSTRUCTION_PATH = "/pi/instruction.txt"
1112
PI_DEFAULT_SYSTEM_PROMPT_PATH = "/pi/system.txt"
@@ -14,11 +15,12 @@
1415

1516

1617
class PiProgramConfig(vf.ProgramConfig):
18+
_version: str = PrivateAttr(PI_DEFAULT_VERSION)
19+
1720
agent_workdir: str = PI_DEFAULT_WORKDIR
1821
instruction_path: str = PI_DEFAULT_INSTRUCTION_PATH
1922
system_prompt_path: str = PI_DEFAULT_SYSTEM_PROMPT_PATH
2023
log_path: str = PI_DEFAULT_LOG_PATH
21-
install_spec: str = PI_DEFAULT_INSTALL_SPEC
2224
install_mcp_adapter: bool = True
2325
sandbox: vf.SandboxConfig | None = vf.SandboxConfig()
2426

@@ -82,7 +84,7 @@ def resolve(self) -> vf.ProgramConfig:
8284
npm install -g --ignore-scripts n
8385
n 22.19.0
8486
hash -r
85-
npm install -g --ignore-scripts {shlex.quote(self.install_spec)}
87+
npm install -g --ignore-scripts {shlex.quote(self._version)}
8688
"""
8789
artifacts = vf.ArtifactsConfig.model_validate(
8890
{
@@ -127,9 +129,15 @@ class PiConfig(vf.HarnessConfig):
127129
system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = (
128130
PI_DEFAULT_SYSTEM_PROMPT
129131
)
132+
version: str = PI_DEFAULT_VERSION
130133
program: PiProgramConfig = PiProgramConfig()
131134
max_turns: int = 4
132135

136+
@model_validator(mode="after")
137+
def apply_version(self):
138+
self.program._version = self.version
139+
return self
140+
133141

134142
class Pi(vf.Harness[PiConfig]):
135143
config: PiConfig

packages/harnesses/harnesses/terminus_2.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,27 @@
11
import shlex
22
from pathlib import PurePosixPath
33

4+
from pydantic import PrivateAttr, model_validator
45
import verifiers as vf
56
from verifiers.v1.utils.sandbox_python_utils import SANDBOX_BIN_DIR, uv_setup_command
67

78
TERMINUS_2_DEFAULT_AGENT_WORKDIR = "/app"
89
TERMINUS_2_DEFAULT_INSTRUCTION_PATH = "/terminus_2/instruction.md"
910
TERMINUS_2_DEFAULT_SYSTEM_PROMPT_PATH = "/terminus_2/system_prompt.txt"
1011
TERMINUS_2_DEFAULT_LOG_PATH = "/logs/agent/terminus_2.log"
11-
TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC = "harbor==0.6.6"
12+
TERMINUS_2_DEFAULT_VERSION = "harbor==0.6.6"
1213
TERMINUS_2_DEFAULT_PYTHON_VERSION = "3.12"
1314
TERMINUS_2_DEFAULT_MODEL_NAME = "openai/gpt-4.1-mini"
1415
TERMINUS_2_DEFAULT_API_BASE_URL = "https://api.pinference.ai/api/v1"
1516

1617

1718
class Terminus2ProgramConfig(vf.ProgramConfig):
19+
_version: str = PrivateAttr(TERMINUS_2_DEFAULT_VERSION)
20+
1821
agent_workdir: str = TERMINUS_2_DEFAULT_AGENT_WORKDIR
1922
instruction_path: str = TERMINUS_2_DEFAULT_INSTRUCTION_PATH
2023
system_prompt_path: str = TERMINUS_2_DEFAULT_SYSTEM_PROMPT_PATH
2124
log_path: str = TERMINUS_2_DEFAULT_LOG_PATH
22-
harbor_install_spec: str = TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC
2325
python_version: str = TERMINUS_2_DEFAULT_PYTHON_VERSION
2426
model_name: str = TERMINUS_2_DEFAULT_MODEL_NAME
2527
api_base_url: str = TERMINUS_2_DEFAULT_API_BASE_URL
@@ -171,7 +173,7 @@ async def main() -> None:
171173
cd "$TERMINUS_2_WORKDIR"
172174
uv --no-config run --no-project --quiet \
173175
--python {shlex.quote(self.python_version)} \
174-
--with {shlex.quote(self.harbor_install_spec)} \
176+
--with {shlex.quote(self._version)} \
175177
python - <<'PY' 2>&1 | tee -a {shlex.quote(self.log_path)}
176178
{agent_script}
177179
PY
@@ -186,8 +188,14 @@ async def main() -> None:
186188

187189

188190
class Terminus2Config(vf.HarnessConfig):
191+
version: str = TERMINUS_2_DEFAULT_VERSION
189192
program: Terminus2ProgramConfig = Terminus2ProgramConfig()
190193

194+
@model_validator(mode="after")
195+
def apply_version(self):
196+
self.program._version = self.version
197+
return self
198+
191199

192200
class Terminus2(vf.Harness[Terminus2Config]):
193201
config: Terminus2Config

tests/test_composable_env.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,8 @@ def test_taskset_repr():
193193
assert "3" in repr(ts)
194194

195195

196-
def test_composable_mini_swe_agent_unversioned_install_spec_uses_unpinned_requirement():
197-
setup = build_mini_swe_agent_install_script(install_spec=" mini-swe-agent ")
196+
def test_composable_mini_swe_agent_unversioned_version_uses_unpinned_requirement():
197+
setup = build_mini_swe_agent_install_script(version=" mini-swe-agent ")
198198

199199
assert (
200200
"vf_python_install --target /opt/mini-swe-agent/prefix/site-packages mini-swe-agent"
@@ -203,9 +203,9 @@ def test_composable_mini_swe_agent_unversioned_install_spec_uses_unpinned_requir
203203
assert "mini-swe-agent==mini-swe-agent" not in setup
204204

205205

206-
def test_composable_opencode_unversioned_install_spec_uses_latest_download_url():
206+
def test_composable_opencode_unversioned_version_uses_latest_download_url():
207207
setup = build_opencode_install_script(
208-
install_spec=" PrimeIntellect-ai/opencode ",
208+
version=" PrimeIntellect-ai/opencode ",
209209
install_ripgrep=False,
210210
)
211211

tests/test_v1_harbor_cli.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@
2525
Terminus2Config,
2626
Terminus2ProgramConfig,
2727
)
28-
from harnesses.pi import PI_DEFAULT_INSTALL_SPEC
28+
from harnesses.pi import PI_DEFAULT_VERSION
2929
from harnesses.terminus_2 import (
3030
TERMINUS_2_DEFAULT_API_BASE_URL,
31-
TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC,
31+
TERMINUS_2_DEFAULT_VERSION,
3232
TERMINUS_2_DEFAULT_MODEL_NAME,
3333
Terminus2,
3434
)
@@ -287,18 +287,18 @@ def test_opencode_config_owns_opencode_harness_fields() -> None:
287287

288288

289289
@pytest.mark.parametrize(
290-
"install_spec",
290+
"version",
291291
["PrimeIntellect-ai/opencode@latest", " PrimeIntellect-ai/opencode "],
292292
)
293-
def test_opencode_latest_install_spec_uses_latest_download_url(
294-
install_spec: str,
293+
def test_opencode_latest_version_uses_latest_download_url(
294+
version: str,
295295
) -> None:
296296
harness = OpenCode(
297297
config=OpenCodeConfig(
298+
version=version,
298299
program=OpenCodeProgramConfig(
299-
install_spec=install_spec,
300300
install_ripgrep=False,
301-
)
301+
),
302302
)
303303
)
304304
program = cast(dict[str, object], harness.config.program.data())
@@ -308,12 +308,10 @@ def test_opencode_latest_install_spec_uses_latest_download_url(
308308
assert "OPENCODE_RELEASE_PATH=releases/latest/download" in setup
309309

310310

311-
def test_opencode_custom_install_spec_uses_versioned_release() -> None:
311+
def test_opencode_custom_version_uses_versioned_release() -> None:
312312
harness = OpenCode(
313313
config=OpenCodeConfig(
314-
program=OpenCodeProgramConfig(
315-
install_spec="Example/open-code@v2.0.0",
316-
)
314+
version="Example/open-code@v2.0.0",
317315
)
318316
)
319317
program = cast(dict[str, object], harness.config.program.data())
@@ -395,9 +393,9 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
395393

396394
assert "apt-get -o Acquire::Retries=3 update" in setup
397395
assert "apt-get -o Acquire::Retries=3 install" in setup
398-
assert harness.config.program.install_spec == PI_DEFAULT_INSTALL_SPEC
399-
assert PI_DEFAULT_INSTALL_SPEC == "@earendil-works/pi-coding-agent@latest"
400-
assert f"npm install -g --ignore-scripts {PI_DEFAULT_INSTALL_SPEC}" in setup
396+
assert harness.config.version == PI_DEFAULT_VERSION
397+
assert PI_DEFAULT_VERSION == "@earendil-works/pi-coding-agent@latest"
398+
assert f"npm install -g --ignore-scripts {PI_DEFAULT_VERSION}" in setup
401399
assert "mariozechner" not in setup
402400
assert '"baseUrl": "${OPENAI_BASE_URL}"' in mcp_setup
403401
assert '"api": "openai-completions"' in mcp_setup
@@ -407,12 +405,8 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
407405
assert f'"command": "{SANDBOX_PYTHON}"' in mcp_setup
408406

409407

410-
def test_pi_harness_preserves_scoped_npm_install_specs() -> None:
411-
harness = Pi(
412-
config=PiConfig(
413-
program=PiProgramConfig(install_spec="@anthropic-ai/claude-code@1.2.3")
414-
)
415-
)
408+
def test_pi_harness_preserves_scoped_npm_versions() -> None:
409+
harness = Pi(config=PiConfig(version="@anthropic-ai/claude-code@1.2.3"))
416410
program = cast(dict[str, object], harness.config.program.data())
417411
setup = cast(str, program["setup"])
418412

@@ -448,7 +442,7 @@ def test_terminus_2_harness_builds_sandbox_program() -> None:
448442

449443
run_script = cast(str, command[2])
450444
assert "TERMINUS_2_WORKDIR=/workspace" in run_script
451-
assert f"--with {TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC}" in run_script
445+
assert f"--with {TERMINUS_2_DEFAULT_VERSION}" in run_script
452446
assert "git+https://github.com" not in run_script
453447
assert "max_turns=7" in run_script
454448

0 commit comments

Comments
 (0)