Skip to content

Commit 2a8e281

Browse files
committed
Move agent versions to harness config
1 parent 48ac9e8 commit 2a8e281

11 files changed

Lines changed: 92 additions & 77 deletions

File tree

packages/harnesses/README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,13 @@ Command agents use `name@version` specs where their installer supports a
6666
versioned package or release. Use `@latest` for a moving latest install:
6767

6868
```toml
69-
[eval.harness.program]
70-
# OpenCode
71-
install_spec = "PrimeIntellect-ai/opencode@latest"
69+
[eval.harness]
70+
id = "harnesses.opencode"
71+
version = "PrimeIntellect-ai/opencode@latest"
72+
```
7273

73-
# MiniSWEAgent or Pi
74-
install_spec = "mini-swe-agent@2.2.8"
74+
```toml
75+
[eval.harness]
76+
id = "harnesses.mini_swe_agent"
77+
version = "mini-swe-agent@2.2.8"
7578
```

packages/harnesses/harnesses/mini_swe_agent.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import shlex
22
from pathlib import PurePosixPath
33

4+
from pydantic import model_validator
45
import verifiers as vf
56
from verifiers.v1.utils.sandbox_python_utils import python_runtime_setup_command
67

@@ -16,23 +17,23 @@
1617
MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
1718
MINI_SWE_AGENT_DEFAULT_LOG_PATH = "/logs/agent/mini-swe-agent.log"
1819
MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH = "/logs/agent/mini-swe-agent.traj.json"
19-
MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC = "mini-swe-agent@2.2.8"
20+
MINI_SWE_AGENT_DEFAULT_VERSION = "mini-swe-agent@2.2.8"
2021
MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC = "mini"
2122
MINI_SWE_AGENT_DEFAULT_MODEL_CLASS = "litellm"
2223
MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT = 120
2324

2425

2526
def build_mini_swe_agent_install_script(
26-
install_spec: str = MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC,
27+
version: str = MINI_SWE_AGENT_DEFAULT_VERSION,
2728
prefix_dir: str = DEFAULT_PREFIX_DIR,
2829
) -> str:
2930
root = shlex.quote(str(PurePosixPath(prefix_dir).parent))
3031
prefix = shlex.quote(prefix_dir)
3132
site_packages = shlex.quote(f"{prefix_dir.rstrip('/')}/site-packages")
32-
name, version = split_versioned_agent_spec(install_spec)
33+
name, parsed_version = split_versioned_agent_spec(version)
3334
requirement = name
34-
if version and version != "latest":
35-
requirement = f"{name}=={version}"
35+
if parsed_version and parsed_version != "latest":
36+
requirement = f"{name}=={parsed_version}"
3637
return f"""\
3738
set -e
3839
{python_runtime_setup_command()}
@@ -51,12 +52,12 @@ def build_mini_swe_agent_install_script(
5152

5253

5354
class MiniSWEAgentProgramConfig(vf.ProgramConfig):
55+
version: str = MINI_SWE_AGENT_DEFAULT_VERSION
5456
agent_workdir: str = MINI_SWE_AGENT_DEFAULT_AGENT_WORKDIR
5557
instruction_path: str = MINI_SWE_AGENT_DEFAULT_INSTRUCTION_PATH
5658
system_prompt_path: str = MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH
5759
log_path: str = MINI_SWE_AGENT_DEFAULT_LOG_PATH
5860
trajectory_path: str = MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH
59-
install_spec: str = MINI_SWE_AGENT_DEFAULT_INSTALL_SPEC
6061
config_spec: str = MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC
6162
model_class: str = MINI_SWE_AGENT_DEFAULT_MODEL_CLASS
6263
environment_timeout: int = MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT
@@ -112,7 +113,7 @@ def resolve(self) -> vf.ProgramConfig:
112113
config_args.extend(["-c", shlex.quote(spec)])
113114

114115
setup = build_mini_swe_agent_install_script(
115-
install_spec=self.install_spec,
116+
version=self.version,
116117
)
117118
log_dir = str(PurePosixPath(self.log_path).parent)
118119
trajectory_dir = str(PurePosixPath(self.trajectory_path).parent)
@@ -155,9 +156,15 @@ def resolve(self) -> vf.ProgramConfig:
155156

156157

157158
class MiniSWEAgentConfig(vf.HarnessConfig):
159+
version: str = MINI_SWE_AGENT_DEFAULT_VERSION
158160
program: MiniSWEAgentProgramConfig = MiniSWEAgentProgramConfig()
159161
max_turns: int = 4
160162

163+
@model_validator(mode="after")
164+
def apply_version(self):
165+
self.program.version = self.version
166+
return self
167+
161168

162169
class MiniSWEAgent(vf.Harness[MiniSWEAgentConfig]):
163170
config: MiniSWEAgentConfig

packages/harnesses/harnesses/opencode.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import shlex
33
from pathlib import PurePosixPath
44

5+
from pydantic import model_validator
56
import verifiers as vf
67
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
78

89
from .utils import split_versioned_agent_spec
910

10-
OPENCODE_DEFAULT_INSTALL_SPEC = "PrimeIntellect-ai/opencode@1.1.63-rl2"
11+
OPENCODE_DEFAULT_VERSION = "PrimeIntellect-ai/opencode@1.1.63-rl2"
1112
OPENCODE_DEFAULT_AGENT_WORKDIR = "/app"
1213
OPENCODE_DEFAULT_INSTRUCTION_PATH = "/opencode/instruction.txt"
1314
OPENCODE_DEFAULT_SYSTEM_PROMPT_PATH = "/opencode/system.txt"
@@ -43,14 +44,14 @@
4344

4445

4546
class OpenCodeProgramConfig(vf.ProgramConfig):
47+
version: str = OPENCODE_DEFAULT_VERSION
4648
agent_workdir: str = OPENCODE_DEFAULT_AGENT_WORKDIR
4749
instruction_path: str = OPENCODE_DEFAULT_INSTRUCTION_PATH
4850
system_prompt_path: str = OPENCODE_DEFAULT_SYSTEM_PROMPT_PATH
4951
log_path: str = OPENCODE_DEFAULT_LOG_PATH
5052
disabled_tools: list[str] = OPENCODE_DEFAULT_DISABLED_TOOLS
5153
allow_git: bool = False
5254
disable_compaction: bool = True
53-
install_spec: str = OPENCODE_DEFAULT_INSTALL_SPEC
5455
install_ripgrep: bool = True
5556
provider_timeout_ms: int = 3_600_000
5657

@@ -75,7 +76,7 @@ def resolve(self) -> vf.ProgramConfig:
7576
if self.install_ripgrep
7677
else ""
7778
)
78-
repo, version = split_versioned_agent_spec(self.install_spec)
79+
repo, version = split_versioned_agent_spec(self.version)
7980
path = "releases/latest/download"
8081
if version and version != "latest":
8182
tag = version if version.startswith("v") else f"v{version}"
@@ -197,9 +198,15 @@ class OpenCodeConfig(vf.HarnessConfig):
197198
system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = (
198199
OPENCODE_DEFAULT_SYSTEM_PROMPT
199200
)
201+
version: str = OPENCODE_DEFAULT_VERSION
200202
program: OpenCodeProgramConfig = OpenCodeProgramConfig()
201203
max_turns: int = 4
202204

205+
@model_validator(mode="after")
206+
def apply_version(self):
207+
self.program.version = self.version
208+
return self
209+
203210

204211
class OpenCode(vf.Harness[OpenCodeConfig]):
205212
config: OpenCodeConfig

packages/harnesses/harnesses/pi.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import shlex
33
from pathlib import PurePosixPath
44

5+
from pydantic import model_validator
56
import verifiers as vf
67
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
78

8-
PI_DEFAULT_INSTALL_SPEC = "@earendil-works/pi-coding-agent@latest"
9+
PI_DEFAULT_VERSION = "@earendil-works/pi-coding-agent@latest"
910
PI_DEFAULT_WORKDIR = "/app"
1011
PI_DEFAULT_INSTRUCTION_PATH = "/pi/instruction.txt"
1112
PI_DEFAULT_SYSTEM_PROMPT_PATH = "/pi/system.txt"
@@ -14,11 +15,11 @@
1415

1516

1617
class PiProgramConfig(vf.ProgramConfig):
18+
version: str = PI_DEFAULT_VERSION
1719
agent_workdir: str = PI_DEFAULT_WORKDIR
1820
instruction_path: str = PI_DEFAULT_INSTRUCTION_PATH
1921
system_prompt_path: str = PI_DEFAULT_SYSTEM_PROMPT_PATH
2022
log_path: str = PI_DEFAULT_LOG_PATH
21-
install_spec: str = PI_DEFAULT_INSTALL_SPEC
2223
install_mcp_adapter: bool = True
2324
sandbox: vf.SandboxConfig | None = vf.SandboxConfig()
2425

@@ -82,7 +83,7 @@ def resolve(self) -> vf.ProgramConfig:
8283
npm install -g --ignore-scripts n
8384
n 22.19.0
8485
hash -r
85-
npm install -g --ignore-scripts {shlex.quote(self.install_spec)}
86+
npm install -g --ignore-scripts {shlex.quote(self.version)}
8687
"""
8788
artifacts = vf.ArtifactsConfig.model_validate(
8889
{
@@ -127,9 +128,15 @@ class PiConfig(vf.HarnessConfig):
127128
system_prompt: vf.PromptInput | vf.SystemPromptConfig | None = (
128129
PI_DEFAULT_SYSTEM_PROMPT
129130
)
131+
version: str = PI_DEFAULT_VERSION
130132
program: PiProgramConfig = PiProgramConfig()
131133
max_turns: int = 4
132134

135+
@model_validator(mode="after")
136+
def apply_version(self):
137+
self.program.version = self.version
138+
return self
139+
133140

134141
class Pi(vf.Harness[PiConfig]):
135142
config: PiConfig

packages/harnesses/harnesses/terminus_2.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,26 @@
11
import shlex
22
from pathlib import PurePosixPath
33

4+
from pydantic import model_validator
45
import verifiers as vf
56
from verifiers.v1.utils.sandbox_python_utils import SANDBOX_BIN_DIR, uv_setup_command
67

78
TERMINUS_2_DEFAULT_AGENT_WORKDIR = "/app"
89
TERMINUS_2_DEFAULT_INSTRUCTION_PATH = "/terminus_2/instruction.md"
910
TERMINUS_2_DEFAULT_SYSTEM_PROMPT_PATH = "/terminus_2/system_prompt.txt"
1011
TERMINUS_2_DEFAULT_LOG_PATH = "/logs/agent/terminus_2.log"
11-
TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC = "harbor==0.6.6"
12+
TERMINUS_2_DEFAULT_VERSION = "harbor==0.6.6"
1213
TERMINUS_2_DEFAULT_PYTHON_VERSION = "3.12"
1314
TERMINUS_2_DEFAULT_MODEL_NAME = "openai/gpt-4.1-mini"
1415
TERMINUS_2_DEFAULT_API_BASE_URL = "https://api.pinference.ai/api/v1"
1516

1617

1718
class Terminus2ProgramConfig(vf.ProgramConfig):
19+
version: str = TERMINUS_2_DEFAULT_VERSION
1820
agent_workdir: str = TERMINUS_2_DEFAULT_AGENT_WORKDIR
1921
instruction_path: str = TERMINUS_2_DEFAULT_INSTRUCTION_PATH
2022
system_prompt_path: str = TERMINUS_2_DEFAULT_SYSTEM_PROMPT_PATH
2123
log_path: str = TERMINUS_2_DEFAULT_LOG_PATH
22-
harbor_install_spec: str = TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC
2324
python_version: str = TERMINUS_2_DEFAULT_PYTHON_VERSION
2425
model_name: str = TERMINUS_2_DEFAULT_MODEL_NAME
2526
api_base_url: str = TERMINUS_2_DEFAULT_API_BASE_URL
@@ -171,7 +172,7 @@ async def main() -> None:
171172
cd "$TERMINUS_2_WORKDIR"
172173
uv --no-config run --no-project --quiet \
173174
--python {shlex.quote(self.python_version)} \
174-
--with {shlex.quote(self.harbor_install_spec)} \
175+
--with {shlex.quote(self.version)} \
175176
python - <<'PY' 2>&1 | tee -a {shlex.quote(self.log_path)}
176177
{agent_script}
177178
PY
@@ -186,8 +187,14 @@ async def main() -> None:
186187

187188

188189
class Terminus2Config(vf.HarnessConfig):
190+
version: str = TERMINUS_2_DEFAULT_VERSION
189191
program: Terminus2ProgramConfig = Terminus2ProgramConfig()
190192

193+
@model_validator(mode="after")
194+
def apply_version(self):
195+
self.program.version = self.version
196+
return self
197+
191198

192199
class Terminus2(vf.Harness[Terminus2Config]):
193200
config: Terminus2Config

tests/test_composable_env.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,8 @@ def test_taskset_repr():
193193
assert "3" in repr(ts)
194194

195195

196-
def test_composable_mini_swe_agent_unversioned_install_spec_uses_unpinned_requirement():
197-
setup = build_mini_swe_agent_install_script(install_spec=" mini-swe-agent ")
196+
def test_composable_mini_swe_agent_unversioned_version_uses_unpinned_requirement():
197+
setup = build_mini_swe_agent_install_script(version=" mini-swe-agent ")
198198

199199
assert (
200200
"vf_python_install --target /opt/mini-swe-agent/prefix/site-packages mini-swe-agent"
@@ -203,9 +203,9 @@ def test_composable_mini_swe_agent_unversioned_install_spec_uses_unpinned_requir
203203
assert "mini-swe-agent==mini-swe-agent" not in setup
204204

205205

206-
def test_composable_opencode_unversioned_install_spec_uses_latest_download_url():
206+
def test_composable_opencode_unversioned_version_uses_latest_download_url():
207207
setup = build_opencode_install_script(
208-
install_spec=" PrimeIntellect-ai/opencode ",
208+
version=" PrimeIntellect-ai/opencode ",
209209
install_ripgrep=False,
210210
)
211211

tests/test_v1_harbor_cli.py

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@
2525
Terminus2Config,
2626
Terminus2ProgramConfig,
2727
)
28-
from harnesses.pi import PI_DEFAULT_INSTALL_SPEC
28+
from harnesses.pi import PI_DEFAULT_VERSION
2929
from harnesses.terminus_2 import (
3030
TERMINUS_2_DEFAULT_API_BASE_URL,
31-
TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC,
31+
TERMINUS_2_DEFAULT_VERSION,
3232
TERMINUS_2_DEFAULT_MODEL_NAME,
3333
Terminus2,
3434
)
@@ -287,18 +287,18 @@ def test_opencode_config_owns_opencode_harness_fields() -> None:
287287

288288

289289
@pytest.mark.parametrize(
290-
"install_spec",
290+
"version",
291291
["PrimeIntellect-ai/opencode@latest", " PrimeIntellect-ai/opencode "],
292292
)
293-
def test_opencode_latest_install_spec_uses_latest_download_url(
294-
install_spec: str,
293+
def test_opencode_latest_version_uses_latest_download_url(
294+
version: str,
295295
) -> None:
296296
harness = OpenCode(
297297
config=OpenCodeConfig(
298+
version=version,
298299
program=OpenCodeProgramConfig(
299-
install_spec=install_spec,
300300
install_ripgrep=False,
301-
)
301+
),
302302
)
303303
)
304304
program = cast(dict[str, object], harness.config.program.data())
@@ -308,12 +308,10 @@ def test_opencode_latest_install_spec_uses_latest_download_url(
308308
assert "OPENCODE_RELEASE_PATH=releases/latest/download" in setup
309309

310310

311-
def test_opencode_custom_install_spec_uses_versioned_release() -> None:
311+
def test_opencode_custom_version_uses_versioned_release() -> None:
312312
harness = OpenCode(
313313
config=OpenCodeConfig(
314-
program=OpenCodeProgramConfig(
315-
install_spec="Example/open-code@v2.0.0",
316-
)
314+
version="Example/open-code@v2.0.0",
317315
)
318316
)
319317
program = cast(dict[str, object], harness.config.program.data())
@@ -395,9 +393,9 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
395393

396394
assert "apt-get -o Acquire::Retries=3 update" in setup
397395
assert "apt-get -o Acquire::Retries=3 install" in setup
398-
assert harness.config.program.install_spec == PI_DEFAULT_INSTALL_SPEC
399-
assert PI_DEFAULT_INSTALL_SPEC == "@earendil-works/pi-coding-agent@latest"
400-
assert f"npm install -g --ignore-scripts {PI_DEFAULT_INSTALL_SPEC}" in setup
396+
assert harness.config.version == PI_DEFAULT_VERSION
397+
assert PI_DEFAULT_VERSION == "@earendil-works/pi-coding-agent@latest"
398+
assert f"npm install -g --ignore-scripts {PI_DEFAULT_VERSION}" in setup
401399
assert "mariozechner" not in setup
402400
assert '"baseUrl": "${OPENAI_BASE_URL}"' in mcp_setup
403401
assert '"api": "openai-completions"' in mcp_setup
@@ -407,12 +405,8 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
407405
assert f'"command": "{SANDBOX_PYTHON}"' in mcp_setup
408406

409407

410-
def test_pi_harness_preserves_scoped_npm_install_specs() -> None:
411-
harness = Pi(
412-
config=PiConfig(
413-
program=PiProgramConfig(install_spec="@anthropic-ai/claude-code@1.2.3")
414-
)
415-
)
408+
def test_pi_harness_preserves_scoped_npm_versions() -> None:
409+
harness = Pi(config=PiConfig(version="@anthropic-ai/claude-code@1.2.3"))
416410
program = cast(dict[str, object], harness.config.program.data())
417411
setup = cast(str, program["setup"])
418412

@@ -448,7 +442,7 @@ def test_terminus_2_harness_builds_sandbox_program() -> None:
448442

449443
run_script = cast(str, command[2])
450444
assert "TERMINUS_2_WORKDIR=/workspace" in run_script
451-
assert f"--with {TERMINUS_2_DEFAULT_HARBOR_INSTALL_SPEC}" in run_script
445+
assert f"--with {TERMINUS_2_DEFAULT_VERSION}" in run_script
452446
assert "git+https://github.com" not in run_script
453447
assert "max_turns=7" in run_script
454448

0 commit comments

Comments
 (0)