Skip to content

Commit f81b6f2

Browse files
authored
Add versioned agent install specs (#1506)
1 parent f54f189 commit f81b6f2

12 files changed

Lines changed: 182 additions & 78 deletions

File tree

packages/harnesses/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,17 @@ own a reusable execution mechanism.
5959
Harness implementations resolve to one `ProgramConfig` shape. Command harness
6060
configs may expose task-relevant execution knobs, but the harness owns command
6161
construction, channel wiring, sandbox placement, and artifacts.
62+
63+
## Agent Versions
64+
65+
Command agents use `name@version` specs where their installer supports a
66+
versioned package or release. Use `@latest` for a moving latest install:
67+
68+
```toml
69+
[eval.harness.program]
70+
# OpenCode
71+
release = "PrimeIntellect-ai/opencode@latest"
72+
73+
# MiniSWEAgent or Pi
74+
package = "mini-swe-agent@2.2.8"
75+
```

packages/harnesses/harnesses/mini_swe_agent.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import verifiers as vf
55
from verifiers.v1.utils.sandbox_python_utils import python_runtime_setup_command
66

7+
from .utils import split_versioned_agent_spec
8+
79
DEFAULT_INSTALL_DIR = "/opt/mini-swe-agent"
810
DEFAULT_PREFIX_DIR = f"{DEFAULT_INSTALL_DIR}/prefix"
911
DEFAULT_SITE_PACKAGES_DIR = f"{DEFAULT_PREFIX_DIR}/site-packages"
@@ -14,41 +16,30 @@
1416
MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH = "/mini-swe-agent/system.txt"
1517
MINI_SWE_AGENT_DEFAULT_LOG_PATH = "/logs/agent/mini-swe-agent.log"
1618
MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH = "/logs/agent/mini-swe-agent.traj.json"
17-
MINI_SWE_AGENT_DEFAULT_PACKAGE_VERSION = "2.2.8"
18-
MINI_SWE_AGENT_DEFAULT_PACKAGE_SHA256 = (
19-
"694df4de1337e665e3cd82e99f93374f573bf52b8e7c362ac5d8045ad9f7c37c"
20-
)
19+
MINI_SWE_AGENT_DEFAULT_PACKAGE = "mini-swe-agent@2.2.8"
2120
MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC = "mini"
2221
MINI_SWE_AGENT_DEFAULT_MODEL_CLASS = "litellm"
2322
MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT = 120
2423

2524

2625
def build_mini_swe_agent_install_script(
27-
package_version: str = MINI_SWE_AGENT_DEFAULT_PACKAGE_VERSION,
28-
package_sha256: str = MINI_SWE_AGENT_DEFAULT_PACKAGE_SHA256,
26+
package: str = MINI_SWE_AGENT_DEFAULT_PACKAGE,
2927
prefix_dir: str = DEFAULT_PREFIX_DIR,
3028
) -> str:
3129
install_dir = str(PurePosixPath(prefix_dir).parent)
3230
site_packages_dir = f"{prefix_dir.rstrip('/')}/site-packages"
33-
wheel_filename = f"mini_swe_agent-{package_version}-py3-none-any.whl"
34-
wheel_url = (
35-
f"https://files.pythonhosted.org/packages/py3/m/mini-swe-agent/{wheel_filename}"
36-
)
3731
setup_prefix_dir = shlex.quote(prefix_dir)
3832
setup_site_packages_dir = shlex.quote(site_packages_dir)
33+
package_name, package_version = split_versioned_agent_spec(package)
34+
package_requirement = package_name
35+
if package_version and package_version != "latest":
36+
package_requirement = f"{package_name}=={package_version}"
3937
return f"""\
4038
set -e
4139
{python_runtime_setup_command()}
4240
rm -rf {setup_prefix_dir}
4341
mkdir -p {shlex.quote(install_dir)} {setup_prefix_dir}/bin {setup_site_packages_dir} {shlex.quote(DEFAULT_LOG_DIR)} /mini-swe-agent
44-
MINI_SWE_AGENT_WHEEL_DIR="$(mktemp -d)"
45-
trap 'rm -rf "$MINI_SWE_AGENT_WHEEL_DIR"' EXIT
46-
MINI_SWE_AGENT_WHEEL="$MINI_SWE_AGENT_WHEEL_DIR/{wheel_filename}"
47-
MINI_SWE_AGENT_WHEEL_URL={shlex.quote(wheel_url)}
48-
export MINI_SWE_AGENT_WHEEL MINI_SWE_AGENT_WHEEL_URL
49-
"$VF_PYTHON" -c 'import os, urllib.request; urllib.request.urlretrieve(os.environ["MINI_SWE_AGENT_WHEEL_URL"], os.environ["MINI_SWE_AGENT_WHEEL"])'
50-
echo "{package_sha256} $MINI_SWE_AGENT_WHEEL" | sha256sum -c -
51-
vf_python_install --target {setup_site_packages_dir} "$MINI_SWE_AGENT_WHEEL"
42+
vf_python_install --target {setup_site_packages_dir} {shlex.quote(package_requirement)}
5243
echo "$VF_PYTHON" > {setup_prefix_dir}/python
5344
cat > {setup_prefix_dir}/bin/mini <<'EOF'
5445
#!/usr/bin/env sh
@@ -66,8 +57,7 @@ class MiniSWEAgentProgramConfig(vf.ProgramConfig):
6657
system_prompt_path: str = MINI_SWE_AGENT_DEFAULT_SYSTEM_PROMPT_PATH
6758
log_path: str = MINI_SWE_AGENT_DEFAULT_LOG_PATH
6859
trajectory_path: str = MINI_SWE_AGENT_DEFAULT_TRAJECTORY_PATH
69-
package_version: str = MINI_SWE_AGENT_DEFAULT_PACKAGE_VERSION
70-
package_sha256: str = MINI_SWE_AGENT_DEFAULT_PACKAGE_SHA256
60+
package: str = MINI_SWE_AGENT_DEFAULT_PACKAGE
7161
config_spec: str = MINI_SWE_AGENT_DEFAULT_CONFIG_SPEC
7262
model_class: str = MINI_SWE_AGENT_DEFAULT_MODEL_CLASS
7363
environment_timeout: int = MINI_SWE_AGENT_DEFAULT_ENVIRONMENT_TIMEOUT
@@ -125,8 +115,7 @@ def resolve(self) -> vf.ProgramConfig:
125115
config_args.extend(["-c", shlex.quote(spec)])
126116

127117
setup = build_mini_swe_agent_install_script(
128-
package_version=self.package_version,
129-
package_sha256=self.package_sha256,
118+
package=self.package,
130119
)
131120
log_dir = str(PurePosixPath(self.log_path).parent)
132121
trajectory_dir = str(PurePosixPath(self.trajectory_path).parent)

packages/harnesses/harnesses/opencode.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,9 @@
55
import verifiers as vf
66
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
77

8-
OPENCODE_DEFAULT_RELEASE_REPO = "PrimeIntellect-ai/opencode"
9-
OPENCODE_DEFAULT_RELEASE_VERSION = "1.1.63-rl2"
10-
OPENCODE_DEFAULT_RELEASE_SHA256 = (
11-
"47f4102796da50769e27d2c9ea6a9cf7941f76898390cb497278cab39c4b6ed4"
12-
)
8+
from .utils import split_versioned_agent_spec
9+
10+
OPENCODE_DEFAULT_RELEASE = "PrimeIntellect-ai/opencode@1.1.63-rl2"
1311
OPENCODE_DEFAULT_AGENT_WORKDIR = "/app"
1412
OPENCODE_DEFAULT_INSTRUCTION_PATH = "/opencode/instruction.txt"
1513
OPENCODE_DEFAULT_SYSTEM_PROMPT_PATH = "/opencode/system.txt"
@@ -52,9 +50,7 @@ class OpenCodeProgramConfig(vf.ProgramConfig):
5250
disabled_tools: list[str] = OPENCODE_DEFAULT_DISABLED_TOOLS
5351
allow_git: bool = False
5452
disable_compaction: bool = True
55-
release_repo: str = OPENCODE_DEFAULT_RELEASE_REPO
56-
release_version: str = OPENCODE_DEFAULT_RELEASE_VERSION
57-
release_sha256: str = OPENCODE_DEFAULT_RELEASE_SHA256
53+
release: str = OPENCODE_DEFAULT_RELEASE
5854
install_ripgrep: bool = True
5955
provider_timeout_ms: int = 3_600_000
6056

@@ -79,18 +75,24 @@ def resolve(self) -> vf.ProgramConfig:
7975
if self.install_ripgrep
8076
else ""
8177
)
82-
sha256_check = (
83-
f'echo "{self.release_sha256} /tmp/opencode.tar.gz" | sha256sum -c -'
84-
)
78+
release_repo, release_version = split_versioned_agent_spec(self.release)
79+
release_path = "releases/latest/download"
80+
if release_version and release_version != "latest":
81+
release_tag = (
82+
release_version
83+
if release_version.startswith("v")
84+
else f"v{release_version}"
85+
)
86+
release_path = f"releases/download/{release_tag}"
8587
# Acquire::Retries=3 mitigates transient archive.ubuntu.com CDN sync
8688
# mismatches that fail fresh-sandbox apt-get calls mid-rollout.
8789
setup = f"""\
8890
set -e
8991
apt-get -o Acquire::Retries=3 update -qq && apt-get -o Acquire::Retries=3 install -y -qq curl tar ca-certificates > /dev/null 2>&1
9092
{rg_install}
9193
92-
OPENCODE_RELEASE_REPO={shlex.quote(self.release_repo)}
93-
OPENCODE_RELEASE_VERSION={shlex.quote(self.release_version)}
94+
OPENCODE_RELEASE_REPO={shlex.quote(release_repo)}
95+
OPENCODE_RELEASE_PATH={shlex.quote(release_path)}
9496
9597
case "$(uname -m)" in
9698
x86_64) OPENCODE_ARCH=x64 ;;
@@ -99,15 +101,13 @@ def resolve(self) -> vf.ProgramConfig:
99101
esac
100102
101103
OPENCODE_ASSET="opencode-linux-$OPENCODE_ARCH.tar.gz"
102-
OPENCODE_RELEASE_TAG="${{OPENCODE_RELEASE_VERSION#v}}"
103-
OPENCODE_RELEASE_URL="https://github.com/$OPENCODE_RELEASE_REPO/releases/download/v$OPENCODE_RELEASE_TAG/$OPENCODE_ASSET"
104+
OPENCODE_RELEASE_URL="https://github.com/$OPENCODE_RELEASE_REPO/$OPENCODE_RELEASE_PATH/$OPENCODE_ASSET"
104105
105106
mkdir -p "$HOME/.opencode/bin"
106107
if [ -x "$HOME/.opencode/bin/opencode" ]; then
107108
echo "OpenCode already installed, skipping download"
108109
else
109110
curl -fsSL "$OPENCODE_RELEASE_URL" -o /tmp/opencode.tar.gz
110-
{sha256_check}
111111
tar -xzf /tmp/opencode.tar.gz -C /tmp
112112
install -m 755 /tmp/opencode "$HOME/.opencode/bin/opencode"
113113
rm -f /tmp/opencode.tar.gz /tmp/opencode

packages/harnesses/harnesses/pi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import verifiers as vf
66
from verifiers.v1.utils.mcp_proxy_utils import proxy_command
77

8-
PI_DEFAULT_PACKAGE = "@earendil-works/pi-coding-agent"
8+
PI_DEFAULT_PACKAGE = "@earendil-works/pi-coding-agent@latest"
99
PI_DEFAULT_WORKDIR = "/app"
1010
PI_DEFAULT_INSTRUCTION_PATH = "/pi/instruction.txt"
1111
PI_DEFAULT_SYSTEM_PROMPT_PATH = "/pi/system.txt"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,10 @@
11
"""Internal harness utilities."""
2+
3+
4+
def split_versioned_agent_spec(spec: str) -> tuple[str, str | None]:
5+
"""Split an agent install spec written as name[@version]."""
6+
spec = spec.strip()
7+
name, _, version = spec.rpartition("@")
8+
if not name:
9+
return spec, None
10+
return name, version or None

tests/test_composable_env.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
TaskSet,
1818
discover_sibling_dir,
1919
)
20+
from verifiers.envs.experimental.composable.harnesses.mini_swe_agent import (
21+
build_mini_swe_agent_install_script,
22+
)
23+
from verifiers.envs.experimental.composable.harnesses.opencode import (
24+
build_install_script as build_opencode_install_script,
25+
)
2026

2127

2228
# ── Mock Rubrics ──────────────────────────────────────────────────────
@@ -187,6 +193,27 @@ def test_taskset_repr():
187193
assert "3" in repr(ts)
188194

189195

196+
def test_composable_mini_swe_agent_unversioned_package_uses_unpinned_requirement():
197+
setup = build_mini_swe_agent_install_script(package=" mini-swe-agent ")
198+
199+
assert (
200+
"vf_python_install --target /opt/mini-swe-agent/prefix/site-packages mini-swe-agent"
201+
in setup
202+
)
203+
assert "mini-swe-agent==mini-swe-agent" not in setup
204+
205+
206+
def test_composable_opencode_unversioned_release_uses_latest_download_url():
207+
setup = build_opencode_install_script(
208+
release=" PrimeIntellect-ai/opencode ",
209+
install_ripgrep=False,
210+
)
211+
212+
assert "OPENCODE_RELEASE_REPO=PrimeIntellect-ai/opencode" in setup
213+
assert "OPENCODE_RELEASE_PATH=releases/latest/download" in setup
214+
assert "releases/download/vPrimeIntellect-ai/opencode" not in setup
215+
216+
190217
@pytest.mark.asyncio
191218
async def test_composable_env_exports_task_workdir():
192219
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")

tests/test_multiturn_env.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,15 +189,15 @@ async def env_response(self, messages, state, **kwargs): # type: ignore[overrid
189189

190190
async def add_model_response(self, state, prompt_messages, response): # type: ignore[override]
191191
await super().add_model_response(state, prompt_messages, response)
192-
await asyncio.sleep(0.05)
192+
await asyncio.sleep(1)
193193

194194
env = SlowMultiTurnEnv(
195195
client=mock_client,
196196
model="test-model",
197197
dataset=sample_chat_dataset,
198198
parser=Parser(),
199199
rubric=Rubric(),
200-
timeout_seconds=0.01,
200+
timeout_seconds=0.2,
201201
)
202202
mock_client.set_default_response("Still going")
203203

tests/test_v1_harbor_cli.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,47 @@ def test_opencode_config_owns_opencode_harness_fields() -> None:
279279
assert harness.config.max_turns == 2
280280
assert "apt-get -o Acquire::Retries=3 update" in setup
281281
assert "apt-get -o Acquire::Retries=3 install" in setup
282+
assert "OPENCODE_RELEASE_REPO=PrimeIntellect-ai/opencode" in setup
283+
assert "OPENCODE_RELEASE_PATH=releases/download/v1.1.63-rl2" in setup
282284
assert "/workspace" in cast(str, command[2])
283285
assert '"webfetch": false' in cast(str, mcp_setup)
284286
assert "/opencode/system.txt" in cast(dict[str, object], program["files"])
285287

286288

289+
@pytest.mark.parametrize(
290+
"release", ["PrimeIntellect-ai/opencode@latest", " PrimeIntellect-ai/opencode "]
291+
)
292+
def test_opencode_latest_release_uses_latest_download_url(release: str) -> None:
293+
harness = OpenCode(
294+
config=OpenCodeConfig(
295+
program=OpenCodeProgramConfig(
296+
release=release,
297+
install_ripgrep=False,
298+
)
299+
)
300+
)
301+
program = cast(dict[str, object], harness.config.program.data())
302+
setup = cast(str, program["setup"])
303+
304+
assert "OPENCODE_RELEASE_REPO=PrimeIntellect-ai/opencode" in setup
305+
assert "OPENCODE_RELEASE_PATH=releases/latest/download" in setup
306+
307+
308+
def test_opencode_custom_release_uses_versioned_spec() -> None:
309+
harness = OpenCode(
310+
config=OpenCodeConfig(
311+
program=OpenCodeProgramConfig(
312+
release="Example/open-code@v2.0.0",
313+
)
314+
)
315+
)
316+
program = cast(dict[str, object], harness.config.program.data())
317+
setup = cast(str, program["setup"])
318+
319+
assert "OPENCODE_RELEASE_REPO=Example/open-code" in setup
320+
assert "OPENCODE_RELEASE_PATH=releases/download/v2.0.0" in setup
321+
322+
287323
@pytest.mark.parametrize(
288324
("harness_cls", "config_cls", "program_cls"),
289325
[
@@ -357,7 +393,7 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
357393
assert "apt-get -o Acquire::Retries=3 update" in setup
358394
assert "apt-get -o Acquire::Retries=3 install" in setup
359395
assert harness.config.program.package == PI_DEFAULT_PACKAGE
360-
assert PI_DEFAULT_PACKAGE == "@earendil-works/pi-coding-agent"
396+
assert PI_DEFAULT_PACKAGE == "@earendil-works/pi-coding-agent@latest"
361397
assert f"npm install -g --ignore-scripts {PI_DEFAULT_PACKAGE}" in setup
362398
assert "mariozechner" not in setup
363399
assert '"baseUrl": "${OPENAI_BASE_URL}"' in mcp_setup
@@ -368,6 +404,18 @@ def test_pi_harness_writes_intercepted_model_and_mcp_config() -> None:
368404
assert f'"command": "{SANDBOX_PYTHON}"' in mcp_setup
369405

370406

407+
def test_pi_harness_preserves_scoped_npm_package_versions() -> None:
408+
harness = Pi(
409+
config=PiConfig(
410+
program=PiProgramConfig(package="@anthropic-ai/claude-code@1.2.3")
411+
)
412+
)
413+
program = cast(dict[str, object], harness.config.program.data())
414+
setup = cast(str, program["setup"])
415+
416+
assert "npm install -g --ignore-scripts @anthropic-ai/claude-code@1.2.3" in setup
417+
418+
371419
def test_terminus_2_harness_builds_sandbox_program() -> None:
372420
harness = Terminus2(
373421
config=Terminus2Config(

tests/test_v1_mini_swe_agent.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,38 @@ def test_mini_swe_agent_builds_sandbox_program():
8080
assert "model.model_kwargs.parallel_tool_calls=true" in script
8181
assert "apt-get -o Acquire::Retries=3 update" in cast(str, program["setup"])
8282
assert "apt-get -o Acquire::Retries=3 install" in cast(str, program["setup"])
83+
assert "mini-swe-agent==2.2.8" in cast(str, program["setup"])
8384
assert "/mini-swe-agent/prompt.txt" in cast(dict[str, object], program["files"])
8485
assert "/mini-swe-agent/system.txt" in cast(dict[str, object], program["files"])
8586
assert "mini_swe_agent_log" in cast(dict[str, object], program["artifacts"])
8687

8788

89+
@pytest.mark.parametrize("package", ["mini-swe-agent@latest", " mini-swe-agent "])
90+
def test_mini_swe_agent_latest_package_uses_unpinned_pip_requirement(package: str):
91+
harness = MiniSWEAgent(
92+
config=MiniSWEAgentConfig(program=MiniSWEAgentProgramConfig(package=package))
93+
)
94+
program = cast(dict[str, Any], harness.config.program.data())
95+
setup = cast(str, program["setup"])
96+
97+
assert (
98+
"vf_python_install --target /opt/mini-swe-agent/prefix/site-packages mini-swe-agent"
99+
in setup
100+
)
101+
102+
103+
def test_mini_swe_agent_pinned_package_uses_pip_requirement():
104+
harness = MiniSWEAgent(
105+
config=MiniSWEAgentConfig(
106+
program=MiniSWEAgentProgramConfig(package="mini-swe-agent@2.2.7")
107+
)
108+
)
109+
program = cast(dict[str, Any], harness.config.program.data())
110+
setup = cast(str, program["setup"])
111+
112+
assert "mini-swe-agent==2.2.7" in setup
113+
114+
88115
def test_mini_swe_agent_composes_with_harbor_taskset(
89116
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
90117
):

verifiers/envs/experimental/composable/harnesses/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
)
1010
from verifiers.envs.experimental.composable.harnesses.opencode import (
1111
DEFAULT_DISABLED_TOOLS,
12-
DEFAULT_RELEASE_SHA256,
12+
DEFAULT_RELEASE,
1313
DEFAULT_SYSTEM_PROMPT,
1414
OPENCODE_INSTALL_SCRIPT,
1515
build_install_script as build_opencode_install_script,
@@ -39,7 +39,7 @@
3939
"build_opencode_run_command",
4040
"OPENCODE_INSTALL_SCRIPT",
4141
"DEFAULT_DISABLED_TOOLS",
42-
"DEFAULT_RELEASE_SHA256",
42+
"DEFAULT_RELEASE",
4343
"DEFAULT_SYSTEM_PROMPT",
4444
"mini_swe_agent_harness",
4545
"build_mini_swe_agent_install_script",

0 commit comments

Comments
 (0)