Skip to content

Commit 2d8a1ae

Browse files
authored
Merge pull request #26 from spacedock-dev/fix/swe-bench-pro-default-run
fix(swe-bench-pro): make the default `rk run` work out of the box
2 parents e0f4017 + 6a4b891 commit 2d8a1ae

5 files changed

Lines changed: 120 additions & 5 deletions

File tree

examples/specs/swe-bench-pro-spacedock-codex.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@ agent:
1414
top_p: null
1515
seed: null
1616
solver_workflow: ./examples/solver_workflows/codex-benchmark-solver
17-
# SWE-tuned budget — above the 1200s codex default: large repos + long
18-
# test suites need more turns and a longer per-attempt/overall timeout.
19-
max_turns: 400
17+
# SWE-tuned budget. The codex runtime does NOT honor a custom max_turns —
18+
# it accepts only the default (200); any other value raises
19+
# SpacedockSolverAgentError. So keep max_turns at 200 and buy the
20+
# "large repo + long test suite" headroom via the TIMEOUTS below, which
21+
# govern wall-clock (harbor trial config), not codex turn limits.
22+
max_turns: 200
2023
override_timeout_sec: 5400
2124
max_timeout_sec: 7200
2225
reasoning_effort: xhigh

src/razorback/agents/_runtime/codex.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,9 +209,16 @@ def _codex_kwargs(harbor_agent_kwargs: dict[str, Any]) -> dict[str, Any]:
209209
if _is_empty_noop(name, value):
210210
continue
211211
if name not in _CODEX_SUPPORTED_KWARGS:
212+
hint = ""
213+
if name == "max_turns":
214+
hint = (
215+
" The codex runtime accepts only the default max_turns (200); "
216+
"keep it at 200 and budget wall-clock via "
217+
"override_timeout_sec / max_timeout_sec instead."
218+
)
212219
raise SpacedockSolverAgentError(
213220
"codex runtime adapter cannot honor unsupported harbor_agent_kwargs "
214-
f"field {name!r}; refusing to silently drop it."
221+
f"field {name!r}; refusing to silently drop it.{hint}"
215222
)
216223
kw[name] = value
217224
return kw

src/razorback/harbor_tasks/materialize.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,41 @@ def _reflect_allowed_files(
113113
target.mkdir(parents=True, exist_ok=True)
114114
continue
115115
target.parent.mkdir(parents=True, exist_ok=True)
116-
if view_mode == "copy":
116+
# The `environment/` subtree is the Docker build context (current Harbor
117+
# convention): `docker compose build` runs with it as the project
118+
# directory, and BuildKit cannot read a Dockerfile (or any build input)
119+
# that symlinks outside the context ("failed to read dockerfile: no such
120+
# file or directory"). So always materialize the build context as real
121+
# files, even in link mode — mirroring how `_patch_task_toml` keeps
122+
# task.toml view-owned. Bulk task files still symlink in link mode (the
123+
# whole point of bind: no eager duplication).
124+
copy_real = view_mode == "copy" or _is_build_context_path(rel)
125+
if path.is_symlink():
126+
# The name filter above only saw the LINK's own path. A symlink with
127+
# an innocuous name can still point at a denied answer artifact (e.g.
128+
# `environment/leak.patch -> ../gold.patch`); copying follows the link
129+
# and embeds the target's bytes under an allowed view path, and a view
130+
# symlink chases through to it too — bypassing the leakage deny
131+
# boundary. Resolve the target and re-apply the deny check (and
132+
# source containment) so a symlink cannot smuggle denied or
133+
# out-of-tree content into the view.
134+
try:
135+
resolved_rel = path.resolve().relative_to(source).as_posix()
136+
except ValueError:
137+
continue # target escapes the source tree — refuse it
138+
if matches_denied_path(resolved_rel, exclude_globs):
139+
continue
140+
if copy_real:
117141
shutil.copy2(path, target)
118142
else:
119143
os.symlink(path, target)
120144

121145

146+
def _is_build_context_path(rel: Path) -> bool:
147+
"""True for files under the `environment/` Docker build context."""
148+
return rel.parts[:1] == ("environment",)
149+
150+
122151
def _patch_task_toml(
123152
task_toml: Path,
124153
*,

tests/unit/test_harbor_task_view_materializer.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,65 @@ def test_link_mode_symlinks_files_but_never_mutates_source_task_toml(tmp_path):
109109
assert "RAZORBACK_BENCHMARK_TASK_ID" not in (source / "task.toml").read_text()
110110

111111

112+
def test_link_mode_copies_environment_build_context_as_real_files(tmp_path):
113+
"""`view_mode="link"` must keep the `environment/` Docker build context as
114+
REAL files, not symlinks. `docker compose build` runs with the view's
115+
`environment/` dir as the build context; BuildKit cannot read a Dockerfile
116+
that symlinks outside the context (`failed to read dockerfile: no such file
117+
or directory`), so a symlinked build context breaks every build-from-source
118+
benchmark (e.g. swe-bench-pro) under the default bind/link mode. Bulk task
119+
files still symlink — only the build context is forced real.
120+
"""
121+
source = _write_source_task(tmp_path)
122+
dockerfile_text = (source / "environment" / "Dockerfile").read_text()
123+
124+
view = materialize_harbor_task_view(
125+
source_task_dir=source,
126+
view_root=tmp_path / "views",
127+
benchmark_kind="fixture-bench",
128+
benchmark_task_id="task-001",
129+
transform_name="fixture-transform",
130+
view_mode="link",
131+
)
132+
133+
# the Docker build context is a real, view-owned file (readable by BuildKit)
134+
assert (view / "environment" / "Dockerfile").is_file()
135+
assert not (view / "environment" / "Dockerfile").is_symlink()
136+
assert (view / "environment" / "Dockerfile").read_text() == dockerfile_text
137+
# bulk files outside environment/ still symlink — the bind/link contract holds
138+
assert (view / "instruction.md").is_symlink()
139+
assert (view / "data" / "input.csv").is_symlink()
140+
141+
142+
def test_environment_symlink_to_denied_target_is_not_smuggled(tmp_path):
143+
"""A symlink under `environment/` with an innocuous name must not smuggle a
144+
DENIED target's bytes into the view. Copying the build context follows
145+
symlinks (shutil.copy2), and the name-based deny filter only sees the link's
146+
own path — so `environment/leak.patch -> ../gold_patch.diff` would otherwise
147+
embed gold-patch content under an allowed view path. The materializer must
148+
resolve the target and re-apply the deny check, dropping it.
149+
"""
150+
source = _write_source_task(tmp_path)
151+
(source / "gold_patch.diff").write_text("--- GOLD ANSWER PATCH ---\n")
152+
(source / "environment" / "leak.patch").symlink_to(source / "gold_patch.diff")
153+
154+
view = materialize_harbor_task_view(
155+
source_task_dir=source,
156+
view_root=tmp_path / "views",
157+
benchmark_kind="fixture-bench",
158+
benchmark_task_id="task-001",
159+
transform_name="fixture-transform",
160+
exclude_globs=("gold_patch*", "gold.patch"),
161+
view_mode="link",
162+
)
163+
164+
# the legit build context is still materialized as a real file …
165+
assert (view / "environment" / "Dockerfile").is_file()
166+
assert not (view / "environment" / "Dockerfile").is_symlink()
167+
# … but the disguised symlink to a denied target is dropped, not embedded
168+
assert not (view / "environment" / "leak.patch").exists()
169+
170+
112171
def test_materialized_view_is_harbor_taskconfig_path_ready(tmp_path):
113172
source = _write_source_task(tmp_path)
114173

tests/unit/test_runtime_adapters.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,23 @@ def test_codex_rejects_unsupported_contract_kwargs(tmp_path, kwarg):
501501
)
502502

503503

504+
def test_codex_max_turns_rejection_is_actionable(tmp_path):
505+
"""A custom max_turns is the most common codex trip (claude honors it, codex
506+
does not). The error must tell the user the fix: keep the default (200) and
507+
budget wall-clock via timeouts — not leave them to reverse-engineer it.
508+
"""
509+
with pytest.raises(SpacedockSolverAgentError) as excinfo:
510+
codex_adapter.build_inner_agent(
511+
logs_dir=tmp_path,
512+
model="gpt-5.1-codex",
513+
harbor_agent_kwargs={"max_turns": 400},
514+
extra_env={"OPENAI_API_KEY": "sk-fake"},
515+
)
516+
message = str(excinfo.value)
517+
assert "200" in message
518+
assert "timeout" in message.lower()
519+
520+
504521
def test_pi_raises_not_implemented(tmp_path):
505522
with pytest.raises(NotImplementedError, match="pi"):
506523
pi_adapter.build_inner_agent(

0 commit comments

Comments
 (0)