Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions examples/specs/swe-bench-pro-spacedock-codex.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ agent:
top_p: null
seed: null
solver_workflow: ./examples/solver_workflows/codex-benchmark-solver
# SWE-tuned budget — above the 1200s codex default: large repos + long
# test suites need more turns and a longer per-attempt/overall timeout.
max_turns: 400
# SWE-tuned budget. The codex runtime does NOT honor a custom max_turns —
# it accepts only the default (200); any other value raises
# SpacedockSolverAgentError. So keep max_turns at 200 and buy the
# "large repo + long test suite" headroom via the TIMEOUTS below, which
# govern wall-clock (harbor trial config), not codex turn limits.
max_turns: 200
override_timeout_sec: 5400
max_timeout_sec: 7200
reasoning_effort: xhigh
Expand Down
9 changes: 8 additions & 1 deletion src/razorback/agents/_runtime/codex.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,16 @@ def _codex_kwargs(harbor_agent_kwargs: dict[str, Any]) -> dict[str, Any]:
if _is_empty_noop(name, value):
continue
if name not in _CODEX_SUPPORTED_KWARGS:
hint = ""
if name == "max_turns":
hint = (
" The codex runtime accepts only the default max_turns (200); "
"keep it at 200 and budget wall-clock via "
"override_timeout_sec / max_timeout_sec instead."
)
raise SpacedockSolverAgentError(
"codex runtime adapter cannot honor unsupported harbor_agent_kwargs "
f"field {name!r}; refusing to silently drop it."
f"field {name!r}; refusing to silently drop it.{hint}"
)
kw[name] = value
return kw
Expand Down
31 changes: 30 additions & 1 deletion src/razorback/harbor_tasks/materialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,41 @@ def _reflect_allowed_files(
target.mkdir(parents=True, exist_ok=True)
continue
target.parent.mkdir(parents=True, exist_ok=True)
if view_mode == "copy":
# The `environment/` subtree is the Docker build context (current Harbor
# convention): `docker compose build` runs with it as the project
# directory, and BuildKit cannot read a Dockerfile (or any build input)
# that symlinks outside the context ("failed to read dockerfile: no such
# file or directory"). So always materialize the build context as real
# files, even in link mode — mirroring how `_patch_task_toml` keeps
# task.toml view-owned. Bulk task files still symlink in link mode (the
# whole point of bind: no eager duplication).
copy_real = view_mode == "copy" or _is_build_context_path(rel)
if path.is_symlink():
# The name filter above only saw the LINK's own path. A symlink with
# an innocuous name can still point at a denied answer artifact (e.g.
# `environment/leak.patch -> ../gold.patch`); copying follows the link
# and embeds the target's bytes under an allowed view path, and a view
# symlink chases through to it too — bypassing the leakage deny
# boundary. Resolve the target and re-apply the deny check (and
# source containment) so a symlink cannot smuggle denied or
# out-of-tree content into the view.
try:
resolved_rel = path.resolve().relative_to(source).as_posix()
except ValueError:
continue # target escapes the source tree — refuse it
if matches_denied_path(resolved_rel, exclude_globs):
continue
if copy_real:
shutil.copy2(path, target)
else:
os.symlink(path, target)


def _is_build_context_path(rel: Path) -> bool:
"""True for files under the `environment/` Docker build context."""
return rel.parts[:1] == ("environment",)


def _patch_task_toml(
task_toml: Path,
*,
Expand Down
59 changes: 59 additions & 0 deletions tests/unit/test_harbor_task_view_materializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,65 @@ def test_link_mode_symlinks_files_but_never_mutates_source_task_toml(tmp_path):
assert "RAZORBACK_BENCHMARK_TASK_ID" not in (source / "task.toml").read_text()


def test_link_mode_copies_environment_build_context_as_real_files(tmp_path):
"""`view_mode="link"` must keep the `environment/` Docker build context as
REAL files, not symlinks. `docker compose build` runs with the view's
`environment/` dir as the build context; BuildKit cannot read a Dockerfile
that symlinks outside the context (`failed to read dockerfile: no such file
or directory`), so a symlinked build context breaks every build-from-source
benchmark (e.g. swe-bench-pro) under the default bind/link mode. Bulk task
files still symlink — only the build context is forced real.
"""
source = _write_source_task(tmp_path)
dockerfile_text = (source / "environment" / "Dockerfile").read_text()

view = materialize_harbor_task_view(
source_task_dir=source,
view_root=tmp_path / "views",
benchmark_kind="fixture-bench",
benchmark_task_id="task-001",
transform_name="fixture-transform",
view_mode="link",
)

# the Docker build context is a real, view-owned file (readable by BuildKit)
assert (view / "environment" / "Dockerfile").is_file()
assert not (view / "environment" / "Dockerfile").is_symlink()
assert (view / "environment" / "Dockerfile").read_text() == dockerfile_text
# bulk files outside environment/ still symlink — the bind/link contract holds
assert (view / "instruction.md").is_symlink()
assert (view / "data" / "input.csv").is_symlink()


def test_environment_symlink_to_denied_target_is_not_smuggled(tmp_path):
"""A symlink under `environment/` with an innocuous name must not smuggle a
DENIED target's bytes into the view. Copying the build context follows
symlinks (shutil.copy2), and the name-based deny filter only sees the link's
own path — so `environment/leak.patch -> ../gold_patch.diff` would otherwise
embed gold-patch content under an allowed view path. The materializer must
resolve the target and re-apply the deny check, dropping it.
"""
source = _write_source_task(tmp_path)
(source / "gold_patch.diff").write_text("--- GOLD ANSWER PATCH ---\n")
(source / "environment" / "leak.patch").symlink_to(source / "gold_patch.diff")

view = materialize_harbor_task_view(
source_task_dir=source,
view_root=tmp_path / "views",
benchmark_kind="fixture-bench",
benchmark_task_id="task-001",
transform_name="fixture-transform",
exclude_globs=("gold_patch*", "gold.patch"),
view_mode="link",
)

# the legit build context is still materialized as a real file …
assert (view / "environment" / "Dockerfile").is_file()
assert not (view / "environment" / "Dockerfile").is_symlink()
# … but the disguised symlink to a denied target is dropped, not embedded
assert not (view / "environment" / "leak.patch").exists()


def test_materialized_view_is_harbor_taskconfig_path_ready(tmp_path):
source = _write_source_task(tmp_path)

Expand Down
17 changes: 17 additions & 0 deletions tests/unit/test_runtime_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,23 @@ def test_codex_rejects_unsupported_contract_kwargs(tmp_path, kwarg):
)


def test_codex_max_turns_rejection_is_actionable(tmp_path):
"""A custom max_turns is the most common codex trip (claude honors it, codex
does not). The error must tell the user the fix: keep the default (200) and
budget wall-clock via timeouts — not leave them to reverse-engineer it.
"""
with pytest.raises(SpacedockSolverAgentError) as excinfo:
codex_adapter.build_inner_agent(
logs_dir=tmp_path,
model="gpt-5.1-codex",
harbor_agent_kwargs={"max_turns": 400},
extra_env={"OPENAI_API_KEY": "sk-fake"},
)
message = str(excinfo.value)
assert "200" in message
assert "timeout" in message.lower()


def test_pi_raises_not_implemented(tmp_path):
with pytest.raises(NotImplementedError, match="pi"):
pi_adapter.build_inner_agent(
Expand Down