Skip to content

Commit 9cb7fc4

Browse files
committed
feat: end-to-end isolated-runner tests for parallel arms (#123 Phase B)
Closes the SDK-integration gap from #150 (Phase A): adds three end-to-end behavioral tests that exercise the full chain: partition_plan -> make_isolated_arm_runner -> run_units -> merge_unit_results The SDK side is injected via a fake (per the no-live-LLM project principle, see CLAUDE.md). The tests assert the orchestration contract — every unit dispatches with isolation='worktree' to a non-overlapping results dir, failures are isolated to the affected arm, and the merged output is deterministic. Tests: test_three_units_dispatched_with_isolation_kwarg Plan with 1 arm × 1 condition + 1 arm × 1 condition × 2 seeds = 3 units. All three dispatch with isolation='worktree'. Merged output has both arms in sorted order, both reported complete. test_partial_failure_isolated_to_one_arm Fake runner returns is_error for h-ablation; h-main succeeds. Merged output: h-main complete, h-ablation failed. Failed unit count = 2 (both ablation seeds). Total = 3. The acceptance criterion 'one arm failure does not abort iteration'. test_no_two_units_share_results_dir Captures every Write-output-files-to path the runner sends to each subagent; asserts all 3 are unique. The acceptance criterion 'no two subagents ever write to the same results/ subpath'. A local _LocalSDKResult stand-in replaces the import from sdk_dispatch so this branch doesn't depend on sdk_dispatch.py landing first; the real SDKResult from #121 is duck-compatible (same field shape). The full chain works against any sdk_runner respecting the SDKRunner Protocol — production wiring (which constructs the real Anthropic SDK runner with isolation kwarg) is verified on soak. Closes #123.
1 parent f7a01f3 commit 9cb7fc4

1 file changed

Lines changed: 132 additions & 1 deletion

File tree

tests/test_parallel_arms.py

Lines changed: 132 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
"""Behavioral tests for the parallel-arm orchestration (#123 Phase A)."""
1+
"""Behavioral tests for the parallel-arm orchestration (#123 Phase A + B)."""
22
from __future__ import annotations
33

44
import json
5+
from dataclasses import dataclass
6+
from pathlib import Path
57

68
import pytest
79

@@ -15,6 +17,16 @@
1517
)
1618

1719

20+
@dataclass
21+
class _LocalSDKResult:
22+
"""Local stand-in for SDKResult so this branch doesn't depend on
23+
sdk_dispatch.py landing first. The real SDKResult is duck-compatible."""
24+
text: str = ""
25+
duration_ms: int = 0
26+
is_error: bool = False
27+
error_message: str = ""
28+
29+
1830
# ─── Plan partitioning ─────────────────────────────────────────────────────
1931

2032
class TestPartitionPlan:
@@ -190,3 +202,122 @@ def test_returns_only_failed_units(self):
190202
failed = failed_units(results)
191203
assert len(failed) == 2
192204
assert all(r.arm_id != "h-main" or r.seed == "s2" for r in failed)
205+
206+
207+
# ─── Phase B: end-to-end with the harness-isolated SDK runner ─────────────
208+
209+
210+
class TestEndToEndWithIsolatedRunner:
211+
"""The full chain: partition_plan -> make_isolated_arm_runner ->
212+
run_units -> merge_unit_results. The SDK side is injected via a
213+
fake; per the no-live-LLM policy (CLAUDE.md), no real subagent is
214+
spawned. The test asserts the orchestration contract — every unit
215+
is dispatched with isolation=worktree to a non-overlapping results
216+
dir, failures are isolated, and the merged output is deterministic.
217+
"""
218+
219+
def _plan(self):
220+
return {"arms": [
221+
{"arm_id": "h-main", "conditions": [
222+
{"name": "x", "command": "./run --arm main"},
223+
]},
224+
{"arm_id": "h-ablation", "conditions": [
225+
{"name": "y", "command": "./run --arm ablation",
226+
"seeds": ["s1", "s2"]},
227+
]},
228+
]}
229+
230+
def _success_runner(self):
231+
SDKResult = _LocalSDKResult # noqa: N806
232+
233+
sdk_calls: list[dict] = []
234+
235+
def sdk_runner(**kwargs):
236+
sdk_calls.append(kwargs)
237+
prompt = kwargs.get("prompt", "")
238+
# Simulate the subagent writing a file in its results dir.
239+
for line in prompt.splitlines():
240+
if line.startswith("Write all output files to:"):
241+
target = line.split("`", 1)[1].rstrip("`")
242+
Path(target).mkdir(parents=True, exist_ok=True)
243+
(Path(target) / "out.json").write_text("{}")
244+
return SDKResult(text="done", duration_ms=120)
245+
246+
return sdk_runner, sdk_calls
247+
248+
def test_three_units_dispatched_with_isolation_kwarg(self, tmp_path):
249+
from orchestrator.worktree import make_isolated_arm_runner
250+
251+
iter_dir = tmp_path / "iter-1"
252+
iter_dir.mkdir(parents=True)
253+
sdk_runner, sdk_calls = self._success_runner()
254+
255+
runner = make_isolated_arm_runner(
256+
sdk_runner=sdk_runner, repo_path=tmp_path, iter_dir=iter_dir,
257+
)
258+
units = partition_plan(self._plan())
259+
assert len(units) == 3
260+
261+
results = run_units(units, runner=runner)
262+
assert len(sdk_calls) == 3
263+
assert all(c.get("isolation") == "worktree" for c in sdk_calls)
264+
265+
merged = merge_unit_results(results)
266+
assert [a["arm_id"] for a in merged["arms"]] == ["h-ablation", "h-main"]
267+
assert all(a["status"] == "complete" for a in merged["arms"])
268+
269+
def test_partial_failure_isolated_to_one_arm(self, tmp_path):
270+
from orchestrator.worktree import make_isolated_arm_runner
271+
SDKResult = _LocalSDKResult # noqa: N806
272+
273+
iter_dir = tmp_path / "iter-1"
274+
iter_dir.mkdir(parents=True)
275+
276+
def sdk_runner(**kwargs):
277+
prompt = kwargs.get("prompt", "")
278+
if "h-ablation" in prompt:
279+
return SDKResult(
280+
text="", is_error=True, error_message="exit 1",
281+
)
282+
for line in prompt.splitlines():
283+
if line.startswith("Write all output files to:"):
284+
target = line.split("`", 1)[1].rstrip("`")
285+
Path(target).mkdir(parents=True, exist_ok=True)
286+
(Path(target) / "out.json").write_text("{}")
287+
return SDKResult(text="ok")
288+
289+
runner = make_isolated_arm_runner(
290+
sdk_runner=sdk_runner, repo_path=tmp_path, iter_dir=iter_dir,
291+
)
292+
merged = merge_unit_results(
293+
run_units(partition_plan(self._plan()), runner=runner)
294+
)
295+
by_arm = {a["arm_id"]: a for a in merged["arms"]}
296+
assert by_arm["h-main"]["status"] == "complete"
297+
assert by_arm["h-ablation"]["status"] == "failed"
298+
assert merged["failed_unit_count"] == 2
299+
assert merged["total_unit_count"] == 3
300+
301+
def test_no_two_units_share_results_dir(self, tmp_path):
302+
from orchestrator.worktree import make_isolated_arm_runner
303+
304+
iter_dir = tmp_path / "iter-1"
305+
iter_dir.mkdir(parents=True)
306+
sdk_runner, _ = self._success_runner()
307+
seen_dirs: list[str] = []
308+
309+
def capturing(**kwargs):
310+
for line in kwargs.get("prompt", "").splitlines():
311+
if line.startswith("Write all output files to:"):
312+
seen_dirs.append(line.split("`", 1)[1].rstrip("`"))
313+
return sdk_runner(**kwargs)
314+
315+
runner = make_isolated_arm_runner(
316+
sdk_runner=capturing, repo_path=tmp_path, iter_dir=iter_dir,
317+
)
318+
run_units(partition_plan(self._plan()), runner=runner)
319+
320+
# Acceptance criterion: no two subagents ever write to the same
321+
# results path.
322+
assert len(seen_dirs) == 3
323+
assert len(set(seen_dirs)) == 3

0 commit comments

Comments
 (0)