Skip to content

Commit 9c40d02

Browse files
committed
feat(scripts): add deterministic core4 lane CLI wrapper
1 parent 9937a9f commit 9c40d02

2 files changed

Lines changed: 273 additions & 0 deletions

File tree

docs/misc/core4_lane_cli.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Core4 Lane CLI
2+
3+
`scripts/core4_lane.py` wraps the recurring 4-task WAA eval lane with deterministic arguments.
4+
5+
It supports:
6+
7+
- `pack`: generate a reusable shell script ("resume pack") with fully resolved commands.
8+
- `run`: execute trial loops directly without manually pasting long command chains.
9+
10+
## Defaults
11+
12+
- tasks: `04d9aeaf,0bf05a7d,0e763496,70745df8`
13+
- demo dir: `annotated_demos_core4`
14+
- output root: `benchmark_results`
15+
- lane name: `repeat_core4`
16+
- agent: `api-openai`
17+
- max steps: `15`
18+
19+
## Examples
20+
21+
Generate a deterministic command pack:
22+
23+
```bash
24+
uv run python scripts/core4_lane.py pack --trials 3 --run-stamp 20260305_1534
25+
```
26+
27+
Dry-run commands that would execute:
28+
29+
```bash
30+
uv run python scripts/core4_lane.py run --trials 3 --dry-run
31+
```
32+
33+
Run three trials now:
34+
35+
```bash
36+
uv run python scripts/core4_lane.py run --trials 3 --fail-fast
37+
```
38+
39+
Run with clean desktop parity flags:
40+
41+
```bash
42+
uv run python scripts/core4_lane.py run \
43+
--trials 2 \
44+
--clean-desktop \
45+
--force-tray-icons \
46+
--waa-image-version win11-24h2-2026-03-04
47+
```
48+

scripts/core4_lane.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
#!/usr/bin/env python3
2+
"""Deterministic CLI wrapper for the recurring 4-task WAA eval lane.
3+
4+
This script avoids ad-hoc copy/paste command chains by generating a stable
5+
"command pack" and/or executing repeated trials programmatically.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import argparse
11+
import shlex
12+
import subprocess
13+
import sys
14+
from dataclasses import dataclass
15+
from datetime import datetime, timezone
16+
from pathlib import Path
17+
18+
19+
REPO_ROOT = Path(__file__).resolve().parent.parent
20+
RUN_DC_EVAL = REPO_ROOT / "scripts" / "run_dc_eval.py"
21+
DEFAULT_TASKS = "04d9aeaf,0bf05a7d,0e763496,70745df8"
22+
23+
24+
@dataclass(frozen=True)
25+
class TrialConfig:
26+
trial_num: int
27+
run_stamp: str
28+
output_root: Path
29+
lane_name: str
30+
31+
def run_dir_name(self) -> str:
32+
return f"{self.lane_name}_trial{self.trial_num}_{self.run_stamp}"
33+
34+
def output_arg(self) -> str:
35+
return str(self.output_root / self.run_dir_name())
36+
37+
38+
def _bool_flag(enabled: bool, flag: str) -> list[str]:
39+
return [flag] if enabled else []
40+
41+
42+
def _build_eval_cmd(args: argparse.Namespace, trial: TrialConfig) -> list[str]:
43+
cmd = [
44+
sys.executable,
45+
str(RUN_DC_EVAL),
46+
"--agent",
47+
args.agent,
48+
"--tasks",
49+
args.tasks,
50+
"--demo-dir",
51+
str(args.demo_dir),
52+
"--max-steps",
53+
str(args.max_steps),
54+
"--output",
55+
trial.output_arg(),
56+
"--server",
57+
args.server,
58+
"--evaluate-url",
59+
args.evaluate_url,
60+
"--vm-user",
61+
args.vm_user,
62+
"--transport-error-threshold",
63+
str(args.transport_error_threshold),
64+
]
65+
if args.vm_ip:
66+
cmd.extend(["--vm-ip", args.vm_ip])
67+
if args.controller:
68+
cmd.extend(
69+
[
70+
"--controller",
71+
"--max-retries",
72+
str(args.max_retries),
73+
"--max-replans",
74+
str(args.max_replans),
75+
]
76+
)
77+
cmd.extend(_bool_flag(args.clean_desktop, "--clean-desktop"))
78+
cmd.extend(_bool_flag(args.force_tray_icons, "--force-tray-icons"))
79+
if args.waa_image_version:
80+
cmd.extend(["--waa-image-version", args.waa_image_version])
81+
return cmd
82+
83+
84+
def _build_trials(args: argparse.Namespace) -> list[TrialConfig]:
85+
return [
86+
TrialConfig(
87+
trial_num=i,
88+
run_stamp=args.run_stamp,
89+
output_root=args.output_root,
90+
lane_name=args.lane_name,
91+
)
92+
for i in range(args.start_trial, args.start_trial + args.trials)
93+
]
94+
95+
96+
def _render_pack(args: argparse.Namespace, trials: list[TrialConfig]) -> str:
97+
lines = [
98+
"#!/usr/bin/env bash",
99+
"set -euo pipefail",
100+
"",
101+
f"cd {shlex.quote(str(REPO_ROOT))}",
102+
"",
103+
f'echo "Running {len(trials)} trial(s) for lane: {args.lane_name}"',
104+
"",
105+
]
106+
for trial in trials:
107+
cmd = _build_eval_cmd(args, trial)
108+
lines.append(f'echo "\\n=== Trial {trial.trial_num} / stamp {args.run_stamp} ==="')
109+
lines.append(shlex.join(cmd))
110+
lines.append("")
111+
return "\n".join(lines)
112+
113+
114+
def cmd_pack(args: argparse.Namespace) -> int:
115+
trials = _build_trials(args)
116+
args.output_root.mkdir(parents=True, exist_ok=True)
117+
pack_text = _render_pack(args, trials)
118+
pack_path = args.output_root / f"{args.lane_name}_resume_pack_{args.run_stamp}.sh"
119+
pack_path.write_text(pack_text, encoding="utf-8")
120+
print(pack_text)
121+
print(f"\nPack written: {pack_path}")
122+
return 0
123+
124+
125+
def cmd_run(args: argparse.Namespace) -> int:
126+
trials = _build_trials(args)
127+
args.output_root.mkdir(parents=True, exist_ok=True)
128+
failures: list[tuple[int, int]] = []
129+
130+
print(f"Repo root: {REPO_ROOT}")
131+
print(f"Trials: {len(trials)}")
132+
print(f"Tasks: {args.tasks}")
133+
print(f"Demo dir: {args.demo_dir}")
134+
print(f"Output root: {args.output_root}")
135+
136+
for trial in trials:
137+
cmd = _build_eval_cmd(args, trial)
138+
print(f"\n=== Trial {trial.trial_num} ===")
139+
print(shlex.join(cmd))
140+
if args.dry_run:
141+
continue
142+
143+
result = subprocess.run(cmd)
144+
if result.returncode != 0:
145+
failures.append((trial.trial_num, result.returncode))
146+
if args.fail_fast:
147+
break
148+
149+
if failures:
150+
print("\nFailures:")
151+
for trial_num, rc in failures:
152+
print(f" trial {trial_num}: rc={rc}")
153+
return 1
154+
155+
print("\nAll requested trials completed.")
156+
return 0
157+
158+
159+
def _common_args(parser: argparse.ArgumentParser) -> None:
160+
parser.add_argument("--tasks", default=DEFAULT_TASKS, help="Comma-separated task IDs/prefixes")
161+
parser.add_argument(
162+
"--demo-dir",
163+
type=Path,
164+
default=REPO_ROOT / "annotated_demos_core4",
165+
help="Directory containing demo files",
166+
)
167+
parser.add_argument(
168+
"--output-root",
169+
type=Path,
170+
default=REPO_ROOT / "benchmark_results",
171+
help="Root directory for per-trial outputs",
172+
)
173+
parser.add_argument("--lane-name", default="repeat_core4", help="Logical lane name")
174+
parser.add_argument(
175+
"--run-stamp",
176+
default=datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"),
177+
help="Stable run stamp used in output/pack names",
178+
)
179+
parser.add_argument("--trials", type=int, default=1, help="Number of sequential trials to run")
180+
parser.add_argument("--start-trial", type=int, default=1, help="First trial index")
181+
parser.add_argument("--agent", default="api-openai", help="Agent passed to run_dc_eval")
182+
parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
183+
parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL")
184+
parser.add_argument("--evaluate-url", default="http://localhost:5050", help="Evaluate server URL")
185+
parser.add_argument("--vm-ip", default=None, help="VM IP (optional)")
186+
parser.add_argument("--vm-user", default="azureuser", help="VM SSH user")
187+
parser.add_argument(
188+
"--transport-error-threshold",
189+
type=int,
190+
default=8,
191+
help="Hard-recovery threshold passed through to run_dc_eval",
192+
)
193+
parser.add_argument("--controller", action="store_true", help="Enable controller mode")
194+
parser.add_argument("--max-retries", type=int, default=2, help="Controller retries")
195+
parser.add_argument("--max-replans", type=int, default=2, help="Controller replans")
196+
parser.add_argument("--clean-desktop", action="store_true", help="Enable clean desktop parity mode")
197+
parser.add_argument("--force-tray-icons", action="store_true", help="Force tray icon setup")
198+
parser.add_argument("--waa-image-version", default=None, help="Pinned WAA image version metadata")
199+
200+
201+
def build_parser() -> argparse.ArgumentParser:
202+
parser = argparse.ArgumentParser(description=__doc__)
203+
sub = parser.add_subparsers(dest="command", required=True)
204+
205+
pack = sub.add_parser("pack", help="Generate a deterministic post-eval resume pack")
206+
_common_args(pack)
207+
pack.set_defaults(func=cmd_pack)
208+
209+
run = sub.add_parser("run", help="Run repeated trials programmatically")
210+
_common_args(run)
211+
run.add_argument("--dry-run", action="store_true", help="Print commands only")
212+
run.add_argument("--fail-fast", action="store_true", help="Stop at first failed trial")
213+
run.set_defaults(func=cmd_run)
214+
215+
return parser
216+
217+
218+
def main() -> int:
219+
parser = build_parser()
220+
args = parser.parse_args()
221+
return int(args.func(args))
222+
223+
224+
if __name__ == "__main__":
225+
raise SystemExit(main())

0 commit comments

Comments
 (0)