Skip to content

Commit 8bd1b43

Browse files
abrichrclaude
andauthored
fix: use WAA server for /evaluate instead of fragile socat proxy (#115)
The evaluate endpoint (/evaluate) is already available on the WAA Flask server (port 5000), which is accessed via a single reliable SSH tunnel (local:5001 → VM:5000). The separate evaluate chain (local:5050 → VM:5051 → socat → docker exec → container:5050) was fragile and caused infrastructure failures when socat died mid-trial. Changes: - Default --evaluate-url to None (falls back to --server URL) - Remove socat proxy setup (_setup_eval_proxy) from run_dc_eval.py - Remove port 5050 from SSH tunnel forwarding - Make done-gate non-fatal when evaluate returns infrastructure error - All scripts pass --evaluate-url only when explicitly set Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 174e9bf commit 8bd1b43

7 files changed

Lines changed: 46 additions & 69 deletions

File tree

.beads/issues.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@
1313
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
1414
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
1515
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
16-
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
16+
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-07T01:44:43.380289-05:00"}
1717
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}

openadapt_evals/benchmarks/cli.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2252,7 +2252,7 @@ def cmd_eval_suite(args: argparse.Namespace) -> int:
22522252
tunnel_mgr = SSHTunnelManager()
22532253
tunnel_mgr.start_tunnels_for_vm(vm_ip=worker_ip)
22542254
server_url = "http://localhost:5001"
2255-
evaluate_url = "http://localhost:5050"
2255+
evaluate_url = None # use server_url for /evaluate
22562256
# Give tunnels a moment to establish
22572257
import time
22582258
time.sleep(3)
@@ -2389,8 +2389,8 @@ def main() -> int:
23892389
)
23902390
run_parser.add_argument("--server", type=str, default="http://localhost:5001",
23912391
help="WAA server URL (default: localhost:5001 for SSH tunnel)")
2392-
run_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050",
2393-
help="Evaluate server URL (default: localhost:5050)")
2392+
run_parser.add_argument("--evaluate-url", type=str, default=None,
2393+
help="Evaluate server URL (default: same as --server)")
23942394
run_parser.add_argument("--agent", type=str, default="api-openai",
23952395
help="Agent type: noop, mock, api-claude, api-openai, api-claude-cu, qwen3vl, smol")
23962396
run_parser.add_argument("--task", type=str,
@@ -2437,8 +2437,8 @@ def main() -> int:
24372437
live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
24382438
live_parser.add_argument("--server", type=str, default="http://localhost:5001",
24392439
help="WAA server URL (default: localhost:5001 for SSH tunnel)")
2440-
live_parser.add_argument("--evaluate-url", type=str, default="http://localhost:5050",
2441-
help="Evaluate server URL (default: localhost:5050)")
2440+
live_parser.add_argument("--evaluate-url", type=str, default=None,
2441+
help="Evaluate server URL (default: same as --server)")
24422442
live_parser.add_argument("--agent", type=str, default="mock",
24432443
help="Agent type: mock, noop, api-claude, api-openai, api-claude-cu, qwen3vl, smol, retrieval-claude, retrieval-openai")
24442444
live_parser.add_argument("--demo", type=str, help="Demo trajectory file for ApiAgent")
@@ -2791,8 +2791,8 @@ def main() -> int:
27912791
help="WAA server URL (used with --no-pool-create)",
27922792
)
27932793
suite_parser.add_argument(
2794-
"--evaluate-url", type=str, default="http://localhost:5050",
2795-
help="Evaluate server URL (used with --no-pool-create)",
2794+
"--evaluate-url", type=str, default=None,
2795+
help="Evaluate server URL (default: same as --server)",
27962796
)
27972797

27982798
args = parser.parse_args()

openadapt_evals/benchmarks/runner.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,17 @@ def _run_single_task(
400400
done = True
401401
break
402402

403+
# If evaluate endpoint is unreachable, accept "done"
404+
# rather than forcing the agent to continue pointlessly
405+
if gate_result.error_type == "infrastructure":
406+
logger.warning(
407+
f"Step {steps}: Done-gate skipped — evaluate "
408+
f"returned infrastructure error: {gate_result.reason}. "
409+
"Accepting 'done'."
410+
)
411+
done = True
412+
break
413+
403414
if gate_score >= config.done_gate_threshold:
404415
logger.info(
405416
f"Step {steps}: Done-gate PASSED "

scripts/core4_eval.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,13 @@ def cmd_run(args: argparse.Namespace) -> int:
7979
str(out),
8080
"--server",
8181
args.server,
82-
"--evaluate-url",
83-
args.evaluate_url,
8482
"--agent",
8583
args.agent,
8684
"--vm-user",
8785
args.vm_user,
8886
]
87+
if args.evaluate_url:
88+
cmd.extend(["--evaluate-url", args.evaluate_url])
8989
if args.vm_ip:
9090
cmd.extend(["--vm-ip", args.vm_ip])
9191
if args.start_from > 0:
@@ -181,7 +181,8 @@ def build_parser() -> argparse.ArgumentParser:
181181
run.add_argument("--max-steps", type=int, default=15)
182182
run.add_argument("--output-root", default="benchmark_results")
183183
run.add_argument("--server", default="http://localhost:5001")
184-
run.add_argument("--evaluate-url", default="http://localhost:5050")
184+
run.add_argument("--evaluate-url", default=None,
185+
help="Evaluate server URL (default: same as --server)")
185186
run.add_argument("--agent", default="api-claude-cu")
186187
run.add_argument("--vm-ip", default=None)
187188
run.add_argument("--vm-user", default="azureuser")

scripts/core4_lane.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ def _build_eval_cmd(args: argparse.Namespace, trial: TrialConfig) -> list[str]:
5555
trial.output_arg(),
5656
"--server",
5757
args.server,
58-
"--evaluate-url",
59-
args.evaluate_url,
6058
"--vm-user",
6159
args.vm_user,
6260
"--transport-error-threshold",
6361
str(args.transport_error_threshold),
6462
]
63+
if args.evaluate_url:
64+
cmd.extend(["--evaluate-url", args.evaluate_url])
6565
if args.vm_ip:
6666
cmd.extend(["--vm-ip", args.vm_ip])
6767
if args.controller:
@@ -181,7 +181,7 @@ def _common_args(parser: argparse.ArgumentParser) -> None:
181181
parser.add_argument("--agent", default="api-openai", help="Agent passed to run_dc_eval")
182182
parser.add_argument("--max-steps", type=int, default=15, help="Max steps per task")
183183
parser.add_argument("--server", default="http://localhost:5001", help="WAA server URL")
184-
parser.add_argument("--evaluate-url", default="http://localhost:5050", help="Evaluate server URL")
184+
parser.add_argument("--evaluate-url", default=None, help="Evaluate server URL (default: same as --server)")
185185
parser.add_argument("--vm-ip", default=None, help="VM IP (optional)")
186186
parser.add_argument("--vm-user", default="azureuser", help="VM SSH user")
187187
parser.add_argument(

scripts/run_dc_eval.py

Lines changed: 7 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def _start_tunnel(vm_user: str, vm_ip: str) -> bool:
5353
"-o", "TCPKeepAlive=yes",
5454
"-o", "ExitOnForwardFailure=yes",
5555
"-L", "5001:localhost:5000",
56-
"-L", "5050:localhost:5051",
5756
"-L", "8006:localhost:8006",
5857
f"{vm_user}@{vm_ip}",
5958
]
@@ -69,43 +68,6 @@ def _probe(server: str, timeout: int = 10) -> bool:
6968
return False
7069

7170

72-
def _setup_eval_proxy(vm_user: str, vm_ip: str) -> bool:
73-
"""(Re-)establish socat proxy for the evaluate server on the VM.
74-
75-
Docker port forwarding for port 5050 is broken due to QEMU's custom
76-
bridge networking (--cap-add NET_ADMIN). Work around it by restarting
77-
the socat-waa-evaluate systemd service on the VM host. The service is
78-
installed during pool creation (see DOCKER_SETUP_SCRIPT in pool.py).
79-
The SSH tunnel maps local 5050 -> VM 5051.
80-
81-
Falls back to the legacy nohup socat approach if the systemd service
82-
is not installed (e.g. on older VMs provisioned before this change).
83-
"""
84-
# Try systemd service first (preferred: auto-restarts on failure)
85-
script = (
86-
"if systemctl list-unit-files socat-waa-evaluate.service "
87-
"| grep -q socat-waa-evaluate; then "
88-
" sudo systemctl restart socat-waa-evaluate.service; "
89-
"else "
90-
" killall socat 2>/dev/null || true; sleep 1; "
91-
" which socat >/dev/null 2>&1 "
92-
" || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; "
93-
" nohup socat TCP-LISTEN:5051,fork,reuseaddr "
94-
" 'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' "
95-
" </dev/null >/dev/null 2>&1 & "
96-
"fi"
97-
)
98-
result = subprocess.run(
99-
["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", script],
100-
capture_output=True, timeout=30,
101-
)
102-
if result.returncode != 0:
103-
print(f" socat proxy setup failed: {result.stderr.decode()}")
104-
return False
105-
print(" socat proxy for evaluate server established (VM:5051 -> container:5050)")
106-
return True
107-
108-
10971
def _restart_container(vm_user: str, vm_ip: str) -> bool:
11072
"""Restart Windows via QEMU monitor reset, falling back to docker restart.
11173
@@ -122,8 +84,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool:
12284
if mgr.is_qemu_monitor_reachable():
12385
print(" Resetting Windows via QEMU monitor (system_reset)...")
12486
if mgr.reset_windows():
125-
print(" QEMU reset sent, re-establishing evaluate proxy...")
126-
_setup_eval_proxy(vm_user, vm_ip)
87+
print(" QEMU reset sent.")
12788
return True
12889
print(" QEMU reset command failed, falling back to docker restart...")
12990
else:
@@ -139,8 +100,7 @@ def _restart_container(vm_user: str, vm_ip: str) -> bool:
139100
if result.returncode != 0:
140101
print(f" Container restart failed: {result.stderr.decode()}")
141102
return False
142-
print(" Container restarted, re-establishing evaluate proxy...")
143-
_setup_eval_proxy(vm_user, vm_ip)
103+
print(" Container restarted.")
144104
return True
145105

146106

@@ -164,11 +124,10 @@ def ensure_waa_ready(
164124
if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
165125
return True
166126

167-
# Step 2: Reconnect tunnel + ensure socat proxy
127+
# Step 2: Reconnect tunnel
168128
print(" WAA unreachable, reconnecting tunnel...")
169129
_kill_tunnels()
170130
time.sleep(1)
171-
_setup_eval_proxy(vm_user, vm_ip)
172131
if _start_tunnel(vm_user, vm_ip):
173132
time.sleep(3)
174133
if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
@@ -210,7 +169,8 @@ def main() -> int:
210169
parser.add_argument("--agent", default="api-claude-cu", help="Agent type")
211170
parser.add_argument("--demo-dir", default="annotated_demos", help="Demo directory")
212171
parser.add_argument("--server", default="http://localhost:5001")
213-
parser.add_argument("--evaluate-url", default="http://localhost:5050")
172+
parser.add_argument("--evaluate-url", default=None,
173+
help="Evaluate server URL (default: same as --server)")
214174
parser.add_argument("--max-steps", type=int, default=15)
215175
parser.add_argument("--output", default="benchmark_results")
216176
parser.add_argument(
@@ -357,11 +317,12 @@ def main() -> int:
357317
"--agent", args.agent,
358318
"--tasks", tid,
359319
"--server", args.server,
360-
"--evaluate-url", args.evaluate_url,
361320
"--max-steps", str(args.max_steps),
362321
"--output", str(output_dir),
363322
"--run-name", run_name,
364323
]
324+
if args.evaluate_url:
325+
cmd.extend(["--evaluate-url", args.evaluate_url])
365326
if demo_path:
366327
cmd.extend(["--demo", str(demo_path.resolve())])
367328
if args.controller and demo_path:

scripts/run_eval_pipeline.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ def _setup_connectivity(
359359

360360
def _wait_waa_ready(
361361
server: str = "http://localhost:5001",
362-
evaluate_url: str = "http://localhost:5050",
362+
evaluate_url: str | None = None,
363363
timeout: int = 1200,
364364
) -> bool:
365365
"""Wait for WAA server and evaluate server to respond."""
@@ -381,13 +381,15 @@ def _wait_waa_ready(
381381
except Exception:
382382
waa_ok = False
383383

384-
try:
385-
eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok
386-
except Exception:
387-
eval_ok = False
384+
eval_ok = True # default to True when no separate evaluate URL
385+
if evaluate_url:
386+
try:
387+
eval_ok = requests.get(f"{evaluate_url}/probe", timeout=10).ok
388+
except Exception:
389+
eval_ok = False
388390

389391
if waa_ok and eval_ok:
390-
print(f"[waa] WAA + evaluate server ready after {elapsed}s")
392+
print(f"[waa] WAA server ready after {elapsed}s")
391393
return True
392394
if waa_ok and not eval_ok:
393395
# WAA is up but evaluate isn't — acceptable for ZS-only runs
@@ -434,7 +436,7 @@ def _run_eval(
434436
conditions: list[tuple[str, str, Path | None]],
435437
agent: str,
436438
server: str,
437-
evaluate_url: str,
439+
evaluate_url: str | None,
438440
max_steps: int,
439441
output_dir: Path,
440442
vm_ip: str,
@@ -481,11 +483,12 @@ def _run_eval(
481483
"--agent", agent,
482484
"--tasks", tid,
483485
"--server", server,
484-
"--evaluate-url", evaluate_url,
485486
"--max-steps", str(max_steps),
486487
"--output", str(output_dir),
487488
"--run-name", run_name,
488489
]
490+
if evaluate_url:
491+
cmd.extend(["--evaluate-url", evaluate_url])
489492
if clean_desktop:
490493
cmd.append("--clean-desktop")
491494
if force_tray_icons:
@@ -606,7 +609,8 @@ def build_parser() -> argparse.ArgumentParser:
606609
help="Pinned WAA image version label to record in run metadata",
607610
)
608611
parser.add_argument("--server", default="http://localhost:5001")
609-
parser.add_argument("--evaluate-url", default="http://localhost:5050")
612+
parser.add_argument("--evaluate-url", default=None,
613+
help="Evaluate server URL (default: same as --server)")
610614
parser.add_argument(
611615
"--vm-name", default=DEFAULT_VM_NAME, help="VM name",
612616
)

0 commit comments

Comments
 (0)