Skip to content

Commit c383d1d

Browse files
abrichrclaude
andcommitted
feat: add 4-layer WAA probe for per-layer diagnostics
Add multi-layer probe that tests screenshot (PNG capture), accessibility (a11y tree), action (pyautogui pipeline), and score (evaluate endpoint) layers individually using existing WAA endpoints. No server-side changes. - New probe.py module with ProbeLayerResult/MultiLayerProbeResult dataclasses - CLI: --detailed, --json, --layers, --evaluate-url args on probe command - VMMonitor: check_waa_detailed() method and waa_detailed_probe field - 41 tests covering all layers, orchestrator, and helpers Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 855dba1 commit c383d1d

5 files changed

Lines changed: 992 additions & 0 deletions

File tree

openadapt_evals/benchmarks/cli.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,42 @@ def cmd_probe(args: argparse.Namespace) -> int:
890890
return 1
891891

892892
server_url = args.server
893+
detailed = args.detailed or args.json_output
894+
layers = args.layers.split(",") if args.layers else None
895+
if layers:
896+
detailed = True
897+
898+
if detailed:
899+
from openadapt_evals.infrastructure.probe import (
900+
multi_layer_probe,
901+
print_probe_results,
902+
)
903+
904+
max_attempts = args.wait_attempts if args.wait else 1
905+
attempt = 0
906+
907+
while attempt < max_attempts:
908+
attempt += 1
909+
result = multi_layer_probe(
910+
server_url,
911+
layers=layers,
912+
evaluate_url=args.evaluate_url,
913+
)
914+
if args.json_output:
915+
print(result.to_json())
916+
else:
917+
print_probe_results(result)
918+
919+
if result.overall_ready:
920+
return 0
921+
922+
if args.wait and attempt < max_attempts:
923+
print(f"Attempt {attempt}/{max_attempts}: not ready, retrying in {args.wait_interval}s...")
924+
time.sleep(args.wait_interval)
925+
926+
return 0 if result.overall_ready else 1
893927

928+
# Default binary probe (unchanged)
894929
print(f"Probing WAA server at {server_url}...")
895930

896931
max_attempts = args.wait_attempts if args.wait else 1
@@ -2264,6 +2299,14 @@ def main() -> int:
22642299
help="Max attempts when waiting")
22652300
probe_parser.add_argument("--wait-interval", type=int, default=5,
22662301
help="Seconds between attempts")
2302+
probe_parser.add_argument("--detailed", action="store_true",
2303+
help="Run 4-layer probe (screenshot, a11y, action, score)")
2304+
probe_parser.add_argument("--json", dest="json_output", action="store_true",
2305+
help="Output JSON (implies --detailed)")
2306+
probe_parser.add_argument("--layers", type=str, default=None,
2307+
help="Comma-separated layer subset (e.g. screenshot,a11y)")
2308+
probe_parser.add_argument("--evaluate-url", type=str, default=None,
2309+
help="Separate URL for score layer (e.g. http://localhost:5051)")
22672310

22682311
# Generate viewer
22692312
view_parser = subparsers.add_parser("view", help="Generate HTML viewer for results")

openadapt_evals/infrastructure/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@
4747
wait_for_stable_screen,
4848
)
4949
from openadapt_evals.infrastructure.ssh_tunnel import SSHTunnelManager, get_tunnel_manager
50+
from openadapt_evals.infrastructure.probe import (
51+
MultiLayerProbeResult,
52+
ProbeLayerResult,
53+
multi_layer_probe,
54+
print_probe_results,
55+
)
5056
from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip
5157
from openadapt_evals.infrastructure.vm_monitor import VMMonitor, VMConfig
5258
from openadapt_evals.infrastructure.vm_provider import VMProvider
@@ -60,15 +66,19 @@
6066
"AWSVMManager",
6167
"AzureOpsTracker",
6268
"AzureVMManager",
69+
"MultiLayerProbeResult",
6370
"PoolManager",
6471
"PoolRunResult",
72+
"ProbeLayerResult",
6573
"QEMUResetManager",
6674
"VMMonitor",
6775
"VMConfig",
6876
"VMProvider",
6977
"SSHTunnelManager",
7078
"compare_screenshots",
7179
"get_tunnel_manager",
80+
"multi_layer_probe",
81+
"print_probe_results",
7282
"resolve_vm_ip",
7383
"wait_for_stable_screen",
7484
]

0 commit comments

Comments
 (0)