diff --git a/README.md b/README.md index 3da3717..1e96973 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ OpenAdapt Evals is a unified framework for evaluating GUI automation agents agai - **Agent interfaces** including `ApiAgent` (Claude / GPT), `ClaudeComputerUseAgent` (with coordinate clamping and fail-safe recovery), `RetrievalAugmentedAgent`, `RandomAgent`, and `PolicyAgent` - **Multi-cloud VM infrastructure** with `AzureVMManager`, `AWSVMManager`, `PoolManager`, `SSHTunnelManager`, and `VMMonitor` for running evaluations at scale on Azure or AWS - **End-to-end eval pipeline** (`scripts/run_eval_pipeline.py`) -- orchestrates demo generation, VM lifecycle, SSH tunnels, and ZS/DC evaluation in a single command +- **Deterministic desktop parity mode** -- `--clean-desktop` suppresses OneDrive/toast/popover noise, `--force-tray-icons` keeps network/audio tray controls visible, and run metadata records requested/observed environment flags - **RL training environment** -- `RLEnvironment` wrapper provides a Gymnasium-style `reset`/`step`/`evaluate` interface for online RL (GRPO, PPO) with outcome-based rewards from WAA scores - **Annotation pipeline** -- VLM-based screenshot annotation (`annotation.py`, `vlm.py`) migrated from openadapt-ml so the full record-annotate-evaluate workflow runs within this repo - **4-layer WAA probe** -- `probe --detailed` checks screenshot capture, accessibility tree, action pipeline, and scoring independently; supports `--json` and `--layers` filtering @@ -155,6 +156,13 @@ python scripts/run_eval_pipeline.py --tasks 04d9aeaf --dry-run # AWS instead of Azure python scripts/run_eval_pipeline.py --cloud aws --vm-name waa-pool-00 + +# Deterministic desktop parity + pinned image version metadata +python scripts/run_eval_pipeline.py \ + --tasks 04d9aeaf \ + --clean-desktop \ + --force-tray-icons \ + --waa-image-version win11-24h2-2026-03-04 ``` ### Parallel evaluation diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py index 25136fd..050eeea 100644 --- a/openadapt_evals/adapters/waa/live.py +++ b/openadapt_evals/adapters/waa/live.py @@ -25,6 +25,7 @@ from __future__ import annotations import base64 +import json import logging import re import time @@ -263,6 +264,14 @@ class WAALiveConfig: waa_examples_path: Path to WAA evaluation_examples_windows directory for loading task configs with evaluator specs. If not set, tasks are loaded from server or created as minimal placeholders. + clean_desktop: If True, applies deterministic desktop policy before + task setup (suppresses OneDrive/toast/taskbar noise). + force_tray_icons: If True, enforces network/audio tray icons visible + via policy keys before task setup. + reapply_clean_desktop_each_reset: Re-apply clean desktop policy on + every reset instead of once per adapter lifecycle. + waa_image_version: Optional pinned WAA image version identifier to + record in environment metadata. """ server_url: str = "http://localhost:5000" @@ -274,6 +283,10 @@ class WAALiveConfig: action_delay: float = 0.5 timeout: float = 90.0 waa_examples_path: str | None = None + clean_desktop: bool = False + force_tray_icons: bool = False + reapply_clean_desktop_each_reset: bool = False + waa_image_version: str | None = None class WAALiveAdapter(BenchmarkAdapter): @@ -298,6 +311,8 @@ def __init__(self, config: WAALiveConfig | None = None): self._current_screenshot: bytes | None = None self._actions: list[BenchmarkAction] = [] self._actual_screen_size: tuple[int, int] | None = None + self._clean_desktop_applied = False + self._environment_profile: dict[str, Any] = {} @property def name(self) -> str: @@ -314,6 +329,10 @@ def supports_parallel(self) -> bool: """Whether parallel execution is supported.""" return False # Single VM for now + def get_environment_profile(self) -> dict[str, Any]: + """Return the last captured environment profile for this adapter.""" + return dict(self._environment_profile) + def check_connection(self) -> bool: """Check if WAA server is reachable. @@ -524,8 +543,12 @@ def reset(self, task: BenchmarkTask) -> BenchmarkObservation: except Exception as e: logger.warning(f"Failed to close windows: {e}") - # Dismiss system notifications (OneDrive, etc.) that persist through close_all - self._dismiss_notifications(requests) + # Optionally apply deterministic desktop policy before task setup. + if self.config.clean_desktop or self.config.force_tray_icons: + self._apply_clean_desktop_policy(requests) + else: + # Best-effort cleanup even when full clean policy is disabled. + self._dismiss_notifications(requests) # If task has setup commands in raw_config, execute them if task.raw_config: @@ -1357,30 +1380,189 @@ def _dismiss_notifications(self, requests_module) -> None: are not closeable via close_all (they're system toasts, not windows). Kill the notification processes and dismiss via keyboard. """ - evaluate_base = self.config.evaluate_url or self.config.server_url - # Kill OneDrive and related notification processes commands = [ "taskkill /F /IM OneDrive.exe /T", "taskkill /F /IM OneDriveStandaloneUpdater.exe /T", - # Dismiss any remaining toast notifications via Action Center + "taskkill /F /IM ApplicationFrameHost.exe /T", + # Open/close action center to dismiss any currently surfaced toast. ( - "powershell -Command \"" - "Get-Process -Name 'ShellExperienceHost' -ErrorAction SilentlyContinue | " - "ForEach-Object { $_.CloseMainWindow() }\"" + "python -c \"" + "import pyautogui; " + "pyautogui.hotkey('win','a'); " + "pyautogui.press('esc')" + "\"" ), ] - for cmd in commands: - try: - requests_module.post( - f"{evaluate_base}/setup", - json={"config": [ - {"type": "execute", "parameters": {"command": cmd, "shell": True}}, - ]}, - timeout=10.0, + self._run_setup_execute_commands( + requests_module, + commands, + label="notification cleanup", + timeout=15.0, + ) + + def _run_setup_execute_commands( + self, + requests_module, + commands: list[str], + *, + label: str, + timeout: float = 20.0, + ) -> None: + """Run a batch of `execute` setup commands on the evaluate server.""" + if not commands: + return + evaluate_base = self.config.evaluate_url or self.config.server_url + payload = { + "config": [ + {"type": "execute", "parameters": {"command": cmd, "shell": True}} + for cmd in commands + ] + } + try: + resp = requests_module.post( + f"{evaluate_base}/setup", + json=payload, + timeout=timeout, + ) + if resp.status_code == 200: + logger.info("%s applied (%d command(s))", label, len(commands)) + else: + logger.warning( + "%s failed: HTTP %s %s", + label, + resp.status_code, + resp.text[:200], ) - except Exception: - pass # Best-effort; don't fail reset if notification kill fails - logger.debug("Dismissed system notifications") + except Exception as e: + logger.warning("%s request failed: %s", label, e) + + def _apply_clean_desktop_policy(self, requests_module) -> None: + """Apply deterministic desktop policy for train/eval UI parity.""" + if self._clean_desktop_applied and not self.config.reapply_clean_desktop_each_reset: + # Keep notifications suppressed each reset even if policy is one-time. + self._dismiss_notifications(requests_module) + return + + commands: list[str] = [] + if self.config.clean_desktop: + commands.extend([ + # Suppress OneDrive first-run backup prompts. + ( + 'reg add "HKLM\\SOFTWARE\\Policies\\Microsoft\\Windows\\OneDrive" ' + '/v DisableFileSyncNGSC /t REG_DWORD /d 1 /f' + ), + # Prevent OneDrive auto-start prompts on login. + 'reg delete "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Run" /v OneDrive /f', + # Suppress notification toasts/popovers. + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\PushNotifications" ' + '/v ToastEnabled /t REG_DWORD /d 0 /f' + ), + # Disable common Windows suggestion surfaces/popups. + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" ' + '/v SubscribedContent-338389Enabled /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" ' + '/v SubscribedContent-338388Enabled /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" ' + '/v SubscribedContent-353694Enabled /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" ' + '/v ShowCopilotButton /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" ' + '/v TaskbarDa /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" ' + '/v TaskbarMn /t REG_DWORD /d 0 /f' + ), + ]) + + if self.config.clean_desktop or self.config.force_tray_icons: + commands.extend([ + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" ' + '/v HideSCANetwork /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" ' + '/v HideSCAVolume /t REG_DWORD /d 0 /f' + ), + ( + 'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer" ' + '/v EnableAutoTray /t REG_DWORD /d 0 /f' + ), + ]) + + if commands: + self._run_setup_execute_commands( + requests_module, + commands, + label="clean desktop policy", + timeout=25.0, + ) + + # Keep this as a final pass to clear already displayed popups. + self._dismiss_notifications(requests_module) + self._environment_profile = self._collect_environment_profile() + if self._environment_profile: + logger.info("Desktop environment profile: %s", self._environment_profile) + self._clean_desktop_applied = True + + def _collect_environment_profile(self) -> dict[str, Any]: + """Capture key environment flags for reproducibility metadata.""" + script = r""" +$ErrorActionPreference='SilentlyContinue' +$os = Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion' +$one = (Get-ItemProperty 'HKLM:\SOFTWARE\Policies\Microsoft\Windows\OneDrive' -Name DisableFileSyncNGSC).DisableFileSyncNGSC +$toast = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\PushNotifications' -Name ToastEnabled).ToastEnabled +$hideNet = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCANetwork).HideSCANetwork +$hideVol = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCAVolume).HideSCAVolume +$autoTray = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer' -Name EnableAutoTray).EnableAutoTray +$widgets = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name TaskbarDa).TaskbarDa +$copilot = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name ShowCopilotButton).ShowCopilotButton +$obj = [ordered]@{ + image_version = $env:WAA_IMAGE_VERSION + os_product = $os.ProductName + os_release = $os.DisplayVersion + os_build = $os.CurrentBuildNumber + one_drive_disable_sync_ngsc = $one + toast_enabled = $toast + hide_network_icon = $hideNet + hide_volume_icon = $hideVol + enable_auto_tray = $autoTray + taskbar_widgets = $widgets + show_copilot_button = $copilot +} +$obj | ConvertTo-Json -Compress +""" + try: + output = self.run_powershell(script).strip() + if not output: + return {} + profile: dict[str, Any] | None = None + for line in reversed(output.splitlines()): + line = line.strip() + if line.startswith("{") and line.endswith("}"): + profile = json.loads(line) + break + if profile is None: + return {} + profile["clean_desktop_requested"] = self.config.clean_desktop + profile["force_tray_icons_requested"] = self.config.force_tray_icons + if self.config.waa_image_version: + profile["configured_image_version"] = self.config.waa_image_version + return profile + except Exception as e: + logger.debug("Failed to collect environment profile: %s", e) + return {} # --- App-name to window-title mapping for post-setup focus --- diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py index 6a6eb30..0b473d0 100644 --- a/openadapt_evals/benchmarks/cli.py +++ b/openadapt_evals/benchmarks/cli.py @@ -142,6 +142,50 @@ def _resolve_vm_context(args: argparse.Namespace) -> tuple[str, str] | None: return name, rg +def _requested_environment_flags(args: argparse.Namespace) -> dict[str, str | bool | None]: + """Collect requested desktop-parity flags from CLI args.""" + return { + "clean_desktop": bool(getattr(args, "clean_desktop", False)), + "force_tray_icons": bool(getattr(args, "force_tray_icons", False)), + "waa_image_version": getattr(args, "waa_image_version", None), + } + + +def _write_run_environment_metadata( + benchmark_dir: Path, + *, + requested: dict[str, str | bool | None], + adapter, + server_url: str, + evaluate_url: str | None, +) -> None: + """Persist run environment flags/profile into benchmark metadata.json.""" + metadata_path = benchmark_dir / "metadata.json" + metadata: dict = {} + if metadata_path.exists(): + try: + metadata = json.loads(metadata_path.read_text()) + except Exception as e: + logger.warning("Could not parse metadata.json for environment patch: %s", e) + metadata = {} + + observed = {} + if hasattr(adapter, "get_environment_profile"): + try: + observed = adapter.get_environment_profile() or {} + except Exception as e: + logger.debug("Could not read adapter environment profile: %s", e) + + metadata["environment"] = { + "requested": requested, + "observed": observed, + "server_url": server_url, + "evaluate_url": evaluate_url, + } + benchmark_dir.mkdir(parents=True, exist_ok=True) + metadata_path.write_text(json.dumps(metadata, indent=2)) + + def cmd_mock(args: argparse.Namespace) -> int: """Run mock evaluation (no Windows VM required).""" from openadapt_evals.benchmarks import ( @@ -289,6 +333,9 @@ def cmd_run(args: argparse.Namespace) -> int: evaluate_url=evaluate_url, max_steps=args.max_steps, waa_examples_path=waa_examples_path, + clean_desktop=getattr(args, "clean_desktop", False), + force_tray_icons=getattr(args, "force_tray_icons", False), + waa_image_version=getattr(args, "waa_image_version", None), ) adapter = WAALiveAdapter(config) @@ -437,6 +484,13 @@ def cmd_run(args: argparse.Namespace) -> int: print(f"Avg score: {metrics['avg_score']:.3f}") print(f"Avg steps: {metrics['avg_steps']:.1f}") benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name + _write_run_environment_metadata( + benchmark_dir, + requested=_requested_environment_flags(args), + adapter=adapter, + server_url=server_url, + evaluate_url=evaluate_url, + ) print(f"\nResults saved to: {benchmark_dir}") no_open = getattr(args, "no_open", False) @@ -479,6 +533,9 @@ def cmd_live(args: argparse.Namespace) -> int: evaluate_url=evaluate_url, max_steps=args.max_steps, waa_examples_path=waa_examples_path, + clean_desktop=getattr(args, "clean_desktop", False), + force_tray_icons=getattr(args, "force_tray_icons", False), + waa_image_version=getattr(args, "waa_image_version", None), ) adapter = WAALiveAdapter(config) @@ -633,6 +690,14 @@ def cmd_live(args: argparse.Namespace) -> int: print(f"Avg steps: {metrics['avg_steps']:.1f}") if eval_config: + benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name + _write_run_environment_metadata( + benchmark_dir, + requested=_requested_environment_flags(args), + adapter=adapter, + server_url=args.server, + evaluate_url=evaluate_url, + ) print(f"\nResults saved to: {eval_config.output_dir}/{eval_config.run_name}") return 0 @@ -865,7 +930,15 @@ def patch_evaluate_endpoint() -> bool: print("[6/6] Running single-task live evaluation...") agent = ScriptedAgent([BenchmarkAction(type="done")]) - adapter = WAALiveAdapter(WAALiveConfig(server_url=server_url, max_steps=args.max_steps)) + adapter = WAALiveAdapter( + WAALiveConfig( + server_url=server_url, + max_steps=args.max_steps, + clean_desktop=getattr(args, "clean_desktop", False), + force_tray_icons=getattr(args, "force_tray_icons", False), + waa_image_version=getattr(args, "waa_image_version", None), + ) + ) eval_config = EvaluationConfig( max_steps=args.max_steps, @@ -891,6 +964,15 @@ def patch_evaluate_endpoint() -> bool: print(f"Success rate: {metrics['success_rate']:.1%}") print(f"Avg score: {metrics['avg_score']:.3f}") print(f"Avg steps: {metrics['avg_steps']:.1f}") + benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name + _write_run_environment_metadata( + benchmark_dir, + requested=_requested_environment_flags(args), + adapter=adapter, + server_url=server_url, + evaluate_url=None, + ) + print(f"\nResults saved to: {benchmark_dir}") return 0 @@ -2299,6 +2381,12 @@ def main() -> int: help="Max retries per step when using --controller (default: 2)") run_parser.add_argument("--max-replans", type=int, default=2, help="Max replans when using --controller (default: 2)") + run_parser.add_argument("--clean-desktop", action="store_true", + help="Apply deterministic clean-desktop policy (disable OneDrive/toast/taskbar noise)") + run_parser.add_argument("--force-tray-icons", action="store_true", + help="Force network/audio tray icons visible for stable click-coordinate tasks") + run_parser.add_argument("--waa-image-version", type=str, default=None, + help="Pinned WAA image version label to record in run metadata") # Live evaluation (full control) live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)") @@ -2321,6 +2409,12 @@ def main() -> int: help="Path to WAA evaluation_examples_windows directory for task configs") live_parser.add_argument("--output", type=str, help="Output directory for traces") live_parser.add_argument("--run-name", type=str, help="Name for this evaluation run") + live_parser.add_argument("--clean-desktop", action="store_true", + help="Apply deterministic clean-desktop policy (disable OneDrive/toast/taskbar noise)") + live_parser.add_argument("--force-tray-icons", action="store_true", + help="Force network/audio tray icons visible for stable click-coordinate tasks") + live_parser.add_argument("--waa-image-version", type=str, default=None, + help="Pinned WAA image version label to record in run metadata") # Probe server probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable") @@ -2487,6 +2581,12 @@ def main() -> int: help="Save execution traces (viewer artifacts)") smoke_live_parser.add_argument("--no-stop-vm", dest="stop_vm", action="store_false", help="Do not deallocate VM after smoke test") + smoke_live_parser.add_argument("--clean-desktop", action="store_true", + help="Apply deterministic clean-desktop policy before smoke task") + smoke_live_parser.add_argument("--force-tray-icons", action="store_true", + help="Force network/audio tray icons visible during smoke task") + smoke_live_parser.add_argument("--waa-image-version", type=str, default=None, + help="Pinned WAA image version label to record in run metadata") smoke_live_parser.set_defaults(stop_vm=True) dashboard_parser = subparsers.add_parser("dashboard", help="Generate VM usage dashboard") diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py index 3379f65..21ca8fa 100644 --- a/scripts/run_dc_eval.py +++ b/scripts/run_dc_eval.py @@ -14,6 +14,10 @@ # All 12 tasks python scripts/run_dc_eval.py --agent api-claude-cu + + # Deterministic desktop parity + pinned image version metadata + python scripts/run_dc_eval.py --agent api-claude-cu --clean-desktop \\ + --force-tray-icons --waa-image-version win11-24h2-2026-03-04 """ from __future__ import annotations @@ -209,6 +213,21 @@ def main() -> int: parser.add_argument("--evaluate-url", default="http://localhost:5050") parser.add_argument("--max-steps", type=int, default=15) parser.add_argument("--output", default="benchmark_results") + parser.add_argument( + "--clean-desktop", + action="store_true", + help="Apply deterministic clean-desktop policy (OneDrive/toast suppression + tray parity)", + ) + parser.add_argument( + "--force-tray-icons", + action="store_true", + help="Force network/audio tray icons visible before each run", + ) + parser.add_argument( + "--waa-image-version", + default=None, + help="Pinned WAA image version label to record in run metadata", + ) parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)") parser.add_argument("--start-from", type=int, default=0, help="Task index to start from") parser.add_argument("--vm-ip", default=None, help="VM IP (auto-detected if omitted)") @@ -334,6 +353,12 @@ def main() -> int: "--max-retries", str(args.max_retries), "--max-replans", str(args.max_replans), ]) + if args.clean_desktop: + cmd.append("--clean-desktop") + if args.force_tray_icons: + cmd.append("--force-tray-icons") + if args.waa_image_version: + cmd.extend(["--waa-image-version", args.waa_image_version]) result = subprocess.run(cmd) elapsed = time.time() - task_start diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py index 02bca0a..aa56021 100644 --- a/scripts/run_eval_pipeline.py +++ b/scripts/run_eval_pipeline.py @@ -28,6 +28,10 @@ # Use AWS instead of Azure python scripts/run_eval_pipeline.py --cloud aws --vm-name waa-pool-00 + + # Train/eval UI parity mode (suppresses OneDrive/toast noise, pins image tag) + python scripts/run_eval_pipeline.py --tasks 04d9aeaf --clean-desktop \\ + --force-tray-icons --waa-image-version win11-24h2-2026-03-04 """ from __future__ import annotations @@ -436,6 +440,9 @@ def _run_eval( vm_ip: str, vm_user: str, tunnel_manager: SSHTunnelManager, + clean_desktop: bool = False, + force_tray_icons: bool = False, + waa_image_version: str | None = None, ) -> dict[str, dict]: """Run all eval conditions sequentially with health checks.""" results = {} @@ -479,6 +486,12 @@ def _run_eval( "--output", str(output_dir), "--run-name", run_name, ] + if clean_desktop: + cmd.append("--clean-desktop") + if force_tray_icons: + cmd.append("--force-tray-icons") + if waa_image_version: + cmd.extend(["--waa-image-version", waa_image_version]) if demo_path: cmd.extend(["--demo", str(demo_path.resolve())]) @@ -577,6 +590,21 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("--agent", default="api-claude-cu", help="Agent type") parser.add_argument("--max-steps", type=int, default=15) parser.add_argument("--output", default=str(DEFAULT_OUTPUT)) + parser.add_argument( + "--clean-desktop", + action="store_true", + help="Apply deterministic clean-desktop policy (OneDrive/toast suppression + tray parity)", + ) + parser.add_argument( + "--force-tray-icons", + action="store_true", + help="Force network/audio tray icons visible before each run", + ) + parser.add_argument( + "--waa-image-version", + default=None, + help="Pinned WAA image version label to record in run metadata", + ) parser.add_argument("--server", default="http://localhost:5001") parser.add_argument("--evaluate-url", default="http://localhost:5050") parser.add_argument( @@ -823,6 +851,9 @@ def main() -> int: vm_ip=vm_ip or "", vm_user=vm_user, tunnel_manager=tunnel_manager, + clean_desktop=args.clean_desktop, + force_tray_icons=args.force_tray_icons, + waa_image_version=args.waa_image_version, ) # ── Phase 4: Summary ──────────────────────────────────────────────