Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ OpenAdapt Evals is a unified framework for evaluating GUI automation agents agai
- **Agent interfaces** including `ApiAgent` (Claude / GPT), `ClaudeComputerUseAgent` (with coordinate clamping and fail-safe recovery), `RetrievalAugmentedAgent`, `RandomAgent`, and `PolicyAgent`
- **Multi-cloud VM infrastructure** with `AzureVMManager`, `AWSVMManager`, `PoolManager`, `SSHTunnelManager`, and `VMMonitor` for running evaluations at scale on Azure or AWS
- **End-to-end eval pipeline** (`scripts/run_eval_pipeline.py`) -- orchestrates demo generation, VM lifecycle, SSH tunnels, and ZS/DC evaluation in a single command
- **Deterministic desktop parity mode** -- `--clean-desktop` suppresses OneDrive/toast/popover noise, `--force-tray-icons` keeps network/audio tray controls visible, and run metadata records requested/observed environment flags
- **RL training environment** -- `RLEnvironment` wrapper provides a Gymnasium-style `reset`/`step`/`evaluate` interface for online RL (GRPO, PPO) with outcome-based rewards from WAA scores
- **Annotation pipeline** -- VLM-based screenshot annotation (`annotation.py`, `vlm.py`) migrated from openadapt-ml so the full record-annotate-evaluate workflow runs within this repo
- **4-layer WAA probe** -- `probe --detailed` checks screenshot capture, accessibility tree, action pipeline, and scoring independently; supports `--json` and `--layers` filtering
Expand Down Expand Up @@ -155,6 +156,13 @@ python scripts/run_eval_pipeline.py --tasks 04d9aeaf --dry-run

# AWS instead of Azure
python scripts/run_eval_pipeline.py --cloud aws --vm-name waa-pool-00

# Deterministic desktop parity + pinned image version metadata
python scripts/run_eval_pipeline.py \
--tasks 04d9aeaf \
--clean-desktop \
--force-tray-icons \
--waa-image-version win11-24h2-2026-03-04
```

### Parallel evaluation
Expand Down
220 changes: 201 additions & 19 deletions openadapt_evals/adapters/waa/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from __future__ import annotations

import base64
import json
import logging
import re
import time
Expand Down Expand Up @@ -263,6 +264,14 @@ class WAALiveConfig:
waa_examples_path: Path to WAA evaluation_examples_windows directory
for loading task configs with evaluator specs. If not set, tasks
are loaded from server or created as minimal placeholders.
clean_desktop: If True, applies deterministic desktop policy before
task setup (suppresses OneDrive/toast/taskbar noise).
force_tray_icons: If True, enforces network/audio tray icons visible
via policy keys before task setup.
reapply_clean_desktop_each_reset: Re-apply clean desktop policy on
every reset instead of once per adapter lifecycle.
waa_image_version: Optional pinned WAA image version identifier to
record in environment metadata.
"""

server_url: str = "http://localhost:5000"
Expand All @@ -274,6 +283,10 @@ class WAALiveConfig:
action_delay: float = 0.5
timeout: float = 90.0
waa_examples_path: str | None = None
clean_desktop: bool = False
force_tray_icons: bool = False
reapply_clean_desktop_each_reset: bool = False
waa_image_version: str | None = None


class WAALiveAdapter(BenchmarkAdapter):
Expand All @@ -298,6 +311,8 @@ def __init__(self, config: WAALiveConfig | None = None):
self._current_screenshot: bytes | None = None
self._actions: list[BenchmarkAction] = []
self._actual_screen_size: tuple[int, int] | None = None
self._clean_desktop_applied = False
self._environment_profile: dict[str, Any] = {}

@property
def name(self) -> str:
Expand All @@ -314,6 +329,10 @@ def supports_parallel(self) -> bool:
"""Whether parallel execution is supported."""
return False # Single VM for now

def get_environment_profile(self) -> dict[str, Any]:
"""Return the last captured environment profile for this adapter."""
return dict(self._environment_profile)

def check_connection(self) -> bool:
"""Check if WAA server is reachable.

Expand Down Expand Up @@ -524,8 +543,12 @@ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
except Exception as e:
logger.warning(f"Failed to close windows: {e}")

# Dismiss system notifications (OneDrive, etc.) that persist through close_all
self._dismiss_notifications(requests)
# Optionally apply deterministic desktop policy before task setup.
if self.config.clean_desktop or self.config.force_tray_icons:
self._apply_clean_desktop_policy(requests)
else:
# Best-effort cleanup even when full clean policy is disabled.
self._dismiss_notifications(requests)

# If task has setup commands in raw_config, execute them
if task.raw_config:
Expand Down Expand Up @@ -1357,30 +1380,189 @@ def _dismiss_notifications(self, requests_module) -> None:
are not closeable via close_all (they're system toasts, not windows).
Kill the notification processes and dismiss via keyboard.
"""
evaluate_base = self.config.evaluate_url or self.config.server_url
# Kill OneDrive and related notification processes
commands = [
"taskkill /F /IM OneDrive.exe /T",
"taskkill /F /IM OneDriveStandaloneUpdater.exe /T",
# Dismiss any remaining toast notifications via Action Center
"taskkill /F /IM ApplicationFrameHost.exe /T",
# Open/close action center to dismiss any currently surfaced toast.
(
"powershell -Command \""
"Get-Process -Name 'ShellExperienceHost' -ErrorAction SilentlyContinue | "
"ForEach-Object { $_.CloseMainWindow() }\""
"python -c \""
"import pyautogui; "
"pyautogui.hotkey('win','a'); "
"pyautogui.press('esc')"
"\""
),
]
for cmd in commands:
try:
requests_module.post(
f"{evaluate_base}/setup",
json={"config": [
{"type": "execute", "parameters": {"command": cmd, "shell": True}},
]},
timeout=10.0,
self._run_setup_execute_commands(
requests_module,
commands,
label="notification cleanup",
timeout=15.0,
)

def _run_setup_execute_commands(
self,
requests_module,
commands: list[str],
*,
label: str,
timeout: float = 20.0,
) -> None:
"""Run a batch of `execute` setup commands on the evaluate server."""
if not commands:
return
evaluate_base = self.config.evaluate_url or self.config.server_url
payload = {
"config": [
{"type": "execute", "parameters": {"command": cmd, "shell": True}}
for cmd in commands
]
}
try:
resp = requests_module.post(
f"{evaluate_base}/setup",
json=payload,
timeout=timeout,
)
if resp.status_code == 200:
logger.info("%s applied (%d command(s))", label, len(commands))
else:
logger.warning(
"%s failed: HTTP %s %s",
label,
resp.status_code,
resp.text[:200],
)
except Exception:
pass # Best-effort; don't fail reset if notification kill fails
logger.debug("Dismissed system notifications")
except Exception as e:
logger.warning("%s request failed: %s", label, e)

def _apply_clean_desktop_policy(self, requests_module) -> None:
"""Apply deterministic desktop policy for train/eval UI parity."""
if self._clean_desktop_applied and not self.config.reapply_clean_desktop_each_reset:
# Keep notifications suppressed each reset even if policy is one-time.
self._dismiss_notifications(requests_module)
return

commands: list[str] = []
if self.config.clean_desktop:
commands.extend([
# Suppress OneDrive first-run backup prompts.
(
'reg add "HKLM\\SOFTWARE\\Policies\\Microsoft\\Windows\\OneDrive" '
'/v DisableFileSyncNGSC /t REG_DWORD /d 1 /f'
),
# Prevent OneDrive auto-start prompts on login.
'reg delete "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Run" /v OneDrive /f',
# Suppress notification toasts/popovers.
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\PushNotifications" '
'/v ToastEnabled /t REG_DWORD /d 0 /f'
),
# Disable common Windows suggestion surfaces/popups.
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
'/v SubscribedContent-338389Enabled /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
'/v SubscribedContent-338388Enabled /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
'/v SubscribedContent-353694Enabled /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
'/v ShowCopilotButton /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
'/v TaskbarDa /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
'/v TaskbarMn /t REG_DWORD /d 0 /f'
),
])

if self.config.clean_desktop or self.config.force_tray_icons:
commands.extend([
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" '
'/v HideSCANetwork /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" '
'/v HideSCAVolume /t REG_DWORD /d 0 /f'
),
(
'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer" '
'/v EnableAutoTray /t REG_DWORD /d 0 /f'
),
])

if commands:
self._run_setup_execute_commands(
requests_module,
commands,
label="clean desktop policy",
timeout=25.0,
)

# Keep this as a final pass to clear already displayed popups.
self._dismiss_notifications(requests_module)
self._environment_profile = self._collect_environment_profile()
if self._environment_profile:
logger.info("Desktop environment profile: %s", self._environment_profile)
self._clean_desktop_applied = True

def _collect_environment_profile(self) -> dict[str, Any]:
"""Capture key environment flags for reproducibility metadata."""
script = r"""
$ErrorActionPreference='SilentlyContinue'
$os = Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion'
$one = (Get-ItemProperty 'HKLM:\SOFTWARE\Policies\Microsoft\Windows\OneDrive' -Name DisableFileSyncNGSC).DisableFileSyncNGSC
$toast = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\PushNotifications' -Name ToastEnabled).ToastEnabled
$hideNet = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCANetwork).HideSCANetwork
$hideVol = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCAVolume).HideSCAVolume
$autoTray = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer' -Name EnableAutoTray).EnableAutoTray
$widgets = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name TaskbarDa).TaskbarDa
$copilot = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name ShowCopilotButton).ShowCopilotButton
$obj = [ordered]@{
image_version = $env:WAA_IMAGE_VERSION
os_product = $os.ProductName
os_release = $os.DisplayVersion
os_build = $os.CurrentBuildNumber
one_drive_disable_sync_ngsc = $one
toast_enabled = $toast
hide_network_icon = $hideNet
hide_volume_icon = $hideVol
enable_auto_tray = $autoTray
taskbar_widgets = $widgets
show_copilot_button = $copilot
}
$obj | ConvertTo-Json -Compress
"""
try:
output = self.run_powershell(script).strip()
if not output:
return {}
profile: dict[str, Any] | None = None
for line in reversed(output.splitlines()):
line = line.strip()
if line.startswith("{") and line.endswith("}"):
profile = json.loads(line)
break
if profile is None:
return {}
profile["clean_desktop_requested"] = self.config.clean_desktop
profile["force_tray_icons_requested"] = self.config.force_tray_icons
if self.config.waa_image_version:
profile["configured_image_version"] = self.config.waa_image_version
return profile
except Exception as e:
logger.debug("Failed to collect environment profile: %s", e)
return {}

# --- App-name to window-title mapping for post-setup focus ---

Expand Down
Loading