diff --git a/CLAUDE.md b/CLAUDE.md index 8a1053b..4e1d2f7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -63,7 +63,7 @@ When merging with `gh pr merge --squash`, GitHub uses the PR title as the commit ## Overview -Evaluation infrastructure for GUI agent benchmarks. Provides benchmark adapters, agent interfaces, Azure VM management, and result visualization for running agents against WAA (Windows Agent Arena) and other benchmarks. +Governed desktop agent evaluation and training infrastructure. Provides benchmark adapters, agent interfaces (including dual-model planner-grounder), VM management (Azure + AWS), RL training integration (TRL GRPO, AReaL), workflow extraction from recordings, PII scrubbing middleware, correction capture, and result visualization. Primary benchmark target is WAA (Windows Agent Arena). ## Quick Start @@ -216,35 +216,212 @@ Run `oa-vm --help` for the full list of 50+ commands. ``` openadapt_evals/ -+-- agents/ # Agent implementations -| +-- base.py # BenchmarkAgent ABC -| +-- api_agent.py # ApiAgent (Claude, GPT) -| +-- retrieval_agent.py# RetrievalAugmentedAgent -| +-- policy_agent.py # PolicyAgent (trained models) -+-- adapters/ # Benchmark adapters -| +-- base.py # BenchmarkAdapter ABC + data classes -| +-- waa/ # WAA live, mock, and local adapters -+-- infrastructure/ # Azure VM and pool management -| +-- azure_vm.py # AzureVMManager (SDK + az CLI) -| +-- pool.py # PoolManager (multi-VM orchestration) -| +-- ssh_tunnel.py # SSHTunnelManager -| +-- vm_monitor.py # VMMonitor dashboard -| +-- resource_tracker.py# Cost tracking -+-- benchmarks/ # Evaluation runner, CLI, viewers -| +-- runner.py # evaluate_agent_on_benchmark() -| +-- cli.py # Benchmark CLI (run, mock, live, view) -| +-- vm_cli.py # VM/Pool CLI (oa-vm, 50+ commands) -| +-- viewer.py # HTML results viewer -| +-- pool_viewer.py # Pool results viewer -| +-- trace_export.py # Training data export -+-- waa_deploy/ # Docker agent deployment -+-- server/ # WAA server extensions (/evaluate endpoint) -+-- config.py # Settings (pydantic-settings, .env) ++-- agents/ # Agent implementations +| +-- base.py # BenchmarkAgent ABC +| +-- api_agent.py # ApiAgent (Claude, GPT) with demo persistence +| +-- planner_grounder_agent.py # PlannerGrounderAgent (dual-model) +| +-- retrieval_agent.py # RetrievalAugmentedAgent +| +-- policy_agent.py # PolicyAgent (trained models) +| +-- claude_computer_use_agent.py # Claude CU native agent ++-- adapters/ # Benchmark adapters +| +-- base.py # BenchmarkAdapter ABC + data classes +| +-- waa/ # WAA live + mock adapters +| +-- local/ # LocalAdapter (native desktop, no VM) +| +-- rl_env.py # RLEnvironment (Gymnasium-style wrapper) +| +-- scrub_middleware.py # ScrubMiddleware (PII removal) +| +-- verl_env.py # verl-compatible environment wrapper ++-- openenv/ # OpenEnv-compatible environment +| +-- environment.py # WAAOpenEnvEnvironment +| +-- models.py # WAAAction, WAAObservation, WAAState +| +-- server.py # HTTP+WebSocket server ++-- training/ # RL training infrastructure +| +-- trl_rollout.py # TRL GRPOTrainer rollout_func +| +-- areal_workflow.py # AReaL AgentWorkflow wrapper +| +-- trajectory_logger.py # PlannerTrajectoryLogger (SFT data) +| +-- planner_cache.py # PlannerCache (pHash-based dedup) ++-- workflow/ # Workflow extraction pipeline +| +-- models.py # Pydantic models (Recording, Transcript, Workflow) +| +-- pipeline/ # 4-pass pipeline +| | +-- scrub.py # Pass 0: PII scrubbing +| | +-- transcript.py # Pass 1: VLM transcript generation +| | +-- extract.py # Pass 2: Structured workflow extraction +| | +-- match.py # Pass 3: Cosine similarity matching +| +-- adapters/ # Recording source adapters +| +-- waa.py # WAA VNC recording adapter ++-- evaluation/ # Evaluation framework +| +-- builtin_verifiers.py # Built-in task verifiers +| +-- verifier_registry.py # Verifier discovery + dispatch +| +-- client.py # Evaluation client ++-- infrastructure/ # Azure/AWS VM and pool management +| +-- azure_vm.py # AzureVMManager (SDK + az CLI) +| +-- pool.py # PoolManager (multi-VM orchestration) +| +-- ssh_tunnel.py # SSHTunnelManager +| +-- vm_monitor.py # VMMonitor dashboard +| +-- resource_tracker.py # Cost tracking ++-- benchmarks/ # Evaluation runner, CLI, viewers +| +-- runner.py # evaluate_agent_on_benchmark() +| +-- cli.py # Benchmark CLI (run, mock, live, view) +| +-- vm_cli.py # VM/Pool CLI (oa-vm, 50+ commands) +| +-- viewer.py # HTML results viewer +| +-- pool_viewer.py # Pool results viewer +| +-- trace_export.py # Training data export (openadapt-ml + lightweight) ++-- task_config.py # YAML/JSON custom task definitions ++-- correction_capture.py # Human correction capture (flywheel) ++-- correction_store.py # Correction library (JSON-file-based) ++-- correction_parser.py # VLM-based correction parsing ++-- waa_deploy/ # Docker agent deployment ++-- server/ # WAA server extensions (/evaluate endpoint) ++-- config.py # Settings (pydantic-settings, .env) +-- __init__.py ``` --- +## PlannerGrounderAgent + +Dual-model architecture separating "what to do" (planner) from "where to click" (grounder). The planner sees the screenshot + accessibility tree and outputs structured JSON instructions. The grounder translates those into precise pixel coordinates. + +Key features: +- **Structured output**: Planner returns `{decision, action_type, action_value, target_description, reasoning}` as JSON +- **Action queue**: Multi-step plans can be queued and executed sequentially +- **Anti-loop detection**: Detects repeated identical actions and triggers recovery (PR #148) +- **Double-click support**: Native `double_click` action type +- **Pluggable models**: Planner and grounder can be different providers (e.g. Claude planner + GPT grounder, or local model via HTTP) +- **Training hooks**: Accepts `PlannerTrajectoryLogger` and `PlannerCache` for SFT data collection and cost reduction + +```python +from openadapt_evals.agents import PlannerGrounderAgent + +agent = PlannerGrounderAgent( + planner="claude-sonnet-4-20250514", + grounder="gpt-4.1-mini", + planner_provider="anthropic", + grounder_provider="openai", +) +``` + +--- + +## TaskConfig (Custom Tasks) + +Define tasks in YAML or native WAA JSON without forking WAA. Supports setup commands, milestone-based dense rewards, and multiple evaluation check types. + +```yaml +# tasks/change-font.yaml +id: change-font-arial +instruction: "Change the default font to Arial in WordPad" +setup: + - type: open_app + app: wordpad +checks: + - check: screenshot + description: "Font is set to Arial" +milestones: + - description: "WordPad is open" + reward: 0.25 + - description: "Font dropdown is open" + reward: 0.25 + - description: "Arial is selected" + reward: 0.5 +``` + +```python +from openadapt_evals.task_config import TaskConfig + +tasks = TaskConfig.from_dir("tasks/") # YAML + JSON auto-detected +task = TaskConfig.from_waa_json("examples/writer/abc123.json") # WAA native format +``` + +Task setup commands are dispatched via `/execute_windows` on the WAA server. All 13+ WAA config entry types are handled (PR #153, #157): `open_app`, `download_file`, `add_bookmark`, `update_browse_history`, `copy_file`, etc. + +**Strict mode** (PR #154): Pass `--strict` to prevent silent fallback degradation during benchmarking. Raises errors instead of silently skipping unsupported features. + +--- + +## Workflow Extraction Pipeline + +4-pass pipeline for extracting structured workflows from desktop recordings: + +| Pass | Module | Input | Output | +|------|--------|-------|--------| +| 0 | `workflow/pipeline/scrub.py` | Raw recording | Scrubbed recording (PII removed) | +| 1 | `workflow/pipeline/transcript.py` | Scrubbed recording | `EpisodeTranscript` (VLM-narrated) | +| 2 | `workflow/pipeline/extract.py` | Transcript | `Workflow` (structured steps) | +| 3 | `workflow/pipeline/match.py` | Workflow | Matched `CanonicalWorkflow` (cosine similarity) | + +Recording sources: `native_capture` (openadapt-capture), `waa_vnc`, `screen_recording`, `imported`. Models defined in `workflow/models.py` (Pydantic). + +--- + +## RL Training Infrastructure + +### RLEnvironment + +Gymnasium-style wrapper (`reset`/`step`/`observe`/`evaluate`) around any `BenchmarkAdapter`. Supports both sparse (outcome-only) and dense (milestone-based) rewards. + +```python +from openadapt_evals.adapters.rl_env import RLEnvironment + +env = RLEnvironment(adapter, default_task_id="", evaluate_every_step=True) +obs = env.reset() +step = env.step(action) +print(step.info["evaluation_score"]) +``` + +### TRL GRPO Rollout + +`trl_rollout.py` implements `make_waa_rollout_func()` for TRL's `GRPOTrainer`. Runs multi-step episodes, collects action tokens/logprobs, computes dense rewards via milestones. + +```python +from openadapt_evals.training.trl_rollout import make_waa_rollout_func + +rollout_func = make_waa_rollout_func(adapter=adapter, task_configs=tasks, max_steps=15) +trainer = GRPOTrainer(model=model, args=config, rollout_func=rollout_func, ...) +``` + +### AReaL Workflow + +`areal_workflow.py` wraps WAADesktopEnv into AReaL's `AgentWorkflow` pattern for distributed RL training. Uses AsyncOpenAI client pointed at AReaL's proxy for automatic logprob tracking. + +### OpenEnv Environment + +`openenv/environment.py` provides an OpenEnv-compatible environment (`WAAOpenEnvEnvironment`) that can be served as an HTTP+WebSocket server via `create_app()`. + +### Training Utilities + +- **PlannerTrajectoryLogger** (`training/trajectory_logger.py`): Saves planner inputs/outputs as JSONL + screenshot PNGs for SFT data collection. Auto-deletes failed episodes. +- **PlannerCache** (`training/planner_cache.py`): Perceptual hash (pHash) based caching of planner API responses. Reduces API costs during GRPO training rollouts. + +--- + +## LocalAdapter + ScrubMiddleware + +**LocalAdapter** (`adapters/local/adapter.py`): Runs on the local machine using `mss` for screenshots and `pynput` for input. No VM required. Handles macOS Retina coordinate scaling automatically. + +**ScrubMiddleware** (`adapters/scrub_middleware.py`): Wraps any adapter with PII scrubbing (via `openadapt-privacy` / Presidio). Every screenshot is scrubbed before the agent sees it. Original screenshots stored for audit. + +```python +from openadapt_evals.adapters.local import LocalAdapter +from openadapt_evals.adapters.scrub_middleware import ScrubMiddleware + +adapter = ScrubMiddleware(LocalAdapter(action_delay=0.5)) +obs = adapter.observe() # PII scrubbed +``` + +--- + +## Correction Flywheel + +Captures human corrections when an agent fails, stores them for retrieval during future episodes: + +- `correction_capture.py`: Records corrections via openadapt-capture (or PIL fallback) +- `correction_store.py`: JSON-file-based library with fuzzy retrieval by task_id + step description +- `correction_parser.py`: VLM-based parsing of correction recordings + +CLI flags: `--correction-library ./corrections --enable-correction-capture` + +--- + ## Demo Persistence (ApiAgent) The `ApiAgent` includes the demo at EVERY step, not just step 1. This fixes the "100% first-action success / 0% episode success" problem. @@ -323,16 +500,31 @@ openadapt-evals mock --tasks 5 ## Key Files -| File | Description | -|-------------------------------|--------------------------------------| -| `agents/api_agent.py` | ApiAgent with demo persistence | -| `agents/retrieval_agent.py` | Auto demo selection | -| `adapters/waa/` | WAA live, mock, local adapters | -| `benchmarks/cli.py` | Benchmark CLI entry point | -| `benchmarks/vm_cli.py` | VM/Pool CLI (oa-vm, 50+ commands) | -| `infrastructure/azure_vm.py` | AzureVMManager | -| `infrastructure/pool.py` | PoolManager for parallel evaluation | -| `config.py` | Settings (pydantic-settings, .env) | +| File | Description | +|-------------------------------------|------------------------------------------------| +| `agents/planner_grounder_agent.py` | PlannerGrounderAgent (dual-model, structured) | +| `agents/api_agent.py` | ApiAgent with demo persistence | +| `agents/retrieval_agent.py` | Auto demo selection | +| `adapters/waa/` | WAA live + mock adapters | +| `adapters/local/adapter.py` | LocalAdapter (native desktop, no VM) | +| `adapters/rl_env.py` | RLEnvironment (Gymnasium-style RL wrapper) | +| `adapters/scrub_middleware.py` | ScrubMiddleware (PII removal via Presidio) | +| `openenv/environment.py` | WAAOpenEnvEnvironment (OpenEnv-compatible) | +| `training/trl_rollout.py` | TRL GRPO rollout_func | +| `training/areal_workflow.py` | AReaL AgentWorkflow wrapper | +| `training/trajectory_logger.py` | SFT data collection from planner calls | +| `training/planner_cache.py` | pHash-based planner response cache | +| `workflow/pipeline/` | 4-pass workflow extraction (scrub/transcript/extract/match) | +| `workflow/models.py` | Pydantic models for recordings + workflows | +| `task_config.py` | YAML/JSON custom task definitions | +| `correction_capture.py` | Human correction capture | +| `correction_store.py` | Correction library with fuzzy retrieval | +| `benchmarks/cli.py` | Benchmark CLI entry point | +| `benchmarks/vm_cli.py` | VM/Pool CLI (oa-vm, 50+ commands) | +| `benchmarks/trace_export.py` | Training data export (openadapt-ml + lightweight) | +| `infrastructure/azure_vm.py` | AzureVMManager | +| `infrastructure/pool.py` | PoolManager for parallel evaluation | +| `config.py` | Settings (pydantic-settings, .env) | ## PyPI Publishing