diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index b00cb8a..791f438 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -13,5 +13,5 @@ {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 28: 6 design docs created (code health, marketing, CLI DX, testing, infra, docs). Marketing materials drafted and polished. Prioritization documented in STATUS.md. Tier 1 blockers identified: version fix, PyAutoGUI fail-safe recovery, socat systemd service, auto-open viewer. Next: implement Tier 1 items to unblock reliable eval runs.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-28T11:25:44.494548-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-01T23:35:11.042286-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/README.md b/README.md index ea1e2d5..c92dc4e 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ The Dockerfile also pre-downloads LibreOffice at build time with dynamic version When a task config includes `related_apps`, the live adapter automatically prepends a `verify_apps` step before the task's setup config. The `--verify` flag on `record_waa_demos.py` provides a pre-flight check across all tasks before starting a recording session. -![LibreOffice Calc installed on Windows 11 VM](https://github.com/OpenAdaptAI/openadapt-evals/releases/download/untagged-42f27f1e47214aae8358/waa_libreoffice.png) +![LibreOffice Calc running inside Windows 11 QEMU VM via noVNC in Chrome](screenshots/waa_libreoffice_desktop.png) ## CLI Reference diff --git a/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt b/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt new file mode 100644 index 0000000..bfd2f76 --- /dev/null +++ b/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt @@ -0,0 +1,67 @@ +DEMONSTRATION: +Task: In a new sheet with 4 headers "Year", "CA changes", "FA changes", and "OA changes", calculate the annual changes for the Current Assets, Fixed Assets, and Other Assets columns. Set the results as percentage type. + +Step 1: + Action: Right-click on the "Sheet1" tab at the bottom and select "Insert Sheet" or "New Sheet" + +Step 2: + Action: Click cell A1 and type "Year" + +Step 3: + Action: Press Tab and type "CA changes" + +Step 4: + Action: Press Tab and type "FA changes" + +Step 5: + Action: Press Tab and type "OA changes" + +Step 6: + Action: Click cell A2 and type "2015" + +Step 7: + Action: Press Enter and type "2016" + +Step 8: + Action: Press Enter and type "2017" + +Step 9: + Action: Press Enter and type "2018" + +Step 10: + Action: Press Enter and type "2019" + +Step 11: + Action: Click cell B2 and type "=(Sheet1.B3-Sheet1.B2)/Sheet1.B2" + +Step 12: + Action: Press Enter + +Step 13: + Action: Click cell B2, then drag the fill handle down to B6 + +Step 14: + Action: Click cell C2 and type "=(Sheet1.C3-Sheet1.C2)/Sheet1.C2" + +Step 15: + Action: Press Enter + +Step 16: + Action: Click cell C2, then drag the fill handle down to C6 + +Step 17: + Action: Click cell D2 and type "=(Sheet1.D3-Sheet1.D2)/Sheet1.D2" + +Step 18: + Action: Press Enter + +Step 19: + Action: Click cell D2, then drag the fill handle down to D6 + +Step 20: + Action: Click and drag to select cells B2:D6 + +Step 21: + Action: Click the % button in the toolbar (or press Ctrl+Shift+5) + +--- diff --git a/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt b/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt new file mode 100644 index 0000000..0dc3c7c --- /dev/null +++ b/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt @@ -0,0 +1,130 @@ +DEMONSTRATION: +Task: In a new sheet with 4 headers "Year", "CA changes", "FA changes", and "OA changes", calculate the annual changes for the Current Assets, Fixed Assets, and Other Assets columns. Set the results as percentage type. + +Step 1: + Observation: The spreadsheet is open to "Sheet1," which contains financial data with columns for years, assets, and liabilities. + Intent: To create a new sheet for calculating and displaying annual changes in asset categories. + Action: Right-click on the "Sheet1" tab at the bottom and select "Insert Sheet" or "New Sheet". + Result: A new, blank sheet named "Sheet2" is added to the workbook and displayed. + +Step 2: + Observation: The new sheet is blank, with cell A1 selected and ready for input. + Intent: To create a header row for organizing annual asset changes. + Action: Click cell A1 and type "Year" + Result: The text "Year" now appears in cell A1. + +Step 3: + Observation: The new sheet contains only the header "Year" in cell A1, with the cursor positioned in cell B1. + Intent: To add the next header, "CA changes," as part of setting up the required columns for annual asset changes. + Action: Press Tab and type "CA changes" + Result: The header "CA changes" is entered in cell B1, and the sheet now displays two headers: "Year" and "CA changes." + +Step 4: + Observation: The new sheet contains two headers, "Year" in cell A1 and "CA changes" in cell B1, with the cursor positioned in cell C1. + Intent: To continue setting up the required headers for calculating annual changes in asset categories. + Action: Press Tab and type "FA changes". + Result: The header "FA changes" is added to cell C1. + +Step 5: + Observation: The new sheet contains three headers: "Year", "CA changes", and "FA changes" in the first row. + Intent: To complete the set of required headers by adding "OA changes" as the fourth column. + Action: Press Tab and type "OA changes". + Result: The "OA changes" header is added in cell D1, completing the four required headers in the first row. + +Step 6: + Observation: The new sheet contains four headers: "Year", "CA changes", "FA changes", and "OA changes", with all cells below the headers empty. + Intent: To begin entering annual data by specifying the first year in the "Year" column. + Action: Click cell A2 and type "2015". + Result: The value "2015" appears in cell A2 under the "Year" header. + +Step 7: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, with cell A2 currently empty. + Intent: To begin entering the years for which asset changes will be calculated. + Action: Press Enter and type "2016" in cell A2. + Result: The value "2016" appears in cell A2, starting the list of years under the "Year" header. + +Step 8: + Observation: The new sheet contains the headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, and "2016" is entered in cell A2. + Intent: To begin listing the years for which annual changes will be calculated. + Action: Press Enter and type "2017". + Result: "2017" is entered into cell A3, directly below "2016" in the "Year" column. + +Step 9: + Observation: The new sheet contains the headers "Year", "CA changes", "FA changes", and "OA changes", with the years 2016 and 2017 already listed in the "Year" column. + Intent: To continue populating the "Year" column with the next chronological year for annual change calculations. + Action: Press Enter and type "2018". + Result: The value "2018" appears in the next row of the "Year" column. + +Step 10: + Observation: The new sheet contains four headers ("Year", "CA changes", "FA changes", "OA changes") and a column of years from 2015 to 2018 under "Year". + Intent: To add the next year in the sequence to the "Year" column in preparation for calculating annual changes. + Action: Press Enter and type "2019". + Result: The value "2019" appears in the next row under the "Year" column. + +Step 11: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", with years listed in column A, but no formulas or data in the other columns. + Intent: To calculate the annual percentage change for Current Assets using data from Sheet1. + Action: Click cell B2 and type "=(Sheet1.B3-Sheet1.B2)/Sheet1.B2". + Result: Cell B2 now contains a formula to compute the percentage change in Current Assets between two years from Sheet1. + +Step 12: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", and the first row under "Year" is filled with "2016", while the "CA changes" cell (B2) contains a formula referencing another sheet. + Intent: To confirm and apply the formula entered in cell B2 for calculating annual changes in Current Assets. + Action: Press Enter + Result: The formula in cell B2 is executed, displaying the calculated percentage change for "CA changes" in 2016. + +Step 13: + Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes", with cell B2 containing a formula and the rest of column B empty. + Intent: To copy the formula in B2 down the column so that annual changes for "CA changes" are calculated for each year. + Action: Click cell B2, then drag the fill handle down to B6. + Result: The formula from B2 is filled down to B6, calculating "CA changes" for all listed years. + +Step 14: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", and cell C2 is empty. + Intent: To calculate the annual change for Fixed Assets as a percentage using data from Sheet1. + Action: Click cell C2 and type "=(Sheet1.C3-Sheet1.C2)/Sheet1.C2" + Result: Cell C2 now contains the formula to compute the percentage change in Fixed Assets between two years from Sheet1. + +Step 15: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, and the first row of data is being entered in row 2. + Intent: To confirm or complete the entry of a formula or value in the spreadsheet. + Action: Press Enter. + Result: The cell selection moves from the current cell (D2) to the next cell below (D3), confirming the entry in D2. + +Step 16: + Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes" with a formula present in cell C2, but the rest of column C is empty. + Intent: To apply the formula in C2 to the rest of the rows in the "CA changes" column for consistent annual change calculations. + Action: Click cell C2, then drag the fill handle down to C6. + Result: The formula from C2 is copied down to C6, populating the "CA changes" column with calculated values for each year. + +Step 17: + Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, with all other cells empty. + Intent: To calculate the annual percentage change for the "OA changes" column using data from Sheet1. + Action: Click cell D2 and type "=(Sheet1.D3-Sheet1.D2)/Sheet1.D2" + Result: Cell D2 is populated with the formula to compute the percentage change for "OA changes" between two years. + +Step 18: + Observation: The formula `=(C3-C2)/C2` is being entered in cell D3 to calculate the percentage change for "OA changes". + Intent: To compute the annual percentage change for Other Assets in the new sheet. + Action: Press Enter + Result: The formula is executed in cell D3, displaying the calculated percentage change for Other Assets. + +Step 19: + Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes" with a formula present only in cell D2. + Intent: To copy the formula in D2 down the "OA changes" column for all relevant rows. + Action: Click cell D2, then drag the fill handle down to D6. + Result: The formula from D2 is filled down through D6, calculating values for each row in the "OA changes" column. + +Step 20: + Observation: The new sheet contains four headers ("Year", "CA changes", "FA changes", "OA changes") and calculated annual changes for each asset type in columns B, C, and D. + Intent: To select the range of calculated annual changes and their corresponding years for further formatting or analysis. + Action: Click and drag to select cells B2:D6. + Result: Cells B2:D6 are highlighted, indicating they are selected for the next operation. + +Step 21: + Observation: The annual changes for "CA changes", "FA changes", and "OA changes" are displayed as decimal values in columns B, C, and D. + Intent: To format the annual change values as percentages for better readability. + Action: Click the % button in the toolbar (or press Ctrl+Shift+5). + Result: The values in columns B, C, and D are now displayed as percentages. + +--- diff --git a/openadapt_evals/infrastructure/__init__.py b/openadapt_evals/infrastructure/__init__.py index 7b6d725..a557e31 100644 --- a/openadapt_evals/infrastructure/__init__.py +++ b/openadapt_evals/infrastructure/__init__.py @@ -22,8 +22,12 @@ # Restart Windows inside QEMU from openadapt_evals.infrastructure import QEMUResetManager - mgr = QEMUResetManager(vm_ip="172.173.66.131") + mgr = QEMUResetManager(vm_ip="10.0.0.1") success, msg = mgr.restart_windows() + + # Auto-detect VM IP + from openadapt_evals.infrastructure import resolve_vm_ip + ip = resolve_vm_ip() # pool registry → Azure CLI ``` """ @@ -31,7 +35,12 @@ from openadapt_evals.infrastructure.azure_vm import AzureVMManager from openadapt_evals.infrastructure.pool import PoolManager, PoolRunResult from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager +from openadapt_evals.infrastructure.screen_stability import ( + compare_screenshots, + wait_for_stable_screen, +) from openadapt_evals.infrastructure.ssh_tunnel import SSHTunnelManager, get_tunnel_manager +from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip from openadapt_evals.infrastructure.vm_monitor import VMMonitor, VMConfig __all__ = [ @@ -43,5 +52,8 @@ "VMMonitor", "VMConfig", "SSHTunnelManager", + "compare_screenshots", "get_tunnel_manager", + "resolve_vm_ip", + "wait_for_stable_screen", ] diff --git a/openadapt_evals/infrastructure/qemu_reset.py b/openadapt_evals/infrastructure/qemu_reset.py index 5ee9f1d..cdf0029 100644 --- a/openadapt_evals/infrastructure/qemu_reset.py +++ b/openadapt_evals/infrastructure/qemu_reset.py @@ -24,7 +24,7 @@ from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager - mgr = QEMUResetManager(vm_ip="172.173.66.131") + mgr = QEMUResetManager(vm_ip="10.0.0.1") # Full restart: send reset + wait for WAA server success, message = mgr.restart_windows() diff --git a/openadapt_evals/infrastructure/screen_stability.py b/openadapt_evals/infrastructure/screen_stability.py new file mode 100644 index 0000000..088f35e --- /dev/null +++ b/openadapt_evals/infrastructure/screen_stability.py @@ -0,0 +1,128 @@ +"""Screen stability detection for WAA VM screenshots. + +Provides utilities for comparing screenshots and waiting until the VM +screen stabilises before proceeding. Extracted from +``scripts/record_waa_demos.py`` so that tests and other callers can +import the functions directly without ``importlib`` hacks. + +Example: + ```python + from openadapt_evals.infrastructure.screen_stability import ( + compare_screenshots, + wait_for_stable_screen, + ) + + similarity = compare_screenshots(png_a, png_b) + stable_png = wait_for_stable_screen("http://localhost:5001") + ``` +""" + +from __future__ import annotations + +import io +import time +from typing import Callable + + +def _take_screenshot(server: str) -> bytes: + """Take a screenshot from the WAA server, raising on failure.""" + import requests + + resp = requests.get(f"{server}/screenshot", timeout=30) + resp.raise_for_status() + return resp.content + + +def compare_screenshots(png_a: bytes, png_b: bytes) -> float: + """Compare two PNG screenshots and return pixel similarity (0.0-1.0). + + Uses raw RGB pixel comparison. Returns 1.0 for identical images. + The 99.5% threshold used by ``wait_for_stable_screen`` tolerates + minor differences like the taskbar clock updating (~0.14% of pixels) + or cursor blink. + """ + from PIL import Image + + img_a = Image.open(io.BytesIO(png_a)).convert("RGB") + img_b = Image.open(io.BytesIO(png_b)).convert("RGB") + + if img_a.size != img_b.size: + return 0.0 + + bytes_a = img_a.tobytes() + bytes_b = img_b.tobytes() + + if bytes_a == bytes_b: + return 1.0 # fast path: identical raw bytes + + # Vectorized comparison via numpy (already a transitive dep via open-clip-torch) + import numpy as np + + arr_a = np.frombuffer(bytes_a, dtype=np.uint8).reshape(-1, 3) + arr_b = np.frombuffer(bytes_b, dtype=np.uint8).reshape(-1, 3) + matching = int(np.all(arr_a == arr_b, axis=1).sum()) + return matching / arr_a.shape[0] + + +def wait_for_stable_screen( + server: str, + poll_interval: float = 2.0, + stability_timeout: float = 30.0, + similarity_threshold: float = 0.995, + required_stable_checks: int = 3, + screenshot_fn: Callable[[str], bytes] | None = None, +) -> bytes: + """Wait for the VM screen to stabilize, then return the screenshot. + + Polls the QEMU framebuffer (free -- local HTTP call) until + ``required_stable_checks`` consecutive screenshot pairs exceed + ``similarity_threshold``. With the defaults (3 checks at 2s + intervals), the screen must be stable for 6 seconds before + proceeding. + + Args: + server: WAA server URL (``/screenshot`` endpoint). + poll_interval: Seconds between screenshots. + stability_timeout: Maximum seconds to wait. If exceeded, the + last screenshot is returned with a warning. + similarity_threshold: Pixel-match fraction (0.0-1.0). 0.995 + tolerates taskbar clock and cursor blink. + required_stable_checks: Consecutive stable pairs required. + With poll_interval=2 and required_stable_checks=3, the + screen must be unchanged for 6 seconds. + screenshot_fn: Optional callable ``(server) -> bytes`` used to + capture a screenshot. Defaults to an internal helper that + calls ``GET {server}/screenshot``. + + Returns: + PNG screenshot bytes of the stable screen. + """ + take = screenshot_fn or _take_screenshot + + prev_png = take(server) + stable_count = 0 + deadline = time.time() + stability_timeout + + while time.time() < deadline: + time.sleep(poll_interval) + curr_png = take(server) + + similarity = compare_screenshots(prev_png, curr_png) + + if similarity >= similarity_threshold: + stable_count += 1 + if stable_count >= required_stable_checks: + elapsed = stability_timeout - (deadline - time.time()) + print(f" Screen stable after {elapsed:.0f}s " + f"({stable_count} checks, {similarity:.4f} similarity)") + return curr_png + else: + if stable_count > 0: + print(f" Screen changed ({similarity:.3f}), resetting stability count") + stable_count = 0 + + prev_png = curr_png + + print(f" WARNING: Screen did not stabilize within {stability_timeout:.0f}s. " + "Using last screenshot.") + return prev_png diff --git a/openadapt_evals/infrastructure/vm_ip.py b/openadapt_evals/infrastructure/vm_ip.py new file mode 100644 index 0000000..559a36e --- /dev/null +++ b/openadapt_evals/infrastructure/vm_ip.py @@ -0,0 +1,121 @@ +"""VM IP resolution with automatic detection. + +Resolves the VM IP address using a layered approach: + +1. Explicit IP (from ``--vm-ip`` argument) — instant +2. Pool registry (local JSON file) — instant, no Azure auth +3. Azure CLI query (``az vm list-ip-addresses``) — always accurate, ~3s + +Usage:: + + from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip + + # Auto-detect (pool registry → Azure CLI) + ip = resolve_vm_ip() + + # Explicit override + ip = resolve_vm_ip(explicit_ip="10.0.0.1") +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def resolve_vm_ip(explicit_ip: str | None = None) -> str: + """Resolve the VM IP address. + + Resolution order: + + 1. ``explicit_ip`` — returned immediately if provided + 2. Pool registry — reads ``benchmark_results/vm_pool_registry.json`` + for the first active worker's IP + 3. Azure CLI query — calls ``AzureVMManager.get_vm_ip()`` (always accurate, ~3s) + + Args: + explicit_ip: If provided, returned as-is (from ``--vm-ip`` argument). + + Returns: + The resolved IP address string. + + Raises: + RuntimeError: If no running VM can be found by any method. + """ + if explicit_ip: + return explicit_ip + + # Try pool registry (fast, local file) + ip = _ip_from_pool_registry() + if ip: + logger.info("VM IP resolved from pool registry: %s", ip) + return ip + + # Fall back to Azure query (always accurate, ~3s) + ip = _ip_from_azure_query() + if ip: + logger.info("VM IP resolved from Azure query: %s", ip) + return ip + + raise RuntimeError( + "No running VM found. Either:\n" + " 1. Create one: oa-vm pool-create --workers 1\n" + " 2. Start it: oa-vm vm start\n" + " 3. Pass explicitly: --vm-ip " + ) + + +def _ip_from_pool_registry() -> str | None: + """Try to read VM IP from the pool registry file. + + Returns the IP of the first non-deleted, non-failed worker, or None. + """ + registry_path = Path("benchmark_results/vm_pool_registry.json") + if not registry_path.exists(): + return None + + try: + import json + + with open(registry_path) as f: + data = json.load(f) + + for worker in data.get("workers", []): + ip = worker.get("ip") + status = worker.get("status", "") + if ip and status not in ("deleted", "failed"): + return ip + except Exception as e: + logger.debug("Could not read pool registry: %s", e) + + return None + + +def _ip_from_azure_query() -> str | None: + """Query Azure for the VM IP address. + + Tries pool-style names (``waa-pool-00``) first, then falls back to + the legacy name (``waa-eval-vm``). + """ + try: + from openadapt_evals.config import settings + from openadapt_evals.infrastructure.azure_vm import AzureVMManager + + mgr = AzureVMManager(resource_group=settings.azure_resource_group) + + # Try pool-style name first (most common) + ip = mgr.get_vm_ip("waa-pool-00") + if ip: + return ip + + # Fallback to legacy name + ip = mgr.get_vm_ip("waa-eval-vm") + if ip: + return ip + + except Exception as e: + logger.debug("Azure VM IP query failed: %s", e) + + return None diff --git a/screenshots/waa_libreoffice_desktop.png b/screenshots/waa_libreoffice_desktop.png new file mode 100644 index 0000000..906161d Binary files /dev/null and b/screenshots/waa_libreoffice_desktop.png differ diff --git a/scripts/convert_recording_to_demo.py b/scripts/convert_recording_to_demo.py new file mode 100644 index 0000000..8931be5 --- /dev/null +++ b/scripts/convert_recording_to_demo.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +"""Convert WAA recordings (meta.json + screenshots) to demo text files. + +Produces demo .txt files in the format expected by eval-suite's --demo-dir. + +Two modes: + --mode text : Fast, free. Converts step descriptions from meta.json directly. + --mode vlm : Richer. Sends screenshots to a VLM for Observation/Intent/Action/Result. + +Usage: + # Text-only (instant, no API calls) + python scripts/convert_recording_to_demo.py \ + --recordings waa_recordings \ + --output demo_prompts \ + --mode text + + # VLM-enriched (sends screenshots to OpenAI/Anthropic) + python scripts/convert_recording_to_demo.py \ + --recordings waa_recordings \ + --output demo_prompts \ + --mode vlm \ + --provider openai + + # Override model (e.g. use cheaper model) + python scripts/convert_recording_to_demo.py \ + --recordings waa_recordings \ + --output demo_prompts \ + --mode vlm \ + --provider openai \ + --model gpt-4.1-mini +""" +from __future__ import annotations + +import base64 +import json +import re +import sys +from pathlib import Path + +from dotenv import load_dotenv + +load_dotenv() + +# Regex patterns for extracting key references from step descriptions +_CELL_REF_RE = re.compile(r'\b([A-Z]+\d+)\b') +_CELL_RANGE_RE = re.compile(r'\b([A-Z]+\d+)\s*[:\-]\s*([A-Z]+\d+)\b') +_FORMULA_RE = re.compile(r'"(=[^"]+)"') +_QUOTED_TEXT_RE = re.compile(r'"([^"]+)"') + + +def _extract_references(text: str) -> dict: + """Extract key references (cell refs, formulas, quoted text) from a step description. + + Returns a dict with: + cells: set of cell references like {'D2', 'B6'} + ranges: list of (start, end) tuples like [('B2', 'D6')] + formulas: list of formula strings + quoted: list of quoted text values + """ + formulas = _FORMULA_RE.findall(text) + # Remove formulas from text before extracting cell refs to avoid double-counting + text_no_formulas = _FORMULA_RE.sub('', text) + ranges = _CELL_RANGE_RE.findall(text_no_formulas) + # Remove ranges before extracting individual cells + text_no_ranges = _CELL_RANGE_RE.sub('', text_no_formulas) + cells = set(_CELL_REF_RE.findall(text_no_ranges)) + quoted = [q for q in _QUOTED_TEXT_RE.findall(text) if not q.startswith('=')] + return { + 'cells': cells, + 'ranges': ranges, + 'formulas': formulas, + 'quoted': quoted, + } + + +def _check_action_mismatch(ground_truth: str, vlm_action: str) -> str | None: + """Check if the VLM's Action line contradicts the ground-truth step. + + Returns a description of the mismatch, or None if consistent. + """ + gt_refs = _extract_references(ground_truth) + vlm_refs = _extract_references(vlm_action) + + mismatches = [] + + # Check cell references: VLM should not introduce cells that conflict with GT + if gt_refs['cells'] and vlm_refs['cells']: + # VLM cells that are NOT in the ground truth (potential hallucinations) + extra_cells = vlm_refs['cells'] - gt_refs['cells'] + # Only flag if VLM has cells that differ in row/col from GT cells + # e.g., GT says D2 but VLM says D3 + for gt_cell in gt_refs['cells']: + gt_col = re.match(r'([A-Z]+)', gt_cell).group(1) + for vlm_cell in extra_cells: + vlm_col = re.match(r'([A-Z]+)', vlm_cell).group(1) + if vlm_col == gt_col and vlm_cell != gt_cell: + mismatches.append( + f"cell ref {vlm_cell} in VLM vs {gt_cell} in ground truth" + ) + + # Check formulas + if gt_refs['formulas'] and vlm_refs['formulas']: + for gt_f, vlm_f in zip(gt_refs['formulas'], vlm_refs['formulas']): + if gt_f != vlm_f: + mismatches.append( + f"formula '{vlm_f}' in VLM vs '{gt_f}' in ground truth" + ) + + # Check quoted text values + if gt_refs['quoted'] and vlm_refs['quoted']: + gt_set = {q.lower() for q in gt_refs['quoted']} + for vlm_q in vlm_refs['quoted']: + if vlm_q.lower() not in gt_set: + # Check if it's a close but wrong value + mismatches.append( + f"quoted text '{vlm_q}' in VLM not in ground truth" + ) + + return '; '.join(mismatches) if mismatches else None + + +def _extract_annotation_field(annotation: str, field: str) -> str | None: + """Extract a specific field value from a VLM annotation string.""" + pattern = re.compile(rf'^{field}:\s*(.+)$', re.MULTILINE | re.IGNORECASE) + match = pattern.search(annotation) + return match.group(1).strip() if match else None + + +def _replace_annotation_field(annotation: str, field: str, new_value: str) -> str: + """Replace a specific field value in a VLM annotation string.""" + pattern = re.compile(rf'^({field}:\s*)(.+)$', re.MULTILINE | re.IGNORECASE) + return pattern.sub(rf'\g<1>{new_value}', annotation) + + +def convert_text(meta: dict) -> str: + """Convert recording meta.json to demo text using step descriptions only.""" + instruction = meta["instruction"] + steps = meta.get("steps", []) + + lines = [ + "DEMONSTRATION:", + f"Task: {instruction}", + "", + ] + + for i, step in enumerate(steps): + desc = step.get("suggested_step", f"(step {i + 1})") + lines.append(f"Step {i + 1}:") + lines.append(f" Action: {desc}") + lines.append("") + + lines.append("---") + return "\n".join(lines) + "\n" + + +def convert_vlm( + meta: dict, + task_dir: Path, + provider: str = "openai", + model: str | None = None, +) -> str: + """Convert recording to demo text using VLM to describe screenshots. + + The VLM is instructed to treat the recorded action from meta.json as + ground truth. After each VLM call, the Action field is validated against + the ground-truth step description. If key references (cell refs, formulas, + text values) are contradicted, the Action field is replaced with the + ground-truth description while preserving the VLM's Observation, Intent, + and Result fields. + """ + instruction = meta["instruction"] + steps = meta.get("steps", []) + num_steps = meta.get("num_steps", len(steps)) + + lines = [ + "DEMONSTRATION:", + f"Task: {instruction}", + "", + ] + + for i in range(num_steps): + step_desc = steps[i].get("suggested_step", "") if i < len(steps) else "" + before_path = task_dir / f"step_{i:02d}_before.png" + after_path = task_dir / f"step_{i:02d}_after.png" + + if not before_path.exists(): + # Fallback to text-only for this step + lines.append(f"Step {i + 1}:") + lines.append(f" Action: {step_desc}") + lines.append("") + continue + + print(f" Step {i + 1}/{num_steps}...", end=" ", flush=True) + + # Build VLM prompt with strong ground-truth constraint + images = [_encode_image(before_path)] + has_after = after_path.exists() + if has_after: + images.append(_encode_image(after_path)) + + prompt = ( + f"You are annotating step {i + 1} of {num_steps} in a desktop task demonstration.\n" + f"Task: {instruction}\n\n" + f"GROUND-TRUTH RECORDED ACTION (definitive — do NOT contradict this):\n" + f" {step_desc}\n\n" + f"The recorded action above is the exact action that was performed. It is the\n" + f"authoritative source for cell references (e.g. D2, B6), text values, formulas,\n" + f"and action types (CLICK, TYPE, DRAG, KEY). Your Action field MUST match these\n" + f"details exactly. Do not substitute different cell references, row numbers, or\n" + f"values based on your visual interpretation — the recording is ground truth.\n\n" + f"{'The first image is BEFORE the action, the second is AFTER.' if has_after else 'This image shows the screen BEFORE the action.'}\n\n" + f"Provide a concise annotation with exactly these fields:\n" + f"Observation: (what the screen shows before the action, ~1 sentence)\n" + f"Intent: (why this action is being taken, ~1 sentence)\n" + f"Action: (restate the recorded action above — you may add brief visual context\n" + f" but must preserve all cell refs, values, and formulas exactly)\n" + f"Result: (what changed after the action, ~1 sentence)\n\n" + f"IMPORTANT: The Action field must be faithful to the ground-truth recorded action.\n" + f"If the recorded action says cell D2, you must say D2 — not D3 or any other cell." + ) + + try: + annotation = _vlm_call(prompt, images, provider, model) + + # Post-hoc validation: check VLM Action against ground truth + vlm_action = _extract_annotation_field(annotation, "Action") + if vlm_action and step_desc: + mismatch = _check_action_mismatch(step_desc, vlm_action) + if mismatch: + print(f"MISMATCH ({mismatch}), replacing Action...", end=" ") + # Replace the Action field with the ground-truth description, + # preserving the VLM's Observation/Intent/Result + annotation = _replace_annotation_field( + annotation, "Action", step_desc + ) + + lines.append(f"Step {i + 1}:") + # Parse the annotation — it should have Observation/Intent/Action/Result lines + for line in annotation.strip().split("\n"): + line = line.strip() + if line: + lines.append(f" {line}") + lines.append("") + print("done") + except Exception as e: + print(f"error: {e}") + # Fallback to text-only + lines.append(f"Step {i + 1}:") + lines.append(f" Action: {step_desc}") + lines.append("") + + lines.append("---") + return "\n".join(lines) + "\n" + + +def _encode_image(path: Path) -> str: + """Read an image file and return base64-encoded string.""" + return base64.b64encode(path.read_bytes()).decode("utf-8") + + +def _vlm_call( + prompt: str, + images_b64: list[str], + provider: str = "openai", + model: str | None = None, +) -> str: + """Send a prompt with images to a VLM and return the response text.""" + if provider == "openai": + return _vlm_call_openai(prompt, images_b64, model or "gpt-4.1") + elif provider in ("anthropic", "claude"): + return _vlm_call_anthropic(prompt, images_b64, model or "claude-sonnet-4-20250514") + else: + raise ValueError(f"Unsupported provider: {provider}") + + +def _vlm_call_openai(prompt: str, images_b64: list[str], model: str) -> str: + import openai + + content: list[dict] = [{"type": "text", "text": prompt}] + for img in images_b64: + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img}", "detail": "low"}, + }) + + client = openai.OpenAI() + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": content}], + max_tokens=400, + temperature=0.0, + ) + return resp.choices[0].message.content + + +def _vlm_call_anthropic(prompt: str, images_b64: list[str], model: str) -> str: + import anthropic + + content: list[dict] = [{"type": "text", "text": prompt}] + for img in images_b64: + content.append({ + "type": "image", + "source": {"type": "base64", "media_type": "image/png", "data": img}, + }) + + client = anthropic.Anthropic() + resp = client.messages.create( + model=model, + messages=[{"role": "user", "content": content}], + max_tokens=400, + temperature=0.0, + ) + return resp.content[0].text + + +def main( + recordings: str = "waa_recordings", + output: str = "demo_prompts", + mode: str = "text", + provider: str = "openai", + model: str | None = None, + task: str | None = None, +) -> None: + """Convert WAA recordings to demo text files. + + Args: + recordings: Directory containing recording subdirectories. + output: Output directory for demo .txt files. + mode: "text" (free, instant) or "vlm" (richer, uses API). + provider: VLM provider for vlm mode ("openai" or "anthropic"). + model: Model override (default: gpt-4.1 for openai, claude-sonnet-4 for anthropic). + task: Specific task ID prefix to convert (default: all). + """ + recordings_dir = Path(recordings) + output_dir = Path(output) + output_dir.mkdir(parents=True, exist_ok=True) + + # Find task directories with meta.json + task_dirs = sorted( + d for d in recordings_dir.iterdir() + if d.is_dir() and (d / "meta.json").exists() + ) + + if task: + task_dirs = [d for d in task_dirs if d.name.startswith(task)] + + if not task_dirs: + print(f"No recordings found in {recordings_dir}") + sys.exit(1) + + effective_model = model + if not effective_model: + effective_model = ( + "gpt-4.1" if provider == "openai" else "claude-sonnet-4-20250514" + ) + print(f"Converting {len(task_dirs)} recording(s) to demo text (mode={mode})") + if mode == "vlm": + print(f" Provider: {provider}, Model: {effective_model}") + print() + + for task_dir in task_dirs: + meta = json.loads((task_dir / "meta.json").read_text(encoding="utf-8")) + task_id = meta["task_id"] + num_steps = meta.get("num_steps", len(meta.get("steps", []))) + + print(f"{'=' * 60}") + print(f"Task: {task_id[:40]}... ({num_steps} steps)") + print(f"{'=' * 60}") + + if mode == "vlm": + demo_text = convert_vlm(meta, task_dir, provider, model) + else: + demo_text = convert_text(meta) + + txt_path = output_dir / f"{task_id}.txt" + txt_path.write_text(demo_text, encoding="utf-8") + print(f" -> {txt_path}") + print() + + print("Done. Use with eval-suite:") + print(f" openadapt-evals eval-suite --demo-dir {output_dir} --tasks ...") + + +if __name__ == "__main__": + import fire + + fire.Fire(main) diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py index 07dcd67..e0b5eb0 100644 --- a/scripts/record_waa_demos.py +++ b/scripts/record_waa_demos.py @@ -31,6 +31,7 @@ from __future__ import annotations import json +import socket import subprocess import sys import time @@ -470,6 +471,63 @@ def _take_screenshot(server: str) -> bytes: return resp.content +def _wait_for_stable_screen( + server: str, + poll_interval: float = 2.0, + stability_timeout: float = 30.0, + similarity_threshold: float = 0.995, + required_stable_checks: int = 3, +) -> bytes: + """Wait for the VM screen to stabilize, then return the screenshot. + + Delegates to :func:`openadapt_evals.infrastructure.screen_stability.wait_for_stable_screen`. + """ + from openadapt_evals.infrastructure.screen_stability import wait_for_stable_screen + + return wait_for_stable_screen( + server, + poll_interval=poll_interval, + stability_timeout=stability_timeout, + similarity_threshold=similarity_threshold, + required_stable_checks=required_stable_checks, + ) + + +def _build_setup_desc(task_config: dict) -> str: + """Build a human-readable description of the task setup actions.""" + parts = [] + for entry in task_config.get("config", []): + t = entry.get("type", "") + p = entry.get("parameters", {}) + if t == "download": + for f in p.get("files", []): + parts.append(f" - File downloaded: {f.get('path', '?')}") + elif t == "open": + parts.append(f" - File opened: {p.get('path', '?')}") + elif t == "launch": + parts.append(f" - App launched: {p.get('command', '?')}") + return "\n".join(parts) if parts else " (none)" + + +def _vlm_call( + messages: list[dict], + api_key: str, + model: str = "gpt-4o", + max_tokens: int = 800, +) -> str: + """Send a chat completion request to OpenAI. Returns the text response.""" + import requests as _req + + resp = _req.post( + "https://api.openai.com/v1/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + json={"model": model, "max_tokens": max_tokens, "messages": messages}, + timeout=60, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + + def _generate_steps( screenshot_png: bytes, instruction: str, @@ -486,28 +544,15 @@ def _generate_steps( if not api_key: return "(No OPENAI_API_KEY set — skipping AI step generation)" - import requests as _req - b64 = base64.b64encode(screenshot_png).decode() - - setup_desc = "" - for entry in task_config.get("config", []): - t = entry.get("type", "") - p = entry.get("parameters", {}) - if t == "download": - for f in p.get("files", []): - setup_desc += f" - File downloaded: {f['path']}\n" - elif t == "open": - setup_desc += f" - File opened: {p.get('path', '?')}\n" - elif t == "launch": - setup_desc += f" - App launched: {p.get('command', '?')}\n" + setup_desc = _build_setup_desc(task_config) prompt = f"""You are helping a human perform a task on a Windows desktop via VNC. TASK: {instruction} ENVIRONMENT SETUP (already done automatically): -{setup_desc if setup_desc else " (none)"} +{setup_desc} Look at the screenshot and give step-by-step instructions to complete the task. Be specific about what to click, what to type, and what menus to use. @@ -515,45 +560,908 @@ def _generate_steps( Keep each step to one action. Use plain text, numbered list. Be concise — the user will read this on a phone screen.""" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{b64}", + "detail": "high", + }, + }, + ], + } + ] + try: - resp = _req.post( - "https://api.openai.com/v1/chat/completions", - headers={"Authorization": f"Bearer {api_key}"}, - json={ - "model": "gpt-4o", - "max_tokens": 800, - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{b64}", - "detail": "high", - }, - }, - ], - } - ], - }, - timeout=30, - ) - resp.raise_for_status() - return resp.json()["choices"][0]["message"]["content"] + return _vlm_call(messages, api_key) except Exception as e: return f"(AI step generation failed: {e})" +def _refine_steps( + screenshot_png: bytes, + instruction: str, + task_config: dict, + current_steps: str, + feedback: str, +) -> str: + """Refine suggested steps based on user feedback. + + Sends the original screenshot, instruction, current steps, and the + user's correction back to the VLM for a revised set of steps. + """ + import base64 + import os + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + return current_steps + + b64 = base64.b64encode(screenshot_png).decode() + setup_desc = _build_setup_desc(task_config) + + prompt = f"""You are helping a human perform a task on a Windows desktop via VNC. + +TASK: {instruction} + +ENVIRONMENT SETUP (already done automatically): +{setup_desc} + +You previously suggested these steps: + +{current_steps} + +The user says this is wrong and provides this feedback: + +"{feedback}" + +Look at the screenshot again and produce CORRECTED step-by-step instructions +that address the user's feedback. Keep the same format: plain text, numbered list, +one action per step, concise.""" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{b64}", + "detail": "high", + }, + }, + ], + } + ] + + try: + return _vlm_call(messages, api_key) + except Exception as e: + print(f" (Refinement failed: {e})") + return current_steps + + +def _parse_step_list(steps_text: str) -> list[str]: + """Parse numbered VLM output into a list of step strings. + + Handles formats like "1. Do X", "1) Do X", "1: Do X". + Falls back to non-empty lines if no numbered items are found. + """ + import re + + lines = steps_text.strip().splitlines() + steps: list[str] = [] + current_step: list[str] = [] + + for line in lines: + # Check if this line starts a new numbered step + m = re.match(r"^\s*\d+[\.\)\:]\s+(.*)", line) + if m: + if current_step: + steps.append(" ".join(current_step)) + current_step = [m.group(1).strip()] + elif current_step: + # Continuation line for the current step + stripped = line.strip() + if stripped: + current_step.append(stripped) + + if current_step: + steps.append(" ".join(current_step)) + + # Fallback: if no numbered items found, split on non-empty lines + if not steps: + steps = [line.strip() for line in lines if line.strip()] + + return steps + + +def _format_step_list(steps: list[str], start_num: int = 1) -> str: + """Format a list of step strings back into numbered text.""" + return "\n".join( + f"{i}. {step}" for i, step in enumerate(steps, start=start_num) + ) + + +def _display_steps(steps_text: str) -> None: + """Pretty-print suggested steps in a box.""" + print() + print(" ┌─ SUGGESTED STEPS ──────────────────────────────") + for line in steps_text.splitlines(): + print(f" │ {line}") + print(" └────────────────────────────────────────────────") + print() + + +def _display_current_step(step_num: int, total: int, step_text: str) -> None: + """Display the current step in a box before the action prompt.""" + header = f" Step {step_num} of {total} " + width = max(len(header) + 4, len(step_text) + 6, 50) + bar = "─" * (width - len(header) - 2) + print(f"\n ──{header}{bar}") + print(f" │ {step_text}") + print(f" └{'─' * (width)}") + + +def _interactive_step_review( + screenshot_png: bytes, + instruction: str, + task_config: dict, + initial_steps: str, +) -> str: + """Let the user review and iteratively correct the suggested steps. + + The user can press Enter to accept, or type feedback to refine. + Returns the final accepted steps text. + """ + current = initial_steps + while True: + correction = input( + " Press Enter to accept steps, or type correction: " + ).strip() + if not correction: + return current + print(" Refining steps...") + current = _refine_steps( + screenshot_png, instruction, task_config, current, correction, + ) + _display_steps(current) + + +def _refine_remaining_steps( + screenshot_png: bytes, + instruction: str, + task_config: dict, + completed_steps: list[str], + remaining_steps: list[str], + feedback: str, +) -> str: + """Refine remaining steps based on user feedback mid-recording. + + Takes a FRESH screenshot (current screen state after completed steps) + and sends it along with the context of what's been done and what remains. + Returns the raw VLM text for remaining steps. + """ + import base64 + import os + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + return _format_step_list(remaining_steps) + + b64 = base64.b64encode(screenshot_png).decode() + setup_desc = _build_setup_desc(task_config) + + completed_text = ( + _format_step_list(completed_steps) + if completed_steps + else "(none)" + ) + remaining_text = _format_step_list(remaining_steps) + current_step_num = len(completed_steps) + 1 + + prompt = f"""You are helping a human perform a task on a Windows desktop via VNC. + +TASK: {instruction} + +ENVIRONMENT SETUP (already done automatically): +{setup_desc} + +The user has already completed these steps: +{completed_text} + +You previously suggested these REMAINING steps (not yet performed): +{remaining_text} + +The user is currently on step {current_step_num} and says: +"{feedback}" + +Look at the CURRENT screenshot (taken just now, after the completed steps) +and produce CORRECTED remaining steps that address the user's feedback. +Only output the remaining steps (do not repeat completed steps). +Keep the same format: plain text, numbered list starting from 1, +one action per step, concise.""" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{b64}", + "detail": "high", + }, + }, + ], + } + ] + + try: + return _vlm_call(messages, api_key) + except Exception as e: + print(f" (Refinement failed: {e})") + return _format_step_list(remaining_steps) + + +def _interactive_remaining_review( + server: str, + instruction: str, + task_config: dict, + completed_steps: list[str], + remaining_steps: list[str], + initial_feedback: str, +) -> list[str]: + """Mini review loop for mid-recording step refinement. + + Takes a fresh screenshot for each refinement round. + Returns the accepted list of remaining steps. + """ + # First refinement using the initial feedback + fresh_png = _take_screenshot(server) + refined_text = _refine_remaining_steps( + fresh_png, instruction, task_config, + completed_steps, remaining_steps, initial_feedback, + ) + new_steps = _parse_step_list(refined_text) + + if not new_steps: + print(" WARNING: VLM returned no steps. Keeping previous plan.") + return remaining_steps + + while True: + # Display corrected remaining steps + print() + print(" ┌─ CORRECTED REMAINING STEPS ─────────────────────") + for line in _format_step_list(new_steps).splitlines(): + print(f" │ {line}") + print(" └─────────────────────────────────────────────────") + print() + + correction = input( + " Press Enter to accept, or type another correction: " + ).strip() + if not correction: + return new_steps + + print(" Taking fresh screenshot and refining...") + fresh_png = _take_screenshot(server) + refined_text = _refine_remaining_steps( + fresh_png, instruction, task_config, + completed_steps, new_steps, correction, + ) + parsed = _parse_step_list(refined_text) + if parsed: + new_steps = parsed + else: + print(" WARNING: VLM returned no steps. Keeping previous plan.") + + +def _record_single_task( + task_id: str, + task_num: int, + total_tasks: int, + output_dir: Path, + server: str, + evaluate_url: str, + vnc_url: str, + vm_ip: str | None, +) -> str | None: + """Record a single task interactively via WAA API. + + Returns the task_id on success, or None on failure. + """ + import requests + + from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager + + print_header(f"Task {task_num}/{total_tasks}: {task_id[:12]}...") + + task_dir = output_dir / task_id + task_dir.mkdir(parents=True, exist_ok=True) + + # Load task config from evaluate server + instruction = task_id # fallback + task_config: dict = {} + try: + task_resp = requests.get( + f"{evaluate_url}/task/{task_id}", timeout=10 + ) + if task_resp.ok: + task_config = task_resp.json() + instruction = task_config.get( + "instruction", task_config.get("task", task_id) + ) + except Exception as e: + print(f" Warning: could not load task config: {e}") + + def _setup_task_env() -> None: + """Run task setup config (download files, open apps, etc.).""" + setup_config = task_config.get("config", []) + related_apps = task_config.get("related_apps", []) + if related_apps: + setup_config = [ + {"type": "verify_apps", "parameters": {"apps": related_apps}} + ] + setup_config + if setup_config: + resp = requests.post( + f"{evaluate_url}/setup", + json={"config": setup_config}, + timeout=120, + ) + resp.raise_for_status() + results = resp.json().get("results", []) + for r in results: + status = r.get("status", "?") + stype = r.get("type", "?") + print(f" setup {stype}: {status}") + + def _soft_reset_task_env() -> bytes: + """Soft reset: close_all + re-run setup + wait for stable screen.""" + print(" Resetting environment (soft)...") + try: + resp = requests.post(f"{server}/setup/close_all", timeout=30) + print(f" close_all: {resp.status_code}") + time.sleep(2) + _setup_task_env() + except Exception as e: + print(f" WARNING: environment setup failed: {e}") + print(f" The task app may not be open. Check VNC.") + print(" Waiting for screen to stabilize...") + return _wait_for_stable_screen(server) + + def _hard_reset_task_env() -> bytes: + """Hard reset: QEMU system_reset + wait for boot + clear recovery + setup.""" + print(" Restarting Windows (QEMU hard reset)...") + mgr = QEMUResetManager(vm_ip=vm_ip, timeout_seconds=300) + success, msg = mgr.restart_windows(server_url=server) + if not success: + print(f" WARNING: QEMU reset failed: {msg}") + print(" Falling back to soft reset...") + return _soft_reset_task_env() + print(f" {msg}") + _clear_recovery_data(server) + print(" Running task setup...") + try: + _setup_task_env() + except Exception as e: + print(f" WARNING: environment setup failed: {e}") + print(f" The task app may not be open. Check VNC.") + print(" Waiting for screen to stabilize...") + return _wait_for_stable_screen(server) + + before_png = _soft_reset_task_env() + + print(f"\n VNC: {vnc_url}") + print(f" Task: {instruction}\n") + + # Generate AI step-by-step guidance from screenshot + print(" Generating suggested steps...") + suggested = _generate_steps(before_png, instruction, task_config) + _display_steps(suggested) + suggested = _interactive_step_review( + before_png, instruction, task_config, suggested, + ) + + # Parse accepted steps into a structured list + completed_steps: list[str] = [] + remaining_steps = _parse_step_list(suggested) + step_plans = [{ + "at_step_idx": 0, + "trigger": "initial", + "steps": list(remaining_steps), + }] + refined_indices: set[int] = set() + steps_meta: list[dict] = [] + step_idx = 0 + + print() + print(" Perform each action in VNC. You can provide feedback") + print(" at any point to correct the remaining steps.\n") + + while remaining_steps: + # Save before screenshot + (task_dir / f"step_{step_idx:02d}_before.png").write_bytes( + before_png + ) + + # Display current step + total = len(completed_steps) + len(remaining_steps) + step_num = len(completed_steps) + 1 + _display_current_step(step_num, total, remaining_steps[0]) + + user_input = input( + " [Enter]=done [d]=task done [r]=redo [R]=restart\n" + " Or type feedback to correct remaining steps: " + ).strip() + + if user_input == "": + # ADVANCE: action done, move to next step + after_png = _take_screenshot(server) + (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( + after_png + ) + done_step = remaining_steps.pop(0) + completed_steps.append(done_step) + steps_meta.append({ + "action_hint": None, + "suggested_step": done_step, + "step_was_refined": step_idx in refined_indices, + }) + before_png = after_png + step_idx += 1 + print(f" Step {step_num} recorded.") + + if not remaining_steps: + print(f"\n All {total} steps completed. Finishing recording.") + + elif user_input.lower() == "d": + # DONE: task finished (possibly before all steps) + after_png = _take_screenshot(server) + (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( + after_png + ) + steps_meta.append({ + "action_hint": "d", + "suggested_step": remaining_steps[0], + "step_was_refined": step_idx in refined_indices, + }) + step_idx += 1 + total = len(completed_steps) + len(remaining_steps) + print(f"\n Task marked done at step {step_num} of {total}. Finishing recording.") + break + + elif user_input.lower() == "r": + # REDO: go back one step + if not completed_steps: + print(" Nothing to redo (already at step 1).") + continue + step_idx -= 1 + remaining_steps.insert(0, completed_steps.pop()) + steps_meta.pop() + before_png = _take_screenshot(server) + print(f" Redoing step {len(completed_steps) + 1}...") + + elif user_input == "R": + # RESTART: QEMU hard reset + re-generate everything + print(" Restarting task from scratch (QEMU hard reset)...") + for f in task_dir.glob("step_*.png"): + f.unlink() + before_png = _hard_reset_task_env() + print(f"\n VNC: {vnc_url}") + print(f" Task: {instruction}\n") + + print(" Generating suggested steps...") + new_suggested = _generate_steps(before_png, instruction, task_config) + _display_steps(new_suggested) + new_suggested = _interactive_step_review( + before_png, instruction, task_config, new_suggested, + ) + + completed_steps = [] + remaining_steps = _parse_step_list(new_suggested) + step_plans.append({ + "at_step_idx": 0, + "trigger": "restart", + "steps": list(remaining_steps), + }) + refined_indices = set() + steps_meta = [] + step_idx = 0 + print() + print(" Task restarted. Continue recording.\n") + + else: + # FEEDBACK: mid-recording step correction + print(" Taking fresh screenshot and refining remaining steps...") + remaining_steps = _interactive_remaining_review( + server, instruction, task_config, + completed_steps, remaining_steps, user_input, + ) + step_plans.append({ + "at_step_idx": step_idx, + "trigger": f"feedback: {user_input}", + "steps": list(remaining_steps), + }) + for i in range(step_idx, step_idx + len(remaining_steps)): + refined_indices.add(i) + # No action taken — loop re-displays the (possibly new) current step + + # Save metadata + meta = { + "task_id": task_id, + "instruction": instruction, + "num_steps": len(steps_meta), + "steps": steps_meta, + "step_plans": step_plans, + "server_url": server, + "recorded_at": datetime.now(timezone.utc).isoformat(), + } + (task_dir / "meta.json").write_text( + json.dumps(meta, indent=2), encoding="utf-8" + ) + + print(f"\n Saved {len(steps_meta)} step(s) to {task_dir}") + return task_id + + +# --------------------------------------------------------------------------- +# Auto-infrastructure helpers +# --------------------------------------------------------------------------- + +_AUTO_VM_NAME = "waa-pool-00" +_AUTO_RESOURCE_GROUP = "openadapt-agents" +_AUTO_SSH_USER = "azureuser" + +# Track whether this script started the VM (so we can offer to deallocate on exit) +_vm_started_by_script = False +_cleanup_registered = False +_cleanup_done = False + +# Port mappings: local_port -> remote_port (on VM host) +_TUNNEL_PORTS = { + 5001: 5000, # WAA server + 5050: 5051, # evaluate server (via socat) + 8006: 8006, # VNC (noVNC) +} + + +def _is_local_port_open(port: int) -> bool: + """Check whether a local TCP port is accepting connections.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(2) + return sock.connect_ex(("localhost", port)) == 0 + + +def _get_vm_power_state() -> str | None: + """Return the Azure VM power state, e.g. 'running', 'deallocated', or None on error.""" + try: + result = subprocess.run( + [ + "az", "vm", "get-instance-view", + "-g", _AUTO_RESOURCE_GROUP, + "-n", _AUTO_VM_NAME, + "--query", "instanceView.statuses[?starts_with(code,'PowerState/')].displayStatus | [0]", + "-o", "tsv", + ], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip().replace("VM ", "").lower() + except Exception: + pass + return None + + +def _get_vm_public_ip() -> str | None: + """Get the public IP of the VM.""" + try: + result = subprocess.run( + [ + "az", "vm", "show", + "-g", _AUTO_RESOURCE_GROUP, + "-n", _AUTO_VM_NAME, + "--show-details", + "--query", "publicIps", + "-o", "tsv", + ], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except Exception: + pass + return None + + +def _auto_start_vm() -> bool: + """Start the Azure VM. Returns True on success.""" + global _vm_started_by_script + print(f" Starting VM '{_AUTO_VM_NAME}'...") + result = subprocess.run( + ["az", "vm", "start", "-g", _AUTO_RESOURCE_GROUP, "-n", _AUTO_VM_NAME], + capture_output=True, text=True, timeout=300, + ) + if result.returncode != 0: + print(f" ERROR: az vm start failed: {result.stderr.strip()}") + return False + print(f" VM '{_AUTO_VM_NAME}' started.") + _vm_started_by_script = True + _register_vm_cleanup() + return True + + +def _auto_establish_tunnels(vm_ip: str) -> bool: + """Establish SSH tunnels to the VM. Prefers autossh if available.""" + import shutil + + use_autossh = shutil.which("autossh") is not None + tool = "autossh" if use_autossh else "ssh" + print(f" Establishing SSH tunnels to {vm_ip} (using {tool})...") + + tunnel_args = [] + for local_port, remote_port in _TUNNEL_PORTS.items(): + tunnel_args.extend(["-L", f"{local_port}:localhost:{remote_port}"]) + + ssh_opts = [ + "-o", "ConnectTimeout=10", + "-o", "ServerAliveInterval=30", + "-o", "ServerAliveCountMax=3", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ExitOnForwardFailure=yes", + "-N", + ] + + if use_autossh: + cmd = ["autossh", "-M", "0", "-f"] + ssh_opts + tunnel_args + [ + f"{_AUTO_SSH_USER}@{vm_ip}" + ] + else: + cmd = ["ssh", "-f"] + ssh_opts + tunnel_args + [ + f"{_AUTO_SSH_USER}@{vm_ip}" + ] + print(" (install autossh for automatic tunnel reconnection)") + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + print(f" ERROR: {tool} tunnel failed: {result.stderr.strip()}") + return False + + time.sleep(2) + for local_port in _TUNNEL_PORTS: + status = "OK" if _is_local_port_open(local_port) else "NOT YET" + print(f" Tunnel localhost:{local_port} -> VM:{_TUNNEL_PORTS[local_port]}: {status}") + + return True + + +def _auto_start_container(vm_ip: str) -> bool: + """Start the winarena Docker container on the VM.""" + print(f" Starting Docker container 'winarena' on {vm_ip}...") + result = subprocess.run( + ["ssh", + "-o", "ConnectTimeout=10", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + f"{_AUTO_SSH_USER}@{vm_ip}", + "docker start winarena"], + capture_output=True, text=True, timeout=60, + ) + if result.returncode != 0: + print(f" ERROR: docker start failed: {result.stderr.strip()}") + return False + print(" Container started.") + return True + + +def _auto_start_socat(vm_ip: str) -> bool: + """Start socat proxy on the VM for port 5050 forwarding.""" + print(f" Starting socat proxy on {vm_ip} (VM:5051 -> container:5050)...") + socat_cmd = ( + 'nohup socat TCP-LISTEN:5051,fork,reuseaddr ' + 'EXEC:"docker exec -i winarena socat - TCP\\:localhost\\:5050" ' + '&>/dev/null &' + ) + result = subprocess.run( + ["ssh", + "-o", "ConnectTimeout=10", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + f"{_AUTO_SSH_USER}@{vm_ip}", + socat_cmd], + capture_output=True, text=True, timeout=30, + ) + if result.returncode != 0: + print(f" WARNING: socat setup returned non-zero: {result.stderr.strip()}") + else: + print(" Socat proxy started.") + return True + + +def _wait_for_waa_ready(server: str, timeout: int = 600, interval: int = 5) -> bool: + """Poll WAA /probe endpoint until it returns HTTP 200.""" + import requests + + print(f" Waiting for WAA to become ready at {server}/probe (timeout {timeout}s)...") + start = time.monotonic() + attempts = 0 + while time.monotonic() - start < timeout: + attempts += 1 + elapsed = int(time.monotonic() - start) + try: + resp = requests.get(f"{server}/probe", timeout=5) + if resp.status_code == 200: + print(f" WAA is ready (after {elapsed}s, {attempts} attempts).") + return True + print(f" [{elapsed}s] probe returned {resp.status_code}, retrying...") + except requests.ConnectionError: + print(f" [{elapsed}s] connection refused, retrying...") + except Exception as e: + print(f" [{elapsed}s] {e}, retrying...") + time.sleep(interval) + + print(f" TIMEOUT: WAA did not become ready within {timeout}s.") + return False + + +def _attempt_auto_recovery( + server: str, + auto_vm: bool, + auto_tunnel: bool, + auto_container: bool, + wait_ready: bool, +) -> bool: + """Attempt to automatically bring up WAA infrastructure. + + Returns True if WAA is reachable after recovery. + """ + print() + print(" Auto-recovery: diagnosing infrastructure state...") + + # Step 1: Check/start VM + vm_ip = _get_vm_public_ip() + power_state = _get_vm_power_state() + print(f" VM power state: {power_state or 'unknown'}") + + if power_state != "running": + if auto_vm: + if not _auto_start_vm(): + return False + time.sleep(5) + vm_ip = _get_vm_public_ip() + if not vm_ip: + print(" FAILED: VM started but could not get public IP.") + return False + print(f" VM IP: {vm_ip}") + else: + print(" VM is not running. Use --auto-vm to start it automatically.") + return False + else: + if not vm_ip: + vm_ip = _get_vm_public_ip() + print(f" VM IP: {vm_ip}") + + if not vm_ip: + print(" FAILED: Could not determine VM IP.") + return False + + # Step 2: Check/start Docker container + if auto_container: + _auto_start_container(vm_ip) + _auto_start_socat(vm_ip) + + # Step 3: Check/establish SSH tunnels + if auto_tunnel and not _is_local_port_open(5001): + if not _auto_establish_tunnels(vm_ip): + print(" FAILED: Could not establish SSH tunnels.") + return False + + # Step 4: Wait for WAA to boot + if wait_ready: + return _wait_for_waa_ready(server) + + return _is_local_port_open(5001) + + +def _deallocate_vm() -> bool: + """Deallocate the Azure VM (async). Returns True on success.""" + print(f"\n Deallocating VM '{_AUTO_VM_NAME}'...") + try: + result = subprocess.run( + [ + "az", "vm", "deallocate", + "-g", _AUTO_RESOURCE_GROUP, + "-n", _AUTO_VM_NAME, + "--no-wait", + ], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0: + print(f" VM '{_AUTO_VM_NAME}' deallocate initiated (billing will stop shortly).") + return True + else: + print(f" WARNING: deallocate failed: {result.stderr.strip()}") + return False + except Exception as e: + print(f" WARNING: deallocate failed: {e}") + return False + + +def _cleanup_on_exit(signal_received: bool = False) -> None: + """Offer to deallocate the VM if this script started it.""" + global _cleanup_done + if _cleanup_done or not _vm_started_by_script: + return + _cleanup_done = True + + if signal_received: + print("\n\n Script interrupted. Deallocating VM to stop billing...") + _deallocate_vm() + else: + try: + answer = input( + f"\n This script started VM '{_AUTO_VM_NAME}'. " + "Deallocate to stop billing? [Y/n] " + ).strip().lower() + if answer in ("", "y", "yes"): + _deallocate_vm() + else: + print(f" VM '{_AUTO_VM_NAME}' left running. " + f"Deallocate manually: az vm deallocate -g {_AUTO_RESOURCE_GROUP} " + f"-n {_AUTO_VM_NAME} --no-wait") + except (EOFError, KeyboardInterrupt): + print() + _deallocate_vm() + + +def _register_vm_cleanup() -> None: + """Register atexit and signal handlers for VM cleanup. Idempotent.""" + global _cleanup_registered + if _cleanup_registered: + return + _cleanup_registered = True + + import atexit + import signal + + atexit.register(_cleanup_on_exit, signal_received=False) + + _original_sigint = signal.getsignal(signal.SIGINT) + _original_sigterm = signal.getsignal(signal.SIGTERM) + + def _signal_handler(signum, frame): + _cleanup_on_exit(signal_received=True) + if signum == signal.SIGINT and callable(_original_sigint): + _original_sigint(signum, frame) + elif signum == signal.SIGTERM and callable(_original_sigterm): + _original_sigterm(signum, frame) + else: + sys.exit(1) + + signal.signal(signal.SIGINT, _signal_handler) + signal.signal(signal.SIGTERM, _signal_handler) + + def cmd_record_waa( tasks: str = ",".join(HARDER_TASK_IDS), server: str = "http://localhost:5001", evaluate_url: str = "http://localhost:5050", output: str = "waa_recordings", vnc_url: str = "http://localhost:8006", - vm_ip: str = "172.173.66.131", + vm_ip: str | None = None, verify: bool = True, + auto: bool = False, + auto_vm: bool = False, + auto_tunnel: bool = False, + auto_container: bool = False, + wait_ready: bool = True, ) -> None: """Record demos interactively via WAA API while user performs actions on VNC. @@ -564,10 +1472,14 @@ def cmd_record_waa( output: Output directory for recordings. vnc_url: VNC URL for the user to open in a browser. vm_ip: Azure VM IP for QEMU reset on task restart. + Auto-detected from pool registry or Azure if omitted. verify: Pre-flight check that all required apps are installed (default True). + auto: Automatically start all infrastructure (VM, tunnels, container). + auto_vm: Start Azure VM if it is deallocated (incurs charges). + auto_tunnel: Establish SSH tunnels if not connected. + auto_container: Start Docker container and socat proxy if not running. + wait_ready: Wait for WAA server to boot after recovery (default True). """ - import requests - # Guard: Fire may pass True if --tasks is used without a value if not isinstance(tasks, str): print(f"ERROR: --tasks must be a string of comma-separated task IDs.") @@ -575,6 +1487,28 @@ def cmd_record_waa( print(f" Hint: use --tasks=\"id1,id2,...\" (with = and no space)") return + # --auto is a convenience flag that enables all sub-flags + if auto: + auto_vm = True + auto_tunnel = True + auto_container = True + + any_auto = auto_vm or auto_tunnel or auto_container + + from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip + + # VM IP resolution may fail if VM is deallocated — that's OK if we have + # auto-recovery flags, since we'll start the VM first. + try: + vm_ip = resolve_vm_ip(vm_ip) + except RuntimeError: + if any_auto: + vm_ip = None # Will be resolved after VM start + else: + raise + + import requests + output_dir = Path(output) output_dir.mkdir(parents=True, exist_ok=True) @@ -599,13 +1533,63 @@ def cmd_record_waa( # Verify connection print(f"Connecting to WAA server at {server}...") + connected = False try: resp = requests.get(f"{server}/probe", timeout=5) resp.raise_for_status() print(f" Connected ({resp.status_code})") + connected = True except Exception as e: print(f" Failed to connect: {e}") - print(" Make sure the WAA server is running and SSH tunnels are up.") + + if any_auto: + # Confirm with user if VM start is involved (cost warning) + if auto_vm: + power_state = _get_vm_power_state() + if power_state != "running": + print() + print( + " WARNING: --auto will start Azure VM resources " + "which incur charges." + ) + answer = input(" Continue? [y/N] ").strip().lower() + if answer not in ("y", "yes"): + print(" Aborted.") + return + + recovered = _attempt_auto_recovery( + server=server, + auto_vm=auto_vm, + auto_tunnel=auto_tunnel, + auto_container=auto_container, + wait_ready=wait_ready, + ) + if recovered: + print() + print(" Auto-recovery succeeded. WAA is ready.") + connected = True + # Re-resolve VM IP now that infrastructure is up + if vm_ip is None: + try: + vm_ip = resolve_vm_ip(vm_ip) + except RuntimeError: + vm_ip = _get_vm_public_ip() + else: + print() + print(" Auto-recovery FAILED. Cannot proceed.") + return + else: + print() + print(" The WAA server is not reachable. To auto-deploy all infrastructure, re-run with --auto:") + print(f" python {__file__} record-waa --auto --tasks={tasks}") + print() + print(" Or use granular flags:") + print(" --auto-vm Start Azure VM if deallocated (incurs charges)") + print(" --auto-tunnel Establish SSH tunnels") + print(" --auto-container Start Docker container + socat proxy") + return + + if not connected: return # Pre-flight: verify all required apps are installed @@ -669,171 +1653,18 @@ def cmd_record_waa( recorded = [] for task_num, task_id in enumerate(task_ids, 1): - print_header(f"Task {task_num}/{len(task_ids)}: {task_id[:12]}...") - - task_dir = output_dir / task_id - task_dir.mkdir(parents=True, exist_ok=True) - - # Load task config from evaluate server - instruction = task_id # fallback - task_config = {} - try: - task_resp = requests.get( - f"{evaluate_url}/task/{task_id}", timeout=10 - ) - if task_resp.ok: - task_config = task_resp.json() - instruction = task_config.get( - "instruction", task_config.get("task", task_id) - ) - except Exception as e: - print(f" Warning: could not load task config: {e}") - - def _setup_task_env() -> None: - """Run task setup config (download files, open apps, etc.).""" - setup_config = task_config.get("config", []) - related_apps = task_config.get("related_apps", []) - if related_apps: - setup_config = [ - {"type": "verify_apps", "parameters": {"apps": related_apps}} - ] + setup_config - if setup_config: - resp = requests.post( - f"{evaluate_url}/setup", - json={"config": setup_config}, - timeout=120, - ) - resp.raise_for_status() - results = resp.json().get("results", []) - for r in results: - status = r.get("status", "?") - stype = r.get("type", "?") - print(f" setup {stype}: {status}") - time.sleep(3) - - def _soft_reset_task_env() -> bytes: - """Soft reset: close_all + re-run setup + screenshot.""" - print(" Resetting environment (soft)...") - try: - resp = requests.post(f"{server}/setup/close_all", timeout=30) - print(f" close_all: {resp.status_code}") - time.sleep(2) - _setup_task_env() - except Exception as e: - print(f" WARNING: environment setup failed: {e}") - print(f" The task app may not be open. Check VNC.") - print(" Taking initial screenshot...") - return _take_screenshot(server) - - def _hard_reset_task_env() -> bytes: - """Hard reset: QEMU system_reset + wait for boot + clear recovery + setup + screenshot.""" - print(" Restarting Windows (QEMU hard reset)...") - mgr = QEMUResetManager(vm_ip=vm_ip, timeout_seconds=300) - success, msg = mgr.restart_windows(server_url=server) - if not success: - print(f" WARNING: QEMU reset failed: {msg}") - print(" Falling back to soft reset...") - return _soft_reset_task_env() - print(f" {msg}") - _clear_recovery_data(server) - print(" Running task setup...") - try: - _setup_task_env() - except Exception as e: - print(f" WARNING: environment setup failed: {e}") - print(f" The task app may not be open. Check VNC.") - print(" Taking initial screenshot...") - return _take_screenshot(server) - - before_png = _soft_reset_task_env() - - print(f"\n VNC: {vnc_url}") - print(f" Task: {instruction}\n") - - # Generate AI step-by-step guidance from screenshot - print(" Generating suggested steps...") - suggested = _generate_steps(before_png, instruction, task_config) - print() - print(" ┌─ SUGGESTED STEPS ──────────────────────────────") - for line in suggested.splitlines(): - print(f" │ {line}") - print(" └────────────────────────────────────────────────") - print() - print(" Perform each action in VNC, then press Enter here.") - print(" Press 'd' when done, 'r' to redo last step,") - print(" 'R' to restart task from scratch.\n") - - steps = [] - step_idx = 0 - while True: - # Save before screenshot - (task_dir / f"step_{step_idx:02d}_before.png").write_bytes( - before_png - ) - - action_desc = input( - f" Step {step_idx + 1}: Press Enter after action " - "(or 'd'=done, 'r'=redo last, 'R'=restart task): " - ).strip() - - if action_desc == "R": - # Restart: QEMU hard reset + re-run setup, clear all steps - print(" Restarting task from scratch (QEMU hard reset)...") - # Clean recorded step files - for f in task_dir.glob("step_*.png"): - f.unlink() - before_png = _hard_reset_task_env() - steps = [] - step_idx = 0 - print(f"\n VNC: {vnc_url}") - print(f" Task: {instruction}\n") - print(" Task restarted. Continue recording.\n") - continue - - if action_desc.lower() == "d": - # Save final screenshot as the last after - after_png = _take_screenshot(server) - (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( - after_png - ) - steps.append({"action_hint": action_desc or None}) - step_idx += 1 - break - - if action_desc.lower() == "r" and step_idx > 0: - # Redo: go back one step - step_idx -= 1 - steps.pop() - # Re-take the before screenshot from current state - before_png = _take_screenshot(server) - print(f" Redoing step {step_idx + 1}...") - continue - - # Take after screenshot - after_png = _take_screenshot(server) - (task_dir / f"step_{step_idx:02d}_after.png").write_bytes( - after_png - ) - - steps.append({"action_hint": action_desc or None}) - before_png = after_png # next step's before = this step's after - step_idx += 1 - - # Save metadata - meta = { - "task_id": task_id, - "instruction": instruction, - "num_steps": len(steps), - "steps": steps, - "server_url": server, - "recorded_at": datetime.now(timezone.utc).isoformat(), - } - (task_dir / "meta.json").write_text( - json.dumps(meta, indent=2), encoding="utf-8" + result = _record_single_task( + task_id=task_id, + task_num=task_num, + total_tasks=len(task_ids), + output_dir=output_dir, + server=server, + evaluate_url=evaluate_url, + vnc_url=vnc_url, + vm_ip=vm_ip, ) - - recorded.append(task_id) - print(f"\n Saved {len(steps)} step(s) to {task_dir}") + if result is not None: + recorded.append(result) # Summary print_header("Recording Summary") diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py index a38dc2a..e938fef 100644 --- a/scripts/run_dc_eval.py +++ b/scripts/run_dc_eval.py @@ -211,7 +211,7 @@ def main() -> int: parser.add_argument("--output", default="benchmark_results") parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)") parser.add_argument("--start-from", type=int, default=0, help="Task index to start from") - parser.add_argument("--vm-ip", default="172.173.66.131", help="VM IP") + parser.add_argument("--vm-ip", default=None, help="VM IP (auto-detected if omitted)") parser.add_argument("--vm-user", default="azureuser", help="VM SSH user") parser.add_argument("--zs-only", action="store_true", help="Run zero-shot only (no demo)") parser.add_argument("--dc-only", action="store_true", help="Run demo-conditioned only") @@ -251,12 +251,16 @@ def main() -> int: continue conditions.append((tid, f"val_dc_{sid}", demo_path)) + from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip + + vm_ip = resolve_vm_ip(args.vm_ip) + print(f"Eval: {len(conditions)} runs ({len(task_ids)} tasks) with {args.agent}") - print(f"VM: {args.vm_ip}") + print(f"VM: {vm_ip}") print() # Verify initial WAA health - if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url): + if not ensure_waa_ready(args.server, args.vm_user, vm_ip, evaluate_url=args.evaluate_url): print("ERROR: Cannot reach WAA server or evaluate server") return 1 @@ -269,7 +273,7 @@ def main() -> int: continue # Health check before each run - if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url): + if not ensure_waa_ready(args.server, args.vm_user, vm_ip, evaluate_url=args.evaluate_url): print(f" Skipping {run_name} - WAA unreachable after recovery") results[run_name] = {"status": "SKIP", "returncode": -1, "elapsed_s": 0} continue diff --git a/tests/test_screen_stability.py b/tests/test_screen_stability.py new file mode 100644 index 0000000..18a4d75 --- /dev/null +++ b/tests/test_screen_stability.py @@ -0,0 +1,204 @@ +"""Tests for screen stability detection (compare_screenshots, wait_for_stable_screen).""" + +import io +from unittest.mock import patch + +from PIL import Image + +from openadapt_evals.infrastructure.screen_stability import ( + compare_screenshots as _compare_screenshots, + wait_for_stable_screen as _wait_for_stable_screen, +) + + +def _make_png(width: int = 100, height: int = 100, color: tuple = (0, 0, 0)) -> bytes: + """Create a solid-color PNG image as bytes.""" + img = Image.new("RGB", (width, height), color) + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +def _make_png_with_diff( + width: int = 100, + height: int = 100, + base_color: tuple = (0, 0, 0), + diff_pixels: int = 0, + diff_color: tuple = (255, 255, 255), +) -> bytes: + """Create a PNG with some pixels changed from the base color.""" + img = Image.new("RGB", (width, height), base_color) + pixels = img.load() + changed = 0 + for y in range(height): + for x in range(width): + if changed >= diff_pixels: + break + pixels[x, y] = diff_color + changed += 1 + if changed >= diff_pixels: + break + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +class TestCompareScreenshots: + """Tests for _compare_screenshots().""" + + def test_identical_images_return_1(self): + """Identical images have similarity 1.0.""" + png = _make_png(color=(128, 128, 128)) + assert _compare_screenshots(png, png) == 1.0 + + def test_completely_different_images_return_0(self): + """Completely different images have similarity 0.0.""" + a = _make_png(color=(0, 0, 0)) + b = _make_png(color=(255, 255, 255)) + assert _compare_screenshots(a, b) == 0.0 + + def test_different_sizes_return_0(self): + """Images of different sizes return 0.0.""" + a = _make_png(width=100, height=100) + b = _make_png(width=200, height=200) + assert _compare_screenshots(a, b) == 0.0 + + def test_small_diff_high_similarity(self): + """A few changed pixels still yield high similarity.""" + total = 100 * 100 # 10000 pixels + diff_pixels = 10 # 0.1% different + a = _make_png(color=(0, 0, 0)) + b = _make_png_with_diff(diff_pixels=diff_pixels, diff_color=(255, 255, 255)) + similarity = _compare_screenshots(a, b) + expected = (total - diff_pixels) / total # 0.999 + assert abs(similarity - expected) < 0.001 + + def test_half_different(self): + """50% different pixels gives ~0.5 similarity.""" + total = 100 * 100 + a = _make_png(color=(0, 0, 0)) + b = _make_png_with_diff(diff_pixels=total // 2, diff_color=(255, 255, 255)) + similarity = _compare_screenshots(a, b) + assert 0.49 < similarity < 0.51 + + def test_clock_area_diff_above_threshold(self): + """Simulated clock-area change (0.3% of pixels) stays above 99.5%.""" + total = 1000 * 1000 # 1M pixels + clock_pixels = 3000 # ~0.3% + a = _make_png(width=1000, height=1000, color=(50, 50, 50)) + b = _make_png_with_diff( + width=1000, height=1000, + base_color=(50, 50, 50), + diff_pixels=clock_pixels, + diff_color=(200, 200, 200), + ) + similarity = _compare_screenshots(a, b) + assert similarity >= 0.995, f"Clock-area diff should be above threshold: {similarity}" + + +class TestWaitForStableScreen: + """Tests for _wait_for_stable_screen().""" + + def test_immediate_stability(self): + """Returns quickly when screen is immediately stable.""" + stable_png = _make_png(color=(100, 100, 100)) + + with patch( + "openadapt_evals.infrastructure.screen_stability._take_screenshot", + return_value=stable_png, + ), patch("time.sleep"): + result = _wait_for_stable_screen( + "http://fake", + poll_interval=0.01, + stability_timeout=5, + required_stable_checks=2, + ) + + assert result == stable_png + + def test_stabilizes_after_changes(self): + """Returns stable screenshot after initial instability.""" + changing = _make_png(color=(255, 0, 0)) + changing2 = _make_png(color=(0, 255, 0)) + stable = _make_png(color=(0, 0, 255)) + + screenshots = [changing, changing2, stable, stable, stable] + call_count = [0] + + def mock_screenshot(server): + idx = min(call_count[0], len(screenshots) - 1) + call_count[0] += 1 + return screenshots[idx] + + with patch( + "openadapt_evals.infrastructure.screen_stability._take_screenshot", + side_effect=mock_screenshot, + ), patch("time.sleep"): + result = _wait_for_stable_screen( + "http://fake", + poll_interval=0.01, + stability_timeout=30, + required_stable_checks=2, + ) + + assert result == stable + + def test_timeout_returns_last_screenshot(self): + """Returns last screenshot on timeout with warning.""" + call_count = [0] + + def mock_screenshot(server): + """Return a different image each time to prevent stability.""" + call_count[0] += 1 + return _make_png(color=(call_count[0] % 256, 0, 0)) + + with patch( + "openadapt_evals.infrastructure.screen_stability._take_screenshot", + side_effect=mock_screenshot, + ), patch("time.sleep"), patch("time.time") as mock_time: + # Simulate timeout: first call returns 0, subsequent calls exceed deadline + times = iter([0.0, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 100.0]) + mock_time.side_effect = lambda: next(times) + + result = _wait_for_stable_screen( + "http://fake", + poll_interval=0.01, + stability_timeout=2.0, + required_stable_checks=2, + ) + + assert result is not None + assert len(result) > 0 + + def test_minor_diff_treated_as_stable(self): + """Minor pixel differences (below threshold) count as stable.""" + base = _make_png(width=100, height=100, color=(50, 50, 50)) + # 2 pixels different out of 10000 = 0.02% diff → 99.98% similar + slight_diff = _make_png_with_diff( + width=100, height=100, + base_color=(50, 50, 50), + diff_pixels=2, + diff_color=(51, 51, 51), + ) + + screenshots = [base, slight_diff, slight_diff, slight_diff] + call_count = [0] + + def mock_screenshot(server): + idx = min(call_count[0], len(screenshots) - 1) + call_count[0] += 1 + return screenshots[idx] + + with patch( + "openadapt_evals.infrastructure.screen_stability._take_screenshot", + side_effect=mock_screenshot, + ), patch("time.sleep"): + result = _wait_for_stable_screen( + "http://fake", + poll_interval=0.01, + stability_timeout=10, + similarity_threshold=0.995, + required_stable_checks=2, + ) + + assert result == slight_diff diff --git a/tests/test_vm_ip.py b/tests/test_vm_ip.py new file mode 100644 index 0000000..07a1460 --- /dev/null +++ b/tests/test_vm_ip.py @@ -0,0 +1,179 @@ +"""Tests for VM IP auto-detection (resolve_vm_ip).""" + +import json +from unittest.mock import MagicMock, patch + +import pytest + +from openadapt_evals.infrastructure.vm_ip import ( + _ip_from_azure_query, + _ip_from_pool_registry, + resolve_vm_ip, +) + + +class TestResolveVmIp: + """Tests for resolve_vm_ip() resolution order.""" + + def test_explicit_ip_returned_immediately(self): + """Explicit IP is returned without checking registry or Azure.""" + assert resolve_vm_ip(explicit_ip="1.2.3.4") == "1.2.3.4" + + def test_pool_registry_used_when_no_explicit_ip(self): + """Pool registry IP is used when no explicit IP is given.""" + with patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry", + return_value="10.0.0.5", + ): + assert resolve_vm_ip() == "10.0.0.5" + + def test_azure_fallback_when_no_registry(self): + """Azure query is used when pool registry doesn't exist.""" + with patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry", + return_value=None, + ), patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query", + return_value="10.0.0.99", + ): + assert resolve_vm_ip() == "10.0.0.99" + + def test_error_when_no_vm_found(self): + """RuntimeError raised when no VM can be found by any method.""" + with patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry", + return_value=None, + ), patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query", + return_value=None, + ): + with pytest.raises(RuntimeError, match="No running VM found"): + resolve_vm_ip() + + def test_explicit_ip_skips_all_lookups(self): + """When explicit IP is given, registry and Azure are never called.""" + with patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry", + ) as mock_reg, patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query", + ) as mock_az: + result = resolve_vm_ip(explicit_ip="5.5.5.5") + assert result == "5.5.5.5" + mock_reg.assert_not_called() + mock_az.assert_not_called() + + def test_registry_checked_before_azure(self): + """Pool registry is checked before Azure query (fast path first).""" + call_order = [] + + def mock_registry(): + call_order.append("registry") + return "10.0.0.1" + + def mock_azure(): + call_order.append("azure") + return "10.0.0.2" + + with patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry", + side_effect=mock_registry, + ), patch( + "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query", + side_effect=mock_azure, + ): + result = resolve_vm_ip() + assert result == "10.0.0.1" + assert call_order == ["registry"] # Azure never called + + +class TestIpFromPoolRegistry: + """Tests for _ip_from_pool_registry().""" + + def test_reads_first_active_worker(self, tmp_path, monkeypatch): + """Returns IP of first non-deleted/non-failed worker.""" + registry = tmp_path / "benchmark_results" / "vm_pool_registry.json" + registry.parent.mkdir(parents=True) + registry.write_text(json.dumps({ + "workers": [ + {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "ready"}, + {"name": "waa-pool-01", "ip": "10.0.0.2", "status": "ready"}, + ] + })) + monkeypatch.chdir(tmp_path) + assert _ip_from_pool_registry() == "10.0.0.1" + + def test_skips_deleted_workers(self, tmp_path, monkeypatch): + """Skips workers with 'deleted' status.""" + registry = tmp_path / "benchmark_results" / "vm_pool_registry.json" + registry.parent.mkdir(parents=True) + registry.write_text(json.dumps({ + "workers": [ + {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "deleted"}, + {"name": "waa-pool-01", "ip": "10.0.0.2", "status": "ready"}, + ] + })) + monkeypatch.chdir(tmp_path) + assert _ip_from_pool_registry() == "10.0.0.2" + + def test_skips_failed_workers(self, tmp_path, monkeypatch): + """Skips workers with 'failed' status.""" + registry = tmp_path / "benchmark_results" / "vm_pool_registry.json" + registry.parent.mkdir(parents=True) + registry.write_text(json.dumps({ + "workers": [ + {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "failed"}, + ] + })) + monkeypatch.chdir(tmp_path) + assert _ip_from_pool_registry() is None + + def test_returns_none_when_no_file(self, tmp_path, monkeypatch): + """Returns None when registry file doesn't exist.""" + monkeypatch.chdir(tmp_path) + assert _ip_from_pool_registry() is None + + def test_returns_none_on_malformed_json(self, tmp_path, monkeypatch): + """Returns None when registry file contains invalid JSON.""" + registry = tmp_path / "benchmark_results" / "vm_pool_registry.json" + registry.parent.mkdir(parents=True) + registry.write_text("not json") + monkeypatch.chdir(tmp_path) + assert _ip_from_pool_registry() is None + + +class TestIpFromAzureQuery: + """Tests for _ip_from_azure_query().""" + + def test_returns_pool_vm_ip(self): + """Returns IP from waa-pool-00 query.""" + mock_mgr = MagicMock() + mock_mgr.get_vm_ip.return_value = "10.0.0.50" + + with patch( + "openadapt_evals.infrastructure.azure_vm.AzureVMManager", + return_value=mock_mgr, + ): + assert _ip_from_azure_query() == "10.0.0.50" + mock_mgr.get_vm_ip.assert_called_once_with("waa-pool-00") + + def test_falls_back_to_legacy_name(self): + """Falls back to waa-eval-vm when waa-pool-00 returns None.""" + mock_mgr = MagicMock() + mock_mgr.get_vm_ip.side_effect = [None, "10.0.0.99"] + + with patch( + "openadapt_evals.infrastructure.azure_vm.AzureVMManager", + return_value=mock_mgr, + ): + assert _ip_from_azure_query() == "10.0.0.99" + calls = mock_mgr.get_vm_ip.call_args_list + assert calls[0][0] == ("waa-pool-00",) + assert calls[1][0] == ("waa-eval-vm",) + + def test_returns_none_on_azure_error(self): + """Returns None when Azure SDK/CLI is unavailable.""" + with patch( + "openadapt_evals.infrastructure.azure_vm.AzureVMManager", + side_effect=ImportError("azure not installed"), + ): + assert _ip_from_azure_query() is None