diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index b00cb8a..791f438 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -13,5 +13,5 @@
 {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
 {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
-{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 28: 6 design docs created (code health, marketing, CLI DX, testing, infra, docs). Marketing materials drafted and polished. Prioritization documented in STATUS.md. Tier 1 blockers identified: version fix, PyAutoGUI fail-safe recovery, socat systemd service, auto-open viewer. Next: implement Tier 1 items to unblock reliable eval runs.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-28T11:25:44.494548-05:00"}
+{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-01T23:35:11.042286-05:00"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
diff --git a/README.md b/README.md
index ea1e2d5..c92dc4e 100644
--- a/README.md
+++ b/README.md
@@ -220,7 +220,7 @@ The Dockerfile also pre-downloads LibreOffice at build time with dynamic version
 
 When a task config includes `related_apps`, the live adapter automatically prepends a `verify_apps` step before the task's setup config. The `--verify` flag on `record_waa_demos.py` provides a pre-flight check across all tasks before starting a recording session.
 
-![LibreOffice Calc installed on Windows 11 VM](https://github.com/OpenAdaptAI/openadapt-evals/releases/download/untagged-42f27f1e47214aae8358/waa_libreoffice.png)
+![LibreOffice Calc running inside Windows 11 QEMU VM via noVNC in Chrome](screenshots/waa_libreoffice_desktop.png)
 
 ## CLI Reference
 
diff --git a/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt b/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt
new file mode 100644
index 0000000..bfd2f76
--- /dev/null
+++ b/demo_prompts/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt
@@ -0,0 +1,67 @@
+DEMONSTRATION:
+Task: In a new sheet with 4 headers "Year", "CA changes", "FA changes", and "OA changes", calculate the annual changes for the Current Assets, Fixed Assets, and Other Assets columns. Set the results as percentage type.
+
+Step 1:
+  Action: Right-click on the "Sheet1" tab at the bottom and select "Insert Sheet" or "New Sheet"
+
+Step 2:
+  Action: Click cell A1 and type "Year"
+
+Step 3:
+  Action: Press Tab and type "CA changes"
+
+Step 4:
+  Action: Press Tab and type "FA changes"
+
+Step 5:
+  Action: Press Tab and type "OA changes"
+
+Step 6:
+  Action: Click cell A2 and type "2015"
+
+Step 7:
+  Action: Press Enter and type "2016"
+
+Step 8:
+  Action: Press Enter and type "2017"
+
+Step 9:
+  Action: Press Enter and type "2018"
+
+Step 10:
+  Action: Press Enter and type "2019"
+
+Step 11:
+  Action: Click cell B2 and type "=(Sheet1.B3-Sheet1.B2)/Sheet1.B2"
+
+Step 12:
+  Action: Press Enter
+
+Step 13:
+  Action: Click cell B2, then drag the fill handle down to B6
+
+Step 14:
+  Action: Click cell C2 and type "=(Sheet1.C3-Sheet1.C2)/Sheet1.C2"
+
+Step 15:
+  Action: Press Enter
+
+Step 16:
+  Action: Click cell C2, then drag the fill handle down to C6
+
+Step 17:
+  Action: Click cell D2 and type "=(Sheet1.D3-Sheet1.D2)/Sheet1.D2"
+
+Step 18:
+  Action: Press Enter
+
+Step 19:
+  Action: Click cell D2, then drag the fill handle down to D6
+
+Step 20:
+  Action: Click and drag to select cells B2:D6
+
+Step 21:
+  Action: Click the % button in the toolbar (or press Ctrl+Shift+5)
+
+---
diff --git a/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt b/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt
new file mode 100644
index 0000000..0dc3c7c
--- /dev/null
+++ b/demo_prompts_vlm/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS.txt
@@ -0,0 +1,130 @@
+DEMONSTRATION:
+Task: In a new sheet with 4 headers "Year", "CA changes", "FA changes", and "OA changes", calculate the annual changes for the Current Assets, Fixed Assets, and Other Assets columns. Set the results as percentage type.
+
+Step 1:
+  Observation: The spreadsheet is open to "Sheet1," which contains financial data with columns for years, assets, and liabilities.
+  Intent: To create a new sheet for calculating and displaying annual changes in asset categories.
+  Action: Right-click on the "Sheet1" tab at the bottom and select "Insert Sheet" or "New Sheet".
+  Result: A new, blank sheet named "Sheet2" is added to the workbook and displayed.
+
+Step 2:
+  Observation: The new sheet is blank, with cell A1 selected and ready for input.
+  Intent: To create a header row for organizing annual asset changes.
+  Action: Click cell A1 and type "Year"
+  Result: The text "Year" now appears in cell A1.
+
+Step 3:
+  Observation: The new sheet contains only the header "Year" in cell A1, with the cursor positioned in cell B1.
+  Intent: To add the next header, "CA changes," as part of setting up the required columns for annual asset changes.
+  Action: Press Tab and type "CA changes"
+  Result: The header "CA changes" is entered in cell B1, and the sheet now displays two headers: "Year" and "CA changes."
+
+Step 4:
+  Observation: The new sheet contains two headers, "Year" in cell A1 and "CA changes" in cell B1, with the cursor positioned in cell C1.
+  Intent: To continue setting up the required headers for calculating annual changes in asset categories.
+  Action: Press Tab and type "FA changes".
+  Result: The header "FA changes" is added to cell C1.
+
+Step 5:
+  Observation: The new sheet contains three headers: "Year", "CA changes", and "FA changes" in the first row.
+  Intent: To complete the set of required headers by adding "OA changes" as the fourth column.
+  Action: Press Tab and type "OA changes".
+  Result: The "OA changes" header is added in cell D1, completing the four required headers in the first row.
+
+Step 6:
+  Observation: The new sheet contains four headers: "Year", "CA changes", "FA changes", and "OA changes", with all cells below the headers empty.
+  Intent: To begin entering annual data by specifying the first year in the "Year" column.
+  Action: Click cell A2 and type "2015".
+  Result: The value "2015" appears in cell A2 under the "Year" header.
+
+Step 7:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, with cell A2 currently empty.
+  Intent: To begin entering the years for which asset changes will be calculated.
+  Action: Press Enter and type "2016" in cell A2.
+  Result: The value "2016" appears in cell A2, starting the list of years under the "Year" header.
+
+Step 8:
+  Observation: The new sheet contains the headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, and "2016" is entered in cell A2.
+  Intent: To begin listing the years for which annual changes will be calculated.
+  Action: Press Enter and type "2017".
+  Result: "2017" is entered into cell A3, directly below "2016" in the "Year" column.
+
+Step 9:
+  Observation: The new sheet contains the headers "Year", "CA changes", "FA changes", and "OA changes", with the years 2016 and 2017 already listed in the "Year" column.
+  Intent: To continue populating the "Year" column with the next chronological year for annual change calculations.
+  Action: Press Enter and type "2018".
+  Result: The value "2018" appears in the next row of the "Year" column.
+
+Step 10:
+  Observation: The new sheet contains four headers ("Year", "CA changes", "FA changes", "OA changes") and a column of years from 2015 to 2018 under "Year".
+  Intent: To add the next year in the sequence to the "Year" column in preparation for calculating annual changes.
+  Action: Press Enter and type "2019".
+  Result: The value "2019" appears in the next row under the "Year" column.
+
+Step 11:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", with years listed in column A, but no formulas or data in the other columns.
+  Intent: To calculate the annual percentage change for Current Assets using data from Sheet1.
+  Action: Click cell B2 and type "=(Sheet1.B3-Sheet1.B2)/Sheet1.B2".
+  Result: Cell B2 now contains a formula to compute the percentage change in Current Assets between two years from Sheet1.
+
+Step 12:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", and the first row under "Year" is filled with "2016", while the "CA changes" cell (B2) contains a formula referencing another sheet.
+  Intent: To confirm and apply the formula entered in cell B2 for calculating annual changes in Current Assets.
+  Action: Press Enter
+  Result: The formula in cell B2 is executed, displaying the calculated percentage change for "CA changes" in 2016.
+
+Step 13:
+  Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes", with cell B2 containing a formula and the rest of column B empty.
+  Intent: To copy the formula in B2 down the column so that annual changes for "CA changes" are calculated for each year.
+  Action: Click cell B2, then drag the fill handle down to B6.
+  Result: The formula from B2 is filled down to B6, calculating "CA changes" for all listed years.
+
+Step 14:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes", and cell C2 is empty.
+  Intent: To calculate the annual change for Fixed Assets as a percentage using data from Sheet1.
+  Action: Click cell C2 and type "=(Sheet1.C3-Sheet1.C2)/Sheet1.C2"
+  Result: Cell C2 now contains the formula to compute the percentage change in Fixed Assets between two years from Sheet1.
+
+Step 15:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, and the first row of data is being entered in row 2.
+  Intent: To confirm or complete the entry of a formula or value in the spreadsheet.
+  Action: Press Enter.
+  Result: The cell selection moves from the current cell (D2) to the next cell below (D3), confirming the entry in D2.
+
+Step 16:
+  Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes" with a formula present in cell C2, but the rest of column C is empty.
+  Intent: To apply the formula in C2 to the rest of the rows in the "CA changes" column for consistent annual change calculations.
+  Action: Click cell C2, then drag the fill handle down to C6.
+  Result: The formula from C2 is copied down to C6, populating the "CA changes" column with calculated values for each year.
+
+Step 17:
+  Observation: The new sheet contains headers "Year", "CA changes", "FA changes", and "OA changes" in row 1, with all other cells empty.
+  Intent: To calculate the annual percentage change for the "OA changes" column using data from Sheet1.
+  Action: Click cell D2 and type "=(Sheet1.D3-Sheet1.D2)/Sheet1.D2"
+  Result: Cell D2 is populated with the formula to compute the percentage change for "OA changes" between two years.
+
+Step 18:
+  Observation: The formula `=(C3-C2)/C2` is being entered in cell D3 to calculate the percentage change for "OA changes".
+  Intent: To compute the annual percentage change for Other Assets in the new sheet.
+  Action: Press Enter
+  Result: The formula is executed in cell D3, displaying the calculated percentage change for Other Assets.
+
+Step 19:
+  Observation: The sheet displays headers "Year", "CA changes", "FA changes", and "OA changes" with a formula present only in cell D2.
+  Intent: To copy the formula in D2 down the "OA changes" column for all relevant rows.
+  Action: Click cell D2, then drag the fill handle down to D6.
+  Result: The formula from D2 is filled down through D6, calculating values for each row in the "OA changes" column.
+
+Step 20:
+  Observation: The new sheet contains four headers ("Year", "CA changes", "FA changes", "OA changes") and calculated annual changes for each asset type in columns B, C, and D.
+  Intent: To select the range of calculated annual changes and their corresponding years for further formatting or analysis.
+  Action: Click and drag to select cells B2:D6.
+  Result: Cells B2:D6 are highlighted, indicating they are selected for the next operation.
+
+Step 21:
+  Observation: The annual changes for "CA changes", "FA changes", and "OA changes" are displayed as decimal values in columns B, C, and D.
+  Intent: To format the annual change values as percentages for better readability.
+  Action: Click the % button in the toolbar (or press Ctrl+Shift+5).
+  Result: The values in columns B, C, and D are now displayed as percentages.
+
+---
diff --git a/openadapt_evals/infrastructure/__init__.py b/openadapt_evals/infrastructure/__init__.py
index 7b6d725..a557e31 100644
--- a/openadapt_evals/infrastructure/__init__.py
+++ b/openadapt_evals/infrastructure/__init__.py
@@ -22,8 +22,12 @@
 
     # Restart Windows inside QEMU
     from openadapt_evals.infrastructure import QEMUResetManager
-    mgr = QEMUResetManager(vm_ip="172.173.66.131")
+    mgr = QEMUResetManager(vm_ip="10.0.0.1")
     success, msg = mgr.restart_windows()
+
+    # Auto-detect VM IP
+    from openadapt_evals.infrastructure import resolve_vm_ip
+    ip = resolve_vm_ip()  # pool registry → Azure CLI
     ```
 """
 
@@ -31,7 +35,12 @@
 from openadapt_evals.infrastructure.azure_vm import AzureVMManager
 from openadapt_evals.infrastructure.pool import PoolManager, PoolRunResult
 from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager
+from openadapt_evals.infrastructure.screen_stability import (
+    compare_screenshots,
+    wait_for_stable_screen,
+)
 from openadapt_evals.infrastructure.ssh_tunnel import SSHTunnelManager, get_tunnel_manager
+from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip
 from openadapt_evals.infrastructure.vm_monitor import VMMonitor, VMConfig
 
 __all__ = [
@@ -43,5 +52,8 @@
     "VMMonitor",
     "VMConfig",
     "SSHTunnelManager",
+    "compare_screenshots",
     "get_tunnel_manager",
+    "resolve_vm_ip",
+    "wait_for_stable_screen",
 ]
diff --git a/openadapt_evals/infrastructure/qemu_reset.py b/openadapt_evals/infrastructure/qemu_reset.py
index 5ee9f1d..cdf0029 100644
--- a/openadapt_evals/infrastructure/qemu_reset.py
+++ b/openadapt_evals/infrastructure/qemu_reset.py
@@ -24,7 +24,7 @@
 
     from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager
 
-    mgr = QEMUResetManager(vm_ip="172.173.66.131")
+    mgr = QEMUResetManager(vm_ip="10.0.0.1")
 
     # Full restart: send reset + wait for WAA server
     success, message = mgr.restart_windows()
diff --git a/openadapt_evals/infrastructure/screen_stability.py b/openadapt_evals/infrastructure/screen_stability.py
new file mode 100644
index 0000000..088f35e
--- /dev/null
+++ b/openadapt_evals/infrastructure/screen_stability.py
@@ -0,0 +1,128 @@
+"""Screen stability detection for WAA VM screenshots.
+
+Provides utilities for comparing screenshots and waiting until the VM
+screen stabilises before proceeding.  Extracted from
+``scripts/record_waa_demos.py`` so that tests and other callers can
+import the functions directly without ``importlib`` hacks.
+
+Example:
+    ```python
+    from openadapt_evals.infrastructure.screen_stability import (
+        compare_screenshots,
+        wait_for_stable_screen,
+    )
+
+    similarity = compare_screenshots(png_a, png_b)
+    stable_png = wait_for_stable_screen("http://localhost:5001")
+    ```
+"""
+
+from __future__ import annotations
+
+import io
+import time
+from typing import Callable
+
+
+def _take_screenshot(server: str) -> bytes:
+    """Take a screenshot from the WAA server, raising on failure."""
+    import requests
+
+    resp = requests.get(f"{server}/screenshot", timeout=30)
+    resp.raise_for_status()
+    return resp.content
+
+
+def compare_screenshots(png_a: bytes, png_b: bytes) -> float:
+    """Compare two PNG screenshots and return pixel similarity (0.0-1.0).
+
+    Uses raw RGB pixel comparison.  Returns 1.0 for identical images.
+    The 99.5% threshold used by ``wait_for_stable_screen`` tolerates
+    minor differences like the taskbar clock updating (~0.14% of pixels)
+    or cursor blink.
+    """
+    from PIL import Image
+
+    img_a = Image.open(io.BytesIO(png_a)).convert("RGB")
+    img_b = Image.open(io.BytesIO(png_b)).convert("RGB")
+
+    if img_a.size != img_b.size:
+        return 0.0
+
+    bytes_a = img_a.tobytes()
+    bytes_b = img_b.tobytes()
+
+    if bytes_a == bytes_b:
+        return 1.0  # fast path: identical raw bytes
+
+    # Vectorized comparison via numpy (already a transitive dep via open-clip-torch)
+    import numpy as np
+
+    arr_a = np.frombuffer(bytes_a, dtype=np.uint8).reshape(-1, 3)
+    arr_b = np.frombuffer(bytes_b, dtype=np.uint8).reshape(-1, 3)
+    matching = int(np.all(arr_a == arr_b, axis=1).sum())
+    return matching / arr_a.shape[0]
+
+
+def wait_for_stable_screen(
+    server: str,
+    poll_interval: float = 2.0,
+    stability_timeout: float = 30.0,
+    similarity_threshold: float = 0.995,
+    required_stable_checks: int = 3,
+    screenshot_fn: Callable[[str], bytes] | None = None,
+) -> bytes:
+    """Wait for the VM screen to stabilize, then return the screenshot.
+
+    Polls the QEMU framebuffer (free -- local HTTP call) until
+    ``required_stable_checks`` consecutive screenshot pairs exceed
+    ``similarity_threshold``.  With the defaults (3 checks at 2s
+    intervals), the screen must be stable for 6 seconds before
+    proceeding.
+
+    Args:
+        server: WAA server URL (``/screenshot`` endpoint).
+        poll_interval: Seconds between screenshots.
+        stability_timeout: Maximum seconds to wait.  If exceeded, the
+            last screenshot is returned with a warning.
+        similarity_threshold: Pixel-match fraction (0.0-1.0).  0.995
+            tolerates taskbar clock and cursor blink.
+        required_stable_checks: Consecutive stable pairs required.
+            With poll_interval=2 and required_stable_checks=3, the
+            screen must be unchanged for 6 seconds.
+        screenshot_fn: Optional callable ``(server) -> bytes`` used to
+            capture a screenshot.  Defaults to an internal helper that
+            calls ``GET {server}/screenshot``.
+
+    Returns:
+        PNG screenshot bytes of the stable screen.
+    """
+    take = screenshot_fn or _take_screenshot
+
+    prev_png = take(server)
+    stable_count = 0
+    deadline = time.time() + stability_timeout
+
+    while time.time() < deadline:
+        time.sleep(poll_interval)
+        curr_png = take(server)
+
+        similarity = compare_screenshots(prev_png, curr_png)
+
+        if similarity >= similarity_threshold:
+            stable_count += 1
+            if stable_count >= required_stable_checks:
+                elapsed = stability_timeout - (deadline - time.time())
+                print(f"    Screen stable after {elapsed:.0f}s "
+                      f"({stable_count} checks, {similarity:.4f} similarity)")
+                return curr_png
+        else:
+            if stable_count > 0:
+                print(f"    Screen changed ({similarity:.3f}), resetting stability count")
+            stable_count = 0
+
+        prev_png = curr_png
+
+    print(f"    WARNING: Screen did not stabilize within {stability_timeout:.0f}s. "
+          "Using last screenshot.")
+    return prev_png
diff --git a/openadapt_evals/infrastructure/vm_ip.py b/openadapt_evals/infrastructure/vm_ip.py
new file mode 100644
index 0000000..559a36e
--- /dev/null
+++ b/openadapt_evals/infrastructure/vm_ip.py
@@ -0,0 +1,121 @@
+"""VM IP resolution with automatic detection.
+
+Resolves the VM IP address using a layered approach:
+
+1. Explicit IP (from ``--vm-ip`` argument) — instant
+2. Pool registry (local JSON file) — instant, no Azure auth
+3. Azure CLI query (``az vm list-ip-addresses``) — always accurate, ~3s
+
+Usage::
+
+    from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip
+
+    # Auto-detect (pool registry → Azure CLI)
+    ip = resolve_vm_ip()
+
+    # Explicit override
+    ip = resolve_vm_ip(explicit_ip="10.0.0.1")
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_vm_ip(explicit_ip: str | None = None) -> str:
+    """Resolve the VM IP address.
+
+    Resolution order:
+
+    1. ``explicit_ip`` — returned immediately if provided
+    2. Pool registry — reads ``benchmark_results/vm_pool_registry.json``
+       for the first active worker's IP
+    3. Azure CLI query — calls ``AzureVMManager.get_vm_ip()`` (always accurate, ~3s)
+
+    Args:
+        explicit_ip: If provided, returned as-is (from ``--vm-ip`` argument).
+
+    Returns:
+        The resolved IP address string.
+
+    Raises:
+        RuntimeError: If no running VM can be found by any method.
+    """
+    if explicit_ip:
+        return explicit_ip
+
+    # Try pool registry (fast, local file)
+    ip = _ip_from_pool_registry()
+    if ip:
+        logger.info("VM IP resolved from pool registry: %s", ip)
+        return ip
+
+    # Fall back to Azure query (always accurate, ~3s)
+    ip = _ip_from_azure_query()
+    if ip:
+        logger.info("VM IP resolved from Azure query: %s", ip)
+        return ip
+
+    raise RuntimeError(
+        "No running VM found. Either:\n"
+        "  1. Create one:  oa-vm pool-create --workers 1\n"
+        "  2. Start it:    oa-vm vm start\n"
+        "  3. Pass explicitly:  --vm-ip <IP>"
+    )
+
+
+def _ip_from_pool_registry() -> str | None:
+    """Try to read VM IP from the pool registry file.
+
+    Returns the IP of the first non-deleted, non-failed worker, or None.
+    """
+    registry_path = Path("benchmark_results/vm_pool_registry.json")
+    if not registry_path.exists():
+        return None
+
+    try:
+        import json
+
+        with open(registry_path) as f:
+            data = json.load(f)
+
+        for worker in data.get("workers", []):
+            ip = worker.get("ip")
+            status = worker.get("status", "")
+            if ip and status not in ("deleted", "failed"):
+                return ip
+    except Exception as e:
+        logger.debug("Could not read pool registry: %s", e)
+
+    return None
+
+
+def _ip_from_azure_query() -> str | None:
+    """Query Azure for the VM IP address.
+
+    Tries pool-style names (``waa-pool-00``) first, then falls back to
+    the legacy name (``waa-eval-vm``).
+    """
+    try:
+        from openadapt_evals.config import settings
+        from openadapt_evals.infrastructure.azure_vm import AzureVMManager
+
+        mgr = AzureVMManager(resource_group=settings.azure_resource_group)
+
+        # Try pool-style name first (most common)
+        ip = mgr.get_vm_ip("waa-pool-00")
+        if ip:
+            return ip
+
+        # Fallback to legacy name
+        ip = mgr.get_vm_ip("waa-eval-vm")
+        if ip:
+            return ip
+
+    except Exception as e:
+        logger.debug("Azure VM IP query failed: %s", e)
+
+    return None
diff --git a/screenshots/waa_libreoffice_desktop.png b/screenshots/waa_libreoffice_desktop.png
new file mode 100644
index 0000000..906161d
Binary files /dev/null and b/screenshots/waa_libreoffice_desktop.png differ
diff --git a/scripts/convert_recording_to_demo.py b/scripts/convert_recording_to_demo.py
new file mode 100644
index 0000000..8931be5
--- /dev/null
+++ b/scripts/convert_recording_to_demo.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+"""Convert WAA recordings (meta.json + screenshots) to demo text files.
+
+Produces demo .txt files in the format expected by eval-suite's --demo-dir.
+
+Two modes:
+  --mode text   : Fast, free. Converts step descriptions from meta.json directly.
+  --mode vlm    : Richer. Sends screenshots to a VLM for Observation/Intent/Action/Result.
+
+Usage:
+    # Text-only (instant, no API calls)
+    python scripts/convert_recording_to_demo.py \
+        --recordings waa_recordings \
+        --output demo_prompts \
+        --mode text
+
+    # VLM-enriched (sends screenshots to OpenAI/Anthropic)
+    python scripts/convert_recording_to_demo.py \
+        --recordings waa_recordings \
+        --output demo_prompts \
+        --mode vlm \
+        --provider openai
+
+    # Override model (e.g. use cheaper model)
+    python scripts/convert_recording_to_demo.py \
+        --recordings waa_recordings \
+        --output demo_prompts \
+        --mode vlm \
+        --provider openai \
+        --model gpt-4.1-mini
+"""
+from __future__ import annotations
+
+import base64
+import json
+import re
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Regex patterns for extracting key references from step descriptions
+_CELL_REF_RE = re.compile(r'\b([A-Z]+\d+)\b')
+_CELL_RANGE_RE = re.compile(r'\b([A-Z]+\d+)\s*[:\-]\s*([A-Z]+\d+)\b')
+_FORMULA_RE = re.compile(r'"(=[^"]+)"')
+_QUOTED_TEXT_RE = re.compile(r'"([^"]+)"')
+
+
+def _extract_references(text: str) -> dict:
+    """Extract key references (cell refs, formulas, quoted text) from a step description.
+
+    Returns a dict with:
+        cells: set of cell references like {'D2', 'B6'}
+        ranges: list of (start, end) tuples like [('B2', 'D6')]
+        formulas: list of formula strings
+        quoted: list of quoted text values
+    """
+    formulas = _FORMULA_RE.findall(text)
+    # Remove formulas from text before extracting cell refs to avoid double-counting
+    text_no_formulas = _FORMULA_RE.sub('', text)
+    ranges = _CELL_RANGE_RE.findall(text_no_formulas)
+    # Remove ranges before extracting individual cells
+    text_no_ranges = _CELL_RANGE_RE.sub('', text_no_formulas)
+    cells = set(_CELL_REF_RE.findall(text_no_ranges))
+    quoted = [q for q in _QUOTED_TEXT_RE.findall(text) if not q.startswith('=')]
+    return {
+        'cells': cells,
+        'ranges': ranges,
+        'formulas': formulas,
+        'quoted': quoted,
+    }
+
+
+def _check_action_mismatch(ground_truth: str, vlm_action: str) -> str | None:
+    """Check if the VLM's Action line contradicts the ground-truth step.
+
+    Returns a description of the mismatch, or None if consistent.
+    """
+    gt_refs = _extract_references(ground_truth)
+    vlm_refs = _extract_references(vlm_action)
+
+    mismatches = []
+
+    # Check cell references: VLM should not introduce cells that conflict with GT
+    if gt_refs['cells'] and vlm_refs['cells']:
+        # VLM cells that are NOT in the ground truth (potential hallucinations)
+        extra_cells = vlm_refs['cells'] - gt_refs['cells']
+        # Only flag if VLM has cells that differ in row/col from GT cells
+        # e.g., GT says D2 but VLM says D3
+        for gt_cell in gt_refs['cells']:
+            gt_col = re.match(r'([A-Z]+)', gt_cell).group(1)
+            for vlm_cell in extra_cells:
+                vlm_col = re.match(r'([A-Z]+)', vlm_cell).group(1)
+                if vlm_col == gt_col and vlm_cell != gt_cell:
+                    mismatches.append(
+                        f"cell ref {vlm_cell} in VLM vs {gt_cell} in ground truth"
+                    )
+
+    # Check formulas
+    if gt_refs['formulas'] and vlm_refs['formulas']:
+        for gt_f, vlm_f in zip(gt_refs['formulas'], vlm_refs['formulas']):
+            if gt_f != vlm_f:
+                mismatches.append(
+                    f"formula '{vlm_f}' in VLM vs '{gt_f}' in ground truth"
+                )
+
+    # Check quoted text values
+    if gt_refs['quoted'] and vlm_refs['quoted']:
+        gt_set = {q.lower() for q in gt_refs['quoted']}
+        for vlm_q in vlm_refs['quoted']:
+            if vlm_q.lower() not in gt_set:
+                # Check if it's a close but wrong value
+                mismatches.append(
+                    f"quoted text '{vlm_q}' in VLM not in ground truth"
+                )
+
+    return '; '.join(mismatches) if mismatches else None
+
+
+def _extract_annotation_field(annotation: str, field: str) -> str | None:
+    """Extract a specific field value from a VLM annotation string."""
+    pattern = re.compile(rf'^{field}:\s*(.+)$', re.MULTILINE | re.IGNORECASE)
+    match = pattern.search(annotation)
+    return match.group(1).strip() if match else None
+
+
+def _replace_annotation_field(annotation: str, field: str, new_value: str) -> str:
+    """Replace a specific field value in a VLM annotation string."""
+    pattern = re.compile(rf'^({field}:\s*)(.+)$', re.MULTILINE | re.IGNORECASE)
+    return pattern.sub(rf'\g<1>{new_value}', annotation)
+
+
+def convert_text(meta: dict) -> str:
+    """Convert recording meta.json to demo text using step descriptions only."""
+    instruction = meta["instruction"]
+    steps = meta.get("steps", [])
+
+    lines = [
+        "DEMONSTRATION:",
+        f"Task: {instruction}",
+        "",
+    ]
+
+    for i, step in enumerate(steps):
+        desc = step.get("suggested_step", f"(step {i + 1})")
+        lines.append(f"Step {i + 1}:")
+        lines.append(f"  Action: {desc}")
+        lines.append("")
+
+    lines.append("---")
+    return "\n".join(lines) + "\n"
+
+
+def convert_vlm(
+    meta: dict,
+    task_dir: Path,
+    provider: str = "openai",
+    model: str | None = None,
+) -> str:
+    """Convert recording to demo text using VLM to describe screenshots.
+
+    The VLM is instructed to treat the recorded action from meta.json as
+    ground truth. After each VLM call, the Action field is validated against
+    the ground-truth step description. If key references (cell refs, formulas,
+    text values) are contradicted, the Action field is replaced with the
+    ground-truth description while preserving the VLM's Observation, Intent,
+    and Result fields.
+    """
+    instruction = meta["instruction"]
+    steps = meta.get("steps", [])
+    num_steps = meta.get("num_steps", len(steps))
+
+    lines = [
+        "DEMONSTRATION:",
+        f"Task: {instruction}",
+        "",
+    ]
+
+    for i in range(num_steps):
+        step_desc = steps[i].get("suggested_step", "") if i < len(steps) else ""
+        before_path = task_dir / f"step_{i:02d}_before.png"
+        after_path = task_dir / f"step_{i:02d}_after.png"
+
+        if not before_path.exists():
+            # Fallback to text-only for this step
+            lines.append(f"Step {i + 1}:")
+            lines.append(f"  Action: {step_desc}")
+            lines.append("")
+            continue
+
+        print(f"  Step {i + 1}/{num_steps}...", end=" ", flush=True)
+
+        # Build VLM prompt with strong ground-truth constraint
+        images = [_encode_image(before_path)]
+        has_after = after_path.exists()
+        if has_after:
+            images.append(_encode_image(after_path))
+
+        prompt = (
+            f"You are annotating step {i + 1} of {num_steps} in a desktop task demonstration.\n"
+            f"Task: {instruction}\n\n"
+            f"GROUND-TRUTH RECORDED ACTION (definitive — do NOT contradict this):\n"
+            f"  {step_desc}\n\n"
+            f"The recorded action above is the exact action that was performed. It is the\n"
+            f"authoritative source for cell references (e.g. D2, B6), text values, formulas,\n"
+            f"and action types (CLICK, TYPE, DRAG, KEY). Your Action field MUST match these\n"
+            f"details exactly. Do not substitute different cell references, row numbers, or\n"
+            f"values based on your visual interpretation — the recording is ground truth.\n\n"
+            f"{'The first image is BEFORE the action, the second is AFTER.' if has_after else 'This image shows the screen BEFORE the action.'}\n\n"
+            f"Provide a concise annotation with exactly these fields:\n"
+            f"Observation: (what the screen shows before the action, ~1 sentence)\n"
+            f"Intent: (why this action is being taken, ~1 sentence)\n"
+            f"Action: (restate the recorded action above — you may add brief visual context\n"
+            f"         but must preserve all cell refs, values, and formulas exactly)\n"
+            f"Result: (what changed after the action, ~1 sentence)\n\n"
+            f"IMPORTANT: The Action field must be faithful to the ground-truth recorded action.\n"
+            f"If the recorded action says cell D2, you must say D2 — not D3 or any other cell."
+        )
+
+        try:
+            annotation = _vlm_call(prompt, images, provider, model)
+
+            # Post-hoc validation: check VLM Action against ground truth
+            vlm_action = _extract_annotation_field(annotation, "Action")
+            if vlm_action and step_desc:
+                mismatch = _check_action_mismatch(step_desc, vlm_action)
+                if mismatch:
+                    print(f"MISMATCH ({mismatch}), replacing Action...", end=" ")
+                    # Replace the Action field with the ground-truth description,
+                    # preserving the VLM's Observation/Intent/Result
+                    annotation = _replace_annotation_field(
+                        annotation, "Action", step_desc
+                    )
+
+            lines.append(f"Step {i + 1}:")
+            # Parse the annotation — it should have Observation/Intent/Action/Result lines
+            for line in annotation.strip().split("\n"):
+                line = line.strip()
+                if line:
+                    lines.append(f"  {line}")
+            lines.append("")
+            print("done")
+        except Exception as e:
+            print(f"error: {e}")
+            # Fallback to text-only
+            lines.append(f"Step {i + 1}:")
+            lines.append(f"  Action: {step_desc}")
+            lines.append("")
+
+    lines.append("---")
+    return "\n".join(lines) + "\n"
+
+
+def _encode_image(path: Path) -> str:
+    """Read an image file and return base64-encoded string."""
+    return base64.b64encode(path.read_bytes()).decode("utf-8")
+
+
+def _vlm_call(
+    prompt: str,
+    images_b64: list[str],
+    provider: str = "openai",
+    model: str | None = None,
+) -> str:
+    """Send a prompt with images to a VLM and return the response text."""
+    if provider == "openai":
+        return _vlm_call_openai(prompt, images_b64, model or "gpt-4.1")
+    elif provider in ("anthropic", "claude"):
+        return _vlm_call_anthropic(prompt, images_b64, model or "claude-sonnet-4-20250514")
+    else:
+        raise ValueError(f"Unsupported provider: {provider}")
+
+
+def _vlm_call_openai(prompt: str, images_b64: list[str], model: str) -> str:
+    import openai
+
+    content: list[dict] = [{"type": "text", "text": prompt}]
+    for img in images_b64:
+        content.append({
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{img}", "detail": "low"},
+        })
+
+    client = openai.OpenAI()
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": content}],
+        max_tokens=400,
+        temperature=0.0,
+    )
+    return resp.choices[0].message.content
+
+
+def _vlm_call_anthropic(prompt: str, images_b64: list[str], model: str) -> str:
+    import anthropic
+
+    content: list[dict] = [{"type": "text", "text": prompt}]
+    for img in images_b64:
+        content.append({
+            "type": "image",
+            "source": {"type": "base64", "media_type": "image/png", "data": img},
+        })
+
+    client = anthropic.Anthropic()
+    resp = client.messages.create(
+        model=model,
+        messages=[{"role": "user", "content": content}],
+        max_tokens=400,
+        temperature=0.0,
+    )
+    return resp.content[0].text
+
+
+def main(
+    recordings: str = "waa_recordings",
+    output: str = "demo_prompts",
+    mode: str = "text",
+    provider: str = "openai",
+    model: str | None = None,
+    task: str | None = None,
+) -> None:
+    """Convert WAA recordings to demo text files.
+
+    Args:
+        recordings: Directory containing recording subdirectories.
+        output: Output directory for demo .txt files.
+        mode: "text" (free, instant) or "vlm" (richer, uses API).
+        provider: VLM provider for vlm mode ("openai" or "anthropic").
+        model: Model override (default: gpt-4.1 for openai, claude-sonnet-4 for anthropic).
+        task: Specific task ID prefix to convert (default: all).
+    """
+    recordings_dir = Path(recordings)
+    output_dir = Path(output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Find task directories with meta.json
+    task_dirs = sorted(
+        d for d in recordings_dir.iterdir()
+        if d.is_dir() and (d / "meta.json").exists()
+    )
+
+    if task:
+        task_dirs = [d for d in task_dirs if d.name.startswith(task)]
+
+    if not task_dirs:
+        print(f"No recordings found in {recordings_dir}")
+        sys.exit(1)
+
+    effective_model = model
+    if not effective_model:
+        effective_model = (
+            "gpt-4.1" if provider == "openai" else "claude-sonnet-4-20250514"
+        )
+    print(f"Converting {len(task_dirs)} recording(s) to demo text (mode={mode})")
+    if mode == "vlm":
+        print(f"  Provider: {provider}, Model: {effective_model}")
+    print()
+
+    for task_dir in task_dirs:
+        meta = json.loads((task_dir / "meta.json").read_text(encoding="utf-8"))
+        task_id = meta["task_id"]
+        num_steps = meta.get("num_steps", len(meta.get("steps", [])))
+
+        print(f"{'=' * 60}")
+        print(f"Task: {task_id[:40]}... ({num_steps} steps)")
+        print(f"{'=' * 60}")
+
+        if mode == "vlm":
+            demo_text = convert_vlm(meta, task_dir, provider, model)
+        else:
+            demo_text = convert_text(meta)
+
+        txt_path = output_dir / f"{task_id}.txt"
+        txt_path.write_text(demo_text, encoding="utf-8")
+        print(f"  -> {txt_path}")
+        print()
+
+    print("Done. Use with eval-suite:")
+    print(f"  openadapt-evals eval-suite --demo-dir {output_dir} --tasks ...")
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
diff --git a/scripts/record_waa_demos.py b/scripts/record_waa_demos.py
index 07dcd67..e0b5eb0 100644
--- a/scripts/record_waa_demos.py
+++ b/scripts/record_waa_demos.py
@@ -31,6 +31,7 @@
 from __future__ import annotations
 
 import json
+import socket
 import subprocess
 import sys
 import time
@@ -470,6 +471,63 @@ def _take_screenshot(server: str) -> bytes:
     return resp.content
 
 
+def _wait_for_stable_screen(
+    server: str,
+    poll_interval: float = 2.0,
+    stability_timeout: float = 30.0,
+    similarity_threshold: float = 0.995,
+    required_stable_checks: int = 3,
+) -> bytes:
+    """Wait for the VM screen to stabilize, then return the screenshot.
+
+    Delegates to :func:`openadapt_evals.infrastructure.screen_stability.wait_for_stable_screen`.
+    """
+    from openadapt_evals.infrastructure.screen_stability import wait_for_stable_screen
+
+    return wait_for_stable_screen(
+        server,
+        poll_interval=poll_interval,
+        stability_timeout=stability_timeout,
+        similarity_threshold=similarity_threshold,
+        required_stable_checks=required_stable_checks,
+    )
+
+
+def _build_setup_desc(task_config: dict) -> str:
+    """Build a human-readable description of the task setup actions."""
+    parts = []
+    for entry in task_config.get("config", []):
+        t = entry.get("type", "")
+        p = entry.get("parameters", {})
+        if t == "download":
+            for f in p.get("files", []):
+                parts.append(f"  - File downloaded: {f.get('path', '?')}")
+        elif t == "open":
+            parts.append(f"  - File opened: {p.get('path', '?')}")
+        elif t == "launch":
+            parts.append(f"  - App launched: {p.get('command', '?')}")
+    return "\n".join(parts) if parts else "  (none)"
+
+
+def _vlm_call(
+    messages: list[dict],
+    api_key: str,
+    model: str = "gpt-4o",
+    max_tokens: int = 800,
+) -> str:
+    """Send a chat completion request to OpenAI. Returns the text response."""
+    import requests as _req
+
+    resp = _req.post(
+        "https://api.openai.com/v1/chat/completions",
+        headers={"Authorization": f"Bearer {api_key}"},
+        json={"model": model, "max_tokens": max_tokens, "messages": messages},
+        timeout=60,
+    )
+    resp.raise_for_status()
+    return resp.json()["choices"][0]["message"]["content"]
+
+
 def _generate_steps(
     screenshot_png: bytes,
     instruction: str,
@@ -486,28 +544,15 @@ def _generate_steps(
     if not api_key:
         return "(No OPENAI_API_KEY set — skipping AI step generation)"
 
-    import requests as _req
-
     b64 = base64.b64encode(screenshot_png).decode()
-
-    setup_desc = ""
-    for entry in task_config.get("config", []):
-        t = entry.get("type", "")
-        p = entry.get("parameters", {})
-        if t == "download":
-            for f in p.get("files", []):
-                setup_desc += f"  - File downloaded: {f['path']}\n"
-        elif t == "open":
-            setup_desc += f"  - File opened: {p.get('path', '?')}\n"
-        elif t == "launch":
-            setup_desc += f"  - App launched: {p.get('command', '?')}\n"
+    setup_desc = _build_setup_desc(task_config)
 
     prompt = f"""You are helping a human perform a task on a Windows desktop via VNC.
 
 TASK: {instruction}
 
 ENVIRONMENT SETUP (already done automatically):
-{setup_desc if setup_desc else "  (none)"}
+{setup_desc}
 
 Look at the screenshot and give step-by-step instructions to complete the task.
 Be specific about what to click, what to type, and what menus to use.
@@ -515,45 +560,908 @@ def _generate_steps(
 Keep each step to one action. Use plain text, numbered list.
 Be concise — the user will read this on a phone screen."""
 
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{b64}",
+                        "detail": "high",
+                    },
+                },
+            ],
+        }
+    ]
+
     try:
-        resp = _req.post(
-            "https://api.openai.com/v1/chat/completions",
-            headers={"Authorization": f"Bearer {api_key}"},
-            json={
-                "model": "gpt-4o",
-                "max_tokens": 800,
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "text", "text": prompt},
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/png;base64,{b64}",
-                                    "detail": "high",
-                                },
-                            },
-                        ],
-                    }
-                ],
-            },
-            timeout=30,
-        )
-        resp.raise_for_status()
-        return resp.json()["choices"][0]["message"]["content"]
+        return _vlm_call(messages, api_key)
     except Exception as e:
         return f"(AI step generation failed: {e})"
 
 
+def _refine_steps(
+    screenshot_png: bytes,
+    instruction: str,
+    task_config: dict,
+    current_steps: str,
+    feedback: str,
+) -> str:
+    """Refine suggested steps based on user feedback.
+
+    Sends the original screenshot, instruction, current steps, and the
+    user's correction back to the VLM for a revised set of steps.
+    """
+    import base64
+    import os
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return current_steps
+
+    b64 = base64.b64encode(screenshot_png).decode()
+    setup_desc = _build_setup_desc(task_config)
+
+    prompt = f"""You are helping a human perform a task on a Windows desktop via VNC.
+
+TASK: {instruction}
+
+ENVIRONMENT SETUP (already done automatically):
+{setup_desc}
+
+You previously suggested these steps:
+
+{current_steps}
+
+The user says this is wrong and provides this feedback:
+
+"{feedback}"
+
+Look at the screenshot again and produce CORRECTED step-by-step instructions
+that address the user's feedback. Keep the same format: plain text, numbered list,
+one action per step, concise."""
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{b64}",
+                        "detail": "high",
+                    },
+                },
+            ],
+        }
+    ]
+
+    try:
+        return _vlm_call(messages, api_key)
+    except Exception as e:
+        print(f"  (Refinement failed: {e})")
+        return current_steps
+
+
+def _parse_step_list(steps_text: str) -> list[str]:
+    """Parse numbered VLM output into a list of step strings.
+
+    Handles formats like "1. Do X", "1) Do X", "1: Do X".
+    Falls back to non-empty lines if no numbered items are found.
+    """
+    import re
+
+    lines = steps_text.strip().splitlines()
+    steps: list[str] = []
+    current_step: list[str] = []
+
+    for line in lines:
+        # Check if this line starts a new numbered step
+        m = re.match(r"^\s*\d+[\.\)\:]\s+(.*)", line)
+        if m:
+            if current_step:
+                steps.append(" ".join(current_step))
+            current_step = [m.group(1).strip()]
+        elif current_step:
+            # Continuation line for the current step
+            stripped = line.strip()
+            if stripped:
+                current_step.append(stripped)
+
+    if current_step:
+        steps.append(" ".join(current_step))
+
+    # Fallback: if no numbered items found, split on non-empty lines
+    if not steps:
+        steps = [line.strip() for line in lines if line.strip()]
+
+    return steps
+
+
+def _format_step_list(steps: list[str], start_num: int = 1) -> str:
+    """Format a list of step strings back into numbered text."""
+    return "\n".join(
+        f"{i}. {step}" for i, step in enumerate(steps, start=start_num)
+    )
+
+
+def _display_steps(steps_text: str) -> None:
+    """Pretty-print suggested steps in a box."""
+    print()
+    print("  ┌─ SUGGESTED STEPS ──────────────────────────────")
+    for line in steps_text.splitlines():
+        print(f"  │ {line}")
+    print("  └────────────────────────────────────────────────")
+    print()
+
+
+def _display_current_step(step_num: int, total: int, step_text: str) -> None:
+    """Display the current step in a box before the action prompt."""
+    header = f" Step {step_num} of {total} "
+    width = max(len(header) + 4, len(step_text) + 6, 50)
+    bar = "─" * (width - len(header) - 2)
+    print(f"\n  ──{header}{bar}")
+    print(f"  │ {step_text}")
+    print(f"  └{'─' * (width)}")
+
+
+def _interactive_step_review(
+    screenshot_png: bytes,
+    instruction: str,
+    task_config: dict,
+    initial_steps: str,
+) -> str:
+    """Let the user review and iteratively correct the suggested steps.
+
+    The user can press Enter to accept, or type feedback to refine.
+    Returns the final accepted steps text.
+    """
+    current = initial_steps
+    while True:
+        correction = input(
+            "  Press Enter to accept steps, or type correction: "
+        ).strip()
+        if not correction:
+            return current
+        print("  Refining steps...")
+        current = _refine_steps(
+            screenshot_png, instruction, task_config, current, correction,
+        )
+        _display_steps(current)
+
+
+def _refine_remaining_steps(
+    screenshot_png: bytes,
+    instruction: str,
+    task_config: dict,
+    completed_steps: list[str],
+    remaining_steps: list[str],
+    feedback: str,
+) -> str:
+    """Refine remaining steps based on user feedback mid-recording.
+
+    Takes a FRESH screenshot (current screen state after completed steps)
+    and sends it along with the context of what's been done and what remains.
+    Returns the raw VLM text for remaining steps.
+    """
+    import base64
+    import os
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return _format_step_list(remaining_steps)
+
+    b64 = base64.b64encode(screenshot_png).decode()
+    setup_desc = _build_setup_desc(task_config)
+
+    completed_text = (
+        _format_step_list(completed_steps)
+        if completed_steps
+        else "(none)"
+    )
+    remaining_text = _format_step_list(remaining_steps)
+    current_step_num = len(completed_steps) + 1
+
+    prompt = f"""You are helping a human perform a task on a Windows desktop via VNC.
+
+TASK: {instruction}
+
+ENVIRONMENT SETUP (already done automatically):
+{setup_desc}
+
+The user has already completed these steps:
+{completed_text}
+
+You previously suggested these REMAINING steps (not yet performed):
+{remaining_text}
+
+The user is currently on step {current_step_num} and says:
+"{feedback}"
+
+Look at the CURRENT screenshot (taken just now, after the completed steps)
+and produce CORRECTED remaining steps that address the user's feedback.
+Only output the remaining steps (do not repeat completed steps).
+Keep the same format: plain text, numbered list starting from 1,
+one action per step, concise."""
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{b64}",
+                        "detail": "high",
+                    },
+                },
+            ],
+        }
+    ]
+
+    try:
+        return _vlm_call(messages, api_key)
+    except Exception as e:
+        print(f"  (Refinement failed: {e})")
+        return _format_step_list(remaining_steps)
+
+
+def _interactive_remaining_review(
+    server: str,
+    instruction: str,
+    task_config: dict,
+    completed_steps: list[str],
+    remaining_steps: list[str],
+    initial_feedback: str,
+) -> list[str]:
+    """Mini review loop for mid-recording step refinement.
+
+    Takes a fresh screenshot for each refinement round.
+    Returns the accepted list of remaining steps.
+    """
+    # First refinement using the initial feedback
+    fresh_png = _take_screenshot(server)
+    refined_text = _refine_remaining_steps(
+        fresh_png, instruction, task_config,
+        completed_steps, remaining_steps, initial_feedback,
+    )
+    new_steps = _parse_step_list(refined_text)
+
+    if not new_steps:
+        print("  WARNING: VLM returned no steps. Keeping previous plan.")
+        return remaining_steps
+
+    while True:
+        # Display corrected remaining steps
+        print()
+        print("  ┌─ CORRECTED REMAINING STEPS ─────────────────────")
+        for line in _format_step_list(new_steps).splitlines():
+            print(f"  │ {line}")
+        print("  └─────────────────────────────────────────────────")
+        print()
+
+        correction = input(
+            "  Press Enter to accept, or type another correction: "
+        ).strip()
+        if not correction:
+            return new_steps
+
+        print("  Taking fresh screenshot and refining...")
+        fresh_png = _take_screenshot(server)
+        refined_text = _refine_remaining_steps(
+            fresh_png, instruction, task_config,
+            completed_steps, new_steps, correction,
+        )
+        parsed = _parse_step_list(refined_text)
+        if parsed:
+            new_steps = parsed
+        else:
+            print("  WARNING: VLM returned no steps. Keeping previous plan.")
+
+
+def _record_single_task(
+    task_id: str,
+    task_num: int,
+    total_tasks: int,
+    output_dir: Path,
+    server: str,
+    evaluate_url: str,
+    vnc_url: str,
+    vm_ip: str | None,
+) -> str | None:
+    """Record a single task interactively via WAA API.
+
+    Returns the task_id on success, or None on failure.
+    """
+    import requests
+
+    from openadapt_evals.infrastructure.qemu_reset import QEMUResetManager
+
+    print_header(f"Task {task_num}/{total_tasks}: {task_id[:12]}...")
+
+    task_dir = output_dir / task_id
+    task_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load task config from evaluate server
+    instruction = task_id  # fallback
+    task_config: dict = {}
+    try:
+        task_resp = requests.get(
+            f"{evaluate_url}/task/{task_id}", timeout=10
+        )
+        if task_resp.ok:
+            task_config = task_resp.json()
+            instruction = task_config.get(
+                "instruction", task_config.get("task", task_id)
+            )
+    except Exception as e:
+        print(f"  Warning: could not load task config: {e}")
+
+    def _setup_task_env() -> None:
+        """Run task setup config (download files, open apps, etc.)."""
+        setup_config = task_config.get("config", [])
+        related_apps = task_config.get("related_apps", [])
+        if related_apps:
+            setup_config = [
+                {"type": "verify_apps", "parameters": {"apps": related_apps}}
+            ] + setup_config
+        if setup_config:
+            resp = requests.post(
+                f"{evaluate_url}/setup",
+                json={"config": setup_config},
+                timeout=120,
+            )
+            resp.raise_for_status()
+            results = resp.json().get("results", [])
+            for r in results:
+                status = r.get("status", "?")
+                stype = r.get("type", "?")
+                print(f"    setup {stype}: {status}")
+
+    def _soft_reset_task_env() -> bytes:
+        """Soft reset: close_all + re-run setup + wait for stable screen."""
+        print("  Resetting environment (soft)...")
+        try:
+            resp = requests.post(f"{server}/setup/close_all", timeout=30)
+            print(f"    close_all: {resp.status_code}")
+            time.sleep(2)
+            _setup_task_env()
+        except Exception as e:
+            print(f"  WARNING: environment setup failed: {e}")
+            print(f"  The task app may not be open. Check VNC.")
+        print("  Waiting for screen to stabilize...")
+        return _wait_for_stable_screen(server)
+
+    def _hard_reset_task_env() -> bytes:
+        """Hard reset: QEMU system_reset + wait for boot + clear recovery + setup."""
+        print("  Restarting Windows (QEMU hard reset)...")
+        mgr = QEMUResetManager(vm_ip=vm_ip, timeout_seconds=300)
+        success, msg = mgr.restart_windows(server_url=server)
+        if not success:
+            print(f"  WARNING: QEMU reset failed: {msg}")
+            print("  Falling back to soft reset...")
+            return _soft_reset_task_env()
+        print(f"    {msg}")
+        _clear_recovery_data(server)
+        print("  Running task setup...")
+        try:
+            _setup_task_env()
+        except Exception as e:
+            print(f"  WARNING: environment setup failed: {e}")
+            print(f"  The task app may not be open. Check VNC.")
+        print("  Waiting for screen to stabilize...")
+        return _wait_for_stable_screen(server)
+
+    before_png = _soft_reset_task_env()
+
+    print(f"\n  VNC: {vnc_url}")
+    print(f"  Task: {instruction}\n")
+
+    # Generate AI step-by-step guidance from screenshot
+    print("  Generating suggested steps...")
+    suggested = _generate_steps(before_png, instruction, task_config)
+    _display_steps(suggested)
+    suggested = _interactive_step_review(
+        before_png, instruction, task_config, suggested,
+    )
+
+    # Parse accepted steps into a structured list
+    completed_steps: list[str] = []
+    remaining_steps = _parse_step_list(suggested)
+    step_plans = [{
+        "at_step_idx": 0,
+        "trigger": "initial",
+        "steps": list(remaining_steps),
+    }]
+    refined_indices: set[int] = set()
+    steps_meta: list[dict] = []
+    step_idx = 0
+
+    print()
+    print("  Perform each action in VNC. You can provide feedback")
+    print("  at any point to correct the remaining steps.\n")
+
+    while remaining_steps:
+        # Save before screenshot
+        (task_dir / f"step_{step_idx:02d}_before.png").write_bytes(
+            before_png
+        )
+
+        # Display current step
+        total = len(completed_steps) + len(remaining_steps)
+        step_num = len(completed_steps) + 1
+        _display_current_step(step_num, total, remaining_steps[0])
+
+        user_input = input(
+            "  [Enter]=done  [d]=task done  [r]=redo  [R]=restart\n"
+            "  Or type feedback to correct remaining steps: "
+        ).strip()
+
+        if user_input == "":
+            # ADVANCE: action done, move to next step
+            after_png = _take_screenshot(server)
+            (task_dir / f"step_{step_idx:02d}_after.png").write_bytes(
+                after_png
+            )
+            done_step = remaining_steps.pop(0)
+            completed_steps.append(done_step)
+            steps_meta.append({
+                "action_hint": None,
+                "suggested_step": done_step,
+                "step_was_refined": step_idx in refined_indices,
+            })
+            before_png = after_png
+            step_idx += 1
+            print(f"  Step {step_num} recorded.")
+
+            if not remaining_steps:
+                print(f"\n  All {total} steps completed. Finishing recording.")
+
+        elif user_input.lower() == "d":
+            # DONE: task finished (possibly before all steps)
+            after_png = _take_screenshot(server)
+            (task_dir / f"step_{step_idx:02d}_after.png").write_bytes(
+                after_png
+            )
+            steps_meta.append({
+                "action_hint": "d",
+                "suggested_step": remaining_steps[0],
+                "step_was_refined": step_idx in refined_indices,
+            })
+            step_idx += 1
+            total = len(completed_steps) + len(remaining_steps)
+            print(f"\n  Task marked done at step {step_num} of {total}. Finishing recording.")
+            break
+
+        elif user_input.lower() == "r":
+            # REDO: go back one step
+            if not completed_steps:
+                print("  Nothing to redo (already at step 1).")
+                continue
+            step_idx -= 1
+            remaining_steps.insert(0, completed_steps.pop())
+            steps_meta.pop()
+            before_png = _take_screenshot(server)
+            print(f"  Redoing step {len(completed_steps) + 1}...")
+
+        elif user_input == "R":
+            # RESTART: QEMU hard reset + re-generate everything
+            print("  Restarting task from scratch (QEMU hard reset)...")
+            for f in task_dir.glob("step_*.png"):
+                f.unlink()
+            before_png = _hard_reset_task_env()
+            print(f"\n  VNC: {vnc_url}")
+            print(f"  Task: {instruction}\n")
+
+            print("  Generating suggested steps...")
+            new_suggested = _generate_steps(before_png, instruction, task_config)
+            _display_steps(new_suggested)
+            new_suggested = _interactive_step_review(
+                before_png, instruction, task_config, new_suggested,
+            )
+
+            completed_steps = []
+            remaining_steps = _parse_step_list(new_suggested)
+            step_plans.append({
+                "at_step_idx": 0,
+                "trigger": "restart",
+                "steps": list(remaining_steps),
+            })
+            refined_indices = set()
+            steps_meta = []
+            step_idx = 0
+            print()
+            print("  Task restarted. Continue recording.\n")
+
+        else:
+            # FEEDBACK: mid-recording step correction
+            print("  Taking fresh screenshot and refining remaining steps...")
+            remaining_steps = _interactive_remaining_review(
+                server, instruction, task_config,
+                completed_steps, remaining_steps, user_input,
+            )
+            step_plans.append({
+                "at_step_idx": step_idx,
+                "trigger": f"feedback: {user_input}",
+                "steps": list(remaining_steps),
+            })
+            for i in range(step_idx, step_idx + len(remaining_steps)):
+                refined_indices.add(i)
+            # No action taken — loop re-displays the (possibly new) current step
+
+    # Save metadata
+    meta = {
+        "task_id": task_id,
+        "instruction": instruction,
+        "num_steps": len(steps_meta),
+        "steps": steps_meta,
+        "step_plans": step_plans,
+        "server_url": server,
+        "recorded_at": datetime.now(timezone.utc).isoformat(),
+    }
+    (task_dir / "meta.json").write_text(
+        json.dumps(meta, indent=2), encoding="utf-8"
+    )
+
+    print(f"\n  Saved {len(steps_meta)} step(s) to {task_dir}")
+    return task_id
+
+
+# ---------------------------------------------------------------------------
+# Auto-infrastructure helpers
+# ---------------------------------------------------------------------------
+
+_AUTO_VM_NAME = "waa-pool-00"
+_AUTO_RESOURCE_GROUP = "openadapt-agents"
+_AUTO_SSH_USER = "azureuser"
+
+# Track whether this script started the VM (so we can offer to deallocate on exit)
+_vm_started_by_script = False
+_cleanup_registered = False
+_cleanup_done = False
+
+# Port mappings: local_port -> remote_port (on VM host)
+_TUNNEL_PORTS = {
+    5001: 5000,   # WAA server
+    5050: 5051,   # evaluate server (via socat)
+    8006: 8006,   # VNC (noVNC)
+}
+
+
+def _is_local_port_open(port: int) -> bool:
+    """Check whether a local TCP port is accepting connections."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        sock.settimeout(2)
+        return sock.connect_ex(("localhost", port)) == 0
+
+
+def _get_vm_power_state() -> str | None:
+    """Return the Azure VM power state, e.g. 'running', 'deallocated', or None on error."""
+    try:
+        result = subprocess.run(
+            [
+                "az", "vm", "get-instance-view",
+                "-g", _AUTO_RESOURCE_GROUP,
+                "-n", _AUTO_VM_NAME,
+                "--query", "instanceView.statuses[?starts_with(code,'PowerState/')].displayStatus | [0]",
+                "-o", "tsv",
+            ],
+            capture_output=True, text=True, timeout=30,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip().replace("VM ", "").lower()
+    except Exception:
+        pass
+    return None
+
+
+def _get_vm_public_ip() -> str | None:
+    """Get the public IP of the VM."""
+    try:
+        result = subprocess.run(
+            [
+                "az", "vm", "show",
+                "-g", _AUTO_RESOURCE_GROUP,
+                "-n", _AUTO_VM_NAME,
+                "--show-details",
+                "--query", "publicIps",
+                "-o", "tsv",
+            ],
+            capture_output=True, text=True, timeout=30,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except Exception:
+        pass
+    return None
+
+
+def _auto_start_vm() -> bool:
+    """Start the Azure VM. Returns True on success."""
+    global _vm_started_by_script
+    print(f"  Starting VM '{_AUTO_VM_NAME}'...")
+    result = subprocess.run(
+        ["az", "vm", "start", "-g", _AUTO_RESOURCE_GROUP, "-n", _AUTO_VM_NAME],
+        capture_output=True, text=True, timeout=300,
+    )
+    if result.returncode != 0:
+        print(f"  ERROR: az vm start failed: {result.stderr.strip()}")
+        return False
+    print(f"  VM '{_AUTO_VM_NAME}' started.")
+    _vm_started_by_script = True
+    _register_vm_cleanup()
+    return True
+
+
+def _auto_establish_tunnels(vm_ip: str) -> bool:
+    """Establish SSH tunnels to the VM. Prefers autossh if available."""
+    import shutil
+
+    use_autossh = shutil.which("autossh") is not None
+    tool = "autossh" if use_autossh else "ssh"
+    print(f"  Establishing SSH tunnels to {vm_ip} (using {tool})...")
+
+    tunnel_args = []
+    for local_port, remote_port in _TUNNEL_PORTS.items():
+        tunnel_args.extend(["-L", f"{local_port}:localhost:{remote_port}"])
+
+    ssh_opts = [
+        "-o", "ConnectTimeout=10",
+        "-o", "ServerAliveInterval=30",
+        "-o", "ServerAliveCountMax=3",
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "UserKnownHostsFile=/dev/null",
+        "-o", "ExitOnForwardFailure=yes",
+        "-N",
+    ]
+
+    if use_autossh:
+        cmd = ["autossh", "-M", "0", "-f"] + ssh_opts + tunnel_args + [
+            f"{_AUTO_SSH_USER}@{vm_ip}"
+        ]
+    else:
+        cmd = ["ssh", "-f"] + ssh_opts + tunnel_args + [
+            f"{_AUTO_SSH_USER}@{vm_ip}"
+        ]
+        print("  (install autossh for automatic tunnel reconnection)")
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    if result.returncode != 0:
+        print(f"  ERROR: {tool} tunnel failed: {result.stderr.strip()}")
+        return False
+
+    time.sleep(2)
+    for local_port in _TUNNEL_PORTS:
+        status = "OK" if _is_local_port_open(local_port) else "NOT YET"
+        print(f"    Tunnel localhost:{local_port} -> VM:{_TUNNEL_PORTS[local_port]}: {status}")
+
+    return True
+
+
+def _auto_start_container(vm_ip: str) -> bool:
+    """Start the winarena Docker container on the VM."""
+    print(f"  Starting Docker container 'winarena' on {vm_ip}...")
+    result = subprocess.run(
+        ["ssh",
+         "-o", "ConnectTimeout=10",
+         "-o", "StrictHostKeyChecking=no",
+         "-o", "UserKnownHostsFile=/dev/null",
+         f"{_AUTO_SSH_USER}@{vm_ip}",
+         "docker start winarena"],
+        capture_output=True, text=True, timeout=60,
+    )
+    if result.returncode != 0:
+        print(f"  ERROR: docker start failed: {result.stderr.strip()}")
+        return False
+    print("  Container started.")
+    return True
+
+
+def _auto_start_socat(vm_ip: str) -> bool:
+    """Start socat proxy on the VM for port 5050 forwarding."""
+    print(f"  Starting socat proxy on {vm_ip} (VM:5051 -> container:5050)...")
+    socat_cmd = (
+        'nohup socat TCP-LISTEN:5051,fork,reuseaddr '
+        'EXEC:"docker exec -i winarena socat - TCP\\:localhost\\:5050" '
+        '&>/dev/null &'
+    )
+    result = subprocess.run(
+        ["ssh",
+         "-o", "ConnectTimeout=10",
+         "-o", "StrictHostKeyChecking=no",
+         "-o", "UserKnownHostsFile=/dev/null",
+         f"{_AUTO_SSH_USER}@{vm_ip}",
+         socat_cmd],
+        capture_output=True, text=True, timeout=30,
+    )
+    if result.returncode != 0:
+        print(f"  WARNING: socat setup returned non-zero: {result.stderr.strip()}")
+    else:
+        print("  Socat proxy started.")
+    return True
+
+
+def _wait_for_waa_ready(server: str, timeout: int = 600, interval: int = 5) -> bool:
+    """Poll WAA /probe endpoint until it returns HTTP 200."""
+    import requests
+
+    print(f"  Waiting for WAA to become ready at {server}/probe (timeout {timeout}s)...")
+    start = time.monotonic()
+    attempts = 0
+    while time.monotonic() - start < timeout:
+        attempts += 1
+        elapsed = int(time.monotonic() - start)
+        try:
+            resp = requests.get(f"{server}/probe", timeout=5)
+            if resp.status_code == 200:
+                print(f"  WAA is ready (after {elapsed}s, {attempts} attempts).")
+                return True
+            print(f"    [{elapsed}s] probe returned {resp.status_code}, retrying...")
+        except requests.ConnectionError:
+            print(f"    [{elapsed}s] connection refused, retrying...")
+        except Exception as e:
+            print(f"    [{elapsed}s] {e}, retrying...")
+        time.sleep(interval)
+
+    print(f"  TIMEOUT: WAA did not become ready within {timeout}s.")
+    return False
+
+
+def _attempt_auto_recovery(
+    server: str,
+    auto_vm: bool,
+    auto_tunnel: bool,
+    auto_container: bool,
+    wait_ready: bool,
+) -> bool:
+    """Attempt to automatically bring up WAA infrastructure.
+
+    Returns True if WAA is reachable after recovery.
+    """
+    print()
+    print("  Auto-recovery: diagnosing infrastructure state...")
+
+    # Step 1: Check/start VM
+    vm_ip = _get_vm_public_ip()
+    power_state = _get_vm_power_state()
+    print(f"  VM power state: {power_state or 'unknown'}")
+
+    if power_state != "running":
+        if auto_vm:
+            if not _auto_start_vm():
+                return False
+            time.sleep(5)
+            vm_ip = _get_vm_public_ip()
+            if not vm_ip:
+                print("  FAILED: VM started but could not get public IP.")
+                return False
+            print(f"  VM IP: {vm_ip}")
+        else:
+            print("  VM is not running. Use --auto-vm to start it automatically.")
+            return False
+    else:
+        if not vm_ip:
+            vm_ip = _get_vm_public_ip()
+        print(f"  VM IP: {vm_ip}")
+
+    if not vm_ip:
+        print("  FAILED: Could not determine VM IP.")
+        return False
+
+    # Step 2: Check/start Docker container
+    if auto_container:
+        _auto_start_container(vm_ip)
+        _auto_start_socat(vm_ip)
+
+    # Step 3: Check/establish SSH tunnels
+    if auto_tunnel and not _is_local_port_open(5001):
+        if not _auto_establish_tunnels(vm_ip):
+            print("  FAILED: Could not establish SSH tunnels.")
+            return False
+
+    # Step 4: Wait for WAA to boot
+    if wait_ready:
+        return _wait_for_waa_ready(server)
+
+    return _is_local_port_open(5001)
+
+
+def _deallocate_vm() -> bool:
+    """Deallocate the Azure VM (async). Returns True on success."""
+    print(f"\n  Deallocating VM '{_AUTO_VM_NAME}'...")
+    try:
+        result = subprocess.run(
+            [
+                "az", "vm", "deallocate",
+                "-g", _AUTO_RESOURCE_GROUP,
+                "-n", _AUTO_VM_NAME,
+                "--no-wait",
+            ],
+            capture_output=True, text=True, timeout=30,
+        )
+        if result.returncode == 0:
+            print(f"  VM '{_AUTO_VM_NAME}' deallocate initiated (billing will stop shortly).")
+            return True
+        else:
+            print(f"  WARNING: deallocate failed: {result.stderr.strip()}")
+            return False
+    except Exception as e:
+        print(f"  WARNING: deallocate failed: {e}")
+        return False
+
+
+def _cleanup_on_exit(signal_received: bool = False) -> None:
+    """Offer to deallocate the VM if this script started it."""
+    global _cleanup_done
+    if _cleanup_done or not _vm_started_by_script:
+        return
+    _cleanup_done = True
+
+    if signal_received:
+        print("\n\n  Script interrupted. Deallocating VM to stop billing...")
+        _deallocate_vm()
+    else:
+        try:
+            answer = input(
+                f"\n  This script started VM '{_AUTO_VM_NAME}'. "
+                "Deallocate to stop billing? [Y/n] "
+            ).strip().lower()
+            if answer in ("", "y", "yes"):
+                _deallocate_vm()
+            else:
+                print(f"  VM '{_AUTO_VM_NAME}' left running. "
+                      f"Deallocate manually: az vm deallocate -g {_AUTO_RESOURCE_GROUP} "
+                      f"-n {_AUTO_VM_NAME} --no-wait")
+        except (EOFError, KeyboardInterrupt):
+            print()
+            _deallocate_vm()
+
+
+def _register_vm_cleanup() -> None:
+    """Register atexit and signal handlers for VM cleanup. Idempotent."""
+    global _cleanup_registered
+    if _cleanup_registered:
+        return
+    _cleanup_registered = True
+
+    import atexit
+    import signal
+
+    atexit.register(_cleanup_on_exit, signal_received=False)
+
+    _original_sigint = signal.getsignal(signal.SIGINT)
+    _original_sigterm = signal.getsignal(signal.SIGTERM)
+
+    def _signal_handler(signum, frame):
+        _cleanup_on_exit(signal_received=True)
+        if signum == signal.SIGINT and callable(_original_sigint):
+            _original_sigint(signum, frame)
+        elif signum == signal.SIGTERM and callable(_original_sigterm):
+            _original_sigterm(signum, frame)
+        else:
+            sys.exit(1)
+
+    signal.signal(signal.SIGINT, _signal_handler)
+    signal.signal(signal.SIGTERM, _signal_handler)
+
+
 def cmd_record_waa(
     tasks: str = ",".join(HARDER_TASK_IDS),
     server: str = "http://localhost:5001",
     evaluate_url: str = "http://localhost:5050",
     output: str = "waa_recordings",
     vnc_url: str = "http://localhost:8006",
-    vm_ip: str = "172.173.66.131",
+    vm_ip: str | None = None,
     verify: bool = True,
+    auto: bool = False,
+    auto_vm: bool = False,
+    auto_tunnel: bool = False,
+    auto_container: bool = False,
+    wait_ready: bool = True,
 ) -> None:
     """Record demos interactively via WAA API while user performs actions on VNC.
 
@@ -564,10 +1472,14 @@ def cmd_record_waa(
         output: Output directory for recordings.
         vnc_url: VNC URL for the user to open in a browser.
         vm_ip: Azure VM IP for QEMU reset on task restart.
+            Auto-detected from pool registry or Azure if omitted.
         verify: Pre-flight check that all required apps are installed (default True).
+        auto: Automatically start all infrastructure (VM, tunnels, container).
+        auto_vm: Start Azure VM if it is deallocated (incurs charges).
+        auto_tunnel: Establish SSH tunnels if not connected.
+        auto_container: Start Docker container and socat proxy if not running.
+        wait_ready: Wait for WAA server to boot after recovery (default True).
     """
-    import requests
-
     # Guard: Fire may pass True if --tasks is used without a value
     if not isinstance(tasks, str):
         print(f"ERROR: --tasks must be a string of comma-separated task IDs.")
@@ -575,6 +1487,28 @@ def cmd_record_waa(
         print(f"  Hint: use --tasks=\"id1,id2,...\" (with = and no space)")
         return
 
+    # --auto is a convenience flag that enables all sub-flags
+    if auto:
+        auto_vm = True
+        auto_tunnel = True
+        auto_container = True
+
+    any_auto = auto_vm or auto_tunnel or auto_container
+
+    from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip
+
+    # VM IP resolution may fail if VM is deallocated — that's OK if we have
+    # auto-recovery flags, since we'll start the VM first.
+    try:
+        vm_ip = resolve_vm_ip(vm_ip)
+    except RuntimeError:
+        if any_auto:
+            vm_ip = None  # Will be resolved after VM start
+        else:
+            raise
+
+    import requests
+
     output_dir = Path(output)
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -599,13 +1533,63 @@ def cmd_record_waa(
 
     # Verify connection
     print(f"Connecting to WAA server at {server}...")
+    connected = False
     try:
         resp = requests.get(f"{server}/probe", timeout=5)
         resp.raise_for_status()
         print(f"  Connected ({resp.status_code})")
+        connected = True
     except Exception as e:
         print(f"  Failed to connect: {e}")
-        print("  Make sure the WAA server is running and SSH tunnels are up.")
+
+        if any_auto:
+            # Confirm with user if VM start is involved (cost warning)
+            if auto_vm:
+                power_state = _get_vm_power_state()
+                if power_state != "running":
+                    print()
+                    print(
+                        "  WARNING: --auto will start Azure VM resources "
+                        "which incur charges."
+                    )
+                    answer = input("  Continue? [y/N] ").strip().lower()
+                    if answer not in ("y", "yes"):
+                        print("  Aborted.")
+                        return
+
+            recovered = _attempt_auto_recovery(
+                server=server,
+                auto_vm=auto_vm,
+                auto_tunnel=auto_tunnel,
+                auto_container=auto_container,
+                wait_ready=wait_ready,
+            )
+            if recovered:
+                print()
+                print("  Auto-recovery succeeded. WAA is ready.")
+                connected = True
+                # Re-resolve VM IP now that infrastructure is up
+                if vm_ip is None:
+                    try:
+                        vm_ip = resolve_vm_ip(vm_ip)
+                    except RuntimeError:
+                        vm_ip = _get_vm_public_ip()
+            else:
+                print()
+                print("  Auto-recovery FAILED. Cannot proceed.")
+                return
+        else:
+            print()
+            print("  The WAA server is not reachable. To auto-deploy all infrastructure, re-run with --auto:")
+            print(f"    python {__file__} record-waa --auto --tasks={tasks}")
+            print()
+            print("  Or use granular flags:")
+            print("    --auto-vm        Start Azure VM if deallocated (incurs charges)")
+            print("    --auto-tunnel    Establish SSH tunnels")
+            print("    --auto-container Start Docker container + socat proxy")
+            return
+
+    if not connected:
         return
 
     # Pre-flight: verify all required apps are installed
@@ -669,171 +1653,18 @@ def cmd_record_waa(
 
     recorded = []
     for task_num, task_id in enumerate(task_ids, 1):
-        print_header(f"Task {task_num}/{len(task_ids)}: {task_id[:12]}...")
-
-        task_dir = output_dir / task_id
-        task_dir.mkdir(parents=True, exist_ok=True)
-
-        # Load task config from evaluate server
-        instruction = task_id  # fallback
-        task_config = {}
-        try:
-            task_resp = requests.get(
-                f"{evaluate_url}/task/{task_id}", timeout=10
-            )
-            if task_resp.ok:
-                task_config = task_resp.json()
-                instruction = task_config.get(
-                    "instruction", task_config.get("task", task_id)
-                )
-        except Exception as e:
-            print(f"  Warning: could not load task config: {e}")
-
-        def _setup_task_env() -> None:
-            """Run task setup config (download files, open apps, etc.)."""
-            setup_config = task_config.get("config", [])
-            related_apps = task_config.get("related_apps", [])
-            if related_apps:
-                setup_config = [
-                    {"type": "verify_apps", "parameters": {"apps": related_apps}}
-                ] + setup_config
-            if setup_config:
-                resp = requests.post(
-                    f"{evaluate_url}/setup",
-                    json={"config": setup_config},
-                    timeout=120,
-                )
-                resp.raise_for_status()
-                results = resp.json().get("results", [])
-                for r in results:
-                    status = r.get("status", "?")
-                    stype = r.get("type", "?")
-                    print(f"    setup {stype}: {status}")
-            time.sleep(3)
-
-        def _soft_reset_task_env() -> bytes:
-            """Soft reset: close_all + re-run setup + screenshot."""
-            print("  Resetting environment (soft)...")
-            try:
-                resp = requests.post(f"{server}/setup/close_all", timeout=30)
-                print(f"    close_all: {resp.status_code}")
-                time.sleep(2)
-                _setup_task_env()
-            except Exception as e:
-                print(f"  WARNING: environment setup failed: {e}")
-                print(f"  The task app may not be open. Check VNC.")
-            print("  Taking initial screenshot...")
-            return _take_screenshot(server)
-
-        def _hard_reset_task_env() -> bytes:
-            """Hard reset: QEMU system_reset + wait for boot + clear recovery + setup + screenshot."""
-            print("  Restarting Windows (QEMU hard reset)...")
-            mgr = QEMUResetManager(vm_ip=vm_ip, timeout_seconds=300)
-            success, msg = mgr.restart_windows(server_url=server)
-            if not success:
-                print(f"  WARNING: QEMU reset failed: {msg}")
-                print("  Falling back to soft reset...")
-                return _soft_reset_task_env()
-            print(f"    {msg}")
-            _clear_recovery_data(server)
-            print("  Running task setup...")
-            try:
-                _setup_task_env()
-            except Exception as e:
-                print(f"  WARNING: environment setup failed: {e}")
-                print(f"  The task app may not be open. Check VNC.")
-            print("  Taking initial screenshot...")
-            return _take_screenshot(server)
-
-        before_png = _soft_reset_task_env()
-
-        print(f"\n  VNC: {vnc_url}")
-        print(f"  Task: {instruction}\n")
-
-        # Generate AI step-by-step guidance from screenshot
-        print("  Generating suggested steps...")
-        suggested = _generate_steps(before_png, instruction, task_config)
-        print()
-        print("  ┌─ SUGGESTED STEPS ──────────────────────────────")
-        for line in suggested.splitlines():
-            print(f"  │ {line}")
-        print("  └────────────────────────────────────────────────")
-        print()
-        print("  Perform each action in VNC, then press Enter here.")
-        print("  Press 'd' when done, 'r' to redo last step,")
-        print("  'R' to restart task from scratch.\n")
-
-        steps = []
-        step_idx = 0
-        while True:
-            # Save before screenshot
-            (task_dir / f"step_{step_idx:02d}_before.png").write_bytes(
-                before_png
-            )
-
-            action_desc = input(
-                f"  Step {step_idx + 1}: Press Enter after action "
-                "(or 'd'=done, 'r'=redo last, 'R'=restart task): "
-            ).strip()
-
-            if action_desc == "R":
-                # Restart: QEMU hard reset + re-run setup, clear all steps
-                print("  Restarting task from scratch (QEMU hard reset)...")
-                # Clean recorded step files
-                for f in task_dir.glob("step_*.png"):
-                    f.unlink()
-                before_png = _hard_reset_task_env()
-                steps = []
-                step_idx = 0
-                print(f"\n  VNC: {vnc_url}")
-                print(f"  Task: {instruction}\n")
-                print("  Task restarted. Continue recording.\n")
-                continue
-
-            if action_desc.lower() == "d":
-                # Save final screenshot as the last after
-                after_png = _take_screenshot(server)
-                (task_dir / f"step_{step_idx:02d}_after.png").write_bytes(
-                    after_png
-                )
-                steps.append({"action_hint": action_desc or None})
-                step_idx += 1
-                break
-
-            if action_desc.lower() == "r" and step_idx > 0:
-                # Redo: go back one step
-                step_idx -= 1
-                steps.pop()
-                # Re-take the before screenshot from current state
-                before_png = _take_screenshot(server)
-                print(f"  Redoing step {step_idx + 1}...")
-                continue
-
-            # Take after screenshot
-            after_png = _take_screenshot(server)
-            (task_dir / f"step_{step_idx:02d}_after.png").write_bytes(
-                after_png
-            )
-
-            steps.append({"action_hint": action_desc or None})
-            before_png = after_png  # next step's before = this step's after
-            step_idx += 1
-
-        # Save metadata
-        meta = {
-            "task_id": task_id,
-            "instruction": instruction,
-            "num_steps": len(steps),
-            "steps": steps,
-            "server_url": server,
-            "recorded_at": datetime.now(timezone.utc).isoformat(),
-        }
-        (task_dir / "meta.json").write_text(
-            json.dumps(meta, indent=2), encoding="utf-8"
+        result = _record_single_task(
+            task_id=task_id,
+            task_num=task_num,
+            total_tasks=len(task_ids),
+            output_dir=output_dir,
+            server=server,
+            evaluate_url=evaluate_url,
+            vnc_url=vnc_url,
+            vm_ip=vm_ip,
         )
-
-        recorded.append(task_id)
-        print(f"\n  Saved {len(steps)} step(s) to {task_dir}")
+        if result is not None:
+            recorded.append(result)
 
     # Summary
     print_header("Recording Summary")
diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py
index a38dc2a..e938fef 100644
--- a/scripts/run_dc_eval.py
+++ b/scripts/run_dc_eval.py
@@ -211,7 +211,7 @@ def main() -> int:
     parser.add_argument("--output", default="benchmark_results")
     parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)")
     parser.add_argument("--start-from", type=int, default=0, help="Task index to start from")
-    parser.add_argument("--vm-ip", default="172.173.66.131", help="VM IP")
+    parser.add_argument("--vm-ip", default=None, help="VM IP (auto-detected if omitted)")
     parser.add_argument("--vm-user", default="azureuser", help="VM SSH user")
     parser.add_argument("--zs-only", action="store_true", help="Run zero-shot only (no demo)")
     parser.add_argument("--dc-only", action="store_true", help="Run demo-conditioned only")
@@ -251,12 +251,16 @@ def main() -> int:
                 continue
             conditions.append((tid, f"val_dc_{sid}", demo_path))
 
+    from openadapt_evals.infrastructure.vm_ip import resolve_vm_ip
+
+    vm_ip = resolve_vm_ip(args.vm_ip)
+
     print(f"Eval: {len(conditions)} runs ({len(task_ids)} tasks) with {args.agent}")
-    print(f"VM: {args.vm_ip}")
+    print(f"VM: {vm_ip}")
     print()
 
     # Verify initial WAA health
-    if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url):
+    if not ensure_waa_ready(args.server, args.vm_user, vm_ip, evaluate_url=args.evaluate_url):
         print("ERROR: Cannot reach WAA server or evaluate server")
         return 1
 
@@ -269,7 +273,7 @@ def main() -> int:
             continue
 
         # Health check before each run
-        if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url):
+        if not ensure_waa_ready(args.server, args.vm_user, vm_ip, evaluate_url=args.evaluate_url):
             print(f"  Skipping {run_name} - WAA unreachable after recovery")
             results[run_name] = {"status": "SKIP", "returncode": -1, "elapsed_s": 0}
             continue
diff --git a/tests/test_screen_stability.py b/tests/test_screen_stability.py
new file mode 100644
index 0000000..18a4d75
--- /dev/null
+++ b/tests/test_screen_stability.py
@@ -0,0 +1,204 @@
+"""Tests for screen stability detection (compare_screenshots, wait_for_stable_screen)."""
+
+import io
+from unittest.mock import patch
+
+from PIL import Image
+
+from openadapt_evals.infrastructure.screen_stability import (
+    compare_screenshots as _compare_screenshots,
+    wait_for_stable_screen as _wait_for_stable_screen,
+)
+
+
+def _make_png(width: int = 100, height: int = 100, color: tuple = (0, 0, 0)) -> bytes:
+    """Create a solid-color PNG image as bytes."""
+    img = Image.new("RGB", (width, height), color)
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def _make_png_with_diff(
+    width: int = 100,
+    height: int = 100,
+    base_color: tuple = (0, 0, 0),
+    diff_pixels: int = 0,
+    diff_color: tuple = (255, 255, 255),
+) -> bytes:
+    """Create a PNG with some pixels changed from the base color."""
+    img = Image.new("RGB", (width, height), base_color)
+    pixels = img.load()
+    changed = 0
+    for y in range(height):
+        for x in range(width):
+            if changed >= diff_pixels:
+                break
+            pixels[x, y] = diff_color
+            changed += 1
+        if changed >= diff_pixels:
+            break
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+class TestCompareScreenshots:
+    """Tests for _compare_screenshots()."""
+
+    def test_identical_images_return_1(self):
+        """Identical images have similarity 1.0."""
+        png = _make_png(color=(128, 128, 128))
+        assert _compare_screenshots(png, png) == 1.0
+
+    def test_completely_different_images_return_0(self):
+        """Completely different images have similarity 0.0."""
+        a = _make_png(color=(0, 0, 0))
+        b = _make_png(color=(255, 255, 255))
+        assert _compare_screenshots(a, b) == 0.0
+
+    def test_different_sizes_return_0(self):
+        """Images of different sizes return 0.0."""
+        a = _make_png(width=100, height=100)
+        b = _make_png(width=200, height=200)
+        assert _compare_screenshots(a, b) == 0.0
+
+    def test_small_diff_high_similarity(self):
+        """A few changed pixels still yield high similarity."""
+        total = 100 * 100  # 10000 pixels
+        diff_pixels = 10  # 0.1% different
+        a = _make_png(color=(0, 0, 0))
+        b = _make_png_with_diff(diff_pixels=diff_pixels, diff_color=(255, 255, 255))
+        similarity = _compare_screenshots(a, b)
+        expected = (total - diff_pixels) / total  # 0.999
+        assert abs(similarity - expected) < 0.001
+
+    def test_half_different(self):
+        """50% different pixels gives ~0.5 similarity."""
+        total = 100 * 100
+        a = _make_png(color=(0, 0, 0))
+        b = _make_png_with_diff(diff_pixels=total // 2, diff_color=(255, 255, 255))
+        similarity = _compare_screenshots(a, b)
+        assert 0.49 < similarity < 0.51
+
+    def test_clock_area_diff_above_threshold(self):
+        """Simulated clock-area change (0.3% of pixels) stays above 99.5%."""
+        total = 1000 * 1000  # 1M pixels
+        clock_pixels = 3000  # ~0.3%
+        a = _make_png(width=1000, height=1000, color=(50, 50, 50))
+        b = _make_png_with_diff(
+            width=1000, height=1000,
+            base_color=(50, 50, 50),
+            diff_pixels=clock_pixels,
+            diff_color=(200, 200, 200),
+        )
+        similarity = _compare_screenshots(a, b)
+        assert similarity >= 0.995, f"Clock-area diff should be above threshold: {similarity}"
+
+
+class TestWaitForStableScreen:
+    """Tests for _wait_for_stable_screen()."""
+
+    def test_immediate_stability(self):
+        """Returns quickly when screen is immediately stable."""
+        stable_png = _make_png(color=(100, 100, 100))
+
+        with patch(
+            "openadapt_evals.infrastructure.screen_stability._take_screenshot",
+            return_value=stable_png,
+        ), patch("time.sleep"):
+            result = _wait_for_stable_screen(
+                "http://fake",
+                poll_interval=0.01,
+                stability_timeout=5,
+                required_stable_checks=2,
+            )
+
+        assert result == stable_png
+
+    def test_stabilizes_after_changes(self):
+        """Returns stable screenshot after initial instability."""
+        changing = _make_png(color=(255, 0, 0))
+        changing2 = _make_png(color=(0, 255, 0))
+        stable = _make_png(color=(0, 0, 255))
+
+        screenshots = [changing, changing2, stable, stable, stable]
+        call_count = [0]
+
+        def mock_screenshot(server):
+            idx = min(call_count[0], len(screenshots) - 1)
+            call_count[0] += 1
+            return screenshots[idx]
+
+        with patch(
+            "openadapt_evals.infrastructure.screen_stability._take_screenshot",
+            side_effect=mock_screenshot,
+        ), patch("time.sleep"):
+            result = _wait_for_stable_screen(
+                "http://fake",
+                poll_interval=0.01,
+                stability_timeout=30,
+                required_stable_checks=2,
+            )
+
+        assert result == stable
+
+    def test_timeout_returns_last_screenshot(self):
+        """Returns last screenshot on timeout with warning."""
+        call_count = [0]
+
+        def mock_screenshot(server):
+            """Return a different image each time to prevent stability."""
+            call_count[0] += 1
+            return _make_png(color=(call_count[0] % 256, 0, 0))
+
+        with patch(
+            "openadapt_evals.infrastructure.screen_stability._take_screenshot",
+            side_effect=mock_screenshot,
+        ), patch("time.sleep"), patch("time.time") as mock_time:
+            # Simulate timeout: first call returns 0, subsequent calls exceed deadline
+            times = iter([0.0, 0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 100.0])
+            mock_time.side_effect = lambda: next(times)
+
+            result = _wait_for_stable_screen(
+                "http://fake",
+                poll_interval=0.01,
+                stability_timeout=2.0,
+                required_stable_checks=2,
+            )
+
+        assert result is not None
+        assert len(result) > 0
+
+    def test_minor_diff_treated_as_stable(self):
+        """Minor pixel differences (below threshold) count as stable."""
+        base = _make_png(width=100, height=100, color=(50, 50, 50))
+        # 2 pixels different out of 10000 = 0.02% diff → 99.98% similar
+        slight_diff = _make_png_with_diff(
+            width=100, height=100,
+            base_color=(50, 50, 50),
+            diff_pixels=2,
+            diff_color=(51, 51, 51),
+        )
+
+        screenshots = [base, slight_diff, slight_diff, slight_diff]
+        call_count = [0]
+
+        def mock_screenshot(server):
+            idx = min(call_count[0], len(screenshots) - 1)
+            call_count[0] += 1
+            return screenshots[idx]
+
+        with patch(
+            "openadapt_evals.infrastructure.screen_stability._take_screenshot",
+            side_effect=mock_screenshot,
+        ), patch("time.sleep"):
+            result = _wait_for_stable_screen(
+                "http://fake",
+                poll_interval=0.01,
+                stability_timeout=10,
+                similarity_threshold=0.995,
+                required_stable_checks=2,
+            )
+
+        assert result == slight_diff
diff --git a/tests/test_vm_ip.py b/tests/test_vm_ip.py
new file mode 100644
index 0000000..07a1460
--- /dev/null
+++ b/tests/test_vm_ip.py
@@ -0,0 +1,179 @@
+"""Tests for VM IP auto-detection (resolve_vm_ip)."""
+
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from openadapt_evals.infrastructure.vm_ip import (
+    _ip_from_azure_query,
+    _ip_from_pool_registry,
+    resolve_vm_ip,
+)
+
+
+class TestResolveVmIp:
+    """Tests for resolve_vm_ip() resolution order."""
+
+    def test_explicit_ip_returned_immediately(self):
+        """Explicit IP is returned without checking registry or Azure."""
+        assert resolve_vm_ip(explicit_ip="1.2.3.4") == "1.2.3.4"
+
+    def test_pool_registry_used_when_no_explicit_ip(self):
+        """Pool registry IP is used when no explicit IP is given."""
+        with patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry",
+            return_value="10.0.0.5",
+        ):
+            assert resolve_vm_ip() == "10.0.0.5"
+
+    def test_azure_fallback_when_no_registry(self):
+        """Azure query is used when pool registry doesn't exist."""
+        with patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry",
+            return_value=None,
+        ), patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query",
+            return_value="10.0.0.99",
+        ):
+            assert resolve_vm_ip() == "10.0.0.99"
+
+    def test_error_when_no_vm_found(self):
+        """RuntimeError raised when no VM can be found by any method."""
+        with patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry",
+            return_value=None,
+        ), patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query",
+            return_value=None,
+        ):
+            with pytest.raises(RuntimeError, match="No running VM found"):
+                resolve_vm_ip()
+
+    def test_explicit_ip_skips_all_lookups(self):
+        """When explicit IP is given, registry and Azure are never called."""
+        with patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry",
+        ) as mock_reg, patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query",
+        ) as mock_az:
+            result = resolve_vm_ip(explicit_ip="5.5.5.5")
+            assert result == "5.5.5.5"
+            mock_reg.assert_not_called()
+            mock_az.assert_not_called()
+
+    def test_registry_checked_before_azure(self):
+        """Pool registry is checked before Azure query (fast path first)."""
+        call_order = []
+
+        def mock_registry():
+            call_order.append("registry")
+            return "10.0.0.1"
+
+        def mock_azure():
+            call_order.append("azure")
+            return "10.0.0.2"
+
+        with patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_pool_registry",
+            side_effect=mock_registry,
+        ), patch(
+            "openadapt_evals.infrastructure.vm_ip._ip_from_azure_query",
+            side_effect=mock_azure,
+        ):
+            result = resolve_vm_ip()
+            assert result == "10.0.0.1"
+            assert call_order == ["registry"]  # Azure never called
+
+
+class TestIpFromPoolRegistry:
+    """Tests for _ip_from_pool_registry()."""
+
+    def test_reads_first_active_worker(self, tmp_path, monkeypatch):
+        """Returns IP of first non-deleted/non-failed worker."""
+        registry = tmp_path / "benchmark_results" / "vm_pool_registry.json"
+        registry.parent.mkdir(parents=True)
+        registry.write_text(json.dumps({
+            "workers": [
+                {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "ready"},
+                {"name": "waa-pool-01", "ip": "10.0.0.2", "status": "ready"},
+            ]
+        }))
+        monkeypatch.chdir(tmp_path)
+        assert _ip_from_pool_registry() == "10.0.0.1"
+
+    def test_skips_deleted_workers(self, tmp_path, monkeypatch):
+        """Skips workers with 'deleted' status."""
+        registry = tmp_path / "benchmark_results" / "vm_pool_registry.json"
+        registry.parent.mkdir(parents=True)
+        registry.write_text(json.dumps({
+            "workers": [
+                {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "deleted"},
+                {"name": "waa-pool-01", "ip": "10.0.0.2", "status": "ready"},
+            ]
+        }))
+        monkeypatch.chdir(tmp_path)
+        assert _ip_from_pool_registry() == "10.0.0.2"
+
+    def test_skips_failed_workers(self, tmp_path, monkeypatch):
+        """Skips workers with 'failed' status."""
+        registry = tmp_path / "benchmark_results" / "vm_pool_registry.json"
+        registry.parent.mkdir(parents=True)
+        registry.write_text(json.dumps({
+            "workers": [
+                {"name": "waa-pool-00", "ip": "10.0.0.1", "status": "failed"},
+            ]
+        }))
+        monkeypatch.chdir(tmp_path)
+        assert _ip_from_pool_registry() is None
+
+    def test_returns_none_when_no_file(self, tmp_path, monkeypatch):
+        """Returns None when registry file doesn't exist."""
+        monkeypatch.chdir(tmp_path)
+        assert _ip_from_pool_registry() is None
+
+    def test_returns_none_on_malformed_json(self, tmp_path, monkeypatch):
+        """Returns None when registry file contains invalid JSON."""
+        registry = tmp_path / "benchmark_results" / "vm_pool_registry.json"
+        registry.parent.mkdir(parents=True)
+        registry.write_text("not json")
+        monkeypatch.chdir(tmp_path)
+        assert _ip_from_pool_registry() is None
+
+
+class TestIpFromAzureQuery:
+    """Tests for _ip_from_azure_query()."""
+
+    def test_returns_pool_vm_ip(self):
+        """Returns IP from waa-pool-00 query."""
+        mock_mgr = MagicMock()
+        mock_mgr.get_vm_ip.return_value = "10.0.0.50"
+
+        with patch(
+            "openadapt_evals.infrastructure.azure_vm.AzureVMManager",
+            return_value=mock_mgr,
+        ):
+            assert _ip_from_azure_query() == "10.0.0.50"
+            mock_mgr.get_vm_ip.assert_called_once_with("waa-pool-00")
+
+    def test_falls_back_to_legacy_name(self):
+        """Falls back to waa-eval-vm when waa-pool-00 returns None."""
+        mock_mgr = MagicMock()
+        mock_mgr.get_vm_ip.side_effect = [None, "10.0.0.99"]
+
+        with patch(
+            "openadapt_evals.infrastructure.azure_vm.AzureVMManager",
+            return_value=mock_mgr,
+        ):
+            assert _ip_from_azure_query() == "10.0.0.99"
+            calls = mock_mgr.get_vm_ip.call_args_list
+            assert calls[0][0] == ("waa-pool-00",)
+            assert calls[1][0] == ("waa-eval-vm",)
+
+    def test_returns_none_on_azure_error(self):
+        """Returns None when Azure SDK/CLI is unavailable."""
+        with patch(
+            "openadapt_evals.infrastructure.azure_vm.AzureVMManager",
+            side_effect=ImportError("azure not installed"),
+        ):
+            assert _ip_from_azure_query() is None