fix: align standalone GRPO with WAA API format and add retry logic (#193)

abrichr · claude · web-flow · commit 43cac1ca9708 · 2026-03-24T00:07:02.000-04:00
The standalone GRPO trainer produced zero rewards due to two API
format bugs in WAADirect:

1. screenshot() tried resp.json() expecting base64-encoded JSON, but
   WAA's /screenshot returns raw PNG bytes via Flask's send_file().
   Fixed to use resp.content (matching WAALiveAdapter).

2. execute_action() wrapped commands in `python -c "..."`, but WAA's
   /execute_windows uses exec() directly -- the wrapper caused
   SyntaxError inside the VM. Fixed to send bare Python statements
   (matching WAALiveAdapter._build_pixel_command).

Additional improvements:
- Add probe() method for structured health checking
- Add screenshot retry logic (3 attempts with 2s delay)
- Add double_click, right_click, scroll action types
- Fix type action to click target first then type (match WAALiveAdapter)
- Add pre-rollout health check in trainer._collect_group()
- Handle empty rollouts gracefully in training loop
- Fix train script to bypass openadapt_evals/__init__.py eager imports
  (open_clip -&gt; numpy ABI crash in minimal training environments)

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/training/standalone/trainer.py b/openadapt_evals/training/standalone/trainer.py
@@ -75,10 +75,13 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
         recent: list[bytes] = []
 
         for step_idx in range(self._config.max_steps_per_episode):
+            # screenshot() already has built-in retry (3 attempts by default)
             try:
                 screenshot = self._env.screenshot()
             except Exception as e:
-                logger.warning("Screenshot failed at step %d: %s", step_idx, e)
+                logger.warning(
+                    "Screenshot failed at step %d after retries: %s", step_idx, e,
+                )
                 break
             recent.append(screenshot)
             if self._env.is_stuck(recent, window=self._config.stuck_window):
@@ -124,6 +127,19 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
 
     def _collect_group(self, task_id: str) -> list[Rollout]:
         """Collect N rollouts for one GRPO gradient step."""
+        assert self._env is not None
+
+        # Pre-rollout health check: verify WAA is responsive before committing
+        # to a full group of rollouts (avoids wasting time on a dead server).
+        probe = self._env.probe()
+        if not probe.get("screenshot_ok"):
+            logger.error(
+                "Pre-rollout health check FAILED for task %s: %s — "
+                "skipping group (returning empty rollouts)",
+                task_id, probe,
+            )
+            return []
+
         tc = self._task_configs.get(task_id)
         instruction = getattr(tc, "name", "") or task_id if tc else task_id
         if tc and self._env:
@@ -242,6 +258,12 @@ def train(self) -> str:
             self._model.eval()
             rollouts = self._collect_group(task_id)
             self._model.train()
+            if not rollouts:
+                logger.warning(
+                    "Step %d/%d: no rollouts collected (server may be down), skipping.",
+                    step + 1, self._config.num_training_steps,
+                )
+                continue
             m = self._training_step(rollouts)
             m.update({"step": step, "task_id": task_id, "elapsed": time.time() - t0, "step_time": time.time() - ts})
             logger.info("Step %d/%d: reward=%.2f loss=%.4f time=%.1fs",
diff --git a/openadapt_evals/training/standalone/waa_direct.py b/openadapt_evals/training/standalone/waa_direct.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import base64
 import hashlib
 import logging
 import time
@@ -15,6 +14,10 @@
 
 logger = logging.getLogger(__name__)
 
+# Default retry parameters for screenshot
+SCREENSHOT_MAX_RETRIES = 3
+SCREENSHOT_RETRY_DELAY = 2.0  # seconds
+
 
 @dataclass
 class RolloutStep:
@@ -37,42 +40,97 @@ class Rollout:
 
 
 class WAADirect:
-    """Direct HTTP client for WAA Flask server. Screenshot/click/type/key."""
+    """Direct HTTP client for WAA Flask server. Screenshot/click/type/key.
+
+    WAA API contract (from WAA Flask server main.py):
+      GET  /screenshot       -> raw PNG bytes (Content-Type: image/png)
+      POST /execute_windows  -> exec(command, {'computer': computer, 'human': human})
+           Payload: {"command": "<python code>"}
+           The command is Python code executed via exec() with pyautogui available.
+           Do NOT wrap in ``python -c "..."`` -- send bare Python statements.
+    """
 
     def __init__(self, server_url: str = "http://localhost:5001",
                  screen_size: tuple[int, int] = (1920, 1080)) -> None:
         self.server_url = server_url.rstrip("/")
         self.screen_size = screen_size
         self._session = requests.Session()
 
-    def screenshot(self) -> bytes:
-        """Take a fresh screenshot. Returns PNG bytes."""
-        resp = self._session.get(f"{self.server_url}/screenshot", timeout=30)
-        if resp.status_code != 200:
-            raise RuntimeError(f"Screenshot failed: {resp.status_code}")
-        data = resp.json()
-        img_b64 = data.get("screenshot", data.get("image", ""))
-        if not img_b64:
-            raise RuntimeError("No screenshot data in response")
-        return base64.b64decode(img_b64)
+    def screenshot(self, max_retries: int = SCREENSHOT_MAX_RETRIES,
+                   retry_delay: float = SCREENSHOT_RETRY_DELAY) -> bytes:
+        """Take a fresh screenshot. Returns raw PNG bytes.
+
+        WAA's /screenshot endpoint returns raw PNG via Flask's send_file(),
+        NOT base64-encoded JSON. Read resp.content, not resp.json().
+        """
+        last_exc: Exception | None = None
+        for attempt in range(1, max_retries + 1):
+            try:
+                resp = self._session.get(
+                    f"{self.server_url}/screenshot", timeout=30,
+                )
+                if resp.status_code != 200:
+                    raise RuntimeError(
+                        f"Screenshot HTTP {resp.status_code}: {resp.text[:200]}"
+                    )
+                png_bytes = resp.content
+                if len(png_bytes) < 100:
+                    raise RuntimeError(
+                        f"Screenshot too small ({len(png_bytes)} bytes) -- "
+                        "server may not be ready"
+                    )
+                return png_bytes
+            except Exception as e:
+                last_exc = e
+                logger.warning(
+                    "Screenshot attempt %d/%d failed: %s",
+                    attempt, max_retries, e,
+                )
+                if attempt < max_retries:
+                    time.sleep(retry_delay)
+        raise RuntimeError(
+            f"Screenshot failed after {max_retries} attempts"
+        ) from last_exc
 
     def execute_action(self, action: SimpleAction) -> dict[str, Any]:
-        """Execute action on VM via /execute_windows."""
+        """Execute action on VM via /execute_windows.
+
+        WAA's /execute_windows does ``exec(command, {'computer': ..., 'human': ...})``.
+        The command must be bare Python code -- NOT wrapped in ``python -c "..."``.
+        pyautogui is available via import inside the exec'd code.
+        """
         if action.type == "click":
             x, y = int(action.x or 0), int(action.y or 0)
-            cmd = f'python -c "import pyautogui; pyautogui.click({x}, {y})"'
+            cmd = f"import pyautogui; pyautogui.click({x}, {y})"
+        elif action.type == "double_click":
+            x, y = int(action.x or 0), int(action.y or 0)
+            cmd = f"import pyautogui; pyautogui.doubleClick({x}, {y})"
+        elif action.type == "right_click":
+            x, y = int(action.x or 0), int(action.y or 0)
+            cmd = f"import pyautogui; pyautogui.rightClick({x}, {y})"
         elif action.type == "type":
-            text = (action.text or "").replace('"', '\\"')
-            cmd = f'python -c "import pyautogui; pyautogui.typewrite(\'{text}\', interval=0.05)"'
+            text = (action.text or "").replace("\\", "\\\\").replace("'", "\\'")
+            x, y = int(action.x or 0), int(action.y or 0)
+            # Click target first, then type (matches WAALiveAdapter pattern)
+            cmd = (
+                f"import pyautogui; import time; "
+                f"pyautogui.click({x}, {y}); "
+                f"time.sleep(0.2); "
+                f"pyautogui.typewrite('{text}', interval=0.05)"
+            )
         elif action.type == "key":
-            cmd = f'python -c "import pyautogui; pyautogui.press(\'{action.key or "enter"}\')"'
+            key = action.key or "enter"
+            cmd = f"import pyautogui; pyautogui.press('{key}')"
+        elif action.type == "scroll":
+            x, y = int(action.x or 0), int(action.y or 0)
+            cmd = f"import pyautogui; pyautogui.scroll(-3, x={x}, y={y})"
         elif action.type == "wait":
             time.sleep(2)
             return {"status": "ok", "action": "wait"}
         elif action.type == "done":
             return {"status": "ok", "action": "done"}
         else:
-            return {"status": "error", "message": f"Unknown: {action.type}"}
+            return {"status": "error", "message": f"Unknown action type: {action.type}"}
 
         resp = self._session.post(
             f"{self.server_url}/execute_windows", json={"command": cmd}, timeout=30,
@@ -117,9 +175,25 @@ def is_stuck(self, recent: list[bytes], window: int = 3) -> bool:
         hashes = [hashlib.md5(s).hexdigest() for s in recent[-window:]]
         return len(set(hashes)) == 1
 
-    def health_check(self) -> bool:
-        """True if WAA server responds."""
+    def probe(self, timeout: float = 10.0) -> dict[str, Any]:
+        """Health-check the WAA server. Returns status dict.
+
+        Attempts a screenshot to verify the full pipeline (not just HTTP).
+        """
+        result: dict[str, Any] = {"reachable": False, "screenshot_ok": False}
         try:
-            return self._session.get(f"{self.server_url}/screenshot", timeout=10).status_code == 200
-        except requests.RequestException:
-            return False
+            resp = self._session.get(
+                f"{self.server_url}/screenshot", timeout=timeout,
+            )
+            result["reachable"] = True
+            result["status_code"] = resp.status_code
+            if resp.status_code == 200:
+                result["screenshot_ok"] = len(resp.content) > 100
+                result["screenshot_bytes"] = len(resp.content)
+        except requests.RequestException as e:
+            result["error"] = str(e)
+        return result
+
+    def health_check(self) -> bool:
+        """True if WAA server responds with a valid screenshot."""
+        return self.probe().get("screenshot_ok", False)
diff --git a/scripts/train_grpo_standalone.py b/scripts/train_grpo_standalone.py
@@ -11,9 +11,48 @@
 
 Or equivalently via module:
     python -m openadapt_evals.training.standalone.trainer --task-dir ...
+
+NOTE: We must avoid triggering openadapt_evals/__init__.py, which eagerly
+imports agents/adapters/demo_library/benchmarks. The demo_library import
+pulls in open_clip at module level, which can crash in minimal training
+environments (e.g., numpy ABI mismatch). We work around this by inserting
+a lightweight shim into sys.modules for the top-level package before any
+sub-imports run.
 """
 
-from openadapt_evals.training.standalone.trainer import main
+import importlib
+import sys
+import types
+from pathlib import Path
+
+
+def _ensure_lightweight_package(pkg_name: str, pkg_dir: Path) -> None:
+    """Register a package in sys.modules without executing its __init__.py.
+
+    This lets us ``import openadapt_evals.training.standalone.trainer``
+    without the top-level ``openadapt_evals/__init__.py`` running its
+    heavy re-exports (agents, adapters, demo_library, benchmarks).
+    """
+    if pkg_name in sys.modules:
+        return
+    pkg = types.ModuleType(pkg_name)
+    pkg.__path__ = [str(pkg_dir)]
+    pkg.__package__ = pkg_name
+    sys.modules[pkg_name] = pkg
+
+
+def main() -> None:
+    root = Path(__file__).resolve().parent.parent
+    pkg_root = root / "openadapt_evals"
+
+    # Shim only the top-level package; sub-packages have lightweight __init__.py
+    _ensure_lightweight_package("openadapt_evals", pkg_root)
+
+    # Now the standalone trainer can be imported without pulling in the
+    # full agents/adapters/benchmarks dependency tree.
+    mod = importlib.import_module("openadapt_evals.training.standalone.trainer")
+    mod.main()
+
 
 if __name__ == "__main__":
     main()