feat: add desktop cleanup, manual demo tools, and fix trainer OOM bugs (#195)

abrichr · claude · web-flow · commit 57a80943c2a6 · 2026-03-26T17:06:30.000-04:00
- Add clean_desktop() to WAADirect to kill known distracting apps between
  episodes, preventing stale desktop state from leaking across phases
- Handle close_all config entry type in WAADirect.setup_task()
- Create manual notepad-hello demo (DemoLibrary-compatible, no screenshots)
- Add scripts/create_manual_demo.py CLI for authoring demos from text specs
- Fix vision tensor exclusion in GRPO loss computation (OOM on L40S)
- Add try/except for float parsing in parse_vlm_output_to_action
- Lower max_new_tokens default from 2048 to 512 (prevents OOM, sufficient
  for Thought+Action format)

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/demos/custom-notepad-hello/manual/demo.json b/demos/custom-notepad-hello/manual/demo.json
@@ -0,0 +1,76 @@
+{
+  "task_id": "custom-notepad-hello",
+  "demo_id": "manual",
+  "description": "Open Notepad via Run dialog and type Hello World",
+  "created_at": "2026-03-26T00:00:00+00:00",
+  "metadata": {
+    "resolution": {
+      "width": 1280,
+      "height": 720
+    },
+    "source": "manual",
+    "note": "Manually authored demo (no screenshots). Descriptions provide step-by-step guidance for planners."
+  },
+  "steps": [
+    {
+      "step_index": 0,
+      "screenshot_path": "",
+      "action_type": "key",
+      "action_description": "KEY(win+r)",
+      "target_description": "Desktop / taskbar",
+      "action_value": "win+r",
+      "x": null,
+      "y": null,
+      "description": "Press Win+R to open the Run dialog",
+      "metadata": {}
+    },
+    {
+      "step_index": 1,
+      "screenshot_path": "",
+      "action_type": "type",
+      "action_description": "TYPE('notepad')",
+      "target_description": "Run dialog Open field",
+      "action_value": "notepad",
+      "x": null,
+      "y": null,
+      "description": "Type 'notepad' in the Run dialog's Open field",
+      "metadata": {}
+    },
+    {
+      "step_index": 2,
+      "screenshot_path": "",
+      "action_type": "key",
+      "action_description": "KEY(enter)",
+      "target_description": "Run dialog",
+      "action_value": "enter",
+      "x": null,
+      "y": null,
+      "description": "Press Enter to launch Notepad",
+      "metadata": {}
+    },
+    {
+      "step_index": 3,
+      "screenshot_path": "",
+      "action_type": "click",
+      "action_description": "CLICK(0.40, 0.40)",
+      "target_description": "Notepad text editing area",
+      "action_value": "",
+      "x": 0.4,
+      "y": 0.4,
+      "description": "Click in the Notepad text editing area to ensure focus",
+      "metadata": {}
+    },
+    {
+      "step_index": 4,
+      "screenshot_path": "",
+      "action_type": "type",
+      "action_description": "TYPE('Hello World')",
+      "target_description": "Notepad text editing area",
+      "action_value": "Hello World",
+      "x": null,
+      "y": null,
+      "description": "Type 'Hello World' in Notepad",
+      "metadata": {}
+    }
+  ]
+}
diff --git a/openadapt_evals/training/standalone/config.py b/openadapt_evals/training/standalone/config.py
@@ -17,7 +17,7 @@ class TrainingConfig:
     num_rollouts_per_step: int = 8
     max_steps_per_episode: int = 15
     temperature: float = 0.7
-    max_new_tokens: int = 2048  # 100 truncates reasoning -- keep high
+    max_new_tokens: int = 512  # 2048 OOMs on L40S; 512 sufficient for Thought+Action
     server_url: str = "http://localhost:5001"
     task_ids: list[str] = field(default_factory=list)
     task_dir: str | None = None
diff --git a/openadapt_evals/training/standalone/prompt.py b/openadapt_evals/training/standalone/prompt.py
@@ -109,9 +109,12 @@ def parse_vlm_output_to_action(
     # CLICK(x=..., y=...)
     m = re.search(r"CLICK\(x=(-?[\d.]+),\s*y=(-?[\d.]+)\)", text, re.IGNORECASE)
     if m:
-        xf = max(0.0, min(1.0, float(m.group(1))))
-        yf = max(0.0, min(1.0, float(m.group(2))))
-        return SimpleAction(type="click", x=int(xf * width), y=int(yf * height))
+        try:
+            xf = max(0.0, min(1.0, float(m.group(1))))
+            yf = max(0.0, min(1.0, float(m.group(2))))
+            return SimpleAction(type="click", x=int(xf * width), y=int(yf * height))
+        except (ValueError, OverflowError):
+            logger.warning("Malformed CLICK coords: x=%s y=%s", m.group(1), m.group(2))
 
     # TYPE(text="...")
     m = re.search(r"""TYPE\(text=["']([^"'\\]*(?:\\.[^"'\\]*)*)["']\)""", text, re.IGNORECASE)
diff --git a/openadapt_evals/training/standalone/trainer.py b/openadapt_evals/training/standalone/trainer.py
@@ -186,7 +186,14 @@ def _compute_rollout_loss(self, rollout: Rollout, advantage: float, scale: float
                 continue
 
             full_ids = torch.cat([prompt_inputs["input_ids"], action_ids.to(prompt_inputs["input_ids"].device)], dim=1)
-            full_inputs = dict(prompt_inputs)
+            # Exclude vision tensors from loss forward pass to avoid OOM.
+            # The vision encoder backward pass is expensive and unnecessary
+            # since we only compute loss on action tokens (past prompt_len).
+            # Proven fix from 7 training runs on L40S GPUs.
+            _VISION_KEYS = {"pixel_values", "pixel_values_videos",
+                            "image_grid_thw", "video_grid_thw"}
+            full_inputs = {k: v for k, v in prompt_inputs.items()
+                           if k not in _VISION_KEYS}
             full_inputs["input_ids"] = full_ids
             full_inputs["attention_mask"] = torch.ones_like(full_ids)
             full_inputs = {k: v.to(device) for k, v in full_inputs.items()}
@@ -288,7 +295,7 @@ def main() -> None:
     p.add_argument("--num-steps", type=int, default=10)
     p.add_argument("--num-rollouts", type=int, default=8)
     p.add_argument("--max-steps-per-episode", type=int, default=15)
-    p.add_argument("--max-new-tokens", type=int, default=2048)
+    p.add_argument("--max-new-tokens", type=int, default=512)
     p.add_argument("--output", default="checkpoints/grpo")
     p.add_argument("--no-4bit", action="store_true")
     p.add_argument("--eval-model", default="gpt-4.1-mini")
diff --git a/openadapt_evals/training/standalone/waa_direct.py b/openadapt_evals/training/standalone/waa_direct.py
@@ -147,6 +147,9 @@ def setup_task(self, task_config: dict[str, Any]) -> bool:
             params = entry.get("parameters", {})
             if etype == "sleep":
                 time.sleep(params.get("seconds", 5))
+            elif etype == "close_all":
+                # Kill common desktop apps for a clean state
+                self.clean_desktop()
             elif etype in ("execute", "command", "launch"):
                 cmd = params.get("command", "")
                 if cmd:
@@ -168,6 +171,50 @@ def setup_task(self, task_config: dict[str, Any]) -> bool:
         time.sleep(2)
         return True
 
+    def clean_desktop(
+        self,
+        kill_apps: list[str] | None = None,
+    ) -> bool:
+        """Kill known distracting apps and show desktop for a clean state.
+
+        Call between episodes (flywheel phases, GRPO rollouts) to prevent
+        stale desktop state from leaking into the next episode.
+
+        Args:
+            kill_apps: Process image names to kill. Defaults to common desktop
+                apps that interfere with task execution.
+
+        Returns:
+            True if cleanup commands executed (does not verify success).
+        """
+        if kill_apps is None:
+            kill_apps = [
+                "notepad.exe", "Code.exe", "msedge.exe", "chrome.exe",
+                "WINWORD.EXE", "EXCEL.EXE", "POWERPNT.EXE", "wordpad.exe",
+                "mspaint.exe", "calc.exe", "explorer.exe",
+            ]
+        # Build taskkill command for all apps in one call
+        kill_cmds = " ".join(
+            f"taskkill /F /IM {app}" for app in kill_apps
+        )
+        commands = [
+            # Kill listed apps (errors are OK -- app may not be running)
+            f"import subprocess; subprocess.run('{kill_cmds}', shell=True, capture_output=True)",
+            # Show desktop (Win+D) to minimize any remaining windows
+            "import pyautogui; import time; pyautogui.hotkey('win', 'd'); time.sleep(1)",
+        ]
+        for cmd in commands:
+            try:
+                self._session.post(
+                    f"{self.server_url}/execute_windows",
+                    json={"command": cmd}, timeout=30,
+                )
+            except requests.RequestException as e:
+                logger.warning("clean_desktop error: %s", e)
+        time.sleep(2)
+        logger.info("Desktop cleanup completed (killed %d app types)", len(kill_apps))
+        return True
+
     def is_stuck(self, recent: list[bytes], window: int = 3) -> bool:
         """True if last N screenshots are identical."""
         if len(recent) < window:
diff --git a/scripts/create_manual_demo.py b/scripts/create_manual_demo.py