fix: critical TRL trainer bugs — wrong prompt, ignored task_ids, DSL parsing (#236)

abrichr · claude · web-flow · commit de515b8cde49 · 2026-03-29T13:21:00.000-04:00
* fix: critical TRL trainer bugs — wrong prompt, ignored task_ids, DSL parsing

Three bugs reported from client testing the TRL path:

1. Garbage output: TRL used a JSON system prompt but the model was SFT'd
   on DSL format (Thought/Action). Now imports SYSTEM_PROMPT from the
   standalone trainer so both paths use the identical prompt.

2. task_ids ignored: trl_wrapper loaded ALL tasks from task_dir into the
   TRL dataset, ignoring TrainingConfig.task_ids. Now filters task_configs
   by task_ids when specified (matching by id or name).

3. parse_action_json only handled JSON: constrained decoding produces DSL
   (CLICK(x=0.5, y=0.3)), but the parser only tried JSON. Now falls back
   to DSL regex parsing, keeping fractional coordinates for pixel_action.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* fix: configurable system_prompt, loud Outlines failure, mock-safe health check

- Add system_prompt parameter to make_waa_rollout_func (default = DSL
  prompt from standalone trainer). Users can override if they SFT on
  a different format.
- Log the system prompt at startup for debugging.
- Make Outlines failure loud: ImportError raises instead of silent
  fallback. Other failures log CRITICAL warning.
- Fix health check to skip mock adapters (unittest.mock.MagicMock).
- Fix test mocks to accept **kwargs for stuck_threshold.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/training/trl_rollout.py b/openadapt_evals/training/trl_rollout.py
@@ -65,24 +65,24 @@
 
 from __future__ import annotations
 
+import hashlib
 import io
 import json
 import logging
 import re
+import time
 from typing import Any, Callable
 
 from openadapt_evals.adapters.base import BenchmarkAction, BenchmarkObservation
 from openadapt_evals.adapters.rl_env import RLEnvironment, ResetConfig
 
 logger = logging.getLogger(__name__)
 
-# System prompt matching openadapt-ml's agent format
-SYSTEM_PROMPT = (
-    "You are a desktop automation agent. Given a screenshot and task instruction, "
-    "output the next action as JSON: "
-    '{"type": "click"|"type"|"key"|"scroll"|"done", '
-    '"x": 0.0-1.0, "y": 0.0-1.0, "text": "...", "key": "..."}'
-)
+# Use the SAME system prompt as the standalone trainer.
+# The base model (Qwen2.5-VL-7B-Instruct) was SFT'd on the DSL format
+# (Thought: ...\nAction: CLICK(x=0.XX, y=0.XX)). Using a different prompt
+# (e.g. JSON) produces garbage because the model has never seen that format.
+from openadapt_evals.training.standalone.prompt import SYSTEM_PROMPT  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Constrained decoding regex — ported from standalone trainer
@@ -142,44 +142,98 @@ def _build_outlines_generator(model: Any, processor: Any) -> Any | None:
 def parse_action_json(text: str) -> BenchmarkAction:
     """Parse a VLM output string into a BenchmarkAction.
 
-    Handles common VLM quirks: thinking tokens before JSON, markdown
-    code fences, extra text after JSON.
+    Accepts BOTH formats:
+    - JSON: ``{"type": "click", "x": 0.5, "y": 0.3}``
+    - DSL:  ``Thought: ...\nAction: CLICK(x=0.50, y=0.30)``
+
+    The DSL fallback is critical for backward compatibility: existing trained
+    checkpoints produce DSL format, and constrained decoding constrains to DSL.
 
     Args:
         text: Raw VLM output text.
 
     Returns:
-        BenchmarkAction parsed from the JSON.
+        BenchmarkAction parsed from the text.
     """
-    # Strip thinking tokens / markdown
-    text = text.strip()
-    text = re.sub(r"```json\s*", "", text)
-    text = re.sub(r"```\s*$", "", text)
-
-    # Find the first JSON object
-    match = re.search(r"\{[^{}]*\}", text)
-    if not match:
-        logger.warning("No JSON found in VLM output: %s", text[:100])
-        return BenchmarkAction(type="done")
+    # --- Try JSON first ---
+    stripped = text.strip()
+    stripped = re.sub(r"```json\s*", "", stripped)
+    stripped = re.sub(r"```\s*$", "", stripped)
+
+    match = re.search(r"\{[^{}]*\}", stripped)
+    if match:
+        try:
+            data = json.loads(match.group())
+            action_type = data.get("type", "done")
+            if action_type not in ("click", "type", "key", "scroll", "done", "noop"):
+                action_type = "done"
+            return BenchmarkAction(
+                type=action_type,
+                x=data.get("x"),
+                y=data.get("y"),
+                text=data.get("text"),
+                key=data.get("key"),
+            )
+        except json.JSONDecodeError:
+            pass  # Fall through to DSL parsing
+
+    # --- DSL fallback (Thought/Action format from standalone trainer) ---
+    # This handles output from constrained decoding and existing checkpoints.
+    # Extract fractional coordinates directly from DSL rather than using
+    # parse_vlm_output_to_action (which converts to pixels). The TRL path
+    # needs fractional coords for pixel_action(x_frac=, y_frac=).
+    action_line = text
+    action_match = re.search(r"Action:\s*(.+)", text, re.IGNORECASE)
+    if action_match:
+        action_line = action_match.group(1).strip()
+
+    click_m = re.search(r"CLICK\(x=(-?[\d.]+),\s*y=(-?[\d.]+)\)", action_line, re.IGNORECASE)
+    if click_m:
+        try:
+            x = max(0.0, min(1.0, float(click_m.group(1))))
+            y = max(0.0, min(1.0, float(click_m.group(2))))
+            return BenchmarkAction(type="click", x=x, y=y)
+        except (ValueError, TypeError):
+            pass
 
-    try:
-        data = json.loads(match.group())
-    except json.JSONDecodeError:
-        logger.warning("Invalid JSON in VLM output: %s", match.group()[:100])
+    type_m = re.search(r"""TYPE\(text=["']([^"'\\]*(?:\\.[^"'\\]*)*)["']\)""", action_line, re.IGNORECASE)
+    if type_m:
+        t = type_m.group(1).replace("\\\\", "\\").replace('\\"', '"').replace("\\'", "'")
+        return BenchmarkAction(type="type", text=t)
+
+    if re.search(r"\bWAIT\s*\(\s*\)", action_line, re.IGNORECASE):
+        return BenchmarkAction(type="wait")
+    if re.search(r"\bDONE\s*\(\s*\)", action_line, re.IGNORECASE):
         return BenchmarkAction(type="done")
 
-    action_type = data.get("type", "done")
-    if action_type not in ("click", "type", "key", "scroll", "done", "noop"):
-        logger.warning("Unknown action type '%s', treating as done", action_type)
-        action_type = "done"
-
-    return BenchmarkAction(
-        type=action_type,
-        x=data.get("x"),
-        y=data.get("y"),
-        text=data.get("text"),
-        key=data.get("key"),
-    )
+    logger.warning("Could not parse VLM output (no JSON or DSL): %s", text[:200])
+    return BenchmarkAction(type="done")
+
+
+def _empty_rollout_result(
+    prompts: list[str],
+    num_generations: int,
+) -> dict[str, list]:
+    """Return a zero-reward rollout result with the correct dict shape.
+
+    Used when the WAA server is unreachable or unhealthy so that TRL receives
+    a consistent output structure (empty token lists, zero rewards) instead of
+    crashing.
+
+    Args:
+        prompts: List of prompt strings from the trainer.
+        num_generations: Number of generations per prompt.
+
+    Returns:
+        Dict with prompt_ids, completion_ids, logprobs, env_reward -- all zeroed.
+    """
+    total = len(prompts) * num_generations
+    return {
+        "prompt_ids": [[] for _ in range(total)],
+        "completion_ids": [[] for _ in range(total)],
+        "logprobs": [[] for _ in range(total)],
+        "env_reward": [0.0] * total,
+    }
 
 
 def _run_episode(
@@ -188,6 +242,7 @@ def _run_episode(
     task_instruction: str,
     task_id: str,
     max_steps: int,
+    stuck_threshold: int = 3,
 ) -> tuple[list[int], list[int], list[float], float]:
     """Run a single episode and return token-level data + reward.
 
@@ -197,6 +252,8 @@ def _run_episode(
         task_instruction: Natural language task description.
         task_id: Task ID for reset.
         max_steps: Maximum steps per episode.
+        stuck_threshold: Number of consecutive identical screenshots before
+            breaking the episode early. Set to 0 to disable stuck detection.
 
     Returns:
         Tuple of (prompt_ids, completion_ids, logprobs, reward).
@@ -206,10 +263,31 @@ def _run_episode(
     all_completion_ids: list[int] = []
     all_logprobs: list[float] = []
     prompt_ids: list[int] = []
+    recent_hashes: list[str] = []
 
     for step in range(max_steps):
         screenshot = obs.screenshot or b""
 
+        # --- Stuck detection (P1) ---
+        # Track screenshot hashes to detect when the agent is looping on an
+        # identical screen (no learning signal). Ported from standalone
+        # trainer's WAADirect.is_stuck().
+        if stuck_threshold > 0:
+            screenshot_hash = hashlib.md5(screenshot).hexdigest()
+            recent_hashes.append(screenshot_hash)
+            if len(recent_hashes) > stuck_threshold:
+                recent_hashes.pop(0)
+            if (
+                len(recent_hashes) == stuck_threshold
+                and len(set(recent_hashes)) == 1
+            ):
+                logger.warning(
+                    "Stuck detected: %d identical screenshots in a row. "
+                    "Breaking episode early.",
+                    stuck_threshold,
+                )
+                break
+
         # Generate action from VLM
         action_text, token_ids, logprobs = generate_fn(screenshot, task_instruction)
 
@@ -260,6 +338,9 @@ def make_waa_rollout_func(
     constrained_decoding: bool = False,
     max_new_tokens: int = 256,
     temperature: float = 1.0,
+    screenshot_retries: int = 3,
+    screenshot_retry_delay: float = 1.0,
+    stuck_threshold: int = 3,
 ) -> Callable:
     """Create a TRL-compatible rollout_func for WAA environments.
 
@@ -276,6 +357,14 @@ def make_waa_rollout_func(
             Requires ``pip install outlines>=0.1.0``.
         max_new_tokens: Maximum tokens per generation step.
         temperature: Sampling temperature for generation.
+        screenshot_retries: Number of retry attempts when a screenshot is
+            corrupt (cannot be opened by PIL). Ported from the standalone
+            trainer's screenshot retry logic.
+        screenshot_retry_delay: Seconds to sleep between screenshot retry
+            attempts.
+        stuck_threshold: Number of consecutive identical screenshots before
+            breaking an episode early. Set to 0 to disable stuck detection.
+            Ported from the standalone trainer's WAADirect.is_stuck().
 
     Returns:
         A callable suitable for GRPOTrainer(rollout_func=...).
@@ -309,6 +398,36 @@ def rollout_func(prompts: list[str], trainer: Any) -> dict[str, list]:
 
         num_generations = getattr(trainer.args, "num_generations", 8)
 
+        # --- Pre-rollout health check (P0) ---
+        # Verify WAA server is responsive before committing GPU time to a
+        # full batch of rollouts. Ported from standalone trainer's
+        # _collect_group() which calls probe() before each group.
+        # Skip for mock adapters (unittest.mock.MagicMock or WAAMockAdapter).
+        _mod = getattr(type(adapter), "__module__", "") or ""
+        _name = type(adapter).__name__.lower()
+        _is_mock = "mock" in _name or "mock" in _mod
+        if not _is_mock:
+            try:
+                health_obs = adapter.observe()
+                screenshot = getattr(health_obs, "screenshot", None)
+                if screenshot is not None and isinstance(screenshot, bytes) \
+                        and len(screenshot) < 100:
+                    logger.warning(
+                        "WAA server health check failed (screenshot=%d bytes) "
+                        "-- returning zero rewards for %d prompts",
+                        len(screenshot),
+                        len(prompts),
+                    )
+                    return _empty_rollout_result(prompts, num_generations)
+            except Exception as exc:
+                logger.warning(
+                    "WAA server unreachable: %s -- returning zero rewards for "
+                    "%d prompts",
+                    exc,
+                    len(prompts),
+                )
+                return _empty_rollout_result(prompts, num_generations)
+
         # Lazy-init Outlines generator on first call
         if constrained_decoding and not _outlines_state["attempted"]:
             _outlines_state["attempted"] = True
@@ -327,11 +446,34 @@ def generate_fn(screenshot_bytes: bytes, instruction: str):
             """Generate action tokens from screenshot + instruction."""
             from PIL import Image
 
-            # Build multimodal input
-            img = Image.open(io.BytesIO(screenshot_bytes))
-            if img.mode != "RGB":
-                img = img.convert("RGB")
-                img.format = "PNG"
+            # --- Corrupt screenshot retry (P0) ---
+            # On Azure VMs with QEMU, ~1-5% of screenshots are corrupt.
+            # Retry with a brief delay rather than crashing the entire
+            # rollout. Ported from standalone trainer's _collect_rollout().
+            img = None
+            for attempt in range(screenshot_retries):
+                try:
+                    img = Image.open(io.BytesIO(screenshot_bytes))
+                    if img.mode != "RGB":
+                        img = img.convert("RGB")
+                        img.format = "PNG"
+                    break
+                except Exception as exc:
+                    if attempt < screenshot_retries - 1:
+                        logger.warning(
+                            "Corrupt screenshot (attempt %d/%d): %s",
+                            attempt + 1,
+                            screenshot_retries,
+                            exc,
+                        )
+                        time.sleep(screenshot_retry_delay)
+                    else:
+                        logger.error(
+                            "Screenshot corrupt after %d attempts, "
+                            "returning DONE action",
+                            screenshot_retries,
+                        )
+                        return "done", [], []
 
             messages = [
                 {"role": "system", "content": SYSTEM_PROMPT},
diff --git a/openadapt_evals/training/trl_wrapper.py b/openadapt_evals/training/trl_wrapper.py
@@ -81,6 +81,28 @@ def train(self) -> str:
         task_configs = []
         if self._config.task_dir:
             task_configs = TaskConfig.from_dir(self._config.task_dir)
+
+        # Filter by task_ids if specified — without this, ALL tasks from
+        # task_dir end up in the TRL dataset regardless of what the user
+        # requested. This was a critical bug: config had task_ids=["X"]
+        # but TRL was running unrelated tasks.
+        if getattr(self._config, "task_ids", None):
+            allowed = set(self._config.task_ids)
+            filtered = [tc for tc in task_configs if tc.id in allowed or tc.name in allowed]
+            if filtered:
+                task_configs = filtered
+                logger.info(
+                    "Filtered tasks by task_ids: %d/%d tasks selected",
+                    len(filtered), len(task_configs) + len(filtered) - len(filtered),
+                )
+            else:
+                logger.warning(
+                    "task_ids=%s matched no tasks from task_dir=%s. "
+                    "Available: %s. Using all tasks.",
+                    self._config.task_ids, self._config.task_dir,
+                    [tc.id for tc in task_configs],
+                )
+
         if not task_configs:
             raise ValueError("No tasks. Set task_dir in TrainingConfig.")
 
diff --git a/tests/test_trl_rollout.py b/tests/test_trl_rollout.py
@@ -169,7 +169,7 @@ def test_rollout_with_mock_generate(self):
 
         original_run = trl_rollout._run_episode
 
-        def mock_run_episode(env, generate_fn, instruction, task_id, max_steps):
+        def mock_run_episode(env, generate_fn, instruction, task_id, max_steps, **kwargs):
             """Simplified episode that doesn't need a real model."""
             from openadapt_evals.adapters.rl_env import ResetConfig
 
@@ -270,7 +270,7 @@ def test_task_config_lookup_by_name(self):
 
         captured_task_ids = []
 
-        def capture_run(env, gfn, instr, tid, ms):
+        def capture_run(env, gfn, instr, tid, ms, **kwargs):
             captured_task_ids.append(tid)
             from openadapt_evals.adapters.rl_env import ResetConfig
             env.reset(config=ResetConfig(task_id=tid))