fix: address round-2 review findings across pipeline and live adapter

abrichr · claude · abrichr · commit 17ae0ceff908 · 2026-03-02T19:33:19.000-05:00
Pipeline (run_eval_pipeline.py):
- Add timeout=3600 to eval subprocess to prevent indefinite hangs
- Guard _ensure_waa_ready against empty vm_ip (skip tunnel reconnect)
- Capture demo generation output to prevent thread-interleaved stdout
- Make eval_tasks a defensive copy instead of alias

Live adapter (live.py):
- Decouple _build_type_commands from callers: return body without
  import prefix, eliminating fragile removeprefix coupling
- Escape tab characters in _escape_for_pyautogui

Tests (test_waa.py):
- Add 18 tests for _escape_for_pyautogui and _build_type_commands
  covering edge cases: empty text, newlines, tabs, quotes, formulas

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py
@@ -221,25 +221,27 @@ def _escape_for_pyautogui(text: str) -> str:
         text
         .replace("\\", "\\\\")
         .replace("'", "\\'")
+        .replace("\t", "\\t")
         .replace("\r", "")
     )
 
 
 def _build_type_commands(text: str) -> str:
-    """Build pyautogui commands to type text, handling embedded newlines.
+    """Build pyautogui command body to type text, handling embedded newlines.
 
     ``pyautogui.write()`` cannot handle literal newline characters — the
     generated Python command string becomes an unterminated string literal
     when executed via ``exec()``.  This function splits the text on newlines
     and interleaves ``pyautogui.write()`` with ``pyautogui.press('enter')``.
 
     Returns:
-        A complete Python command string including ``import pyautogui;``.
+        A pyautogui command body string (without ``import pyautogui;`` prefix).
+        Callers must prepend the import themselves.
     """
     segments = text.split("\n")
     if len(segments) == 1:
         escaped = _escape_for_pyautogui(text)
-        return f"import pyautogui; pyautogui.write('{escaped}', interval=0.02)"
+        return f"pyautogui.write('{escaped}', interval=0.02)"
 
     commands: list[str] = []
     for i, seg in enumerate(segments):
@@ -250,8 +252,7 @@ def _build_type_commands(text: str) -> str:
                 commands.append(f"pyautogui.write('{escaped}', interval=0.02)")
             if i < len(segments) - 1:
                 commands.append("pyautogui.press('enter')")
-    body = "; ".join(commands) if commands else "pass"
-    return f"import pyautogui; {body}"
+    return "; ".join(commands) if commands else "pass"
 
 
 @dataclass
@@ -1137,7 +1138,7 @@ def _translate_action(self, action: BenchmarkAction) -> str | None:
 
         if action.type == "type":
             text = action.text or ""
-            type_cmds = _build_type_commands(text)
+            type_body = _build_type_commands(text)
             # If target_node_id is set (from type_element), click element first to focus it
             if action.target_node_id is not None:
                 elem_id = str(action.target_node_id)
@@ -1149,11 +1150,11 @@ def _translate_action(self, action: BenchmarkAction) -> str | None:
                         f"import pyautogui; import time; "
                         f"pyautogui.click({cx}, {cy}); "
                         f"time.sleep(0.2); "
-                        + type_cmds.removeprefix("import pyautogui; ")
+                        f"{type_body}"
                     )
                 else:
                     logger.warning(f"Element ID '{elem_id}' not found for type_element, typing without focus")
-            return type_cmds
+            return f"import pyautogui; {type_body}"
 
         if action.type == "key":
             return self._translate_key_action(action)
diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py
@@ -126,12 +126,15 @@ def _generate_demos(
         if model:
             cmd.extend(["--model", model])
 
-        result = subprocess.run(cmd, timeout=600)
+        result = subprocess.run(cmd, timeout=600, capture_output=True, text=True)
         if result.returncode == 0:
             print(f"[demos]   -> done")
             generated.append(task_id)
         else:
             print(f"[demos]   ERROR: exit code {result.returncode}")
+            if result.stderr:
+                for line in result.stderr.strip().splitlines()[-3:]:
+                    print(f"[demos]   ! {line}")
 
     return generated
 
@@ -280,12 +283,16 @@ def _ensure_waa_ready(
 
     Recovery sequence:
     1. Probe -> OK: return True
-    2. Reconnect tunnel -> Probe -> OK: return True
+    2. Reconnect tunnel -> Probe -> OK: return True (skipped if vm_ip is empty)
     3. Wait for probe with timeout
     """
     if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
         return True
 
+    if not vm_ip:
+        print("  WAA unreachable and no VM IP available for tunnel reconnect")
+        return False
+
     print("  WAA unreachable, reconnecting tunnel...")
     tunnel_manager.stop_all_tunnels()
     time.sleep(1)
@@ -469,7 +476,7 @@ def _run_eval(
         if demo_path:
             cmd.extend(["--demo", str(demo_path.resolve())])
 
-        result = subprocess.run(cmd, capture_output=True, text=True)
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
         elapsed = time.time() - task_start
 
         # Log captured output to a file and print summary
@@ -645,7 +652,7 @@ def main() -> int:
     missing_demos = [t for t in recorded_tasks if t not in existing_demos]
 
     # Tasks eligible for eval = those with demos (or will have demos after generation)
-    eval_tasks = recorded_tasks  # all recorded tasks (demos will be generated if missing)
+    eval_tasks = list(recorded_tasks)  # copy; demos will be generated for missing ones
 
     print(f"Pipeline Configuration")
     print(f"  Recordings: {recordings_dir} ({len(recorded_tasks)} task(s))")
diff --git a/tests/test_waa.py b/tests/test_waa.py
@@ -458,3 +458,137 @@ def test_no_false_positive_on_generic_fail_safe_text(self):
         from openadapt_evals.adapters.waa.live import _is_failsafe_error
 
         assert not _is_failsafe_error("The application has a fail-safe mechanism")
+
+
+# ---------------------------------------------------------------------------
+# _escape_for_pyautogui
+# ---------------------------------------------------------------------------
+
+
+class TestEscapeForPyautogui:
+    """Tests for _escape_for_pyautogui text escaping."""
+
+    def test_plain_text(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("hello world") == "hello world"
+
+    def test_backslash_escaped(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("path\\to\\file") == "path\\\\to\\\\file"
+
+    def test_single_quote_escaped(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("it's") == "it\\'s"
+
+    def test_tab_escaped(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("col1\tcol2") == "col1\\tcol2"
+
+    def test_carriage_return_stripped(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("line\r") == "line"
+
+    def test_empty_string(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        assert _escape_for_pyautogui("") == ""
+
+    def test_combined_special_chars(self):
+        from openadapt_evals.adapters.waa.live import _escape_for_pyautogui
+
+        # \r is stripped, so "with\rtabs" → "withtabs"
+        result = _escape_for_pyautogui("it's a\\path\twith\rtabs")
+        assert result == "it\\'s a\\\\path\\twithtabs"
+
+
+# ---------------------------------------------------------------------------
+# _build_type_commands
+# ---------------------------------------------------------------------------
+
+
+class TestBuildTypeCommands:
+    """Tests for _build_type_commands which handles newlines in type actions."""
+
+    def test_simple_text(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("hello")
+        assert result == "pyautogui.write('hello', interval=0.02)"
+
+    def test_text_with_newline(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("abc\ndef")
+        assert "pyautogui.write('abc', interval=0.02)" in result
+        assert "pyautogui.press('enter')" in result
+        assert "pyautogui.write('def', interval=0.02)" in result
+
+    def test_trailing_newline(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("abc\n")
+        assert "pyautogui.write('abc', interval=0.02)" in result
+        assert "pyautogui.press('enter')" in result
+        # Should NOT have a write('') after the enter
+        assert result.endswith("pyautogui.press('enter')")
+
+    def test_leading_newline(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("\nabc")
+        assert result.startswith("pyautogui.press('enter')")
+        assert "pyautogui.write('abc', interval=0.02)" in result
+
+    def test_just_newline(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("\n")
+        assert result == "pyautogui.press('enter')"
+
+    def test_empty_string(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("")
+        assert "pyautogui.write('', interval=0.02)" in result
+
+    def test_multiple_newlines(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("a\nb\nc")
+        parts = result.split("; ")
+        assert parts[0] == "pyautogui.write('a', interval=0.02)"
+        assert parts[1] == "pyautogui.press('enter')"
+        assert parts[2] == "pyautogui.write('b', interval=0.02)"
+        assert parts[3] == "pyautogui.press('enter')"
+        assert parts[4] == "pyautogui.write('c', interval=0.02)"
+
+    def test_special_chars_escaped(self):
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("it's a\\path")
+        assert "it\\'s a\\\\path" in result
+
+    def test_no_import_prefix(self):
+        """_build_type_commands should return body only, no import statement."""
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        result = _build_type_commands("hello")
+        assert not result.startswith("import ")
+
+    def test_formula_with_newlines(self):
+        """Reproduces the original LibreOffice Calc bug."""
+        from openadapt_evals.adapters.waa.live import _build_type_commands
+
+        text = "=(Sheet1.C4-Sheet1.C3)/Sheet1.C3\n=(Sheet1.C5-Sheet1.C4)/Sheet1.C4"
+        result = _build_type_commands(text)
+        # Should have two write commands separated by enter
+        assert "pyautogui.write('=(Sheet1.C4-Sheet1.C3)/Sheet1.C3', interval=0.02)" in result
+        assert "pyautogui.press('enter')" in result
+        assert "pyautogui.write('=(Sheet1.C5-Sheet1.C4)/Sheet1.C4', interval=0.02)" in result
+        # No literal newlines in the result
+        assert "\n" not in result