Skip to content

Commit ffcb41d

Browse files
abrichrclaude
andauthored
fix(agent): replace manual string escaping with repr() and fix CU agent bugs (#83)
* fix(agent): replace manual string escaping with repr() and fix CU agent bugs Five reliability fixes for eval runs: 1. Replace _escape_for_pyautogui() with repr() in _build_type_commands() - eliminates entire class of string-embedding bugs (newlines, tabs, quotes, unicode) using Python's own escaping mechanism 2. Fix drag coordinate field names: startCoordinate/endCoordinate (camelCase) → start_coordinate/coordinate (snake_case) per Claude computer_use API 3. Add _clamp_coord() to prevent (0,0) coordinates from triggering PyAutoGUI fail-safe, applied to click, drag, and mouse_move actions 4. Re-inject demo text at every step in tool_result messages to prevent context drift in demo-conditioned evaluation 5. Add command logging in WAALiveAdapter.step() for debugging Also adds docs/eval_analysis_2026_03_02.md documenting ZS vs DC eval results and literature review on demo-conditioning approaches. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: add multi-level demo format transform and fix tests - Add scripts/transform_demo_format.py: transforms rigid {Observation, Intent, Action, Result} demos into adaptive {Think, Action, Expect} format with PLAN section (Option D from eval analysis) - LLM-assisted mode (default): uses vlm_call() for semantic transform - Rule-based mode (--no-llm): free, no API calls needed - Supports --dry-run for preview - Fix tests for repr() escaping and coordinate clamping: - Remove TestEscapeForPyautogui (tests deleted function) - Update TestBuildTypeCommands for repr() output format - Add test_all_special_chars_produce_valid_python invariant test - Fix drag test to use snake_case field names - Fix coordinate edge test to expect clamped (0.005, 0.005) - Regenerate uv.lock for consilium package name resolution Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add DC-multilevel eval results to analysis DC-multilevel (new {Think, Action, Expect} + PLAN format) showed clear improvement over DC-rigid: agent followed the plan, entered all headers and years, typed correct formula, used drag-fill. Still scored 0.0 due to premature task completion (finished 1/3 columns), but qualitatively the best behavior across all three conditions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4896b65 commit ffcb41d

7 files changed

Lines changed: 2205 additions & 789 deletions

File tree

docs/eval_analysis_2026_03_02.md

Lines changed: 329 additions & 0 deletions
Large diffs are not rendered by default.

openadapt_evals/adapters/waa/live.py

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -215,43 +215,33 @@ def _is_failsafe_error(text: str) -> bool:
215215
return "failsafeexception" in lower or "fail-safe triggered" in lower
216216

217217

218-
def _escape_for_pyautogui(text: str) -> str:
219-
"""Escape text for embedding in a single-quoted Python string literal."""
220-
return (
221-
text
222-
.replace("\\", "\\\\")
223-
.replace("'", "\\'")
224-
.replace("\t", "\\t")
225-
.replace("\r", "")
226-
)
227-
228-
229218
def _build_type_commands(text: str) -> str:
230219
"""Build pyautogui command body to type text, handling embedded newlines.
231220
232-
``pyautogui.write()`` cannot handle literal newline characters — the
233-
generated Python command string becomes an unterminated string literal
234-
when executed via ``exec()``. This function splits the text on newlines
235-
and interleaves ``pyautogui.write()`` with ``pyautogui.press('enter')``.
221+
Uses ``repr()`` for string escaping instead of manual character-by-character
222+
replacement. This eliminates the entire class of escaping bugs (newlines,
223+
tabs, quotes, unicode, null bytes, etc.) because ``repr()`` is Python's own
224+
mechanism for producing valid string literals from any string content —
225+
the same principle as parameterized SQL queries vs string concatenation.
226+
227+
Newlines are handled semantically: split into separate ``write()`` calls
228+
with ``press('enter')`` between them, since the agent intends "press Enter."
236229
237230
Returns:
238231
A pyautogui command body string (without ``import pyautogui;`` prefix).
239232
Callers must prepend the import themselves.
240233
"""
234+
text = text.replace("\r", "")
241235
segments = text.split("\n")
242236
if len(segments) == 1:
243-
escaped = _escape_for_pyautogui(text)
244-
return f"pyautogui.write('{escaped}', interval=0.02)"
237+
return f"pyautogui.write({repr(text)}, interval=0.02)"
245238

246239
commands: list[str] = []
247240
for i, seg in enumerate(segments):
248-
# Skip empty trailing segment from a trailing newline
249-
if seg or i < len(segments) - 1:
250-
escaped = _escape_for_pyautogui(seg)
251-
if escaped:
252-
commands.append(f"pyautogui.write('{escaped}', interval=0.02)")
253-
if i < len(segments) - 1:
254-
commands.append("pyautogui.press('enter')")
241+
if seg:
242+
commands.append(f"pyautogui.write({repr(seg)}, interval=0.02)")
243+
if i < len(segments) - 1:
244+
commands.append("pyautogui.press('enter')")
255245
return "; ".join(commands) if commands else "pass"
256246

257247

@@ -575,6 +565,7 @@ def step(
575565

576566
# Execute command via /execute_windows (has access to computer object)
577567
if command:
568+
logger.info("Sending command to WAA: %r", command)
578569
try:
579570
resp = requests.post(
580571
f"{self.config.server_url}/execute_windows",

openadapt_evals/agents/claude_computer_use_agent.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ class ClaudeComputerUseAgent(BenchmarkAgent):
7878

7979
DEFAULT_MODEL = "claude-sonnet-4-6"
8080

81+
# Minimum normalized coordinate to avoid PyAutoGUI fail-safe (top-left corner)
82+
_COORD_EPS = 0.005 # ~6px at 1280, ~4px at 720
83+
8184
def __init__(
8285
self,
8386
api_key: str | None = None,
@@ -123,6 +126,16 @@ def __init__(
123126
f"Demo provided ({len(self.demo)} chars) - persists across all steps"
124127
)
125128

129+
def _clamp_coord(self, x_norm: float, y_norm: float) -> tuple[float, float]:
130+
"""Clamp normalized coordinates away from (0,0) to avoid fail-safe."""
131+
if x_norm < self._COORD_EPS and y_norm < self._COORD_EPS:
132+
logger.warning(
133+
f"Clamping near-zero coordinates ({x_norm:.4f}, {y_norm:.4f}) "
134+
f"to ({self._COORD_EPS}, {self._COORD_EPS}) to avoid fail-safe"
135+
)
136+
return (self._COORD_EPS, self._COORD_EPS)
137+
return (x_norm, y_norm)
138+
126139
def reset(self) -> None:
127140
"""Reset agent state between episodes."""
128141
self._messages = []
@@ -178,7 +191,17 @@ def act(
178191
tool_result = self._build_tool_result(
179192
screenshot_b64, self._last_tool_use_id
180193
)
181-
self._messages.append({"role": "user", "content": [tool_result]})
194+
content: list[dict[str, Any]] = [tool_result]
195+
# Re-inject demo at every step so it doesn't drift out of context
196+
if self.demo:
197+
content.append({
198+
"type": "text",
199+
"text": (
200+
f"DEMONSTRATION (follow this pattern, you are at step "
201+
f"{self._step_count}):\n---\n{self.demo}\n---"
202+
),
203+
})
204+
self._messages.append({"role": "user", "content": content})
182205

183206
# Loop: call API, and if Claude requests a screenshot/wait, send the
184207
# screenshot back and call again (up to MAX_INTERNAL_RETRIES times)
@@ -374,8 +397,9 @@ def _map_action(
374397
"triple_click",
375398
):
376399
coord = tool_input.get("coordinate", [0, 0])
377-
x_norm = coord[0] / self.display_width
378-
y_norm = coord[1] / self.display_height
400+
x_norm, y_norm = self._clamp_coord(
401+
coord[0] / self.display_width, coord[1] / self.display_height
402+
)
379403
ba_type = "click"
380404
raw["click_variant"] = action_type
381405
return BenchmarkAction(
@@ -414,25 +438,33 @@ def _map_action(
414438

415439
# Drag action
416440
if action_type == "left_click_drag":
417-
start = tool_input.get("startCoordinate", [0, 0])
418-
end = tool_input.get("endCoordinate", [0, 0])
441+
# Claude's computer_use API uses snake_case field names:
442+
# start_coordinate: [x, y] (drag start)
443+
# coordinate: [x, y] (drag end)
444+
start = tool_input.get("start_coordinate", [0, 0])
445+
end = tool_input.get("coordinate", [0, 0])
446+
sx, sy = self._clamp_coord(
447+
start[0] / self.display_width, start[1] / self.display_height
448+
)
449+
ex, ey = self._clamp_coord(
450+
end[0] / self.display_width, end[1] / self.display_height
451+
)
419452
return BenchmarkAction(
420453
type="drag",
421-
x=start[0] / self.display_width,
422-
y=start[1] / self.display_height,
423-
end_x=end[0] / self.display_width,
424-
end_y=end[1] / self.display_height,
454+
x=sx, y=sy, end_x=ex, end_y=ey,
425455
raw_action=raw,
426456
)
427457

428458
# Mouse move — treat as a click with no effect for BenchmarkAction
429459
if action_type == "mouse_move":
430460
coord = tool_input.get("coordinate", [0, 0])
461+
x_norm, y_norm = self._clamp_coord(
462+
coord[0] / self.display_width, coord[1] / self.display_height
463+
)
431464
raw["is_mouse_move"] = True
432465
return BenchmarkAction(
433466
type="click",
434-
x=coord[0] / self.display_width,
435-
y=coord[1] / self.display_height,
467+
x=x_norm, y=y_norm,
436468
raw_action=raw,
437469
)
438470

0 commit comments

Comments
 (0)