Skip to content

Commit a5f3aea

Browse files
abrichrclaude
andauthored
fix: add direct pixel path for pixel_action bypassing element routing (#91)
Extract command-sending logic from step() into _send_command() helper. Rewrite pixel_action() to build pyautogui commands directly via _build_pixel_command() and send them through _send_command(), bypassing _translate_action/_translate_click_action entirely. This eliminates unnecessary element-resolution routing for actions that already have absolute pixel coordinates. The step() method continues to use the element-based _translate_action path for agent actions. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a641256 commit a5f3aea

2 files changed

Lines changed: 476 additions & 61 deletions

File tree

openadapt_evals/adapters/waa/live.py

Lines changed: 167 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,74 @@ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
536536

537537
return self._get_observation()
538538

539+
def _send_command(self, command: str) -> None:
540+
"""Send a pyautogui command to WAA via /execute_windows.
541+
542+
Handles fail-safe detection and recovery. If PyAutoGUI's fail-safe is
543+
triggered (mouse at screen corner), recovery is attempted automatically
544+
via the /execute endpoint before retrying the command once.
545+
546+
Args:
547+
command: Python command string to execute on the WAA server.
548+
"""
549+
import requests
550+
551+
logger.info("Sending command to WAA: %r", command)
552+
try:
553+
resp = requests.post(
554+
f"{self.config.server_url}/execute_windows",
555+
json={"command": command},
556+
timeout=self.config.timeout
557+
)
558+
# Check ALL responses (200 and non-200) for fail-safe errors.
559+
# WAA returns 500 with a JSON "message" field for fail-safe,
560+
# and sometimes 200 with stderr containing the exception.
561+
response_text = resp.text
562+
if resp.status_code == 200:
563+
result = resp.json()
564+
stderr = result.get("stderr", "")
565+
stdout = result.get("stdout", "")
566+
response_text = stderr + stdout
567+
if stderr:
568+
logger.warning(f"Command stderr: {stderr}")
569+
else:
570+
logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
571+
572+
# Detect PyAutoGUI fail-safe and attempt recovery (once per step)
573+
if _is_failsafe_error(response_text):
574+
logger.warning(
575+
"PyAutoGUI fail-safe detected; attempting recovery..."
576+
)
577+
if self._recover_failsafe():
578+
logger.info(
579+
"Fail-safe cleared; retrying command: %s", command
580+
)
581+
retry_resp = requests.post(
582+
f"{self.config.server_url}/execute_windows",
583+
json={"command": command},
584+
timeout=self.config.timeout,
585+
)
586+
if retry_resp.status_code == 200:
587+
retry_result = retry_resp.json()
588+
if retry_result.get("stderr"):
589+
logger.warning(
590+
f"Retry stderr: {retry_result['stderr']}"
591+
)
592+
else:
593+
logger.error(
594+
f"Retry failed ({retry_resp.status_code}): "
595+
f"{retry_resp.text}"
596+
)
597+
else:
598+
logger.error(
599+
"Fail-safe recovery failed; step will proceed "
600+
"with degraded state"
601+
)
602+
elif resp.status_code == 200:
603+
logger.debug(f"Executed: {command}")
604+
except Exception as e:
605+
logger.error(f"Execute request failed: {e}")
606+
539607
def step(
540608
self, action: BenchmarkAction
541609
) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
@@ -555,8 +623,6 @@ def step(
555623
Returns:
556624
Tuple of (observation, done, info).
557625
"""
558-
import requests
559-
560626
self._step_count += 1
561627
self._actions.append(action)
562628

@@ -565,61 +631,7 @@ def step(
565631

566632
# Execute command via /execute_windows (has access to computer object)
567633
if command:
568-
logger.info("Sending command to WAA: %r", command)
569-
try:
570-
resp = requests.post(
571-
f"{self.config.server_url}/execute_windows",
572-
json={"command": command},
573-
timeout=self.config.timeout
574-
)
575-
# Check ALL responses (200 and non-200) for fail-safe errors.
576-
# WAA returns 500 with a JSON "message" field for fail-safe,
577-
# and sometimes 200 with stderr containing the exception.
578-
response_text = resp.text
579-
if resp.status_code == 200:
580-
result = resp.json()
581-
stderr = result.get("stderr", "")
582-
stdout = result.get("stdout", "")
583-
response_text = stderr + stdout
584-
if stderr:
585-
logger.warning(f"Command stderr: {stderr}")
586-
else:
587-
logger.error(f"Execute failed ({resp.status_code}): {resp.text}")
588-
589-
# Detect PyAutoGUI fail-safe and attempt recovery (once per step)
590-
if _is_failsafe_error(response_text):
591-
logger.warning(
592-
"PyAutoGUI fail-safe detected; attempting recovery..."
593-
)
594-
if self._recover_failsafe():
595-
logger.info(
596-
"Fail-safe cleared; retrying command: %s", command
597-
)
598-
retry_resp = requests.post(
599-
f"{self.config.server_url}/execute_windows",
600-
json={"command": command},
601-
timeout=self.config.timeout,
602-
)
603-
if retry_resp.status_code == 200:
604-
retry_result = retry_resp.json()
605-
if retry_result.get("stderr"):
606-
logger.warning(
607-
f"Retry stderr: {retry_result['stderr']}"
608-
)
609-
else:
610-
logger.error(
611-
f"Retry failed ({retry_resp.status_code}): "
612-
f"{retry_resp.text}"
613-
)
614-
else:
615-
logger.error(
616-
"Fail-safe recovery failed; step will proceed "
617-
"with degraded state"
618-
)
619-
elif resp.status_code == 200:
620-
logger.debug(f"Executed: {command}")
621-
except Exception as e:
622-
logger.error(f"Execute request failed: {e}")
634+
self._send_command(command)
623635

624636
# Wait for UI to settle
625637
time.sleep(self.config.action_delay)
@@ -939,8 +951,9 @@ def pixel_action(
939951
"""Execute a pixel-coordinate action and return (obs, done, info).
940952
941953
Convenience method for RL agents that output raw pixel coordinates
942-
instead of element IDs. Supports both absolute pixels and normalized
943-
fractions.
954+
instead of element IDs. Builds pyautogui commands directly and sends
955+
them to WAA via /execute_windows, bypassing the element-based routing
956+
in _translate_action/_translate_click_action entirely.
944957
945958
Coordinates can be specified as:
946959
- Absolute pixels: pixel_action(x=885, y=22)
@@ -959,15 +972,108 @@ def pixel_action(
959972
y_frac: Y as fraction of screen height (0.0-1.0). Overrides y.
960973
961974
Returns:
962-
Tuple of (observation, done, info) same as step().
975+
Tuple of (observation, done, info) -- same as step().
963976
"""
964977
if x_frac is not None or y_frac is not None:
965978
w, h = self.screen_size
966979
x = int((x_frac or 0.0) * w)
967980
y = int((y_frac or 0.0) * h)
968981

982+
# Build the action for bookkeeping (step count, action history)
969983
action = BenchmarkAction(type=action_type, x=x, y=y, text=text, key=key)
970-
return self.step(action)
984+
self._step_count += 1
985+
self._actions.append(action)
986+
987+
# Build pyautogui command directly -- no element resolution needed
988+
command = self._build_pixel_command(
989+
action_type=action_type, x=x, y=y, text=text, key=key,
990+
)
991+
992+
if command:
993+
self._send_command(command)
994+
995+
# Wait for UI to settle
996+
time.sleep(self.config.action_delay)
997+
998+
# Check if done (error actions are also terminal)
999+
done = (
1000+
action_type in ("done", "error")
1001+
or self._step_count >= self.config.max_steps
1002+
)
1003+
1004+
obs = self._get_observation()
1005+
info = {
1006+
"step": self._step_count,
1007+
"command": command,
1008+
"pixel_direct": True,
1009+
}
1010+
1011+
return obs, done, info
1012+
1013+
def _build_pixel_command(
1014+
self,
1015+
action_type: str,
1016+
x: int | float | None = None,
1017+
y: int | float | None = None,
1018+
text: str | None = None,
1019+
key: str | None = None,
1020+
) -> str | None:
1021+
"""Build a pyautogui command string from pixel coordinates.
1022+
1023+
This is the direct-pixel path used by pixel_action(). It generates
1024+
pyautogui commands using absolute pixel coordinates without consulting
1025+
the accessibility tree or element rects.
1026+
1027+
Args:
1028+
action_type: One of "click", "double_click", "right_click", "type",
1029+
"key", "scroll", "done", "error", "wait".
1030+
x: X pixel coordinate (absolute).
1031+
y: Y pixel coordinate (absolute).
1032+
text: Text to type (for action_type="type").
1033+
key: Key name (for action_type="key").
1034+
1035+
Returns:
1036+
Python command string for pyautogui, or None for terminal actions.
1037+
"""
1038+
if action_type in ("done", "error"):
1039+
return None
1040+
1041+
if action_type == "wait":
1042+
return "import time; time.sleep(1)"
1043+
1044+
# Resolve pixel coordinates and clamp to safe margin
1045+
px = int(x) if x is not None else 0
1046+
py = int(y) if y is not None else 0
1047+
px, py = self._clamp_pixel_coords(px, py)
1048+
1049+
if action_type == "click":
1050+
return f"import pyautogui; pyautogui.click({px}, {py})"
1051+
1052+
if action_type == "double_click":
1053+
return f"import pyautogui; pyautogui.doubleClick({px}, {py})"
1054+
1055+
if action_type == "right_click":
1056+
return f"import pyautogui; pyautogui.rightClick({px}, {py})"
1057+
1058+
if action_type == "type":
1059+
type_body = _build_type_commands(text or "")
1060+
return (
1061+
f"import pyautogui; import time; "
1062+
f"pyautogui.click({px}, {py}); "
1063+
f"time.sleep(0.2); "
1064+
f"{type_body}"
1065+
)
1066+
1067+
if action_type == "key":
1068+
# Reuse the key translation logic
1069+
temp_action = BenchmarkAction(type="key", key=key)
1070+
return self._translate_key_action(temp_action)
1071+
1072+
if action_type == "scroll":
1073+
return f"import pyautogui; pyautogui.scroll(-3, x={px}, y={py})"
1074+
1075+
logger.warning(f"Unknown pixel action type: {action_type}")
1076+
return None
9711077

9721078
def _get_observation(self) -> BenchmarkObservation:
9731079
"""Fetch current observation from WAA server.

0 commit comments

Comments
 (0)