@@ -536,6 +536,74 @@ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
536536
537537 return self ._get_observation ()
538538
539+ def _send_command (self , command : str ) -> None :
540+ """Send a pyautogui command to WAA via /execute_windows.
541+
542+ Handles fail-safe detection and recovery. If PyAutoGUI's fail-safe is
543+ triggered (mouse at screen corner), recovery is attempted automatically
544+ via the /execute endpoint before retrying the command once.
545+
546+ Args:
547+ command: Python command string to execute on the WAA server.
548+ """
549+ import requests
550+
551+ logger .info ("Sending command to WAA: %r" , command )
552+ try :
553+ resp = requests .post (
554+ f"{ self .config .server_url } /execute_windows" ,
555+ json = {"command" : command },
556+ timeout = self .config .timeout
557+ )
558+ # Check ALL responses (200 and non-200) for fail-safe errors.
559+ # WAA returns 500 with a JSON "message" field for fail-safe,
560+ # and sometimes 200 with stderr containing the exception.
561+ response_text = resp .text
562+ if resp .status_code == 200 :
563+ result = resp .json ()
564+ stderr = result .get ("stderr" , "" )
565+ stdout = result .get ("stdout" , "" )
566+ response_text = stderr + stdout
567+ if stderr :
568+ logger .warning (f"Command stderr: { stderr } " )
569+ else :
570+ logger .error (f"Execute failed ({ resp .status_code } ): { resp .text } " )
571+
572+ # Detect PyAutoGUI fail-safe and attempt recovery (once per step)
573+ if _is_failsafe_error (response_text ):
574+ logger .warning (
575+ "PyAutoGUI fail-safe detected; attempting recovery..."
576+ )
577+ if self ._recover_failsafe ():
578+ logger .info (
579+ "Fail-safe cleared; retrying command: %s" , command
580+ )
581+ retry_resp = requests .post (
582+ f"{ self .config .server_url } /execute_windows" ,
583+ json = {"command" : command },
584+ timeout = self .config .timeout ,
585+ )
586+ if retry_resp .status_code == 200 :
587+ retry_result = retry_resp .json ()
588+ if retry_result .get ("stderr" ):
589+ logger .warning (
590+ f"Retry stderr: { retry_result ['stderr' ]} "
591+ )
592+ else :
593+ logger .error (
594+ f"Retry failed ({ retry_resp .status_code } ): "
595+ f"{ retry_resp .text } "
596+ )
597+ else :
598+ logger .error (
599+ "Fail-safe recovery failed; step will proceed "
600+ "with degraded state"
601+ )
602+ elif resp .status_code == 200 :
603+ logger .debug (f"Executed: { command } " )
604+ except Exception as e :
605+ logger .error (f"Execute request failed: { e } " )
606+
539607 def step (
540608 self , action : BenchmarkAction
541609 ) -> tuple [BenchmarkObservation , bool , dict [str , Any ]]:
@@ -555,8 +623,6 @@ def step(
555623 Returns:
556624 Tuple of (observation, done, info).
557625 """
558- import requests
559-
560626 self ._step_count += 1
561627 self ._actions .append (action )
562628
@@ -565,61 +631,7 @@ def step(
565631
566632 # Execute command via /execute_windows (has access to computer object)
567633 if command :
568- logger .info ("Sending command to WAA: %r" , command )
569- try :
570- resp = requests .post (
571- f"{ self .config .server_url } /execute_windows" ,
572- json = {"command" : command },
573- timeout = self .config .timeout
574- )
575- # Check ALL responses (200 and non-200) for fail-safe errors.
576- # WAA returns 500 with a JSON "message" field for fail-safe,
577- # and sometimes 200 with stderr containing the exception.
578- response_text = resp .text
579- if resp .status_code == 200 :
580- result = resp .json ()
581- stderr = result .get ("stderr" , "" )
582- stdout = result .get ("stdout" , "" )
583- response_text = stderr + stdout
584- if stderr :
585- logger .warning (f"Command stderr: { stderr } " )
586- else :
587- logger .error (f"Execute failed ({ resp .status_code } ): { resp .text } " )
588-
589- # Detect PyAutoGUI fail-safe and attempt recovery (once per step)
590- if _is_failsafe_error (response_text ):
591- logger .warning (
592- "PyAutoGUI fail-safe detected; attempting recovery..."
593- )
594- if self ._recover_failsafe ():
595- logger .info (
596- "Fail-safe cleared; retrying command: %s" , command
597- )
598- retry_resp = requests .post (
599- f"{ self .config .server_url } /execute_windows" ,
600- json = {"command" : command },
601- timeout = self .config .timeout ,
602- )
603- if retry_resp .status_code == 200 :
604- retry_result = retry_resp .json ()
605- if retry_result .get ("stderr" ):
606- logger .warning (
607- f"Retry stderr: { retry_result ['stderr' ]} "
608- )
609- else :
610- logger .error (
611- f"Retry failed ({ retry_resp .status_code } ): "
612- f"{ retry_resp .text } "
613- )
614- else :
615- logger .error (
616- "Fail-safe recovery failed; step will proceed "
617- "with degraded state"
618- )
619- elif resp .status_code == 200 :
620- logger .debug (f"Executed: { command } " )
621- except Exception as e :
622- logger .error (f"Execute request failed: { e } " )
634+ self ._send_command (command )
623635
624636 # Wait for UI to settle
625637 time .sleep (self .config .action_delay )
@@ -939,8 +951,9 @@ def pixel_action(
939951 """Execute a pixel-coordinate action and return (obs, done, info).
940952
941953 Convenience method for RL agents that output raw pixel coordinates
942- instead of element IDs. Supports both absolute pixels and normalized
943- fractions.
954+ instead of element IDs. Builds pyautogui commands directly and sends
955+ them to WAA via /execute_windows, bypassing the element-based routing
956+ in _translate_action/_translate_click_action entirely.
944957
945958 Coordinates can be specified as:
946959 - Absolute pixels: pixel_action(x=885, y=22)
@@ -959,15 +972,108 @@ def pixel_action(
959972 y_frac: Y as fraction of screen height (0.0-1.0). Overrides y.
960973
961974 Returns:
962- Tuple of (observation, done, info) — same as step().
975+ Tuple of (observation, done, info) -- same as step().
963976 """
964977 if x_frac is not None or y_frac is not None :
965978 w , h = self .screen_size
966979 x = int ((x_frac or 0.0 ) * w )
967980 y = int ((y_frac or 0.0 ) * h )
968981
982+ # Build the action for bookkeeping (step count, action history)
969983 action = BenchmarkAction (type = action_type , x = x , y = y , text = text , key = key )
970- return self .step (action )
984+ self ._step_count += 1
985+ self ._actions .append (action )
986+
987+ # Build pyautogui command directly -- no element resolution needed
988+ command = self ._build_pixel_command (
989+ action_type = action_type , x = x , y = y , text = text , key = key ,
990+ )
991+
992+ if command :
993+ self ._send_command (command )
994+
995+ # Wait for UI to settle
996+ time .sleep (self .config .action_delay )
997+
998+ # Check if done (error actions are also terminal)
999+ done = (
1000+ action_type in ("done" , "error" )
1001+ or self ._step_count >= self .config .max_steps
1002+ )
1003+
1004+ obs = self ._get_observation ()
1005+ info = {
1006+ "step" : self ._step_count ,
1007+ "command" : command ,
1008+ "pixel_direct" : True ,
1009+ }
1010+
1011+ return obs , done , info
1012+
1013+ def _build_pixel_command (
1014+ self ,
1015+ action_type : str ,
1016+ x : int | float | None = None ,
1017+ y : int | float | None = None ,
1018+ text : str | None = None ,
1019+ key : str | None = None ,
1020+ ) -> str | None :
1021+ """Build a pyautogui command string from pixel coordinates.
1022+
1023+ This is the direct-pixel path used by pixel_action(). It generates
1024+ pyautogui commands using absolute pixel coordinates without consulting
1025+ the accessibility tree or element rects.
1026+
1027+ Args:
1028+ action_type: One of "click", "double_click", "right_click", "type",
1029+ "key", "scroll", "done", "error", "wait".
1030+ x: X pixel coordinate (absolute).
1031+ y: Y pixel coordinate (absolute).
1032+ text: Text to type (for action_type="type").
1033+ key: Key name (for action_type="key").
1034+
1035+ Returns:
1036+ Python command string for pyautogui, or None for terminal actions.
1037+ """
1038+ if action_type in ("done" , "error" ):
1039+ return None
1040+
1041+ if action_type == "wait" :
1042+ return "import time; time.sleep(1)"
1043+
1044+ # Resolve pixel coordinates and clamp to safe margin
1045+ px = int (x ) if x is not None else 0
1046+ py = int (y ) if y is not None else 0
1047+ px , py = self ._clamp_pixel_coords (px , py )
1048+
1049+ if action_type == "click" :
1050+ return f"import pyautogui; pyautogui.click({ px } , { py } )"
1051+
1052+ if action_type == "double_click" :
1053+ return f"import pyautogui; pyautogui.doubleClick({ px } , { py } )"
1054+
1055+ if action_type == "right_click" :
1056+ return f"import pyautogui; pyautogui.rightClick({ px } , { py } )"
1057+
1058+ if action_type == "type" :
1059+ type_body = _build_type_commands (text or "" )
1060+ return (
1061+ f"import pyautogui; import time; "
1062+ f"pyautogui.click({ px } , { py } ); "
1063+ f"time.sleep(0.2); "
1064+ f"{ type_body } "
1065+ )
1066+
1067+ if action_type == "key" :
1068+ # Reuse the key translation logic
1069+ temp_action = BenchmarkAction (type = "key" , key = key )
1070+ return self ._translate_key_action (temp_action )
1071+
1072+ if action_type == "scroll" :
1073+ return f"import pyautogui; pyautogui.scroll(-3, x={ px } , y={ py } )"
1074+
1075+ logger .warning (f"Unknown pixel action type: { action_type } " )
1076+ return None
9711077
9721078 def _get_observation (self ) -> BenchmarkObservation :
9731079 """Fetch current observation from WAA server.
0 commit comments