22
33from __future__ import annotations
44
5- import base64
65import hashlib
76import logging
87import time
1514
1615logger = logging .getLogger (__name__ )
1716
17+ # Default retry parameters for screenshot
18+ SCREENSHOT_MAX_RETRIES = 3
19+ SCREENSHOT_RETRY_DELAY = 2.0 # seconds
20+
1821
1922@dataclass
2023class RolloutStep :
@@ -37,42 +40,97 @@ class Rollout:
3740
3841
3942class WAADirect :
40- """Direct HTTP client for WAA Flask server. Screenshot/click/type/key."""
43+ """Direct HTTP client for WAA Flask server. Screenshot/click/type/key.
44+
45+ WAA API contract (from WAA Flask server main.py):
46+ GET /screenshot -> raw PNG bytes (Content-Type: image/png)
47+ POST /execute_windows -> exec(command, {'computer': computer, 'human': human})
48+ Payload: {"command": "<python code>"}
49+ The command is Python code executed via exec() with pyautogui available.
50+ Do NOT wrap in ``python -c "..."`` -- send bare Python statements.
51+ """
4152
4253 def __init__ (self , server_url : str = "http://localhost:5001" ,
4354 screen_size : tuple [int , int ] = (1920 , 1080 )) -> None :
4455 self .server_url = server_url .rstrip ("/" )
4556 self .screen_size = screen_size
4657 self ._session = requests .Session ()
4758
48- def screenshot (self ) -> bytes :
49- """Take a fresh screenshot. Returns PNG bytes."""
50- resp = self ._session .get (f"{ self .server_url } /screenshot" , timeout = 30 )
51- if resp .status_code != 200 :
52- raise RuntimeError (f"Screenshot failed: { resp .status_code } " )
53- data = resp .json ()
54- img_b64 = data .get ("screenshot" , data .get ("image" , "" ))
55- if not img_b64 :
56- raise RuntimeError ("No screenshot data in response" )
57- return base64 .b64decode (img_b64 )
59+ def screenshot (self , max_retries : int = SCREENSHOT_MAX_RETRIES ,
60+ retry_delay : float = SCREENSHOT_RETRY_DELAY ) -> bytes :
61+ """Take a fresh screenshot. Returns raw PNG bytes.
62+
63+ WAA's /screenshot endpoint returns raw PNG via Flask's send_file(),
64+ NOT base64-encoded JSON. Read resp.content, not resp.json().
65+ """
66+ last_exc : Exception | None = None
67+ for attempt in range (1 , max_retries + 1 ):
68+ try :
69+ resp = self ._session .get (
70+ f"{ self .server_url } /screenshot" , timeout = 30 ,
71+ )
72+ if resp .status_code != 200 :
73+ raise RuntimeError (
74+ f"Screenshot HTTP { resp .status_code } : { resp .text [:200 ]} "
75+ )
76+ png_bytes = resp .content
77+ if len (png_bytes ) < 100 :
78+ raise RuntimeError (
79+ f"Screenshot too small ({ len (png_bytes )} bytes) -- "
80+ "server may not be ready"
81+ )
82+ return png_bytes
83+ except Exception as e :
84+ last_exc = e
85+ logger .warning (
86+ "Screenshot attempt %d/%d failed: %s" ,
87+ attempt , max_retries , e ,
88+ )
89+ if attempt < max_retries :
90+ time .sleep (retry_delay )
91+ raise RuntimeError (
92+ f"Screenshot failed after { max_retries } attempts"
93+ ) from last_exc
5894
5995 def execute_action (self , action : SimpleAction ) -> dict [str , Any ]:
60- """Execute action on VM via /execute_windows."""
96+ """Execute action on VM via /execute_windows.
97+
98+ WAA's /execute_windows does ``exec(command, {'computer': ..., 'human': ...})``.
99+ The command must be bare Python code -- NOT wrapped in ``python -c "..."``.
100+ pyautogui is available via import inside the exec'd code.
101+ """
61102 if action .type == "click" :
62103 x , y = int (action .x or 0 ), int (action .y or 0 )
63- cmd = f'python -c "import pyautogui; pyautogui.click({ x } , { y } )"'
104+ cmd = f"import pyautogui; pyautogui.click({ x } , { y } )"
105+ elif action .type == "double_click" :
106+ x , y = int (action .x or 0 ), int (action .y or 0 )
107+ cmd = f"import pyautogui; pyautogui.doubleClick({ x } , { y } )"
108+ elif action .type == "right_click" :
109+ x , y = int (action .x or 0 ), int (action .y or 0 )
110+ cmd = f"import pyautogui; pyautogui.rightClick({ x } , { y } )"
64111 elif action .type == "type" :
65- text = (action .text or "" ).replace ('"' , '\\ "' )
66- cmd = f'python -c "import pyautogui; pyautogui.typewrite(\' { text } \' , interval=0.05)"'
112+ text = (action .text or "" ).replace ("\\ " , "\\ \\ " ).replace ("'" , "\\ '" )
113+ x , y = int (action .x or 0 ), int (action .y or 0 )
114+ # Click target first, then type (matches WAALiveAdapter pattern)
115+ cmd = (
116+ f"import pyautogui; import time; "
117+ f"pyautogui.click({ x } , { y } ); "
118+ f"time.sleep(0.2); "
119+ f"pyautogui.typewrite('{ text } ', interval=0.05)"
120+ )
67121 elif action .type == "key" :
68- cmd = f'python -c "import pyautogui; pyautogui.press(\' { action .key or "enter" } \' )"'
122+ key = action .key or "enter"
123+ cmd = f"import pyautogui; pyautogui.press('{ key } ')"
124+ elif action .type == "scroll" :
125+ x , y = int (action .x or 0 ), int (action .y or 0 )
126+ cmd = f"import pyautogui; pyautogui.scroll(-3, x={ x } , y={ y } )"
69127 elif action .type == "wait" :
70128 time .sleep (2 )
71129 return {"status" : "ok" , "action" : "wait" }
72130 elif action .type == "done" :
73131 return {"status" : "ok" , "action" : "done" }
74132 else :
75- return {"status" : "error" , "message" : f"Unknown: { action .type } " }
133+ return {"status" : "error" , "message" : f"Unknown action type : { action .type } " }
76134
77135 resp = self ._session .post (
78136 f"{ self .server_url } /execute_windows" , json = {"command" : cmd }, timeout = 30 ,
@@ -117,9 +175,25 @@ def is_stuck(self, recent: list[bytes], window: int = 3) -> bool:
117175 hashes = [hashlib .md5 (s ).hexdigest () for s in recent [- window :]]
118176 return len (set (hashes )) == 1
119177
120- def health_check (self ) -> bool :
121- """True if WAA server responds."""
178+ def probe (self , timeout : float = 10.0 ) -> dict [str , Any ]:
179+ """Health-check the WAA server. Returns status dict.
180+
181+ Attempts a screenshot to verify the full pipeline (not just HTTP).
182+ """
183+ result : dict [str , Any ] = {"reachable" : False , "screenshot_ok" : False }
122184 try :
123- return self ._session .get (f"{ self .server_url } /screenshot" , timeout = 10 ).status_code == 200
124- except requests .RequestException :
125- return False
185+ resp = self ._session .get (
186+ f"{ self .server_url } /screenshot" , timeout = timeout ,
187+ )
188+ result ["reachable" ] = True
189+ result ["status_code" ] = resp .status_code
190+ if resp .status_code == 200 :
191+ result ["screenshot_ok" ] = len (resp .content ) > 100
192+ result ["screenshot_bytes" ] = len (resp .content )
193+ except requests .RequestException as e :
194+ result ["error" ] = str (e )
195+ return result
196+
197+ def health_check (self ) -> bool :
198+ """True if WAA server responds with a valid screenshot."""
199+ return self .probe ().get ("screenshot_ok" , False )
0 commit comments