4343import os
4444import re
4545from io import BytesIO
46- from typing import Any , Dict , List
46+ from typing import Dict , List
4747
4848from PIL import Image
4949
@@ -210,6 +210,7 @@ def __init__(
210210 )
211211 try :
212212 from anthropic import Anthropic
213+
213214 self ._client = Anthropic (api_key = self .api_key )
214215 except ImportError :
215216 raise RuntimeError (
@@ -225,6 +226,7 @@ def __init__(
225226 )
226227 try :
227228 from openai import OpenAI
229+
228230 self ._client = OpenAI (api_key = self .api_key )
229231 except ImportError :
230232 raise RuntimeError (
@@ -240,9 +242,13 @@ def __init__(
240242 self .memory_block_text = "# empty memory block"
241243 self .step_counter = 0
242244
243- logger .info (f"ApiAgent initialized with provider={ provider } , model={ self .model } " )
245+ logger .info (
246+ f"ApiAgent initialized with provider={ provider } , model={ self .model } "
247+ )
244248 if self .demo :
245- logger .info (f"Demo trajectory provided ({ len (self .demo )} chars) - will persist across all steps" )
249+ logger .info (
250+ f"Demo trajectory provided ({ len (self .demo )} chars) - will persist across all steps"
251+ )
246252
247253 def predict (self , instruction : str , obs : Dict ) -> tuple :
248254 """Predict the next action based on observation.
@@ -325,10 +331,9 @@ def predict(self, instruction: str, obs: Dict) -> tuple:
325331 # Add action history if enabled (enhanced: includes reasoning, not just raw actions)
326332 if self .use_history and self .history :
327333 # Use rich history with reasoning (like PC Agent-E)
328- history_entries = self .history [- self .history_cutoff :]
334+ history_entries = self .history [- self .history_cutoff :]
329335 history_str = "\n \n " .join (
330- f"[Step { i + 1 } ] { entry } "
331- for i , entry in enumerate (history_entries )
336+ f"[Step { i + 1 } ] { entry } " for i , entry in enumerate (history_entries )
332337 )
333338 content_parts .append (f"History of previous steps:\n { history_str } " )
334339 logs ["history_entries" ] = len (history_entries )
@@ -381,14 +386,18 @@ def predict(self, instruction: str, obs: Dict) -> tuple:
381386 actions = [code_text ]
382387 self .prev_actions .append (code_text )
383388 # Store rich history with reasoning (memory + action)
384- self ._add_to_history (f"Thought: { self .memory_block_text } \n Action: { code_text } " )
389+ self ._add_to_history (
390+ f"Thought: { self .memory_block_text } \n Action: { code_text } "
391+ )
385392 else :
386393 # Try to extract action from response text
387394 action = self ._parse_action_from_text (response_text , w , h )
388395 if action :
389396 actions = [action ]
390397 self .prev_actions .append (action )
391- self ._add_to_history (f"Thought: { self .memory_block_text } \n Action: { action } " )
398+ self ._add_to_history (
399+ f"Thought: { self .memory_block_text } \n Action: { action } "
400+ )
392401 else :
393402 logger .warning ("Could not extract action from response" )
394403 actions = ["# Could not parse action" ]
@@ -483,33 +492,25 @@ def _parse_action_from_text(self, text: str, width: int, height: int) -> str | N
483492 Python code string or None if parsing failed.
484493 """
485494 # Try to find click coordinates
486- click_match = re .search (
487- r"click.*?(\d+)\s*,\s*(\d+)" , text , re .IGNORECASE
488- )
495+ click_match = re .search (r"click.*?(\d+)\s*,\s*(\d+)" , text , re .IGNORECASE )
489496 if click_match :
490497 x , y = int (click_match .group (1 )), int (click_match .group (2 ))
491498 return f"computer.click({ x } , { y } )"
492499
493500 # Try to find type text
494- type_match = re .search (
495- r'type[:\s]+["\'](.+?)["\']' , text , re .IGNORECASE
496- )
501+ type_match = re .search (r'type[:\s]+["\'](.+?)["\']' , text , re .IGNORECASE )
497502 if type_match :
498503 text_to_type = type_match .group (1 )
499504 return f'computer.type("{ text_to_type } ")'
500505
501506 # Try to find key press
502- key_match = re .search (
503- r"press[:\s]+(\w+)" , text , re .IGNORECASE
504- )
507+ key_match = re .search (r"press[:\s]+(\w+)" , text , re .IGNORECASE )
505508 if key_match :
506509 key = key_match .group (1 ).lower ()
507510 return f'computer.press("{ key } ")'
508511
509512 # Try to find hotkey
510- hotkey_match = re .search (
511- r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)" , text , re .IGNORECASE
512- )
513+ hotkey_match = re .search (r"hotkey[:\s]+(\w+)\s*\+\s*(\w+)" , text , re .IGNORECASE )
513514 if hotkey_match :
514515 key1 , key2 = hotkey_match .group (1 ).lower (), hotkey_match .group (2 ).lower ()
515516 return f'computer.hotkey("{ key1 } ", "{ key2 } ")'
0 commit comments