-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Expand file tree
/
Copy pathcomputer_use.py
More file actions
244 lines (203 loc) · 7.67 KB
/
computer_use.py
File metadata and controls
244 lines (203 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# How to run this example:
# uv run python -m playwright install chromium
# uv run -m examples.tools.computer_use
import asyncio
import base64
import sys
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import Any, Literal
from playwright.async_api import Browser, Page, Playwright, async_playwright
from agents import (
Agent,
AsyncComputer,
Button,
ComputerProvider,
ComputerTool,
RunContextWrapper,
Runner,
trace,
)
# Uncomment to see very verbose logs
# import logging
# logging.getLogger("openai.agents").setLevel(logging.DEBUG)
# logging.getLogger("openai.agents").addHandler(logging.StreamHandler())
CUA_KEY_TO_PLAYWRIGHT_KEY = {
"/": "Divide",
"\\": "Backslash",
"alt": "Alt",
"arrowdown": "ArrowDown",
"arrowleft": "ArrowLeft",
"arrowright": "ArrowRight",
"arrowup": "ArrowUp",
"backspace": "Backspace",
"capslock": "CapsLock",
"cmd": "Meta",
"ctrl": "Control",
"delete": "Delete",
"end": "End",
"enter": "Enter",
"esc": "Escape",
"home": "Home",
"insert": "Insert",
"option": "Alt",
"pagedown": "PageDown",
"pageup": "PageUp",
"shift": "Shift",
"space": " ",
"super": "Meta",
"tab": "Tab",
"win": "Meta",
}
class LocalPlaywrightComputer(AsyncComputer):
"""A computer, implemented using a local Playwright browser."""
def __init__(self):
self._playwright: Playwright | None = None
self._browser: Browser | None = None
self._page: Page | None = None
async def _get_browser_and_page(self) -> tuple[Browser, Page]:
width, height = self.dimensions
launch_args = [f"--window-size={width},{height}"]
browser = await self.playwright.chromium.launch(headless=False, args=launch_args)
page = await browser.new_page()
await page.set_viewport_size({"width": width, "height": height})
await page.goto("https://www.bing.com")
return browser, page
async def __aenter__(self):
# Start Playwright and call the subclass hook for getting browser/page
self._playwright = await async_playwright().start()
self._browser, self._page = await self._get_browser_and_page()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self._browser:
await self._browser.close()
if self._playwright:
await self._playwright.stop()
return None
async def open(self) -> "LocalPlaywrightComputer":
"""Open resources without using a context manager."""
await self.__aenter__()
return self
async def close(self) -> None:
"""Close resources without using a context manager."""
await self.__aexit__(None, None, None)
@property
def playwright(self) -> Playwright:
assert self._playwright is not None
return self._playwright
@property
def browser(self) -> Browser:
assert self._browser is not None
return self._browser
@property
def page(self) -> Page:
assert self._page is not None
return self._page
@property
def dimensions(self) -> tuple[int, int]:
return (1024, 768)
async def screenshot(self) -> str:
"""Capture only the viewport (not full_page)."""
png_bytes = await self.page.screenshot(full_page=False)
return base64.b64encode(png_bytes).decode("utf-8")
def _normalize_keys(self, keys: list[str] | None) -> list[str]:
if not keys:
return []
return [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys]
@asynccontextmanager
async def _hold_keys(self, keys: list[str] | None) -> AsyncIterator[None]:
mapped_keys = self._normalize_keys(keys)
try:
for key in mapped_keys:
await self.page.keyboard.down(key)
yield
finally:
for key in reversed(mapped_keys):
await self.page.keyboard.up(key)
async def click(
self, x: int, y: int, button: Button = "left", *, keys: list[str] | None = None
) -> None:
playwright_button: Literal["left", "middle", "right"] = "left"
# Playwright only supports left, middle, right buttons
if button in ("left", "right", "middle"):
playwright_button = button # type: ignore
async with self._hold_keys(keys):
await self.page.mouse.click(x, y, button=playwright_button)
async def double_click(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
async with self._hold_keys(keys):
await self.page.mouse.dblclick(x, y)
async def scroll(
self,
x: int,
y: int,
scroll_x: int,
scroll_y: int,
*,
keys: list[str] | None = None,
) -> None:
async with self._hold_keys(keys):
await self.page.mouse.move(x, y)
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
async def type(self, text: str) -> None:
await self.page.keyboard.type(text)
async def wait(self) -> None:
await asyncio.sleep(1)
async def move(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
async with self._hold_keys(keys):
await self.page.mouse.move(x, y)
async def keypress(self, keys: list[str]) -> None:
mapped_keys = self._normalize_keys(keys)
for key in mapped_keys:
await self.page.keyboard.down(key)
for key in reversed(mapped_keys):
await self.page.keyboard.up(key)
async def drag(self, path: list[tuple[int, int]], *, keys: list[str] | None = None) -> None:
if not path:
return
async with self._hold_keys(keys):
await self.page.mouse.move(path[0][0], path[0][1])
await self.page.mouse.down()
for px, py in path[1:]:
await self.page.mouse.move(px, py)
await self.page.mouse.up()
async def run_agent(
computer_config: ComputerProvider[LocalPlaywrightComputer] | AsyncComputer,
) -> None:
with trace("Computer use example"):
agent = Agent(
name="Browser user",
instructions="You are a helpful agent. Find the current weather in Tokyo.",
tools=[ComputerTool(computer=computer_config)],
# GPT-5.4 uses the built-in Responses API computer tool.
model="gpt-5.4",
)
result = await Runner.run(agent, "What is the weather in Tokyo right now?")
print(result.final_output)
async def singleton_computer() -> None:
# Use a shared computer when you do not expect to run multiple agents concurrently.
async with LocalPlaywrightComputer() as computer:
await run_agent(computer)
async def computer_per_request() -> None:
# Initialize a new computer per request to avoid sharing state between runs.
async def create_computer(*, run_context: RunContextWrapper[Any]) -> LocalPlaywrightComputer:
print(f"Creating computer for run context: {run_context}")
return await LocalPlaywrightComputer().open()
async def dispose_computer(
*,
run_context: RunContextWrapper[Any],
computer: LocalPlaywrightComputer,
) -> None:
print(f"Disposing computer for run context: {run_context}")
await computer.close()
await run_agent(
ComputerProvider[LocalPlaywrightComputer](
create=create_computer,
dispose=dispose_computer,
)
)
if __name__ == "__main__":
mode = (sys.argv[1] if len(sys.argv) > 1 else "").lower()
if mode == "singleton":
asyncio.run(singleton_computer())
else:
asyncio.run(computer_per_request())