Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/tools.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ When a [`ComputerTool`][agents.tool.ComputerTool] is present, `tool_choice="comp

This distinction matters when `ComputerTool` is backed by a [`ComputerProvider`][agents.tool.ComputerProvider] factory. The GA `computer` payload does not need `environment` or dimensions at serialization time, so unresolved factories are fine. Preview-compatible serialization still needs a resolved `Computer` or `AsyncComputer` instance so the SDK can send `environment`, `display_width`, and `display_height`.

At runtime, both paths still use the same local harness. Preview responses emit `computer_call` items with a single `action`; `gpt-5.4` can emit batched `actions[]`, and the SDK executes them in order before producing a `computer_call_output` screenshot item. See `examples/tools/computer_use.py` for a runnable Playwright-based harness.
At runtime, both paths still use the same local harness. Preview responses emit `computer_call` items with a single `action`; `gpt-5.4` can emit batched `actions[]`, and the SDK executes them in order before producing a `computer_call_output` screenshot item. Mouse actions can also carry an optional `keys` list for held modifiers such as `Shift` or `Ctrl`. See `examples/tools/computer_use.py` for a runnable Playwright-based harness.

```python
from agents import Agent, ApplyPatchTool, ShellTool
Expand All @@ -211,14 +211,14 @@ class NoopComputer(AsyncComputer):
environment = "browser"
dimensions = (1024, 768)
async def screenshot(self): return ""
async def click(self, x, y, button): ...
async def double_click(self, x, y): ...
async def scroll(self, x, y, scroll_x, scroll_y): ...
async def click(self, x, y, button, *, keys=None): ...
async def double_click(self, x, y, *, keys=None): ...
async def scroll(self, x, y, scroll_x, scroll_y, *, keys=None): ...
async def type(self, text): ...
async def wait(self): ...
async def move(self, x, y): ...
async def move(self, x, y, *, keys=None): ...
async def keypress(self, keys): ...
async def drag(self, path): ...
async def drag(self, path, *, keys=None): ...


class NoopEditor(ApplyPatchEditor):
Expand Down
65 changes: 49 additions & 16 deletions examples/tools/computer_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import asyncio
import base64
import sys
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import Any, Literal, Union

from playwright.async_api import Browser, Page, Playwright, async_playwright
Expand Down Expand Up @@ -118,46 +120,77 @@ async def screenshot(self) -> str:
png_bytes = await self.page.screenshot(full_page=False)
return base64.b64encode(png_bytes).decode("utf-8")

async def click(self, x: int, y: int, button: Button = "left") -> None:
def _normalize_keys(self, keys: list[str] | None) -> list[str]:
if not keys:
return []
return [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys]

@asynccontextmanager
async def _hold_keys(self, keys: list[str] | None) -> AsyncIterator[None]:
mapped_keys = self._normalize_keys(keys)
try:
for key in mapped_keys:
await self.page.keyboard.down(key)
yield
finally:
for key in reversed(mapped_keys):
await self.page.keyboard.up(key)

async def click(
self, x: int, y: int, button: Button = "left", *, keys: list[str] | None = None
) -> None:
playwright_button: Literal["left", "middle", "right"] = "left"

# Playwright only supports left, middle, right buttons
if button in ("left", "right", "middle"):
playwright_button = button # type: ignore

await self.page.mouse.click(x, y, button=playwright_button)
async with self._hold_keys(keys):
await self.page.mouse.click(x, y, button=playwright_button)

async def double_click(self, x: int, y: int) -> None:
await self.page.mouse.dblclick(x, y)
async def double_click(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
async with self._hold_keys(keys):
await self.page.mouse.dblclick(x, y)

async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
await self.page.mouse.move(x, y)
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")
async def scroll(
self,
x: int,
y: int,
scroll_x: int,
scroll_y: int,
*,
keys: list[str] | None = None,
) -> None:
async with self._hold_keys(keys):
await self.page.mouse.move(x, y)
await self.page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})")

async def type(self, text: str) -> None:
await self.page.keyboard.type(text)

async def wait(self) -> None:
await asyncio.sleep(1)

async def move(self, x: int, y: int) -> None:
await self.page.mouse.move(x, y)
async def move(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
async with self._hold_keys(keys):
await self.page.mouse.move(x, y)

async def keypress(self, keys: list[str]) -> None:
mapped_keys = [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys]
mapped_keys = self._normalize_keys(keys)
for key in mapped_keys:
await self.page.keyboard.down(key)
for key in reversed(mapped_keys):
await self.page.keyboard.up(key)

async def drag(self, path: list[tuple[int, int]]) -> None:
async def drag(self, path: list[tuple[int, int]], *, keys: list[str] | None = None) -> None:
if not path:
return
await self.page.mouse.move(path[0][0], path[0][1])
await self.page.mouse.down()
for px, py in path[1:]:
await self.page.mouse.move(px, py)
await self.page.mouse.up()
async with self._hold_keys(keys):
await self.page.mouse.move(path[0][0], path[0][1])
await self.page.mouse.down()
for px, py in path[1:]:
await self.page.mouse.move(px, py)
await self.page.mouse.up()


async def run_agent(
Expand Down
36 changes: 26 additions & 10 deletions src/agents/computer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,23 @@ def screenshot(self) -> str:
pass

@abc.abstractmethod
def click(self, x: int, y: int, button: Button) -> None:
def click(self, x: int, y: int, button: Button, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
def double_click(self, x: int, y: int) -> None:
def double_click(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
def scroll(
self,
x: int,
y: int,
scroll_x: int,
scroll_y: int,
*,
keys: list[str] | None = None,
) -> None:
pass

@abc.abstractmethod
Expand All @@ -44,15 +52,15 @@ def wait(self) -> None:
pass

@abc.abstractmethod
def move(self, x: int, y: int) -> None:
def move(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
def keypress(self, keys: list[str]) -> None:
pass

@abc.abstractmethod
def drag(self, path: list[tuple[int, int]]) -> None:
def drag(self, path: list[tuple[int, int]], *, keys: list[str] | None = None) -> None:
pass


Expand All @@ -75,15 +83,23 @@ async def screenshot(self) -> str:
pass

@abc.abstractmethod
async def click(self, x: int, y: int, button: Button) -> None:
async def click(self, x: int, y: int, button: Button, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
async def double_click(self, x: int, y: int) -> None:
async def double_click(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
async def scroll(
self,
x: int,
y: int,
scroll_x: int,
scroll_y: int,
*,
keys: list[str] | None = None,
) -> None:
pass

@abc.abstractmethod
Expand All @@ -95,13 +111,13 @@ async def wait(self) -> None:
pass

@abc.abstractmethod
async def move(self, x: int, y: int) -> None:
async def move(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

@abc.abstractmethod
async def keypress(self, keys: list[str]) -> None:
pass

@abc.abstractmethod
async def drag(self, path: list[tuple[int, int]]) -> None:
async def drag(self, path: list[tuple[int, int]], *, keys: list[str] | None = None) -> None:
pass
49 changes: 47 additions & 2 deletions src/agents/run_internal/tool_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,30 +185,47 @@ async def _execute_action_and_capture(
) -> str:
"""Execute computer actions (sync or async drivers) and return the final screenshot."""

async def maybe_call(method_name: str, *args: Any) -> Any:
async def maybe_call(method_name: str, *args: Any, **kwargs: Any) -> Any:
method = getattr(computer, method_name, None)
if method is None or not callable(method):
raise ModelBehaviorError(f"Computer driver missing method {method_name}")
result = method(*args)
supported_kwargs = cls._supported_keyword_arguments(method)
filtered_kwargs = {key: value for key, value in kwargs.items() if value is not None}
unsupported_kwargs = [
key
for key in filtered_kwargs
if key not in supported_kwargs and None not in supported_kwargs
]
if unsupported_kwargs:
unsupported = ", ".join(sorted(unsupported_kwargs))
raise ModelBehaviorError(
"Computer driver method "
f"{method_name!r} does not accept keyword argument(s) {unsupported}. "
"Update the driver to support modifier keys for computer actions."
)
result = method(*args, **filtered_kwargs)
return await result if inspect.isawaitable(result) else result

last_action_was_screenshot = False
last_screenshot_result: Any = None
for action in cls._iter_actions(tool_call):
action_type = get_mapping_or_attr(action, "type")
action_keys = cls._normalize_modifier_keys(get_mapping_or_attr(action, "keys"))
last_action_was_screenshot = False
if action_type == "click":
await maybe_call(
"click",
get_mapping_or_attr(action, "x"),
get_mapping_or_attr(action, "y"),
get_mapping_or_attr(action, "button"),
keys=action_keys,
)
elif action_type == "double_click":
await maybe_call(
"double_click",
get_mapping_or_attr(action, "x"),
get_mapping_or_attr(action, "y"),
keys=action_keys,
)
elif action_type == "drag":
path = get_mapping_or_attr(action, "path") or []
Expand All @@ -221,6 +238,7 @@ async def maybe_call(method_name: str, *args: Any) -> Any:
)
for point in path
],
keys=action_keys,
)
elif action_type == "keypress":
await maybe_call("keypress", get_mapping_or_attr(action, "keys"))
Expand All @@ -229,6 +247,7 @@ async def maybe_call(method_name: str, *args: Any) -> Any:
"move",
get_mapping_or_attr(action, "x"),
get_mapping_or_attr(action, "y"),
keys=action_keys,
)
elif action_type == "screenshot":
last_screenshot_result = await maybe_call("screenshot")
Expand All @@ -240,6 +259,7 @@ async def maybe_call(method_name: str, *args: Any) -> Any:
get_mapping_or_attr(action, "y"),
get_mapping_or_attr(action, "scroll_x"),
get_mapping_or_attr(action, "scroll_y"),
keys=action_keys,
)
elif action_type == "type":
await maybe_call("type", get_mapping_or_attr(action, "text"))
Expand Down Expand Up @@ -285,6 +305,31 @@ def _serialize_action_payload(action: Any) -> Any:
return dataclasses.asdict(action)
return action

@staticmethod
def _normalize_modifier_keys(keys: Any) -> list[str] | None:
if not keys:
return None
return cast(list[str], keys)

@staticmethod
def _supported_keyword_arguments(method: Any) -> set[str | None]:
signature = inspect.signature(method)
supported: set[str | None] = {
parameter.name
for parameter in signature.parameters.values()
if parameter.kind
in {
inspect.Parameter.KEYWORD_ONLY,
inspect.Parameter.POSITIONAL_OR_KEYWORD,
}
}
if any(
parameter.kind == inspect.Parameter.VAR_KEYWORD
for parameter in signature.parameters.values()
):
supported.add(None)
return supported


class LocalShellAction:
"""Execute local shell commands via the LocalShellTool with lifecycle hooks."""
Expand Down
18 changes: 13 additions & 5 deletions tests/test_agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3891,22 +3891,30 @@ def dimensions(self) -> tuple[int, int]:
def screenshot(self) -> str:
return "screenshot"

def click(self, x: int, y: int, button: str) -> None:
def click(self, x: int, y: int, button: str, *, keys: list[str] | None = None) -> None:
pass

def double_click(self, x: int, y: int) -> None:
def double_click(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

def drag(self, path: list[tuple[int, int]]) -> None:
def drag(self, path: list[tuple[int, int]], *, keys: list[str] | None = None) -> None:
pass

def keypress(self, keys: list[str]) -> None:
pass

def move(self, x: int, y: int) -> None:
def move(self, x: int, y: int, *, keys: list[str] | None = None) -> None:
pass

def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
def scroll(
self,
x: int,
y: int,
scroll_x: int,
scroll_y: int,
*,
keys: list[str] | None = None,
) -> None:
pass

def type(self, text: str) -> None:
Expand Down
Loading