From f0327a22fb66b10012503bceca500abf5e2a796d Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Fri, 6 Mar 2026 15:01:47 +0100 Subject: [PATCH 1/8] Add computer-use plugin for model-agnostic desktop control New plugin that registers desktop automation tools (click, double_click, type_text, key_press, scroll, mouse_move, open_path) on any LLM via the standard FunctionRegistry. Backed by pyautogui and designed to work with Realtime models that receive screen-share frames. Made-with: Cursor --- plugins/computer_use/README.md | 42 ++++++ plugins/computer_use/pyproject.toml | 40 ++++++ plugins/computer_use/tests/__init__.py | 0 .../computer_use/tests/test_computer_use.py | 112 +++++++++++++++ .../plugins/computer_use/__init__.py | 3 + .../plugins/computer_use/_actions.py | 135 ++++++++++++++++++ .../plugins/computer_use/_toolkit.py | 49 +++++++ pyproject.toml | 2 + 8 files changed, 383 insertions(+) create mode 100644 plugins/computer_use/README.md create mode 100644 plugins/computer_use/pyproject.toml create mode 100644 plugins/computer_use/tests/__init__.py create mode 100644 plugins/computer_use/tests/test_computer_use.py create mode 100644 plugins/computer_use/vision_agents/plugins/computer_use/__init__.py create mode 100644 plugins/computer_use/vision_agents/plugins/computer_use/_actions.py create mode 100644 plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py diff --git a/plugins/computer_use/README.md b/plugins/computer_use/README.md new file mode 100644 index 000000000..3941cdfc5 --- /dev/null +++ b/plugins/computer_use/README.md @@ -0,0 +1,42 @@ +# Computer Use Plugin + +Model-agnostic desktop control tools for Vision Agents. Lets any LLM with vision (via screen share) interact with the user's desktop — clicking, typing, scrolling, and opening files. + +## Install + +```bash +pip install vision-agents-plugins-computer-use +``` + +## Usage + +Register the tools on any LLM, then use with an agent that receives screen-share frames: + +```python +from vision_agents.plugins import gemini, computer_use + +llm = gemini.Realtime("gemini-2.0-flash-live-001") +computer_use.ComputerUseToolkit().register(llm) + +agent = Agent(llm=llm) +``` + +With screen sharing active, the model sees the desktop and can call: + +| Tool | Description | +|------|-------------| +| `click(x, y, button)` | Click at coordinates | +| `double_click(x, y)` | Double-click at coordinates | +| `type_text(text)` | Type text into the focused element | +| `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` | +| `scroll(x, y, clicks, direction)` | Scroll at coordinates | +| `mouse_move(x, y)` | Move cursor to coordinates | +| `open_path(path)` | Open a file/folder with the OS default handler | + +## How it works + +The SDK's screen-share pipeline (`TrackType.SCREEN_SHARE`) feeds frames to the VLM/Realtime model continuously. The model sees the screen, decides what to do, and calls action tools backed by [PyAutoGUI](https://pyautogui.readthedocs.io/). + +## Platform support + +Actions use PyAutoGUI (macOS, Linux, Windows). `open_path` uses `open` (macOS), `xdg-open` (Linux), or `explorer` (Windows). diff --git a/plugins/computer_use/pyproject.toml b/plugins/computer_use/pyproject.toml new file mode 100644 index 000000000..5adb151c1 --- /dev/null +++ b/plugins/computer_use/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "vision-agents-plugins-computer-use" +dynamic = ["version"] +description = "Model-agnostic computer use (desktop control) plugin for Vision Agents" +readme = "README.md" +keywords = ["computer use", "desktop automation", "AI", "agents"] +requires-python = ">=3.10" +license = "MIT" +dependencies = [ + "vision-agents", + "pyautogui", +] + +[project.urls] +Documentation = "https://visionagents.ai/" +Website = "https://visionagents.ai/" +Source = "https://github.com/GetStream/Vision-Agents" + +[tool.hatch.version] +source = "vcs" +raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" } + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.hatch.build.targets.sdist] +include = ["/vision_agents"] + +[tool.uv.sources] +vision-agents = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.4.1", + "pytest-asyncio>=1.0.0", +] diff --git a/plugins/computer_use/tests/__init__.py b/plugins/computer_use/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plugins/computer_use/tests/test_computer_use.py b/plugins/computer_use/tests/test_computer_use.py new file mode 100644 index 000000000..2dabc823d --- /dev/null +++ b/plugins/computer_use/tests/test_computer_use.py @@ -0,0 +1,112 @@ +"""Tests for the computer-use plugin.""" + +import pytest +from vision_agents.core.llm import FunctionRegistry + +from vision_agents.plugins.computer_use import ComputerUseToolkit +from vision_agents.plugins.computer_use._actions import ( + click, + double_click, + key_press, + mouse_move, + scroll, + type_text, +) + +EXPECTED_TOOLS = { + "click", + "double_click", + "type_text", + "key_press", + "scroll", + "mouse_move", + "open_path", +} + + +class _FakeLLM: + """Minimal stand-in that exposes a real FunctionRegistry.""" + + def __init__(self): + self.function_registry = FunctionRegistry() + + +class TestComputerUseToolkit: + def test_register_adds_all_tools(self): + llm = _FakeLLM() + ComputerUseToolkit().register(llm) + + registered = set(llm.function_registry._functions.keys()) + assert registered == EXPECTED_TOOLS + + def test_register_tool_schemas_have_descriptions(self): + llm = _FakeLLM() + ComputerUseToolkit().register(llm) + + schemas = llm.function_registry.get_tool_schemas() + for schema in schemas: + assert schema["description"], f"{schema['name']} has no description" + + def test_register_is_idempotent(self): + llm = _FakeLLM() + toolkit = ComputerUseToolkit() + toolkit.register(llm) + toolkit.register(llm) + + schemas = llm.function_registry.get_tool_schemas() + names = [s["name"] for s in schemas] + assert len(names) == len(EXPECTED_TOOLS) + + def test_click_schema_has_parameters(self): + llm = _FakeLLM() + ComputerUseToolkit().register(llm) + + schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} + click_schema = schemas["click"] + props = click_schema["parameters_schema"]["properties"] + assert "x" in props + assert "y" in props + assert "button" in props + + def test_scroll_schema_has_direction(self): + llm = _FakeLLM() + ComputerUseToolkit().register(llm) + + schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} + scroll_schema = schemas["scroll"] + props = scroll_schema["parameters_schema"]["properties"] + assert "direction" in props + + +class TestActions: + """Test that action functions are valid async callables with correct signatures.""" + + @pytest.mark.integration + async def test_click_executes(self): + result = await click(0, 0, button="left") + assert "Clicked" in result + + @pytest.mark.integration + async def test_double_click_executes(self): + result = await double_click(0, 0) + assert "Double-clicked" in result + + @pytest.mark.integration + async def test_type_text_executes(self): + result = await type_text("") + assert "Typed" in result + + @pytest.mark.integration + async def test_key_press_executes(self): + result = await key_press("shift") + assert "Pressed" in result + + @pytest.mark.integration + async def test_scroll_executes(self): + result = await scroll(0, 0, clicks=1, direction="down") + assert "Scrolled" in result + + @pytest.mark.integration + async def test_mouse_move_executes(self): + result = await mouse_move(0, 0) + assert "Moved" in result diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py b/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py new file mode 100644 index 000000000..98cf7477b --- /dev/null +++ b/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py @@ -0,0 +1,3 @@ +from ._toolkit import ComputerUseToolkit + +__all__ = ["ComputerUseToolkit"] diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py new file mode 100644 index 000000000..acbb2db3e --- /dev/null +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py @@ -0,0 +1,135 @@ +"""Low-level desktop actions backed by pyautogui.""" + +import asyncio +import logging +import platform +import subprocess +from typing import Literal + +import pyautogui + +logger = logging.getLogger(__name__) + +pyautogui.FAILSAFE = True +pyautogui.PAUSE = 0.1 + + +def _run_sync(func, *args, **kwargs): + """Run a blocking pyautogui call in a thread executor.""" + loop = asyncio.get_running_loop() + return loop.run_in_executor(None, lambda: func(*args, **kwargs)) + + +async def click( + x: int, + y: int, + button: str = "left", +) -> str: + """Click at the given screen coordinates. + + Args: + x: Horizontal pixel coordinate. + y: Vertical pixel coordinate. + button: Mouse button — "left", "right", or "middle". + """ + await _run_sync(pyautogui.click, x, y, button=button) + logger.debug("click(%d, %d, button=%s)", x, y, button) + return f"Clicked at ({x}, {y}) with {button} button" + + +async def double_click(x: int, y: int) -> str: + """Double-click at the given screen coordinates. + + Args: + x: Horizontal pixel coordinate. + y: Vertical pixel coordinate. + """ + await _run_sync(pyautogui.doubleClick, x, y) + logger.debug("double_click(%d, %d)", x, y) + return f"Double-clicked at ({x}, {y})" + + +async def type_text(text: str) -> str: + """Type a string of text into the currently focused element. + + Args: + text: The text to type. + """ + await _run_sync(pyautogui.write, text, interval=0.03) + logger.debug("type_text(%r)", text[:80]) + return f"Typed {len(text)} characters" + + +async def key_press(keys: str) -> str: + """Press a key or key combination. + + Args: + keys: Key combo separated by "+", e.g. "cmd+c", "ctrl+shift+t", "enter". + """ + parts = [k.strip() for k in keys.split("+")] + await _run_sync(pyautogui.hotkey, *parts) + logger.debug("key_press(%r)", keys) + return f"Pressed {keys}" + + +async def scroll( + x: int, + y: int, + clicks: int = 3, + direction: Literal["up", "down"] = "down", +) -> str: + """Scroll at the given screen coordinates. + + Args: + x: Horizontal pixel coordinate to scroll at. + y: Vertical pixel coordinate to scroll at. + clicks: Number of scroll increments. + direction: "up" or "down". + """ + amount = clicks if direction == "up" else -clicks + await _run_sync(pyautogui.scroll, amount, x=x, y=y) + logger.debug("scroll(%d, %d, clicks=%d, direction=%s)", x, y, clicks, direction) + return f"Scrolled {direction} {clicks} clicks at ({x}, {y})" + + +async def mouse_move(x: int, y: int) -> str: + """Move the mouse cursor to the given screen coordinates. + + Args: + x: Horizontal pixel coordinate. + y: Vertical pixel coordinate. + """ + await _run_sync(pyautogui.moveTo, x, y) + logger.debug("mouse_move(%d, %d)", x, y) + return f"Moved mouse to ({x}, {y})" + + +async def open_path(path: str) -> str: + """Open a file or folder using the OS default handler. + + Args: + path: Absolute path to the file or folder to open. + """ + system = platform.system() + if system == "Darwin": + cmd = ["open", path] + elif system == "Linux": + cmd = ["xdg-open", path] + elif system == "Windows": + cmd = ["explorer", path] + else: + return f"Unsupported platform: {system}" + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + _, stderr = await proc.communicate() + if proc.returncode != 0: + err = stderr.decode().strip() + logger.error("open_path(%r) failed: %s", path, err) + return f"Failed to open {path}: {err}" + + logger.debug("open_path(%r)", path) + return f"Opened {path}" diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py new file mode 100644 index 000000000..23bda7447 --- /dev/null +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py @@ -0,0 +1,49 @@ +"""ComputerUseToolkit — registers desktop control tools on any LLM.""" + +import logging + +from vision_agents.core.llm.llm import LLM + +from . import _actions + +logger = logging.getLogger(__name__) + +_TOOL_DESCRIPTIONS = { + "click": "Click at screen coordinates (x, y) with the specified mouse button.", + "double_click": "Double-click at screen coordinates (x, y).", + "type_text": "Type a string of text into the currently focused element.", + "key_press": 'Press a key or key combination, e.g. "cmd+c", "enter", "ctrl+shift+t".', + "scroll": "Scroll at screen coordinates (x, y) in the given direction.", + "mouse_move": "Move the mouse cursor to screen coordinates (x, y).", + "open_path": "Open a file or folder by its absolute path using the OS default handler.", +} + + +class ComputerUseToolkit: + """Bundles desktop-control tools and registers them on an LLM. + + Usage:: + + from vision_agents.plugins import computer_use + + computer_use.ComputerUseToolkit().register(llm) + """ + + def register(self, llm: LLM) -> None: + """Register all computer-use action tools on *llm*.""" + tools = { + "click": _actions.click, + "double_click": _actions.double_click, + "type_text": _actions.type_text, + "key_press": _actions.key_press, + "scroll": _actions.scroll, + "mouse_move": _actions.mouse_move, + "open_path": _actions.open_path, + } + for name, func in tools.items(): + llm.function_registry.register( + name=name, description=_TOOL_DESCRIPTIONS[name] + )(func) + logger.info( + "Registered %d computer-use tools on %s", len(tools), type(llm).__name__ + ) diff --git a/pyproject.toml b/pyproject.toml index 8eed9d206..74ad030d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ vision-agents-plugins-turbopuffer = { workspace = true } vision-agents-plugins-nvidia = { workspace = true } vision-agents-plugins-mistral = { workspace = true } vision-agents-plugins-assemblyai = { workspace = true } +vision-agents-plugins-computer-use = { workspace = true } [tool.uv] # Workspace-level override to resolve numpy version conflicts @@ -79,6 +80,7 @@ members = [ "plugins/nvidia", "plugins/mistral", "plugins/assemblyai", + "plugins/computer_use", ] exclude = [ "**/__pycache__", From 37048f7241e8810b9e6d3ab8edb4355e1b8d2635 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 12:44:04 +0100 Subject: [PATCH 2/8] Refactor computer-use plugin: grid overlay, cell-based API, clean registration - Replace x/y virtual coordinates with cell-based targeting (e.g. "H8") - Add configurable Grid class with cols/rows params (default 15x15) - Add GridOverlayProcessor to draw labeled grid on screen share frames - Add sub-cell positioning (top-left, center, bottom-right, etc.) - Replace ComputerUseToolkit class with plain `register(llm)` function - Add computer use example with Gemini Realtime + grid overlay - Update README with new API and grid documentation Made-with: Cursor --- examples/10_computer_use_example/.gitignore | 1 + examples/10_computer_use_example/README.md | 64 ++++++ .../computer_use_example.py | 53 +++++ .../10_computer_use_example/instructions.md | 21 ++ .../10_computer_use_example/pyproject.toml | 19 ++ plugins/computer_use/README.md | 28 ++- .../computer_use/tests/test_computer_use.py | 111 ++++++++-- .../plugins/computer_use/__init__.py | 6 +- .../plugins/computer_use/_actions.py | 193 ++++++++++++------ .../plugins/computer_use/_grid.py | 137 +++++++++++++ .../plugins/computer_use/_processor.py | 96 +++++++++ .../plugins/computer_use/_toolkit.py | 83 ++++---- uv.lock | 153 +++++++++++++- 13 files changed, 835 insertions(+), 130 deletions(-) create mode 100644 examples/10_computer_use_example/.gitignore create mode 100644 examples/10_computer_use_example/README.md create mode 100644 examples/10_computer_use_example/computer_use_example.py create mode 100644 examples/10_computer_use_example/instructions.md create mode 100644 examples/10_computer_use_example/pyproject.toml create mode 100644 plugins/computer_use/vision_agents/plugins/computer_use/_grid.py create mode 100644 plugins/computer_use/vision_agents/plugins/computer_use/_processor.py diff --git a/examples/10_computer_use_example/.gitignore b/examples/10_computer_use_example/.gitignore new file mode 100644 index 000000000..4c49bd78f --- /dev/null +++ b/examples/10_computer_use_example/.gitignore @@ -0,0 +1 @@ +.env diff --git a/examples/10_computer_use_example/README.md b/examples/10_computer_use_example/README.md new file mode 100644 index 000000000..7573c6b70 --- /dev/null +++ b/examples/10_computer_use_example/README.md @@ -0,0 +1,64 @@ +# Computer Use Example + +An AI desktop assistant that can see your screen and control your computer. Share your screen in a video call and ask the agent to perform actions like opening folders, clicking buttons, typing text, or using keyboard shortcuts. + +## How it works + +1. You join a video call and share your screen +2. The agent receives your screen-share frames via Gemini Realtime +3. You ask the agent to do something (e.g. "open my Downloads folder") +4. The agent sees your screen, identifies what to interact with, and calls action tools +5. PyAutoGUI executes the actions on the host machine + +## Prerequisites + +- Python 3.10+ +- A display environment (the agent controls the machine it runs on) +- API keys for: + - [Google AI (Gemini)](https://ai.google.dev/) — for the Realtime LLM + - [Stream](https://getstream.io/) — for video infrastructure + +## Setup + +1. Navigate to this example: + ```bash + cd examples/10_computer_use_example + ``` + +2. Install dependencies: + ```bash + uv sync + ``` + +3. Set up your `.env`: + ``` + GOOGLE_API_KEY=your_google_key + STREAM_API_KEY=your_stream_key + STREAM_API_SECRET=your_stream_secret + ``` + +## Run + +```bash +uv run computer_use_example.py run +``` + +The agent will create a call and open a demo UI. Share your screen in the call, then ask the agent to perform actions. + +## Available actions + +| Tool | What it does | +|------|-------------| +| `click(x, y)` | Click at screen coordinates | +| `double_click(x, y)` | Double-click at coordinates | +| `type_text(text)` | Type into the focused element | +| `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` | +| `scroll(x, y, clicks, direction)` | Scroll at coordinates | +| `mouse_move(x, y)` | Move the cursor | +| `open_path(path)` | Open a file or folder with the OS default handler | + +## Important notes + +- The agent controls the machine it runs on, not the caller's machine. For remote control, run the agent on the target machine. +- PyAutoGUI requires accessibility permissions on macOS (System Settings > Privacy & Security > Accessibility). +- Consider running in a sandboxed environment (VM or container) for safety. diff --git a/examples/10_computer_use_example/computer_use_example.py b/examples/10_computer_use_example/computer_use_example.py new file mode 100644 index 000000000..8e78a7d34 --- /dev/null +++ b/examples/10_computer_use_example/computer_use_example.py @@ -0,0 +1,53 @@ +""" +Computer use example — the agent sees your screen share and can control your desktop. + +Uses: +- Gemini Realtime for live screen-share vision + tool calling +- Stream's edge network for video transport +- Computer-use plugin for desktop actions (click, type, scroll, etc.) +- Grid overlay processor so the LLM can reference labeled cells + +Share your screen in the call, then ask the agent to perform actions +like "open my Downloads folder" or "click on the Safari icon". +""" + +import logging + +from dotenv import load_dotenv +from vision_agents.core import Agent, AgentLauncher, Runner, User +from vision_agents.plugins import computer_use, gemini, getstream + +logger = logging.getLogger(__name__) + +load_dotenv() + + +def setup_llm() -> gemini.Realtime: + llm = gemini.Realtime(fps=2) + computer_use.register(llm) + return llm + + +async def create_agent(**kwargs) -> Agent: + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Desktop Assistant", id="desktop-agent"), + instructions="Read @examples/10_computer_use_example/instructions.md", + llm=setup_llm(), + processors=[computer_use.GridOverlayProcessor(fps=2)], + ) + return agent + + +async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: + call = await agent.create_call(call_type, call_id) + + async with agent.join(call): + await agent.llm.simple_response( + text="Say hi and let the user know they can share their screen and ask you to perform actions on their computer." + ) + await agent.finish() + + +if __name__ == "__main__": + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/10_computer_use_example/instructions.md b/examples/10_computer_use_example/instructions.md new file mode 100644 index 000000000..c2ee648cd --- /dev/null +++ b/examples/10_computer_use_example/instructions.md @@ -0,0 +1,21 @@ +You are a **Desktop Assistant** that controls the user's computer by calling your tools. + +## Critical Rule + +When the user asks you to do something on screen, you MUST call the appropriate tool function (click, mouse_move, type_text, key_press, scroll, open_path). Never just describe what you would do — actually call the tool. If the user says "click on X", call the `click` tool. If they say "move cursor to X", call `mouse_move`. + +## Grid system + +The screen has a **grid overlay** with columns **A-O** (left to right) and rows **1-15** (top to bottom). Each cell is labeled in its top-left corner (e.g. A1, C5, O15). When you want to interact with a UI element, identify which grid cell it falls in and pass that as the `cell` parameter (e.g. `cell="C2"`). + +For finer accuracy, use the `position` parameter to target a specific part of the cell: top-left, top, top-right, left, center (default), right, bottom-left, bottom, or bottom-right. For example, if a button is in the top-right area of cell C2, use `cell="C2", position="top-right"`. + +## Rules + +1. **Always use tools.** When asked to perform an action, call the tool immediately. Say briefly what you'll do, then call the tool. +2. **Use cell references.** Look at the grid labels on screen and pass the `cell` parameter (e.g. "C2") for coordinate-based tools. +3. **Prefer open_path for files and folders.** If the user asks to open something by name or path, use `open_path` instead of trying to find and double-click an icon. +4. **Use keyboard shortcuts.** When possible, prefer `key_press` over clicking through menus (e.g. `cmd+c` to copy, `cmd+tab` to switch apps, `cmd+space` to open Spotlight). +5. **One action at a time.** Perform a single action, then observe the result before deciding on the next step. +6. **Ask when unsure.** If you can't clearly identify a UI element or aren't confident about which cell it's in, ask the user for guidance. +7. **Keep responses short.** The user is watching you in real time — don't narrate at length. diff --git a/examples/10_computer_use_example/pyproject.toml b/examples/10_computer_use_example/pyproject.toml new file mode 100644 index 000000000..fbc344b7c --- /dev/null +++ b/examples/10_computer_use_example/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "computer-use-example" +version = "0.1.0" +description = "AI desktop assistant that can see and control your screen" +requires-python = ">=3.10" + +dependencies = [ + "python-dotenv>=1.0", + "vision-agents", + "vision-agents-plugins-gemini", + "vision-agents-plugins-getstream", + "vision-agents-plugins-computer-use", +] + +[tool.uv.sources] +"vision-agents" = { path = "../../agents-core", editable = true } +"vision-agents-plugins-gemini" = { path = "../../plugins/gemini", editable = true } +"vision-agents-plugins-getstream" = { path = "../../plugins/getstream", editable = true } +"vision-agents-plugins-computer-use" = { path = "../../plugins/computer_use", editable = true } diff --git a/plugins/computer_use/README.md b/plugins/computer_use/README.md index 3941cdfc5..c11a36862 100644 --- a/plugins/computer_use/README.md +++ b/plugins/computer_use/README.md @@ -15,27 +15,37 @@ Register the tools on any LLM, then use with an agent that receives screen-share ```python from vision_agents.plugins import gemini, computer_use -llm = gemini.Realtime("gemini-2.0-flash-live-001") -computer_use.ComputerUseToolkit().register(llm) +llm = gemini.Realtime(fps=2) +computer_use.register(llm) -agent = Agent(llm=llm) +agent = Agent( + llm=llm, + processors=[computer_use.GridOverlayProcessor(fps=2)], +) ``` -With screen sharing active, the model sees the desktop and can call: +The `GridOverlayProcessor` draws a labeled grid on screen frames so the model can reference cells by name. Grid size is customizable: + +```python +computer_use.register(llm, cols=10, rows=10) +computer_use.GridOverlayProcessor(cols=10, rows=10, fps=2) +``` + +With screen sharing active, the model sees the grid and can call: | Tool | Description | |------|-------------| -| `click(x, y, button)` | Click at coordinates | -| `double_click(x, y)` | Double-click at coordinates | +| `click(cell, position, button)` | Click at a grid cell | +| `double_click(cell, position)` | Double-click at a grid cell | | `type_text(text)` | Type text into the focused element | | `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` | -| `scroll(x, y, clicks, direction)` | Scroll at coordinates | -| `mouse_move(x, y)` | Move cursor to coordinates | +| `scroll(cell, position, clicks, direction)` | Scroll at a grid cell | +| `mouse_move(cell, position)` | Move cursor to a grid cell | | `open_path(path)` | Open a file/folder with the OS default handler | ## How it works -The SDK's screen-share pipeline (`TrackType.SCREEN_SHARE`) feeds frames to the VLM/Realtime model continuously. The model sees the screen, decides what to do, and calls action tools backed by [PyAutoGUI](https://pyautogui.readthedocs.io/). +The SDK's screen-share pipeline (`TrackType.SCREEN_SHARE`) feeds frames to the VLM/Realtime model continuously. The `GridOverlayProcessor` annotates these frames with a labeled grid (e.g. A-O / 1-15). The model reads the grid labels, picks the right cell, and calls action tools backed by [PyAutoGUI](https://pyautogui.readthedocs.io/). ## Platform support diff --git a/plugins/computer_use/tests/test_computer_use.py b/plugins/computer_use/tests/test_computer_use.py index 2dabc823d..a6e7cebd1 100644 --- a/plugins/computer_use/tests/test_computer_use.py +++ b/plugins/computer_use/tests/test_computer_use.py @@ -3,13 +3,10 @@ import pytest from vision_agents.core.llm import FunctionRegistry -from vision_agents.plugins.computer_use import ComputerUseToolkit +from vision_agents.plugins.computer_use import Grid, register from vision_agents.plugins.computer_use._actions import ( - click, - double_click, key_press, - mouse_move, - scroll, + make_grid_actions, type_text, ) @@ -31,17 +28,82 @@ def __init__(self): self.function_registry = FunctionRegistry() -class TestComputerUseToolkit: +class TestGrid: + def test_defaults(self): + grid = Grid() + assert grid.cols == 15 + assert grid.rows == 15 + assert grid.col_labels[0] == "A" + assert grid.col_labels[-1] == "O" + + def test_custom_size(self): + grid = Grid(cols=5, rows=5) + assert grid.cols == 5 + assert grid.rows == 5 + assert grid.col_labels == ["A", "B", "C", "D", "E"] + assert grid.label == "A-E / 1-5" + + def test_invalid_cols(self): + with pytest.raises(ValueError, match="cols must be 1-26"): + Grid(cols=0) + with pytest.raises(ValueError, match="cols must be 1-26"): + Grid(cols=27) + + def test_invalid_rows(self): + with pytest.raises(ValueError, match="rows must be 1-99"): + Grid(rows=0) + with pytest.raises(ValueError, match="rows must be 1-99"): + Grid(rows=100) + + def test_cell_to_virtual_center(self): + grid = Grid(cols=10, rows=10) + vx, vy = grid.cell_to_virtual("A1") + assert vx == 50 + assert vy == 50 + + def test_cell_to_virtual_position(self): + grid = Grid(cols=10, rows=10) + vx, vy = grid.cell_to_virtual("A1", position="top-left") + assert vx == 20 + assert vy == 20 + + def test_cell_to_virtual_last_cell(self): + grid = Grid(cols=10, rows=10) + vx, vy = grid.cell_to_virtual("J10") + assert vx == 950 + assert vy == 950 + + def test_cell_to_virtual_invalid_cell(self): + grid = Grid(cols=5, rows=5) + with pytest.raises(ValueError, match="Invalid cell reference"): + grid.cell_to_virtual("Z1") + + def test_cell_to_virtual_out_of_range_row(self): + grid = Grid(cols=5, rows=5) + with pytest.raises(ValueError, match="Row must be 1-5"): + grid.cell_to_virtual("A6") + + def test_cell_to_virtual_invalid_position(self): + grid = Grid(cols=5, rows=5) + with pytest.raises(ValueError, match="Invalid position"): + grid.cell_to_virtual("A1", position="middle") + + def test_label(self): + assert Grid(cols=15, rows=15).label == "A-O / 1-15" + assert Grid(cols=26, rows=1).label == "A-Z / 1-1" + + +class TestRegister: def test_register_adds_all_tools(self): llm = _FakeLLM() - ComputerUseToolkit().register(llm) + register(llm) registered = set(llm.function_registry._functions.keys()) assert registered == EXPECTED_TOOLS def test_register_tool_schemas_have_descriptions(self): llm = _FakeLLM() - ComputerUseToolkit().register(llm) + register(llm) schemas = llm.function_registry.get_tool_schemas() for schema in schemas: @@ -49,9 +111,8 @@ def test_register_tool_schemas_have_descriptions(self): def test_register_is_idempotent(self): llm = _FakeLLM() - toolkit = ComputerUseToolkit() - toolkit.register(llm) - toolkit.register(llm) + register(llm) + register(llm) schemas = llm.function_registry.get_tool_schemas() names = [s["name"] for s in schemas] @@ -59,36 +120,46 @@ def test_register_is_idempotent(self): def test_click_schema_has_parameters(self): llm = _FakeLLM() - ComputerUseToolkit().register(llm) + register(llm) schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} click_schema = schemas["click"] props = click_schema["parameters_schema"]["properties"] - assert "x" in props - assert "y" in props + assert "cell" in props + assert "position" in props assert "button" in props def test_scroll_schema_has_direction(self): llm = _FakeLLM() - ComputerUseToolkit().register(llm) + register(llm) schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} scroll_schema = schemas["scroll"] props = scroll_schema["parameters_schema"]["properties"] assert "direction" in props + def test_custom_grid_size_in_descriptions(self): + llm = _FakeLLM() + register(llm, cols=8, rows=8) + + schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} + assert "A-H" in schemas["click"]["description"] + assert "1-8" in schemas["click"]["description"] + class TestActions: """Test that action functions are valid async callables with correct signatures.""" @pytest.mark.integration async def test_click_executes(self): - result = await click(0, 0, button="left") + actions = make_grid_actions(Grid()) + result = await actions["click"](cell="A1") assert "Clicked" in result @pytest.mark.integration async def test_double_click_executes(self): - result = await double_click(0, 0) + actions = make_grid_actions(Grid()) + result = await actions["double_click"](cell="A1") assert "Double-clicked" in result @pytest.mark.integration @@ -103,10 +174,12 @@ async def test_key_press_executes(self): @pytest.mark.integration async def test_scroll_executes(self): - result = await scroll(0, 0, clicks=1, direction="down") + actions = make_grid_actions(Grid()) + result = await actions["scroll"](cell="A1", clicks=1, direction="down") assert "Scrolled" in result @pytest.mark.integration async def test_mouse_move_executes(self): - result = await mouse_move(0, 0) + actions = make_grid_actions(Grid()) + result = await actions["mouse_move"](cell="A1") assert "Moved" in result diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py b/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py index 98cf7477b..72f2c3410 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/__init__.py @@ -1,3 +1,5 @@ -from ._toolkit import ComputerUseToolkit +from ._grid import Grid +from ._processor import GridOverlayProcessor +from ._toolkit import register -__all__ = ["ComputerUseToolkit"] +__all__ = ["Grid", "GridOverlayProcessor", "register"] diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py index acbb2db3e..f7c193ff6 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py @@ -4,49 +4,150 @@ import logging import platform import subprocess -from typing import Literal +from collections.abc import Callable, Coroutine +from typing import Any, Literal import pyautogui +from ._grid import VIRTUAL_SIZE, Grid + logger = logging.getLogger(__name__) -pyautogui.FAILSAFE = True +pyautogui.FAILSAFE = False pyautogui.PAUSE = 0.1 -def _run_sync(func, *args, **kwargs): +def _run_sync(func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any: """Run a blocking pyautogui call in a thread executor.""" loop = asyncio.get_running_loop() return loop.run_in_executor(None, lambda: func(*args, **kwargs)) -async def click( - x: int, - y: int, - button: str = "left", -) -> str: - """Click at the given screen coordinates. - - Args: - x: Horizontal pixel coordinate. - y: Vertical pixel coordinate. - button: Mouse button — "left", "right", or "middle". - """ - await _run_sync(pyautogui.click, x, y, button=button) - logger.debug("click(%d, %d, button=%s)", x, y, button) - return f"Clicked at ({x}, {y}) with {button} button" - - -async def double_click(x: int, y: int) -> str: - """Double-click at the given screen coordinates. - - Args: - x: Horizontal pixel coordinate. - y: Vertical pixel coordinate. - """ - await _run_sync(pyautogui.doubleClick, x, y) - logger.debug("double_click(%d, %d)", x, y) - return f"Double-clicked at ({x}, {y})" +def _to_screen(vx: int, vy: int) -> tuple[int, int]: + """Scale from virtual coordinate space to actual screen pixels.""" + vx = max(0, min(vx, VIRTUAL_SIZE)) + vy = max(0, min(vy, VIRTUAL_SIZE)) + sw, sh = pyautogui.size() + screen_x = int(vx * sw / VIRTUAL_SIZE) + screen_y = int(vy * sh / VIRTUAL_SIZE) + screen_x = max(0, min(screen_x, sw - 1)) + screen_y = max(0, min(screen_y, sh - 1)) + return screen_x, screen_y + + +ActionFunc = Callable[..., Coroutine[Any, Any, str]] + + +def make_grid_actions(grid: Grid) -> dict[str, ActionFunc]: + """Create coordinate-based action functions bound to *grid*.""" + + def _cell_to_screen(cell: str, position: str = "center") -> tuple[int, int]: + vx, vy = grid.cell_to_virtual(cell, position=position) + return _to_screen(vx, vy) + + async def click( + cell: str, + position: str = "center", + button: str = "left", + ) -> str: + """Click at a grid cell. + + Args: + cell: Grid cell reference, e.g. "H8". + position: Where within the cell to click. One of: top-left, top, + top-right, left, center, right, bottom-left, bottom, bottom-right. + button: Mouse button — "left", "right", or "middle". + """ + sx, sy = _cell_to_screen(cell, position) + logger.info( + "click(cell=%s, position=%s -> screen=%d,%d, button=%s)", + cell, + position, + sx, + sy, + button, + ) + await _run_sync(pyautogui.click, sx, sy, button=button) + return f"Clicked at {cell} ({position}) with {button} button" + + async def double_click( + cell: str, + position: str = "center", + ) -> str: + """Double-click at a grid cell. + + Args: + cell: Grid cell reference, e.g. "H8". + position: Where within the cell to click. One of: top-left, top, + top-right, left, center, right, bottom-left, bottom, bottom-right. + """ + sx, sy = _cell_to_screen(cell, position) + logger.info( + "double_click(cell=%s, position=%s -> screen=%d,%d)", + cell, + position, + sx, + sy, + ) + await _run_sync(pyautogui.doubleClick, sx, sy) + return f"Double-clicked at {cell} ({position})" + + async def scroll( + cell: str, + position: str = "center", + clicks: int = 3, + direction: Literal["up", "down"] = "down", + ) -> str: + """Scroll at a grid cell. + + Args: + cell: Grid cell reference, e.g. "H8". + position: Where within the cell to scroll. One of: top-left, top, + top-right, left, center, right, bottom-left, bottom, bottom-right. + clicks: Number of scroll increments. + direction: "up" or "down". + """ + sx, sy = _cell_to_screen(cell, position) + amount = clicks if direction == "up" else -clicks + await _run_sync(pyautogui.scroll, amount, x=sx, y=sy) + logger.debug( + "scroll(cell=%s, position=%s -> screen=%d,%d, direction=%s)", + cell, + position, + sx, + sy, + direction, + ) + return f"Scrolled {direction} {clicks} clicks at {cell} ({position})" + + async def mouse_move( + cell: str, + position: str = "center", + ) -> str: + """Move the mouse cursor to a grid cell. + + Args: + cell: Grid cell reference, e.g. "H8". + position: Where within the cell to move. One of: top-left, top, + top-right, left, center, right, bottom-left, bottom, bottom-right. + """ + sx, sy = _cell_to_screen(cell, position) + logger.info( + "mouse_move(cell=%s, position=%s -> screen=%d,%d)", + cell, + position, + sx, + sy, + ) + await _run_sync(pyautogui.moveTo, sx, sy) + return f"Moved mouse to {cell} ({position})" + + return { + "click": click, + "double_click": double_click, + "scroll": scroll, + "mouse_move": mouse_move, + } async def type_text(text: str) -> str: @@ -72,38 +173,6 @@ async def key_press(keys: str) -> str: return f"Pressed {keys}" -async def scroll( - x: int, - y: int, - clicks: int = 3, - direction: Literal["up", "down"] = "down", -) -> str: - """Scroll at the given screen coordinates. - - Args: - x: Horizontal pixel coordinate to scroll at. - y: Vertical pixel coordinate to scroll at. - clicks: Number of scroll increments. - direction: "up" or "down". - """ - amount = clicks if direction == "up" else -clicks - await _run_sync(pyautogui.scroll, amount, x=x, y=y) - logger.debug("scroll(%d, %d, clicks=%d, direction=%s)", x, y, clicks, direction) - return f"Scrolled {direction} {clicks} clicks at ({x}, {y})" - - -async def mouse_move(x: int, y: int) -> str: - """Move the mouse cursor to the given screen coordinates. - - Args: - x: Horizontal pixel coordinate. - y: Vertical pixel coordinate. - """ - await _run_sync(pyautogui.moveTo, x, y) - logger.debug("mouse_move(%d, %d)", x, y) - return f"Moved mouse to ({x}, {y})" - - async def open_path(path: str) -> str: """Open a file or folder using the OS default handler. diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_grid.py b/plugins/computer_use/vision_agents/plugins/computer_use/_grid.py new file mode 100644 index 000000000..5c1f97869 --- /dev/null +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_grid.py @@ -0,0 +1,137 @@ +"""Configurable grid overlay and cell-to-coordinate conversion.""" + +import logging +import re + +import av +from PIL import Image, ImageDraw, ImageFont +from PIL.ImageFont import FreeTypeFont, ImageFont as PILImageFont + +logger = logging.getLogger(__name__) + +_LINE_COLOR = (255, 255, 0, 100) +_LABEL_COLOR = (255, 255, 0, 200) + +_POSITION_OFFSETS: dict[str, tuple[float, float]] = { + "top-left": (0.2, 0.2), + "top": (0.5, 0.2), + "top-right": (0.8, 0.2), + "left": (0.2, 0.5), + "center": (0.5, 0.5), + "right": (0.8, 0.5), + "bottom-left": (0.2, 0.8), + "bottom": (0.5, 0.8), + "bottom-right": (0.8, 0.8), +} + +VIRTUAL_SIZE = 1000 + + +class Grid: + """A configurable grid for mapping cell references to screen coordinates. + + Args: + cols: Number of columns (1-26). Default 15. + rows: Number of rows (1-99). Default 15. + """ + + def __init__(self, cols: int = 15, rows: int = 15): + if cols < 1 or cols > 26: + raise ValueError(f"cols must be 1-26, got {cols}") + if rows < 1 or rows > 99: + raise ValueError(f"rows must be 1-99, got {rows}") + + self.cols = cols + self.rows = rows + self.col_labels = [chr(ord("A") + i) for i in range(cols)] + self.row_labels = list(range(1, rows + 1)) + self._cell_w = VIRTUAL_SIZE // cols + self._cell_h = VIRTUAL_SIZE // rows + + last_col = self.col_labels[-1] + self._cell_pattern = re.compile( + rf"^([A-{last_col}a-{last_col.lower()}])(\d{{1,2}})$" + ) + + @property + def label(self) -> str: + """Short description, e.g. 'A-O / 1-15'.""" + return f"{self.col_labels[0]}-{self.col_labels[-1]} / 1-{self.rows}" + + def cell_to_virtual(self, cell: str, position: str = "center") -> tuple[int, int]: + """Convert a cell reference like 'C2' to virtual (x, y). + + Args: + cell: Grid cell, e.g. "C2". + position: Sub-cell target. One of: top-left, top, top-right, left, + center, right, bottom-left, bottom, bottom-right. + + Raises: + ValueError: If the cell reference or position is invalid. + """ + m = self._cell_pattern.match(cell.strip()) + if not m: + raise ValueError(f"Invalid cell reference: {cell!r}. Use format like 'C2'.") + col_letter = m.group(1).upper() + row_num = int(m.group(2)) + + if col_letter not in self.col_labels: + raise ValueError( + f"Column must be {self.col_labels[0]}-{self.col_labels[-1]}, " + f"got {col_letter!r}" + ) + if row_num < 1 or row_num > self.rows: + raise ValueError(f"Row must be 1-{self.rows}, got {row_num}") + + offsets = _POSITION_OFFSETS.get(position.lower().strip()) + if offsets is None: + raise ValueError( + f"Invalid position: {position!r}. " + f"Choose from: {', '.join(_POSITION_OFFSETS)}" + ) + ox, oy = offsets + + col_idx = self.col_labels.index(col_letter) + row_idx = row_num - 1 + vx = int(col_idx * self._cell_w + self._cell_w * ox) + vy = int(row_idx * self._cell_h + self._cell_h * oy) + return vx, vy + + def draw_overlay(self, frame: av.VideoFrame) -> av.VideoFrame: + """Draw a labeled grid overlay on *frame*.""" + img = frame.to_image().convert("RGBA") + w, h = img.size + + overlay = Image.new("RGBA", (w, h), (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + + font: FreeTypeFont | PILImageFont + try: + font = ImageFont.truetype( + "/System/Library/Fonts/Menlo.ttc", + size=max(8, h // (self.rows * 4)), + ) + except OSError: + font = ImageFont.load_default() + + col_step = w / self.cols + row_step = h / self.rows + + for i in range(1, self.cols): + x = int(i * col_step) + draw.line([(x, 0), (x, h)], fill=_LINE_COLOR, width=1) + + for i in range(1, self.rows): + y = int(i * row_step) + draw.line([(0, y), (w, y)], fill=_LINE_COLOR, width=1) + + for ci, col in enumerate(self.col_labels): + for ri, row in enumerate(self.row_labels): + label = f"{col}{row}" + lx = int(ci * col_step) + 2 + ly = int(ri * row_step) + 1 + draw.text((lx + 1, ly + 1), label, fill=(0, 0, 0, 160), font=font) + draw.text((lx, ly), label, fill=_LABEL_COLOR, font=font) + + composited = Image.alpha_composite(img, overlay).convert("RGB") + return av.VideoFrame.from_image(composited) diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py b/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py new file mode 100644 index 000000000..dc76d12ad --- /dev/null +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py @@ -0,0 +1,96 @@ +"""GridOverlayProcessor — draws a labeled grid on screen share frames.""" + +import logging +from typing import Optional + +import aiortc +import av +from vision_agents.core.processors.base_processor import VideoProcessorPublisher +from vision_agents.core.utils.video_forwarder import VideoForwarder +from vision_agents.core.utils.video_track import QueuedVideoTrack + +from ._grid import Grid + +logger = logging.getLogger(__name__) + + +class GridOverlayProcessor(VideoProcessorPublisher): + """Draws a configurable grid on incoming video frames. + + The annotated frames are forwarded to the LLM so it can reference + grid cells (e.g. "C2") instead of guessing raw pixel coordinates. + + Args: + cols: Number of grid columns (1-26). Default 15. + rows: Number of grid rows (1-99). Default 15. + fps: Frame rate for processing. Default 2. + + Usage:: + + from vision_agents.plugins.computer_use import GridOverlayProcessor + + agent = Agent( + ..., + processors=[GridOverlayProcessor(cols=15, rows=15)], + ) + """ + + name = "grid_overlay" + + def __init__(self, cols: int = 15, rows: int = 15, fps: float = 2): + self._grid = Grid(cols=cols, rows=rows) + self._fps = fps + self._video_forwarder: Optional[VideoForwarder] = None + self._video_track = QueuedVideoTrack() + self._shutdown = False + + def publish_video_track(self) -> aiortc.VideoStreamTrack: + return self._video_track + + async def process_video( + self, + track: aiortc.VideoStreamTrack, + participant_id: Optional[str], + shared_forwarder: Optional[VideoForwarder] = None, + ) -> None: + if self._video_forwarder is not None: + await self._video_forwarder.remove_frame_handler(self._on_frame) + + logger.info( + "Starting grid overlay processor at %.1f FPS (grid %s)", + self._fps, + self._grid.label, + ) + self._video_forwarder = ( + shared_forwarder + if shared_forwarder + else VideoForwarder( + track, + max_buffer=int(self._fps), + fps=self._fps, + name="grid_overlay_forwarder", + ) + ) + self._video_forwarder.add_frame_handler( + self._on_frame, fps=self._fps, name="grid_overlay" + ) + + async def _on_frame(self, frame: av.VideoFrame) -> None: + if self._shutdown: + return + try: + annotated = self._grid.draw_overlay(frame) + except Exception: + logger.exception("draw_overlay failed, forwarding original frame") + annotated = frame + await self._video_track.add_frame(annotated) + + async def stop_processing(self) -> None: + if self._video_forwarder is not None: + await self._video_forwarder.remove_frame_handler(self._on_frame) + self._video_forwarder = None + logger.info("Stopped grid overlay processor") + + async def close(self) -> None: + self._shutdown = True + await self.stop_processing() diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py index 23bda7447..db67b629c 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py @@ -1,49 +1,58 @@ -"""ComputerUseToolkit — registers desktop control tools on any LLM.""" +"""Register desktop-control tools on any LLM.""" import logging from vision_agents.core.llm.llm import LLM from . import _actions +from ._grid import Grid logger = logging.getLogger(__name__) -_TOOL_DESCRIPTIONS = { - "click": "Click at screen coordinates (x, y) with the specified mouse button.", - "double_click": "Double-click at screen coordinates (x, y).", - "type_text": "Type a string of text into the currently focused element.", - "key_press": 'Press a key or key combination, e.g. "cmd+c", "enter", "ctrl+shift+t".', - "scroll": "Scroll at screen coordinates (x, y) in the given direction.", - "mouse_move": "Move the mouse cursor to screen coordinates (x, y).", - "open_path": "Open a file or folder by its absolute path using the OS default handler.", -} +def register(llm: LLM, cols: int = 15, rows: int = 15) -> None: + """Register all computer-use action tools on *llm*. -class ComputerUseToolkit: - """Bundles desktop-control tools and registers them on an LLM. - - Usage:: - - from vision_agents.plugins import computer_use - - computer_use.ComputerUseToolkit().register(llm) + Args: + llm: The LLM to register tools on. + cols: Number of grid columns (1-26). Default 15. + rows: Number of grid rows (1-99). Default 15. """ - - def register(self, llm: LLM) -> None: - """Register all computer-use action tools on *llm*.""" - tools = { - "click": _actions.click, - "double_click": _actions.double_click, - "type_text": _actions.type_text, - "key_press": _actions.key_press, - "scroll": _actions.scroll, - "mouse_move": _actions.mouse_move, - "open_path": _actions.open_path, - } - for name, func in tools.items(): - llm.function_registry.register( - name=name, description=_TOOL_DESCRIPTIONS[name] - )(func) - logger.info( - "Registered %d computer-use tools on %s", len(tools), type(llm).__name__ - ) + grid = Grid(cols=cols, rows=rows) + + cell_hint = ( + f" The screen has a grid overlay with columns" + f" {grid.col_labels[0]}-{grid.col_labels[-1]}" + f" and rows 1-{grid.rows}." + " Provide the 'cell' parameter (e.g. 'H8') to target a grid cell." + " Use 'position' to refine within the cell" + " (top-left, top, top-right, left, center, right," + " bottom-left, bottom, bottom-right)." + ) + + descriptions = { + "click": "Click at a grid cell." + cell_hint, + "double_click": "Double-click at a grid cell." + cell_hint, + "type_text": "Type a string of text into the currently focused element.", + "key_press": 'Press a key or key combination, e.g. "cmd+c", "enter", "ctrl+shift+t".', + "scroll": "Scroll at a grid cell." + cell_hint, + "mouse_move": "Move the mouse cursor to a grid cell." + cell_hint, + "open_path": "Open a file or folder by its absolute path using the OS default handler.", + } + + grid_actions = _actions.make_grid_actions(grid) + tools: dict[str, _actions.ActionFunc] = { + **grid_actions, + "type_text": _actions.type_text, + "key_press": _actions.key_press, + "open_path": _actions.open_path, + } + + for name, func in tools.items(): + llm.function_registry.register(name=name, description=descriptions[name])(func) + logger.info( + "Registered %d computer-use tools on %s (grid %s)", + len(tools), + type(llm).__name__, + grid.label, + ) diff --git a/uv.lock b/uv.lock index 736c9d66b..375a2701b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.12" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'win32'", @@ -17,6 +17,7 @@ members = [ "vision-agents-plugins-assemblyai", "vision-agents-plugins-aws", "vision-agents-plugins-cartesia", + "vision-agents-plugins-computer-use", "vision-agents-plugins-decart", "vision-agents-plugins-deepgram", "vision-agents-plugins-elevenlabs", @@ -3201,6 +3202,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/8e/469e5a4a2f5855992e425f3cb33804cc07bf18d48f2db061aec61ce50270/more_itertools-10.8.0-py3-none-any.whl", hash = "sha256:52d4362373dcf7c52546bc4af9a86ee7c4579df9a8dc268be0a2f949d376cc9b", size = 69667, upload-time = "2025-09-02T15:23:09.635Z" }, ] +[[package]] +name = "mouseinfo" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyperclip" }, + { name = "python3-xlib", marker = "sys_platform == 'linux'" }, + { name = "rubicon-objc", marker = "sys_platform == 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/fa/b2ba8229b9381e8f6381c1dcae6f4159a7f72349e414ed19cfbbd1817173/MouseInfo-0.1.3.tar.gz", hash = "sha256:2c62fb8885062b8e520a3cce0a297c657adcc08c60952eb05bc8256ef6f7f6e7", size = 10850, upload-time = "2020-03-27T21:20:10.136Z" } + [[package]] name = "mpmath" version = "1.3.0" @@ -4704,6 +4716,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, ] +[[package]] +name = "pyautogui" +version = "0.9.54" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mouseinfo" }, + { name = "pygetwindow" }, + { name = "pymsgbox" }, + { name = "pyobjc-core", marker = "sys_platform == 'darwin'" }, + { name = "pyobjc-framework-quartz", marker = "sys_platform == 'darwin'" }, + { name = "pyscreeze" }, + { name = "python3-xlib", marker = "sys_platform == 'linux'" }, + { name = "pytweening" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/65/ff/cdae0a8c2118a0de74b6cf4cbcdcaf8fd25857e6c3f205ce4b1794b27814/PyAutoGUI-0.9.54.tar.gz", hash = "sha256:dd1d29e8fd118941cb193f74df57e5c6ff8e9253b99c7b04f39cfc69f3ae04b2", size = 61236, upload-time = "2023-05-24T20:11:32.972Z" } + [[package]] name = "pybase64" version = "1.4.3" @@ -4995,6 +5023,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9b/4d/b9add7c84060d4c1906abe9a7e5359f2a60f7a9a4f67268b2766673427d8/pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498", size = 15730, upload-time = "2025-03-17T18:53:14.532Z" }, ] +[[package]] +name = "pygetwindow" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyrect" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/70/c7a4f46dbf06048c6d57d9489b8e0f9c4c3d36b7479f03c5ca97eaa2541d/PyGetWindow-0.0.9.tar.gz", hash = "sha256:17894355e7d2b305cd832d717708384017c1698a90ce24f6f7fbf0242dd0a688", size = 9699, upload-time = "2020-10-04T02:12:50.806Z" } + [[package]] name = "pygments" version = "2.19.2" @@ -5099,6 +5136,61 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/ec/6e02b2561d056ea5b33046e3cad21238e6a9097b97d6ccc0fbe52b50c858/pylibsrtp-1.0.0-cp310-abi3-win_arm64.whl", hash = "sha256:2696bdb2180d53ac55d0eb7b58048a2aa30cd4836dd2ca683669889137a94d2a", size = 1159246, upload-time = "2025-10-13T16:12:30.285Z" }, ] +[[package]] +name = "pymsgbox" +version = "2.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/6a/e80da7594ee598a776972d09e2813df2b06b3bc29218f440631dfa7c78a8/pymsgbox-2.0.1.tar.gz", hash = "sha256:98d055c49a511dcc10fa08c3043e7102d468f5e4b3a83c6d3c61df722c7d798d", size = 20768, upload-time = "2025-09-09T00:38:56.863Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6f/3e/08c8cac81b2b2f7502746e6b9c8e5b0ec6432cd882c605560fc409aaf087/pymsgbox-2.0.1-py3-none-any.whl", hash = "sha256:5de8ec19bca2ca7e6c09d39c817c83f17c75cee80275235f43a9931db699f73b", size = 9994, upload-time = "2025-09-09T00:38:55.672Z" }, +] + +[[package]] +name = "pyobjc-core" +version = "12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b8/b6/d5612eb40be4fd5ef88c259339e6313f46ba67577a95d86c3470b951fce0/pyobjc_core-12.1.tar.gz", hash = "sha256:2bb3903f5387f72422145e1466b3ac3f7f0ef2e9960afa9bcd8961c5cbf8bd21", size = 1000532, upload-time = "2025-11-14T10:08:28.292Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/5a/6b15e499de73050f4a2c88fff664ae154307d25dc04da8fb38998a428358/pyobjc_core-12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:818bcc6723561f207e5b5453efe9703f34bc8781d11ce9b8be286bb415eb4962", size = 678335, upload-time = "2025-11-14T09:32:20.107Z" }, + { url = "https://files.pythonhosted.org/packages/f4/d2/29e5e536adc07bc3d33dd09f3f7cf844bf7b4981820dc2a91dd810f3c782/pyobjc_core-12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:01c0cf500596f03e21c23aef9b5f326b9fb1f8f118cf0d8b66749b6cf4cbb37a", size = 677370, upload-time = "2025-11-14T09:33:05.273Z" }, + { url = "https://files.pythonhosted.org/packages/1b/f0/4b4ed8924cd04e425f2a07269943018d43949afad1c348c3ed4d9d032787/pyobjc_core-12.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:177aaca84bb369a483e4961186704f64b2697708046745f8167e818d968c88fc", size = 719586, upload-time = "2025-11-14T09:33:53.302Z" }, + { url = "https://files.pythonhosted.org/packages/25/98/9f4ed07162de69603144ff480be35cd021808faa7f730d082b92f7ebf2b5/pyobjc_core-12.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:844515f5d86395b979d02152576e7dee9cc679acc0b32dc626ef5bda315eaa43", size = 670164, upload-time = "2025-11-14T09:34:37.458Z" }, + { url = "https://files.pythonhosted.org/packages/62/50/dc076965c96c7f0de25c0a32b7f8aa98133ed244deaeeacfc758783f1f30/pyobjc_core-12.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:453b191df1a4b80e756445b935491b974714456ae2cbae816840bd96f86db882", size = 712204, upload-time = "2025-11-14T09:35:24.148Z" }, +] + +[[package]] +name = "pyobjc-framework-cocoa" +version = "12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyobjc-core", marker = "sys_platform != 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/bf/ee4f27ec3920d5c6fc63c63e797c5b2cc4e20fe439217085d01ea5b63856/pyobjc_framework_cocoa-12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:547c182837214b7ec4796dac5aee3aa25abc665757b75d7f44f83c994bcb0858", size = 384590, upload-time = "2025-11-14T09:41:17.336Z" }, + { url = "https://files.pythonhosted.org/packages/ad/31/0c2e734165abb46215797bd830c4bdcb780b699854b15f2b6240515edcc6/pyobjc_framework_cocoa-12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5a3dcd491cacc2f5a197142b3c556d8aafa3963011110102a093349017705118", size = 384689, upload-time = "2025-11-14T09:41:41.478Z" }, + { url = "https://files.pythonhosted.org/packages/23/3b/b9f61be7b9f9b4e0a6db18b3c35c4c4d589f2d04e963e2174d38c6555a92/pyobjc_framework_cocoa-12.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:914b74328c22d8ca261d78c23ef2befc29776e0b85555973927b338c5734ca44", size = 388843, upload-time = "2025-11-14T09:42:05.719Z" }, + { url = "https://files.pythonhosted.org/packages/59/bb/f777cc9e775fc7dae77b569254570fe46eb842516b3e4fe383ab49eab598/pyobjc_framework_cocoa-12.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:03342a60fc0015bcdf9b93ac0b4f457d3938e9ef761b28df9564c91a14f0129a", size = 384932, upload-time = "2025-11-14T09:42:29.771Z" }, + { url = "https://files.pythonhosted.org/packages/58/27/b457b7b37089cad692c8aada90119162dfb4c4a16f513b79a8b2b022b33b/pyobjc_framework_cocoa-12.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6ba1dc1bfa4da42d04e93d2363491275fb2e2be5c20790e561c8a9e09b8cf2cc", size = 388970, upload-time = "2025-11-14T09:42:53.964Z" }, +] + +[[package]] +name = "pyobjc-framework-quartz" +version = "12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyobjc-core", marker = "sys_platform != 'win32'" }, + { name = "pyobjc-framework-cocoa", marker = "sys_platform != 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/9b/780f057e5962f690f23fdff1083a4cfda5a96d5b4d3bb49505cac4f624f2/pyobjc_framework_quartz-12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7730cdce46c7e985535b5a42c31381af4aa6556e5642dc55b5e6597595e57a16", size = 218798, upload-time = "2025-11-14T10:00:01.236Z" }, + { url = "https://files.pythonhosted.org/packages/ba/2d/e8f495328101898c16c32ac10e7b14b08ff2c443a756a76fd1271915f097/pyobjc_framework_quartz-12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:629b7971b1b43a11617f1460cd218bd308dfea247cd4ee3842eb40ca6f588860", size = 219206, upload-time = "2025-11-14T10:00:15.623Z" }, + { url = "https://files.pythonhosted.org/packages/67/43/b1f0ad3b842ab150a7e6b7d97f6257eab6af241b4c7d14cb8e7fde9214b8/pyobjc_framework_quartz-12.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:53b84e880c358ba1ddcd7e8d5ea0407d760eca58b96f0d344829162cda5f37b3", size = 224317, upload-time = "2025-11-14T10:00:30.703Z" }, + { url = "https://files.pythonhosted.org/packages/4a/00/96249c5c7e5aaca5f688ca18b8d8ad05cd7886ebd639b3c71a6a4cadbe75/pyobjc_framework_quartz-12.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:42d306b07f05ae7d155984503e0fb1b701fecd31dcc5c79fe8ab9790ff7e0de0", size = 219558, upload-time = "2025-11-14T10:00:45.476Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a6/708a55f3ff7a18c403b30a29a11dccfed0410485a7548c60a4b6d4cc0676/pyobjc_framework_quartz-12.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0cc08fddb339b2760df60dea1057453557588908e42bdc62184b6396ce2d6e9a", size = 224580, upload-time = "2025-11-14T10:01:00.091Z" }, +] + [[package]] name = "pyopenssl" version = "25.3.0" @@ -5139,6 +5231,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, ] +[[package]] +name = "pyrect" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cb/04/2ba023d5f771b645f7be0c281cdacdcd939fe13d1deb331fc5ed1a6b3a98/PyRect-0.2.0.tar.gz", hash = "sha256:f65155f6df9b929b67caffbd57c0947c5ae5449d3b580d178074bffb47a09b78", size = 17219, upload-time = "2022-03-16T04:45:52.36Z" } + +[[package]] +name = "pyscreeze" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/f0/cb456ac4f1a73723d5b866933b7986f02bacea27516629c00f8e7da94c2d/pyscreeze-1.0.1.tar.gz", hash = "sha256:cf1662710f1b46aa5ff229ee23f367da9e20af4a78e6e365bee973cad0ead4be", size = 27826, upload-time = "2024-08-20T23:03:07.291Z" } + [[package]] name = "pytest" version = "9.0.0" @@ -5232,6 +5336,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, ] +[[package]] +name = "python3-xlib" +version = "0.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/c6/2c5999de3bb1533521f1101e8fe56fd9c266732f4d48011c7c69b29d12ae/python3-xlib-0.15.tar.gz", hash = "sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8", size = 132828, upload-time = "2014-05-31T12:28:59.603Z" } + +[[package]] +name = "pytweening" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/0c/c16bc93ac2755bac0066a8ecbd2a2931a1735a6fffd99a2b9681c7e83e90/pytweening-1.2.0.tar.gz", hash = "sha256:243318b7736698066c5f362ec5c2b6434ecf4297c3c8e7caa8abfe6af4cac71b", size = 171241, upload-time = "2024-02-20T03:37:56.809Z" } + [[package]] name = "pytz" version = "2025.2" @@ -5675,6 +5791,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, ] +[[package]] +name = "rubicon-objc" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4f/d2/d39ecd205661a5c14c90dbd92a722a203848a3621785c9783716341de427/rubicon_objc-0.5.3.tar.gz", hash = "sha256:74c25920c5951a05db9d3a1aac31d23816ec7dacc841a5b124d911b99ea71b9a", size = 171512, upload-time = "2025-12-03T03:51:10.264Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/ab/e834c01138c272fb2e37d2f3c7cba708bc694dbc7b3f03b743f29ceb92d5/rubicon_objc-0.5.3-py3-none-any.whl", hash = "sha256:31dedcda9be38435f5ec067906e1eea5d0ddb790330e98a22e94ff424758b415", size = 64414, upload-time = "2025-12-03T03:51:09.082Z" }, +] + [[package]] name = "ruff" version = "0.14.4" @@ -7267,6 +7392,32 @@ dev = [ { name = "pytest-asyncio", specifier = ">=1.0.0" }, ] +[[package]] +name = "vision-agents-plugins-computer-use" +source = { editable = "plugins/computer_use" } +dependencies = [ + { name = "pyautogui" }, + { name = "vision-agents" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "pytest-asyncio" }, +] + +[package.metadata] +requires-dist = [ + { name = "pyautogui" }, + { name = "vision-agents", editable = "agents-core" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8.4.1" }, + { name = "pytest-asyncio", specifier = ">=1.0.0" }, +] + [[package]] name = "vision-agents-plugins-decart" source = { editable = "plugins/decart" } From d9bf77a832c412ee64195d69a80bc3406d12cc98 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 12:59:35 +0100 Subject: [PATCH 3/8] Add computer-use plugin to core optional dependencies Made-with: Cursor --- agents-core/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml index eba9dbf74..159f065af 100644 --- a/agents-core/pyproject.toml +++ b/agents-core/pyproject.toml @@ -72,6 +72,7 @@ twilio = ["vision-agents-plugins-twilio"] turbopuffer = ["vision-agents-plugins-turbopuffer"] mistral = ["vision-agents-plugins-mistral"] assemblyai = ["vision-agents-plugins-assemblyai"] +computer-use = ["vision-agents-plugins-computer-use"] redis = ["redis[hiredis]>=5.0.0"] all-plugins = [ @@ -107,6 +108,7 @@ all-plugins = [ "vision-agents-plugins-turbopuffer", "vision-agents-plugins-mistral", "vision-agents-plugins-assemblyai", + "vision-agents-plugins-computer-use", ] [tool.hatch.metadata] From 5c3cd84cffa427aae9b789a7dbc2bb2231ef94f4 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 13:10:09 +0100 Subject: [PATCH 4/8] Guard pyautogui import for headless CI environments Made-with: Cursor --- .../vision_agents/plugins/computer_use/_actions.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py index f7c193ff6..93040f502 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py @@ -7,14 +7,17 @@ from collections.abc import Callable, Coroutine from typing import Any, Literal -import pyautogui - from ._grid import VIRTUAL_SIZE, Grid logger = logging.getLogger(__name__) -pyautogui.FAILSAFE = False -pyautogui.PAUSE = 0.1 +try: + import pyautogui + + pyautogui.FAILSAFE = False + pyautogui.PAUSE = 0.1 +except (KeyError, ImportError): + pyautogui = None # type: ignore[assignment] def _run_sync(func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any: From 64fcc62e02e546223eec0e185e8eac324c4148c2 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 13:12:55 +0100 Subject: [PATCH 5/8] Fix review feedback: update example README table and add double_click to instructions Made-with: Cursor --- examples/10_computer_use_example/README.md | 10 +++++----- examples/10_computer_use_example/instructions.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/10_computer_use_example/README.md b/examples/10_computer_use_example/README.md index 7573c6b70..e1a82cf00 100644 --- a/examples/10_computer_use_example/README.md +++ b/examples/10_computer_use_example/README.md @@ -31,7 +31,7 @@ An AI desktop assistant that can see your screen and control your computer. Shar ``` 3. Set up your `.env`: - ``` + ```bash GOOGLE_API_KEY=your_google_key STREAM_API_KEY=your_stream_key STREAM_API_SECRET=your_stream_secret @@ -49,12 +49,12 @@ The agent will create a call and open a demo UI. Share your screen in the call, | Tool | What it does | |------|-------------| -| `click(x, y)` | Click at screen coordinates | -| `double_click(x, y)` | Double-click at coordinates | +| `click(cell, position, button)` | Click at a grid cell | +| `double_click(cell, position)` | Double-click at a grid cell | | `type_text(text)` | Type into the focused element | | `key_press(keys)` | Press a key combo, e.g. `"cmd+c"` | -| `scroll(x, y, clicks, direction)` | Scroll at coordinates | -| `mouse_move(x, y)` | Move the cursor | +| `scroll(cell, position, clicks, direction)` | Scroll at a grid cell | +| `mouse_move(cell, position)` | Move the cursor to a grid cell | | `open_path(path)` | Open a file or folder with the OS default handler | ## Important notes diff --git a/examples/10_computer_use_example/instructions.md b/examples/10_computer_use_example/instructions.md index c2ee648cd..dd0136cf5 100644 --- a/examples/10_computer_use_example/instructions.md +++ b/examples/10_computer_use_example/instructions.md @@ -2,7 +2,7 @@ You are a **Desktop Assistant** that controls the user's computer by calling you ## Critical Rule -When the user asks you to do something on screen, you MUST call the appropriate tool function (click, mouse_move, type_text, key_press, scroll, open_path). Never just describe what you would do — actually call the tool. If the user says "click on X", call the `click` tool. If they say "move cursor to X", call `mouse_move`. +When the user asks you to do something on screen, you MUST call the appropriate tool function (click, double_click, mouse_move, type_text, key_press, scroll, open_path). Never just describe what you would do — actually call the tool. If the user says "click on X", call the `click` tool. If they say "move cursor to X", call `mouse_move`. ## Grid system From 8f73eccdf95018bc6944b9b58c26c6c1f5b4dc98 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 13:24:50 +0100 Subject: [PATCH 6/8] Add shared Grid instance support to register() and GridOverlayProcessor Both now accept an optional `grid=` parameter so tools and overlay share a single source of truth for grid dimensions. Made-with: Cursor --- .../computer_use_example.py | 7 +++++-- plugins/computer_use/README.md | 7 ++++--- plugins/computer_use/tests/test_computer_use.py | 9 +++++++++ .../plugins/computer_use/_processor.py | 16 ++++++++++++---- .../plugins/computer_use/_toolkit.py | 11 +++++++++-- uv.lock | 8 +++++++- 6 files changed, 46 insertions(+), 12 deletions(-) diff --git a/examples/10_computer_use_example/computer_use_example.py b/examples/10_computer_use_example/computer_use_example.py index 8e78a7d34..cce530b75 100644 --- a/examples/10_computer_use_example/computer_use_example.py +++ b/examples/10_computer_use_example/computer_use_example.py @@ -22,9 +22,12 @@ load_dotenv() +grid = computer_use.Grid(cols=15, rows=15) + + def setup_llm() -> gemini.Realtime: llm = gemini.Realtime(fps=2) - computer_use.register(llm) + computer_use.register(llm, grid=grid) return llm @@ -34,7 +37,7 @@ async def create_agent(**kwargs) -> Agent: agent_user=User(name="Desktop Assistant", id="desktop-agent"), instructions="Read @examples/10_computer_use_example/instructions.md", llm=setup_llm(), - processors=[computer_use.GridOverlayProcessor(fps=2)], + processors=[computer_use.GridOverlayProcessor(grid=grid, fps=2)], ) return agent diff --git a/plugins/computer_use/README.md b/plugins/computer_use/README.md index c11a36862..bf7938327 100644 --- a/plugins/computer_use/README.md +++ b/plugins/computer_use/README.md @@ -24,11 +24,12 @@ agent = Agent( ) ``` -The `GridOverlayProcessor` draws a labeled grid on screen frames so the model can reference cells by name. Grid size is customizable: +The `GridOverlayProcessor` draws a labeled grid on screen frames so the model can reference cells by name. Grid size is customizable — share a `Grid` instance to keep tools and overlay in sync: ```python -computer_use.register(llm, cols=10, rows=10) -computer_use.GridOverlayProcessor(cols=10, rows=10, fps=2) +grid = computer_use.Grid(cols=20, rows=20) +computer_use.register(llm, grid=grid) +computer_use.GridOverlayProcessor(grid=grid, fps=2) ``` With screen sharing active, the model sees the grid and can call: diff --git a/plugins/computer_use/tests/test_computer_use.py b/plugins/computer_use/tests/test_computer_use.py index a6e7cebd1..8ea814b66 100644 --- a/plugins/computer_use/tests/test_computer_use.py +++ b/plugins/computer_use/tests/test_computer_use.py @@ -146,6 +146,15 @@ def test_custom_grid_size_in_descriptions(self): assert "A-H" in schemas["click"]["description"] assert "1-8" in schemas["click"]["description"] + def test_register_with_shared_grid(self): + llm = _FakeLLM() + grid = Grid(cols=10, rows=10) + register(llm, grid=grid) + + schemas = {s["name"]: s for s in llm.function_registry.get_tool_schemas()} + assert "A-J" in schemas["click"]["description"] + assert "1-10" in schemas["click"]["description"] + class TestActions: """Test that action functions are valid async callables with correct signatures.""" diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py b/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py index dc76d12ad..3987ca249 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_processor.py @@ -21,24 +21,32 @@ class GridOverlayProcessor(VideoProcessorPublisher): grid cells (e.g. "C2") instead of guessing raw pixel coordinates. Args: + grid: Shared Grid instance. If provided, cols/rows are ignored. cols: Number of grid columns (1-26). Default 15. rows: Number of grid rows (1-99). Default 15. fps: Frame rate for processing. Default 2. Usage:: - from vision_agents.plugins.computer_use import GridOverlayProcessor + from vision_agents.plugins.computer_use import Grid, GridOverlayProcessor + grid = Grid(cols=20, rows=20) agent = Agent( ..., - processors=[GridOverlayProcessor(cols=15, rows=15)], + processors=[GridOverlayProcessor(grid=grid)], ) """ name = "grid_overlay" - def __init__(self, cols: int = 15, rows: int = 15, fps: float = 2): - self._grid = Grid(cols=cols, rows=rows) + def __init__( + self, + grid: Grid | None = None, + cols: int = 15, + rows: int = 15, + fps: float = 2, + ): + self._grid = grid if grid is not None else Grid(cols=cols, rows=rows) self._fps = fps self._video_forwarder: Optional[VideoForwarder] = None self._video_track = QueuedVideoTrack() diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py index db67b629c..0c67974dc 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py @@ -10,15 +10,22 @@ logger = logging.getLogger(__name__) -def register(llm: LLM, cols: int = 15, rows: int = 15) -> None: +def register( + llm: LLM, + grid: Grid | None = None, + cols: int = 15, + rows: int = 15, +) -> None: """Register all computer-use action tools on *llm*. Args: llm: The LLM to register tools on. + grid: Shared Grid instance. If provided, cols/rows are ignored. cols: Number of grid columns (1-26). Default 15. rows: Number of grid rows (1-99). Default 15. """ - grid = Grid(cols=cols, rows=rows) + if grid is None: + grid = Grid(cols=cols, rows=rows) cell_hint = ( f" The screen has a grid overlay with columns" diff --git a/uv.lock b/uv.lock index 375a2701b..d0acd7b13 100644 --- a/uv.lock +++ b/uv.lock @@ -7059,6 +7059,7 @@ all-plugins = [ { name = "vision-agents-plugins-assemblyai" }, { name = "vision-agents-plugins-aws" }, { name = "vision-agents-plugins-cartesia" }, + { name = "vision-agents-plugins-computer-use" }, { name = "vision-agents-plugins-decart" }, { name = "vision-agents-plugins-deepgram" }, { name = "vision-agents-plugins-elevenlabs" }, @@ -7100,6 +7101,9 @@ aws = [ cartesia = [ { name = "vision-agents-plugins-cartesia" }, ] +computer-use = [ + { name = "vision-agents-plugins-computer-use" }, +] decart = [ { name = "vision-agents-plugins-decart" }, ] @@ -7219,6 +7223,8 @@ requires-dist = [ { name = "vision-agents-plugins-aws", marker = "extra == 'aws'", editable = "plugins/aws" }, { name = "vision-agents-plugins-cartesia", marker = "extra == 'all-plugins'", editable = "plugins/cartesia" }, { name = "vision-agents-plugins-cartesia", marker = "extra == 'cartesia'", editable = "plugins/cartesia" }, + { name = "vision-agents-plugins-computer-use", marker = "extra == 'all-plugins'", editable = "plugins/computer_use" }, + { name = "vision-agents-plugins-computer-use", marker = "extra == 'computer-use'", editable = "plugins/computer_use" }, { name = "vision-agents-plugins-decart", marker = "extra == 'all-plugins'", editable = "plugins/decart" }, { name = "vision-agents-plugins-decart", marker = "extra == 'decart'", editable = "plugins/decart" }, { name = "vision-agents-plugins-deepgram", marker = "extra == 'all-plugins'", editable = "plugins/deepgram" }, @@ -7276,7 +7282,7 @@ requires-dist = [ { name = "vision-agents-plugins-xai", marker = "extra == 'all-plugins'", editable = "plugins/xai" }, { name = "vision-agents-plugins-xai", marker = "extra == 'xai'", editable = "plugins/xai" }, ] -provides-extras = ["all-plugins", "anthropic", "assemblyai", "aws", "cartesia", "decart", "deepgram", "dev", "elevenlabs", "fast-whisper", "fish", "gemini", "getstream", "heygen", "huggingface", "inworld", "kokoro", "lemonslice", "mistral", "moondream", "moonshine", "nvidia", "openai", "openrouter", "pocket", "qwen", "redis", "roboflow", "smart-turn", "turbopuffer", "twilio", "ultralytics", "vogent", "wizper", "xai"] +provides-extras = ["all-plugins", "anthropic", "assemblyai", "aws", "cartesia", "computer-use", "decart", "deepgram", "dev", "elevenlabs", "fast-whisper", "fish", "gemini", "getstream", "heygen", "huggingface", "inworld", "kokoro", "lemonslice", "mistral", "moondream", "moonshine", "nvidia", "openai", "openrouter", "pocket", "qwen", "redis", "roboflow", "smart-turn", "turbopuffer", "twilio", "ultralytics", "vogent", "wizper", "xai"] [[package]] name = "vision-agents-plugins-anthropic" From 9980adc24130f169bd465c31c466866db551a70e Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 13:27:38 +0100 Subject: [PATCH 7/8] Make position a required parameter for grid-based actions Made-with: Cursor --- plugins/computer_use/tests/test_computer_use.py | 10 ++++++---- .../plugins/computer_use/_actions.py | 16 ++++++++-------- .../plugins/computer_use/_toolkit.py | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/plugins/computer_use/tests/test_computer_use.py b/plugins/computer_use/tests/test_computer_use.py index 8ea814b66..37c31943e 100644 --- a/plugins/computer_use/tests/test_computer_use.py +++ b/plugins/computer_use/tests/test_computer_use.py @@ -162,13 +162,13 @@ class TestActions: @pytest.mark.integration async def test_click_executes(self): actions = make_grid_actions(Grid()) - result = await actions["click"](cell="A1") + result = await actions["click"](cell="A1", position="center") assert "Clicked" in result @pytest.mark.integration async def test_double_click_executes(self): actions = make_grid_actions(Grid()) - result = await actions["double_click"](cell="A1") + result = await actions["double_click"](cell="A1", position="center") assert "Double-clicked" in result @pytest.mark.integration @@ -184,11 +184,13 @@ async def test_key_press_executes(self): @pytest.mark.integration async def test_scroll_executes(self): actions = make_grid_actions(Grid()) - result = await actions["scroll"](cell="A1", clicks=1, direction="down") + result = await actions["scroll"]( + cell="A1", position="center", clicks=1, direction="down" + ) assert "Scrolled" in result @pytest.mark.integration async def test_mouse_move_executes(self): actions = make_grid_actions(Grid()) - result = await actions["mouse_move"](cell="A1") + result = await actions["mouse_move"](cell="A1", position="center") assert "Moved" in result diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py index 93040f502..6de25ed91 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py @@ -50,14 +50,14 @@ def _cell_to_screen(cell: str, position: str = "center") -> tuple[int, int]: async def click( cell: str, - position: str = "center", + position: str, button: str = "left", ) -> str: """Click at a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Where within the cell to click. One of: top-left, top, + position: Required. Where within the cell to click. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. button: Mouse button — "left", "right", or "middle". """ @@ -75,13 +75,13 @@ async def click( async def double_click( cell: str, - position: str = "center", + position: str, ) -> str: """Double-click at a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Where within the cell to click. One of: top-left, top, + position: Required. Where within the cell to click. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. """ sx, sy = _cell_to_screen(cell, position) @@ -97,7 +97,7 @@ async def double_click( async def scroll( cell: str, - position: str = "center", + position: str, clicks: int = 3, direction: Literal["up", "down"] = "down", ) -> str: @@ -105,7 +105,7 @@ async def scroll( Args: cell: Grid cell reference, e.g. "H8". - position: Where within the cell to scroll. One of: top-left, top, + position: Required. Where within the cell to scroll. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. clicks: Number of scroll increments. direction: "up" or "down". @@ -125,13 +125,13 @@ async def scroll( async def mouse_move( cell: str, - position: str = "center", + position: str, ) -> str: """Move the mouse cursor to a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Where within the cell to move. One of: top-left, top, + position: Required. Where within the cell to move. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. """ sx, sy = _cell_to_screen(cell, position) diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py index 0c67974dc..72f1ab00d 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py @@ -32,7 +32,7 @@ def register( f" {grid.col_labels[0]}-{grid.col_labels[-1]}" f" and rows 1-{grid.rows}." " Provide the 'cell' parameter (e.g. 'H8') to target a grid cell." - " Use 'position' to refine within the cell" + " You MUST also provide 'position' to specify where within the cell" " (top-left, top, top-right, left, center, right," " bottom-left, bottom, bottom-right)." ) From b68129917f237123f8336a538418c868ca70a281 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 11 Mar 2026 13:31:42 +0100 Subject: [PATCH 8/8] Revert "Make position a required parameter for grid-based actions" This reverts commit 9980adc24130f169bd465c31c466866db551a70e. --- plugins/computer_use/tests/test_computer_use.py | 10 ++++------ .../plugins/computer_use/_actions.py | 16 ++++++++-------- .../plugins/computer_use/_toolkit.py | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/plugins/computer_use/tests/test_computer_use.py b/plugins/computer_use/tests/test_computer_use.py index 37c31943e..8ea814b66 100644 --- a/plugins/computer_use/tests/test_computer_use.py +++ b/plugins/computer_use/tests/test_computer_use.py @@ -162,13 +162,13 @@ class TestActions: @pytest.mark.integration async def test_click_executes(self): actions = make_grid_actions(Grid()) - result = await actions["click"](cell="A1", position="center") + result = await actions["click"](cell="A1") assert "Clicked" in result @pytest.mark.integration async def test_double_click_executes(self): actions = make_grid_actions(Grid()) - result = await actions["double_click"](cell="A1", position="center") + result = await actions["double_click"](cell="A1") assert "Double-clicked" in result @pytest.mark.integration @@ -184,13 +184,11 @@ async def test_key_press_executes(self): @pytest.mark.integration async def test_scroll_executes(self): actions = make_grid_actions(Grid()) - result = await actions["scroll"]( - cell="A1", position="center", clicks=1, direction="down" - ) + result = await actions["scroll"](cell="A1", clicks=1, direction="down") assert "Scrolled" in result @pytest.mark.integration async def test_mouse_move_executes(self): actions = make_grid_actions(Grid()) - result = await actions["mouse_move"](cell="A1", position="center") + result = await actions["mouse_move"](cell="A1") assert "Moved" in result diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py index 6de25ed91..93040f502 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_actions.py @@ -50,14 +50,14 @@ def _cell_to_screen(cell: str, position: str = "center") -> tuple[int, int]: async def click( cell: str, - position: str, + position: str = "center", button: str = "left", ) -> str: """Click at a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Required. Where within the cell to click. One of: top-left, top, + position: Where within the cell to click. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. button: Mouse button — "left", "right", or "middle". """ @@ -75,13 +75,13 @@ async def click( async def double_click( cell: str, - position: str, + position: str = "center", ) -> str: """Double-click at a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Required. Where within the cell to click. One of: top-left, top, + position: Where within the cell to click. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. """ sx, sy = _cell_to_screen(cell, position) @@ -97,7 +97,7 @@ async def double_click( async def scroll( cell: str, - position: str, + position: str = "center", clicks: int = 3, direction: Literal["up", "down"] = "down", ) -> str: @@ -105,7 +105,7 @@ async def scroll( Args: cell: Grid cell reference, e.g. "H8". - position: Required. Where within the cell to scroll. One of: top-left, top, + position: Where within the cell to scroll. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. clicks: Number of scroll increments. direction: "up" or "down". @@ -125,13 +125,13 @@ async def scroll( async def mouse_move( cell: str, - position: str, + position: str = "center", ) -> str: """Move the mouse cursor to a grid cell. Args: cell: Grid cell reference, e.g. "H8". - position: Required. Where within the cell to move. One of: top-left, top, + position: Where within the cell to move. One of: top-left, top, top-right, left, center, right, bottom-left, bottom, bottom-right. """ sx, sy = _cell_to_screen(cell, position) diff --git a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py index 72f1ab00d..0c67974dc 100644 --- a/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py +++ b/plugins/computer_use/vision_agents/plugins/computer_use/_toolkit.py @@ -32,7 +32,7 @@ def register( f" {grid.col_labels[0]}-{grid.col_labels[-1]}" f" and rows 1-{grid.rows}." " Provide the 'cell' parameter (e.g. 'H8') to target a grid cell." - " You MUST also provide 'position' to specify where within the cell" + " Use 'position' to refine within the cell" " (top-left, top, top-right, left, center, right," " bottom-left, bottom, bottom-right)." )