Skip to content

Commit 862df3e

Browse files
Documentation of how to customize compute use
1 parent 23a6e96 commit 862df3e

3 files changed

Lines changed: 114 additions & 13 deletions

File tree

README.md

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Lower-level feature improvements include:
1414
- Support for calling tools via remote MCP servers
1515
- Support for more models
1616

17-
Note that this template implements support for MCP servers, but does not yet support web search, image generation, or computer use.
17+
Note that this template implements does not yet support image generation.
1818

1919
## Quickstart Setup
2020

@@ -111,3 +111,50 @@ After defining your function (and optionally a template), start the server and n
111111
![Function Registration](./docs/functions.png)
112112

113113
Don't forget to click "Regenerate tool.config.json" to save your changes. (This will regenerate the `tool.config.json` file with your new configuration.)
114+
115+
## Customizing Computer Use
116+
117+
The computer use tool lets the assistant control a virtual screen by issuing mouse and keyboard actions (click, type, scroll, etc.) and receiving screenshots in return.
118+
119+
By default, this template uses a headless Playwright browser as the computer use backend. When the assistant starts a computer use session, it sees a landing page with a URL input so it can navigate to websites.
120+
121+
To swap in a different backend (e.g., a full virtual desktop via VNC/xdotool, or a GUI automation library like pyautogui), implement the `ComputerSession` and `ComputerSessionManager` protocols defined in `utils/computer_use.py`:
122+
123+
```python
124+
from utils.computer_use import ComputerSession, ComputerSessionManager, Action
125+
126+
class MySession:
127+
"""Your custom session — must implement ComputerSession protocol."""
128+
129+
async def screenshot(self) -> str:
130+
"""Capture the screen and return a base64-encoded PNG string."""
131+
...
132+
133+
async def execute(self, action: Action) -> str:
134+
"""Perform an action (click, type, etc.) and return a screenshot."""
135+
...
136+
137+
async def close(self) -> None:
138+
"""Release resources."""
139+
...
140+
141+
class MySessionManager:
142+
"""Your custom manager — must implement ComputerSessionManager protocol."""
143+
144+
def get_or_create(self, conversation_id: str, width: int = 1024, height: int = 768) -> MySession:
145+
...
146+
147+
async def close(self, conversation_id: str) -> None:
148+
...
149+
150+
async def close_all(self) -> None:
151+
...
152+
```
153+
154+
Then replace the `session_manager` singleton in `utils/computer_use.py`:
155+
156+
```python
157+
session_manager: ComputerSessionManager = MySessionManager()
158+
```
159+
160+
No other changes are needed — the chat router and shutdown logic use `session_manager` through the protocol interface.

tests/test_computer_use.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
from utils.computer_use import (
3333
BrowserSession,
3434
BrowserSessionManager,
35+
ComputerSession,
36+
ComputerSessionManager,
3537
build_computer_tool,
3638
describe_action,
3739
execute_computer_actions,
@@ -336,6 +338,16 @@ def test_only_type_key(self):
336338
assert tool == {"type": "computer"}
337339

338340

341+
class TestProtocolConformance:
342+
"""Tests that concrete classes satisfy the Protocol interfaces."""
343+
344+
def test_browser_session_is_computer_session(self):
345+
assert isinstance(BrowserSession(), ComputerSession)
346+
347+
def test_browser_session_manager_is_computer_session_manager(self):
348+
assert isinstance(BrowserSessionManager(), ComputerSessionManager)
349+
350+
339351
class TestExecuteComputerAction:
340352
"""Integration tests for execute_computer_action()."""
341353

@@ -361,18 +373,22 @@ async def test_returns_valid_png(self):
361373
async def test_reuses_session_for_same_conversation(self):
362374
from utils.computer_use import session_manager
363375

376+
manager = session_manager
377+
assert isinstance(manager, BrowserSessionManager)
364378
action = Screenshot(type="screenshot")
365379
await execute_computer_actions([action],"test-conv")
366380
await execute_computer_actions([action],"test-conv")
367-
assert "test-conv" in session_manager._sessions
381+
assert "test-conv" in manager._sessions
368382

369383
@pytest.mark.anyio
370384
async def test_different_conversations_different_sessions(self):
371385
from utils.computer_use import session_manager
372386

387+
manager = session_manager
388+
assert isinstance(manager, BrowserSessionManager)
373389
action = Screenshot(type="screenshot")
374390
await execute_computer_actions([action],"conv-a")
375391
await execute_computer_actions([action],"conv-b")
376-
assert "conv-a" in session_manager._sessions
377-
assert "conv-b" in session_manager._sessions
378-
assert session_manager._sessions["conv-a"] is not session_manager._sessions["conv-b"]
392+
assert "conv-a" in manager._sessions
393+
assert "conv-b" in manager._sessions
394+
assert manager._sessions["conv-a"] is not manager._sessions["conv-b"]

utils/computer_use.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import asyncio
22
import base64
33
import logging
4-
from typing import Union
4+
from typing import Protocol, Union, runtime_checkable
55

66
from playwright.async_api import async_playwright, Browser, Page, Playwright
77

@@ -19,6 +19,48 @@
1919

2020
logger = logging.getLogger("uvicorn.error")
2121

22+
Action = Union[
23+
Click, DoubleClick, Drag, Keypress,
24+
Move, Screenshot, Scroll, Type, Wait,
25+
]
26+
27+
28+
@runtime_checkable
29+
class ComputerSession(Protocol):
30+
"""Protocol for computer use sessions.
31+
32+
Implement this to provide a custom backend (e.g., VNC, xdotool, pyautogui)
33+
instead of the default Playwright-based BrowserSession.
34+
35+
Methods:
36+
screenshot: Capture the current screen as a base64-encoded PNG string.
37+
execute: Perform an action and return a base64-encoded PNG screenshot.
38+
close: Release resources held by this session.
39+
"""
40+
41+
async def screenshot(self) -> str: ...
42+
async def execute(self, action: Action) -> str: ...
43+
async def close(self) -> None: ...
44+
45+
46+
@runtime_checkable
47+
class ComputerSessionManager(Protocol):
48+
"""Protocol for managing computer use sessions by conversation ID.
49+
50+
Implement this to provide custom session lifecycle management.
51+
52+
Methods:
53+
get_or_create: Return an existing session or create a new one.
54+
close: Close and remove a specific session.
55+
close_all: Close all sessions (called during app shutdown).
56+
"""
57+
58+
def get_or_create(
59+
self, conversation_id: str, width: int = 1024, height: int = 768,
60+
) -> ComputerSession: ...
61+
async def close(self, conversation_id: str) -> None: ...
62+
async def close_all(self) -> None: ...
63+
2264
_LANDING_PAGE_HTML = """\
2365
<!DOCTYPE html>
2466
<html>
@@ -45,11 +87,6 @@
4587
</html>
4688
"""
4789

48-
Action = Union[
49-
Click, DoubleClick, Drag, Keypress,
50-
Move, Screenshot, Scroll, Type, Wait,
51-
]
52-
5390
# Map OpenAI key names to Playwright key names
5491
_KEY_MAP: dict[str, str] = {
5592
"ctrl": "Control",
@@ -227,8 +264,9 @@ async def close_all(self) -> None:
227264
self._sessions.clear()
228265

229266

230-
# Module-level singleton
231-
session_manager = BrowserSessionManager()
267+
# Module-level singleton — replace with your own ComputerSessionManager implementation
268+
# to use a different backend (e.g., VNC, xdotool, pyautogui).
269+
session_manager: ComputerSessionManager = BrowserSessionManager()
232270

233271

234272
def build_computer_tool(**kwargs: object) -> dict[str, str]:

0 commit comments

Comments
 (0)