Skip to content

Commit 58c81c9

Browse files
authored
Merge branch 'main' into hypeship/start-url-cli
2 parents ed7a71f + 295a8d0 commit 58c81c9

10 files changed

Lines changed: 530 additions & 193 deletions

File tree

pkg/create/templates.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{
8787
Languages: []string{LanguageTypeScript, LanguagePython},
8888
},
8989
TemplateYutoriComputerUse: {
90-
Name: "Yutori n1 Computer Use",
91-
Description: "Implements a Yutori n1 computer use agent",
90+
Name: "Yutori n1.5 Computer Use",
91+
Description: "Implements a Yutori n1.5 computer use agent",
9292
Languages: []string{LanguageTypeScript, LanguagePython},
9393
},
9494
TemplateTzafonComputerUse: {

pkg/templates/python/yutori/README.md

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
# Kernel Python Sample App - Yutori n1 Computer Use
1+
# Kernel Python Sample App - Yutori n1.5 Computer Use
22

3-
This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API.
3+
This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API.
44

5-
[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
5+
[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
6+
7+
This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools).
68

79
## Setup
810

@@ -55,35 +57,46 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa
5557

5658
## Viewport Configuration
5759

58-
Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
60+
Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
5961

60-
> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
62+
> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
6163
6264
See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations.
6365

6466
## Screenshots
6567

6668
Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori.
6769

68-
## n1-latest Supported Actions
70+
## n1.5-latest Supported Actions
71+
72+
This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only.
6973

7074
| Action | Description |
7175
|--------|-------------|
72-
| `left_click` | Left mouse click at coordinates |
73-
| `double_click` | Double-click at coordinates |
74-
| `triple_click` | Triple-click at coordinates |
76+
| `left_click` | Left mouse click at coordinates (supports `modifier`) |
77+
| `double_click` | Double-click at coordinates (supports `modifier`) |
78+
| `triple_click` | Triple-click at coordinates (supports `modifier`) |
79+
| `middle_click` | Middle mouse click at coordinates |
7580
| `right_click` | Right mouse click at coordinates |
81+
| `mouse_move` | Move mouse to coordinates without clicking |
82+
| `mouse_down` | Press the left mouse button at coordinates |
83+
| `mouse_up` | Release the left mouse button at coordinates |
7684
| `scroll` | Scroll page in a direction |
7785
| `type` | Type text into focused element |
78-
| `key_press` | Send keyboard input |
79-
| `hover` | Move mouse without clicking |
86+
| `key_press` | Send a single key or key combination |
87+
| `hold_key` | Hold a key for a duration |
8088
| `drag` | Click-and-drag operation |
8189
| `wait` | Pause for UI to update |
8290
| `refresh` | Reload current page |
8391
| `go_back` | Navigate back in history |
92+
| `go_forward` | Navigate forward in history |
8493
| `goto_url` | Navigate to a URL |
8594

95+
### Disabled tools
96+
97+
The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model.
98+
8699
## Resources
87100

88-
- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1)
101+
- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5)
89102
- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)

pkg/templates/python/yutori/loop.py

Lines changed: 113 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,42 @@
11
"""
2-
Yutori n1 Sampling Loop
2+
Yutori n1.5 Sampling Loop
33
4-
Implements the agent loop for Yutori's n1-latest computer use model.
5-
n1-latest uses an OpenAI-compatible API with tool_calls:
4+
Implements the agent loop for Yutori's n1.5-latest computer use model.
5+
n1.5-latest uses an OpenAI-compatible API with tool_calls:
66
- Actions are returned via tool_calls in the assistant message
77
- Tool results use role: "tool" with matching tool_call_id
88
- The model stops by returning content without tool_calls
99
- Coordinates are returned in 1000x1000 space and need scaling
1010
11-
@see https://docs.yutori.com/reference/n1
11+
@see https://docs.yutori.com/reference/n1-5
1212
"""
1313

14+
import copy
1415
import json
1516
from typing import Any, Optional
1617

1718
from kernel import Kernel
1819
from openai import OpenAI
1920

20-
from tools import ComputerTool, N1Action, ToolResult
21+
from tools import ComputerTool, N15Action, ToolResult
22+
23+
# Tools that require a Playwright page / DOM access. The default core tool set
24+
# already excludes them, but we also list them in `disable_tools` so the
25+
# exclusion is explicit and survives if the default ever changes.
26+
DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
27+
TOOL_SET = "browser_tools_core-20260403"
28+
29+
# Screenshot-trimming defaults mirror Yutori's reference loop:
30+
# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py
31+
# Trimming is size-triggered — we only drop old screenshots when the payload
32+
# exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS.
33+
MAX_REQUEST_BYTES = 9_500_000
34+
KEEP_RECENT_SCREENSHOTS = 6
2135

2236

2337
async def sampling_loop(
2438
*,
25-
model: str = "n1-latest",
39+
model: str = "n1.5-latest",
2640
task: str,
2741
api_key: str,
2842
kernel: Kernel,
@@ -63,12 +77,23 @@ async def sampling_loop(
6377
iteration += 1
6478
print(f"\n=== Iteration {iteration} ===")
6579

80+
request_messages, dropped = _trimmed_for_request(conversation_messages)
81+
if dropped:
82+
print(f"Trimmed {dropped} old screenshot(s) to fit request size limit")
83+
6684
try:
6785
response = client.chat.completions.create(
6886
model=model,
69-
messages=conversation_messages,
87+
messages=request_messages,
7088
max_completion_tokens=max_completion_tokens,
7189
temperature=0.3,
90+
# n1.5-specific knobs go in extra_body.
91+
# tool_set selects the core (coordinate-based) tools.
92+
# disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
93+
extra_body={
94+
"tool_set": TOOL_SET,
95+
"disable_tools": DISABLED_TOOLS,
96+
},
7297
)
7398
except Exception as api_error:
7499
print(f"API call failed: {api_error}")
@@ -108,7 +133,7 @@ async def sampling_loop(
108133
})
109134
continue
110135

111-
action: N1Action = {"action_type": action_name, **args}
136+
action: N15Action = {"action_type": action_name, **args}
112137
print(f"Executing action: {action_name}", args)
113138

114139
scaled_action = _scale_coordinates(action, viewport_width, viewport_height)
@@ -155,7 +180,86 @@ async def sampling_loop(
155180
}
156181

157182

158-
def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action:
183+
def _trimmed_for_request(
184+
messages: list[dict[str, Any]],
185+
) -> tuple[list[dict[str, Any]], int]:
186+
"""Return a deep-copied messages list with old screenshots stripped to fit MAX_REQUEST_BYTES.
187+
188+
The most recent KEEP_RECENT_SCREENSHOTS screenshots are protected. The full
189+
`messages` list is preserved unchanged for the caller's return value.
190+
"""
191+
trimmed = copy.deepcopy(messages)
192+
size = _estimate_size(trimmed)
193+
if size <= MAX_REQUEST_BYTES:
194+
return trimmed, 0
195+
196+
image_indices = [i for i, m in enumerate(trimmed) if _message_has_image(m)]
197+
if not image_indices:
198+
return trimmed, 0
199+
200+
protected = set(image_indices[-max(1, KEEP_RECENT_SCREENSHOTS):])
201+
removed = 0
202+
203+
for idx in image_indices:
204+
if size <= MAX_REQUEST_BYTES:
205+
break
206+
if idx in protected:
207+
continue
208+
if _strip_one_image(trimmed[idx]):
209+
removed += 1
210+
size = _estimate_size(trimmed)
211+
212+
# If still over, strip from the protected window too — but always keep the latest.
213+
if size > MAX_REQUEST_BYTES:
214+
last_idx = image_indices[-1]
215+
for idx in image_indices:
216+
if size <= MAX_REQUEST_BYTES:
217+
break
218+
if idx == last_idx:
219+
continue
220+
if _strip_one_image(trimmed[idx]):
221+
removed += 1
222+
size = _estimate_size(trimmed)
223+
224+
return trimmed, removed
225+
226+
227+
def _estimate_size(messages: list[dict[str, Any]]) -> int:
228+
return len(json.dumps(messages, separators=(",", ":"), ensure_ascii=False).encode("utf-8"))
229+
230+
231+
def _message_has_image(msg: dict[str, Any]) -> bool:
232+
content = msg.get("content")
233+
if not isinstance(content, list):
234+
return False
235+
return any(isinstance(p, dict) and p.get("type") == "image_url" for p in content)
236+
237+
238+
def _strip_one_image(msg: dict[str, Any]) -> bool:
239+
content = msg.get("content")
240+
if not isinstance(content, list):
241+
return False
242+
243+
removed = False
244+
new_content: list[dict[str, Any]] = []
245+
for part in content:
246+
if not removed and isinstance(part, dict) and part.get("type") == "image_url":
247+
removed = True
248+
continue
249+
new_content.append(part)
250+
251+
if not removed:
252+
return False
253+
254+
has_text = any(isinstance(p, dict) and p.get("type") == "text" for p in new_content)
255+
if not has_text:
256+
new_content.append({"type": "text", "text": "Screenshot omitted to stay under request size limit."})
257+
258+
msg["content"] = new_content
259+
return True
260+
261+
262+
def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
159263
scaled = dict(action)
160264

161265
if "coordinates" in scaled and scaled["coordinates"]:

pkg/templates/python/yutori/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def cua_task(
3030
payload: QueryInput,
3131
) -> QueryOutput:
3232
"""
33-
Process a user query using Yutori n1 Computer Use with Kernel's browser automation.
33+
Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation.
3434
3535
Args:
3636
ctx: Kernel context containing invocation information
@@ -58,7 +58,7 @@ async def cua_task(
5858
print("Kernel browser live view url:", session.live_view_url)
5959

6060
loop_result = await sampling_loop(
61-
model="n1-latest",
61+
model="n1.5-latest",
6262
task=payload["query"],
6363
api_key=str(api_key),
6464
kernel=session.kernel,
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
"""Yutori n1 Computer Tools."""
1+
"""Yutori n1.5 Computer Tools."""
22

33
from .base import ToolError, ToolResult
4-
from .computer import ComputerTool, N1Action
4+
from .computer import ComputerTool, N15Action
55

66
__all__ = [
77
"ToolError",
88
"ToolResult",
99
"ComputerTool",
10-
"N1Action",
10+
"N15Action",
1111
]

0 commit comments

Comments
 (0)