cli/pkg/templates/python/yutori/loop.py at 24ae144d3ba3b2c97ff413f421e8a4d78ae796b4 · dhruvbatra/cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
"""
Yutori n1.5 Sampling Loop

Implements the agent loop for Yutori's n1.5-latest computer use model.
n1.5-latest uses an OpenAI-compatible API with tool_calls:
- Actions are returned via tool_calls in the assistant message
- Tool results use role: "tool" with matching tool_call_id
- The model stops by returning content without tool_calls
- Coordinates are returned in 1000x1000 space and need scaling

@see https://docs.yutori.com/reference/n1-5
"""

from __future__ import annotations

import copy
import json
import platform
from datetime import datetime
from typing import Any, Optional
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError

from kernel import Kernel
from openai import OpenAI

from tools import ComputerTool, N15Action, ToolResult

# Tools that require a Playwright page / DOM access. The default core tool set
# already excludes them, but we also list them in `disable_tools` so the
# exclusion is explicit and survives if the default ever changes.
DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
TOOL_SET = "browser_tools_core-20260403"

NAVIGATOR_COORDINATE_SCALE = 1000

# Screenshot-trimming defaults mirror Yutori's reference loop:
# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py
# Trimming is size-triggered — we only drop old screenshots when the payload
# exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS.
MAX_REQUEST_BYTES = 9_500_000
KEEP_RECENT_SCREENSHOTS = 6


async def sampling_loop(
    *,
    model: str = "n1.5-latest",
    task: str,
    api_key: str,
    kernel: Kernel,
    session_id: str,
    max_completion_tokens: int = 4096,
    max_iterations: int = 100,
    viewport_width: int = 1280,
    viewport_height: int = 800,
    kiosk_mode: bool = False,
    user_timezone: str = "America/Los_Angeles",
    user_location: str = "San Francisco, CA, US",
) -> dict[str, Any]:
    """Run the n1.5 sampling loop until the model stops calling tools or max iterations."""
    client = OpenAI(
        api_key=api_key,
        base_url="https://api.yutori.com/v1",
    )

    computer_tool = ComputerTool(kernel, session_id, viewport_width, viewport_height, kiosk_mode=kiosk_mode)

    initial_screenshot = await computer_tool.screenshot()

    # Append location/timezone/current-date context to the task — mirrors Yutori's
    # format_task_with_context helper and helps the model with date-sensitive
    # judgments. https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/context.py
    task_with_context = _format_task_with_context(task, user_timezone, user_location)

    user_content: list[dict[str, Any]] = [{"type": "text", "text": task_with_context}]
    if initial_screenshot.get("base64_image"):
        user_content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/webp;base64,{initial_screenshot['base64_image']}"
            },
        })

    conversation_messages: list[dict[str, Any]] = [
        {"role": "user", "content": user_content}
    ]

    iteration = 0
    final_answer: Optional[str] = None

    while iteration < max_iterations:
        iteration += 1
        print(f"\n=== Iteration {iteration} ===")

        request_messages, dropped = _trimmed_for_request(conversation_messages)
        if dropped:
            print(f"Trimmed {dropped} old screenshot(s) to fit request size limit")

        try:
            response = client.chat.completions.create(
                model=model,
                messages=request_messages,
                max_completion_tokens=max_completion_tokens,
                temperature=0.3,
                # n1.5-specific knobs go in extra_body.
                # tool_set selects the core (coordinate-based) tools.
                # disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
                extra_body={
                    "tool_set": TOOL_SET,
                    "disable_tools": DISABLED_TOOLS,
                },
            )
        except Exception as api_error:
            print(f"API call failed: {api_error}")
            raise

        if not response.choices or len(response.choices) == 0:
            print(f"No choices in response: {response}")
            raise ValueError("No choices in API response")

        choice = response.choices[0]
        assistant_message = choice.message
        if not assistant_message:
            raise ValueError("No response from model")

        print("Assistant content:", assistant_message.content or "(none)")

        conversation_messages.append(assistant_message.model_dump(exclude_none=True))

        tool_calls = assistant_message.tool_calls

        # No tool_calls means the model is done
        if not tool_calls:
            final_answer = assistant_message.content or None
            print(f"No tool_calls, model is done. Final answer: {final_answer}")
            break

        for tc in tool_calls:
            action_name = tc.function.name
            try:
                args = json.loads(tc.function.arguments)
            except json.JSONDecodeError:
                print(f"Failed to parse tool_call arguments: {tc.function.arguments}")
                conversation_messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": "Error: failed to parse arguments",
                })
                continue

            action: N15Action = {"action_type": action_name, **args}
            print(f"Executing action: {action_name}", args)

            scaled_action = _scale_coordinates(action, viewport_width, viewport_height)

            result: ToolResult
            try:
                result = await computer_tool.execute(scaled_action)
            except Exception as e:
                print(f"Action failed: {e}")
                result = {"error": str(e)}

            if result.get("base64_image"):
                conversation_messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/webp;base64,{result['base64_image']}"
                            },
                        }
                    ],
                })
            elif result.get("error"):
                conversation_messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": f"Action failed: {result['error']}",
                })
            else:
                conversation_messages.append({
                    "role": "tool",
                    "tool_call_id": tc.id,
                    "content": result.get("output", "OK"),
                })

    # If the loop exhausted iterations, prompt the model for a final summary so
    # the caller gets a usable answer instead of empty content. Mirrors Yutori's
    # format_stop_and_summarize helper.
    if iteration >= max_iterations and not final_answer:
        print("Max iterations reached — requesting summary")
        try:
            final_screenshot = await computer_tool.screenshot()
            stop_content: list[dict[str, Any]] = [
                {"type": "text", "text": _format_stop_and_summarize(task)}
            ]
            if final_screenshot.get("base64_image"):
                stop_content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/webp;base64,{final_screenshot['base64_image']}"
                    },
                })
            conversation_messages.append({"role": "user", "content": stop_content})

            summary_messages, _ = _trimmed_for_request(conversation_messages)
            summary_response = client.chat.completions.create(
                model=model,
                messages=summary_messages,
                max_completion_tokens=max_completion_tokens,
                temperature=0.3,
                extra_body={"tool_set": TOOL_SET, "disable_tools": DISABLED_TOOLS},
            )
            summary = summary_response.choices[0].message if summary_response.choices else None
            if summary:
                conversation_messages.append(summary.model_dump(exclude_none=True))
                final_answer = summary.content or None
        except Exception as summary_error:
            print(f"Stop-and-summarize call failed: {summary_error}")

    return {
        "messages": conversation_messages,
        "final_answer": final_answer,
    }


def _format_task_with_context(task: str, user_timezone: str, user_location: str) -> str:
    """Append location, timezone, and current date/time to the task message."""
    for timezone_name in [user_timezone, "America/Los_Angeles", "UTC"]:
        try:
            tz = ZoneInfo(timezone_name)
            tz_label = timezone_name
            break
        except (ZoneInfoNotFoundError, ValueError, OSError):
            continue
    else:
        return task

    now = datetime.now(tz)
    day_fmt = "%#d" if platform.system() == "Windows" else "%-d"
    context = "\n".join([
        f"User's location: {user_location}",
        f"User's timezone: {tz_label}",
        f"Current Date: {now.strftime(f'%B {day_fmt}, %Y')}",
        f"Current Time: {now.strftime('%H:%M:%S %Z')}",
        f"Today is: {now.strftime('%A')}",
    ])
    return f"{task}\n\n{context}"


def _format_stop_and_summarize(task: str) -> str:
    return (
        f"Stop here. "
        f"Summarize your current progress and list in detail all the findings "
        f"relevant to the given task:\n{task}\n"
        f"Provide URLs for all relevant results you find and return them in your response. "
        f"If there is no specific URL for a result, "
        f"cite the page URL that the information was found on."
    )


def _trimmed_for_request(
    messages: list[dict[str, Any]],
) -> tuple[list[dict[str, Any]], int]:
    """Return a deep-copied messages list with old screenshots stripped to fit MAX_REQUEST_BYTES.

    The most recent KEEP_RECENT_SCREENSHOTS screenshots are protected. The full
    `messages` list is preserved unchanged for the caller's return value.
    """
    trimmed = copy.deepcopy(messages)
    size = _estimate_size(trimmed)
    if size <= MAX_REQUEST_BYTES:
        return trimmed, 0

    image_indices = [i for i, m in enumerate(trimmed) if _message_has_image(m)]
    if not image_indices:
        return trimmed, 0

    protected = set(image_indices[-max(1, KEEP_RECENT_SCREENSHOTS):])
    removed = 0

    for idx in image_indices:
        if size <= MAX_REQUEST_BYTES:
            break
        if idx in protected:
            continue
        if _strip_one_image(trimmed[idx]):
            removed += 1
            size = _estimate_size(trimmed)

    # If still over, strip from the protected window too — but always keep the latest.
    if size > MAX_REQUEST_BYTES:
        last_idx = image_indices[-1]
        for idx in image_indices:
            if size <= MAX_REQUEST_BYTES:
                break
            if idx == last_idx:
                continue
            if _strip_one_image(trimmed[idx]):
                removed += 1
                size = _estimate_size(trimmed)

    return trimmed, removed


def _estimate_size(messages: list[dict[str, Any]]) -> int:
    return len(json.dumps(messages, separators=(",", ":"), ensure_ascii=False).encode("utf-8"))


def _message_has_image(msg: dict[str, Any]) -> bool:
    content = msg.get("content")
    if not isinstance(content, list):
        return False
    return any(isinstance(p, dict) and p.get("type") == "image_url" for p in content)


def _strip_one_image(msg: dict[str, Any]) -> bool:
    content = msg.get("content")
    if not isinstance(content, list):
        return False

    removed = False
    new_content: list[dict[str, Any]] = []
    for part in content:
        if not removed and isinstance(part, dict) and part.get("type") == "image_url":
            removed = True
            continue
        new_content.append(part)

    if not removed:
        return False

    has_text = any(isinstance(p, dict) and p.get("type") == "text" for p in new_content)
    if not has_text:
        new_content.append({"type": "text", "text": "Screenshot omitted to stay under request size limit."})

    msg["content"] = new_content
    return True


def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
    scaled = dict(action)

    if "coordinates" in scaled and scaled["coordinates"]:
        scaled["coordinates"] = _denormalize(scaled["coordinates"], viewport_width, viewport_height)

    if "start_coordinates" in scaled and scaled["start_coordinates"]:
        scaled["start_coordinates"] = _denormalize(scaled["start_coordinates"], viewport_width, viewport_height)

    return scaled


def _denormalize(coords: list[int] | tuple[int, int], width: int, height: int) -> list[int]:
    """Map [0, 1000] coordinates to viewport pixels and clamp to [0, dim-1].

    Clamping prevents a boundary value like 1000 from landing one pixel outside
    the viewport on a 1280x800 display.
    """
    raw_x = round((coords[0] / NAVIGATOR_COORDINATE_SCALE) * width)
    raw_y = round((coords[1] / NAVIGATOR_COORDINATE_SCALE) * height)
    x = max(0, min(width - 1, raw_x))
    y = max(0, min(height - 1, raw_y))
    return [x, y]