Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/askui/android_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def __init__(
) -> None:
reporter = CompositeReporter(reporters=reporters)
self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
super().__init__(
reporter=reporter,
retry=retry,
Expand All @@ -97,6 +96,10 @@ def __init__(
callbacks=callbacks,
truncation_strategy=truncation_strategy,
)
self.act_agent_os_facade = AndroidAgentOsFacade(
self.os,
coordinate_space=self._vlm_provider.coordinate_space,
)
self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
# Override default act settings with Android-specific settings
self.act_settings = ActSettings(
Expand Down
3 changes: 2 additions & 1 deletion src/askui/computer_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def __init__(
truncation_strategy=truncation_strategy,
)
self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
self.tools.os
self.tools.os,
coordinate_space=self._vlm_provider.coordinate_space,
)
self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
# Override default act settings with computer-specific settings
Expand Down
36 changes: 36 additions & 0 deletions src/askui/model_providers/ollama_vlm_provider.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
"""OllamaVlmProvider — VLM access via a local Ollama instance."""

from openai import OpenAI
from typing_extensions import override

from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
from askui.models.shared.coordinate_space import (
NormalizedCoordinateSpace,
PixelCoordinateSpace,
ScaledCoordinateSpace,
VlmCoordinateSpace,
)

_DEFAULT_BASE_URL = "http://localhost:11434/v1"
_DEFAULT_MODEL_ID = "qwen3.5"

_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace()


class OllamaVlmProvider(OpenAIVlmProvider):
"""VLM provider that routes requests to a local Ollama instance.

Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
defaults (``base_url``, ``api_key``, ``model_id``).

Qwen and Holo models are automatically detected and their coordinate
space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
Kimi models use ``NormalizedCoordinateSpace()``.
Pass ``coordinate_space`` explicitly to override auto-detection.

Args:
model_id (str, optional): Ollama model to use. Defaults to
``"qwen3.5"``.
base_url (str, optional): Base URL for the Ollama OpenAI-compatible
API. Defaults to ``"http://localhost:11434/v1"``.
client (`OpenAI` | None, optional): Pre-configured OpenAI client.
If provided, ``base_url`` is ignored.
coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
grid the model emits coordinates in. ``None`` (the default)
enables auto-detection based on ``model_id``.

Example:
```python
Expand All @@ -40,10 +59,27 @@ def __init__(
model_id: str = _DEFAULT_MODEL_ID,
base_url: str = _DEFAULT_BASE_URL,
client: OpenAI | None = None,
coordinate_space: VlmCoordinateSpace | None = None,
) -> None:
self._coordinate_space_override = coordinate_space
super().__init__(
model_id=model_id,
api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value
base_url=base_url,
client=client,
coordinate_space=coordinate_space or PixelCoordinateSpace(),
)

@property
@override
def coordinate_space(self) -> VlmCoordinateSpace:
if self._coordinate_space_override is not None:
return self._coordinate_space_override
model_lower = self._model_id_value.lower()
if "qwen" in model_lower:
return _QWEN_COORDINATE_SPACE
if "holo" in model_lower:
return _HOLO_COORDINATE_SPACE
if "kimi" in model_lower:
return _KIMI_COORDINATE_SPACE
return self._coordinate_space
26 changes: 26 additions & 0 deletions src/askui/model_providers/openai_vlm_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@
ThinkingConfigParam,
ToolChoiceParam,
)
from askui.models.shared.coordinate_space import (
SCREENSHOT_RESOLUTION,
PixelCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection
from askui.utils.model_pricing import ModelPricing

_DEFAULT_MODEL_ID = "gpt-5.4"
_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()


class OpenAIVlmProvider(VlmProvider):
Expand All @@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider):
to the OpenAI API (``https://api.openai.com/v1``).
client (`OpenAI` | None, optional): Pre-configured OpenAI client.
If provided, ``api_key`` and ``base_url`` are ignored.
coordinate_space (VlmCoordinateSpace, optional): The coordinate grid
the model emits coordinates in. Defaults to the screenshot
resolution (native pixel coordinates).

Example:
```python
Expand All @@ -57,6 +66,7 @@ def __init__(
api_key: str | None = None,
base_url: str | None = None,
client: OpenAI | None = None,
coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
input_cost_per_million_tokens: float | None = None,
output_cost_per_million_tokens: float | None = None,
cache_write_cost_per_million_tokens: float | None = None,
Expand All @@ -65,6 +75,7 @@ def __init__(
self._model_id_value = (
model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
)
self._coordinate_space = coordinate_space
if client is not None:
self._client = client
else:
Expand All @@ -86,6 +97,11 @@ def __init__(
def model_id(self) -> str:
return self._model_id_value

@property
@override
def coordinate_space(self) -> VlmCoordinateSpace:
return self._coordinate_space

@property
@override
def pricing(self) -> ModelPricing | None:
Expand All @@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi:
"""Lazily initialise the `OpenAIMessagesApi` on first use."""
return OpenAIMessagesApi(client=self._client)

@override
def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
"""Append coordinate and resolution info to the system prompt."""
coord_info = self.coordinate_space.build_prompt_section(
screenshot_resolution=SCREENSHOT_RESOLUTION,
)
return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")

@override
def create_message(
self,
Expand All @@ -108,6 +132,8 @@ def create_message(
temperature: float | None = None,
provider_options: dict[str, Any] | None = None,
) -> MessageParam:
if system is not None:
system = self.augment_system_prompt(system)
return self._messages_api.create_message(
messages=messages,
model_id=self._model_id_value,
Expand Down
31 changes: 31 additions & 0 deletions src/askui/model_providers/vlm_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@
ThinkingConfigParam,
ToolChoiceParam,
)
from askui.models.shared.coordinate_space import (
PixelCoordinateSpace,
VlmCoordinateSpace,
)
from askui.models.shared.prompts import SystemPrompt
from askui.models.shared.tools import ToolCollection
from askui.utils.model_pricing import ModelPricing

_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()


class VlmProvider(ABC):
"""Interface for Vision Language Model providers.
Expand Down Expand Up @@ -44,6 +50,17 @@ class VlmProvider(ABC):
def model_id(self) -> str:
"""The model identifier used by this provider."""

@property
def coordinate_space(self) -> VlmCoordinateSpace:
"""The coordinate space this model emits coordinates in.

Returns a `VlmCoordinateSpace` describing the grid the model uses.
The default is `PixelCoordinateSpace` (native pixel coordinates).
Override in subclasses when the model uses a different grid
(e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen).
"""
return _DEFAULT_COORDINATE_SPACE

@property
def pricing(self) -> ModelPricing | None:
"""Pricing information for this provider's model.
Expand All @@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None:
"""
return None

def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
"""Hook for providers to augment the system prompt before sending.

Called by ``create_message()`` implementations. The base
implementation returns the prompt unchanged. Override in
subclasses that need to inject provider-specific information
(e.g. coordinate bounds for non-Anthropic models).

The original ``SystemPrompt`` object is **not** mutated —
implementations should create a new ``SystemPrompt`` wrapping
the augmented text.
"""
return system

@abstractmethod
def create_message(
self,
Expand Down
10 changes: 10 additions & 0 deletions src/askui/models/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from .android_base_tool import AndroidBaseTool
from .computer_base_tool import ComputerBaseTool
from .coordinate_space import (
NormalizedCoordinateSpace,
PixelCoordinateSpace,
ScaledCoordinateSpace,
VlmCoordinateSpace,
)
from .tool_tags import ToolTags

try:
Expand All @@ -13,6 +19,10 @@
__all__ = [
"AndroidBaseTool",
"ComputerBaseTool",
"NormalizedCoordinateSpace",
"PixelCoordinateSpace",
"ScaledCoordinateSpace",
"VlmCoordinateSpace",
"ToolTags",
]

Expand Down
104 changes: 104 additions & 0 deletions src/askui/models/shared/coordinate_space.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

from abc import ABC, abstractmethod

from pydantic import BaseModel, Field

# The resolution screenshots are scaled to before being sent to the model.
# Used by all agent OS facades (computer, Android, Playwright).
SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768)


def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]:
sw, sh = screenshot_resolution
return [
f"* Screenshot resolution: {sw}x{sh} pixels",
"* Screenshots may contain black padding bars to preserve the "
"original aspect ratio. UI elements are NOT located in the "
"padding area.",
"* Coordinate origin is the top-left corner (0, 0)",
]


class VlmCoordinateSpace(BaseModel, ABC):
"""Abstract base for VLM coordinate conventions.

Each subclass describes one coordinate grid a VLM may emit and knows
how to map those coordinates back to pixel space and how to render
the matching prompt section.
"""

@abstractmethod
def map_to_target(
self, x: float, y: float, target_resolution: tuple[int, int]
) -> tuple[int, int]:
"""Map model coordinates to pixel coordinates in *target_resolution*."""

@abstractmethod
def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
"""Build prompt text describing coordinate bounds for the model."""


class PixelCoordinateSpace(VlmCoordinateSpace):
"""Identity mapping -- coordinates already in pixel space.

Used by Anthropic/Claude which emit coordinates matching the
screenshot resolution.
"""

def map_to_target(
self,
x: float,
y: float,
target_resolution: tuple[int, int], # noqa: ARG002
) -> tuple[int, int]:
return int(x), int(y)

def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
sw, sh = screenshot_resolution
lines = _common_prompt_lines(screenshot_resolution)
lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
return "\n".join(lines)


class ScaledCoordinateSpace(VlmCoordinateSpace):
"""Integer grid (e.g. 1000x1000 for Qwen). Linear scaling."""

width: int = Field(gt=0, description="Width of the coordinate grid")
height: int = Field(gt=0, description="Height of the coordinate grid")

def map_to_target(
self, x: float, y: float, target_resolution: tuple[int, int]
) -> tuple[int, int]:
tw, th = target_resolution
return int(x * tw / self.width), int(y * th / self.height)

def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
lines = _common_prompt_lines(screenshot_resolution)
if (self.width, self.height) != screenshot_resolution:
lines.append(
f"* Emit coordinates in a {self.width}x{self.height} "
f"normalised grid: 0 <= x < {self.width}, "
f"0 <= y < {self.height}"
)
else:
sw, sh = screenshot_resolution
lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
return "\n".join(lines)


class NormalizedCoordinateSpace(VlmCoordinateSpace):
"""0.0-1.0 float grid (Kimi). No fields."""

def map_to_target(
self, x: float, y: float, target_resolution: tuple[int, int]
) -> tuple[int, int]:
tw, th = target_resolution
return int(x * tw), int(y * th)

def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
lines = _common_prompt_lines(screenshot_resolution)
lines.append(
"* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0"
)
return "\n".join(lines)
Loading
Loading