askui · philipph-askui · Jun 9, 2026
diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py
@@ -87,7 +87,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
-        self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -97,6 +96,10 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = AndroidAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with Android-specific settings
         self.act_settings = ActSettings(

diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py
@@ -130,7 +130,8 @@ def __init__(
             truncation_strategy=truncation_strategy,
         )
         self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
-            self.tools.os
+            self.tools.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with computer-specific settings

diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
@@ -1,26 +1,45 @@
 """OllamaVlmProvider — VLM access via a local Ollama instance."""
 
 from openai import OpenAI
+from typing_extensions import override
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 
 _DEFAULT_BASE_URL = "http://localhost:11434/v1"
 _DEFAULT_MODEL_ID = "qwen3.5"
 
+_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace()
+
 
 class OllamaVlmProvider(OpenAIVlmProvider):
     """VLM provider that routes requests to a local Ollama instance.
 
     Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
     defaults (``base_url``, ``api_key``, ``model_id``).
 
+    Qwen and Holo models are automatically detected and their coordinate
+    space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
+    Kimi models use ``NormalizedCoordinateSpace()``.
+    Pass ``coordinate_space`` explicitly to override auto-detection.
+
     Args:
         model_id (str, optional): Ollama model to use. Defaults to
             ``"qwen3.5"``.
         base_url (str, optional): Base URL for the Ollama OpenAI-compatible
             API. Defaults to ``"http://localhost:11434/v1"``.
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``base_url`` is ignored.
+        coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
+            grid the model emits coordinates in.  ``None`` (the default)
+            enables auto-detection based on ``model_id``.
 
     Example:
         ```python
@@ -40,10 +59,27 @@ def __init__(
         model_id: str = _DEFAULT_MODEL_ID,
         base_url: str = _DEFAULT_BASE_URL,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace | None = None,
     ) -> None:
+        self._coordinate_space_override = coordinate_space
         super().__init__(
             model_id=model_id,
             api_key="ollama",  # Ollama requires no auth; OpenAI SDK needs a value
             base_url=base_url,
             client=client,
+            coordinate_space=coordinate_space or PixelCoordinateSpace(),
         )
+
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        if self._coordinate_space_override is not None:
+            return self._coordinate_space_override
+        model_lower = self._model_id_value.lower()
+        if "qwen" in model_lower:
+            return _QWEN_COORDINATE_SPACE
+        if "holo" in model_lower:
+            return _HOLO_COORDINATE_SPACE
+        if "kimi" in model_lower:
+            return _KIMI_COORDINATE_SPACE
+        return self._coordinate_space
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
@@ -14,11 +14,17 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
 
 
 class OpenAIVlmProvider(VlmProvider):
@@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider):
             to the OpenAI API (``https://api.openai.com/v1``).
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``api_key`` and ``base_url`` are ignored.
+        coordinate_space (VlmCoordinateSpace, optional): The coordinate grid
+            the model emits coordinates in.  Defaults to the screenshot
+            resolution (native pixel coordinates).
 
     Example:
         ```python
@@ -57,6 +66,7 @@ def __init__(
         api_key: str | None = None,
         base_url: str | None = None,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -65,6 +75,7 @@ def __init__(
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
+        self._coordinate_space = coordinate_space
         if client is not None:
             self._client = client
         else:
@@ -86,6 +97,11 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id_value
 
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        return self._coordinate_space
+
     @property
     @override
     def pricing(self) -> ModelPricing | None:
@@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi:
         """Lazily initialise the `OpenAIMessagesApi` on first use."""
         return OpenAIMessagesApi(client=self._client)
 
+    @override
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Append coordinate and resolution info to the system prompt."""
+        coord_info = self.coordinate_space.build_prompt_section(
+            screenshot_resolution=SCREENSHOT_RESOLUTION,
+        )
+        return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")
+
     @override
     def create_message(
         self,
@@ -108,6 +132,8 @@ def create_message(
         temperature: float | None = None,
         provider_options: dict[str, Any] | None = None,
     ) -> MessageParam:
+        if system is not None:
+            system = self.augment_system_prompt(system)
         return self._messages_api.create_message(
             messages=messages,
             model_id=self._model_id_value,

diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py
@@ -8,10 +8,16 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
+
 
 class VlmProvider(ABC):
     """Interface for Vision Language Model providers.
@@ -44,6 +50,17 @@ class VlmProvider(ABC):
     def model_id(self) -> str:
         """The model identifier used by this provider."""
 
+    @property
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        """The coordinate space this model emits coordinates in.
+
+        Returns a `VlmCoordinateSpace` describing the grid the model uses.
+        The default is `PixelCoordinateSpace` (native pixel coordinates).
+        Override in subclasses when the model uses a different grid
+        (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen).
+        """
+        return _DEFAULT_COORDINATE_SPACE
+
     @property
     def pricing(self) -> ModelPricing | None:
         """Pricing information for this provider's model.
@@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None:
         """
         return None
 
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Hook for providers to augment the system prompt before sending.
+
+        Called by ``create_message()`` implementations.  The base
+        implementation returns the prompt unchanged.  Override in
+        subclasses that need to inject provider-specific information
+        (e.g. coordinate bounds for non-Anthropic models).
+
+        The original ``SystemPrompt`` object is **not** mutated —
+        implementations should create a new ``SystemPrompt`` wrapping
+        the augmented text.
+        """
+        return system
+
     @abstractmethod
     def create_message(
         self,

diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
@@ -1,5 +1,11 @@
 from .android_base_tool import AndroidBaseTool
 from .computer_base_tool import ComputerBaseTool
+from .coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from .tool_tags import ToolTags
 
 try:
@@ -13,6 +19,10 @@
 __all__ = [
     "AndroidBaseTool",
     "ComputerBaseTool",
+    "NormalizedCoordinateSpace",
+    "PixelCoordinateSpace",
+    "ScaledCoordinateSpace",
+    "VlmCoordinateSpace",
     "ToolTags",
 ]
 

diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel, Field
+
+# The resolution screenshots are scaled to before being sent to the model.
+# Used by all agent OS facades (computer, Android, Playwright).
+SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768)
+
+
+def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]:
+    sw, sh = screenshot_resolution
+    return [
+        f"* Screenshot resolution: {sw}x{sh} pixels",
+        "* Screenshots may contain black padding bars to preserve the "
+        "original aspect ratio. UI elements are NOT located in the "
+        "padding area.",
+        "* Coordinate origin is the top-left corner (0, 0)",
+    ]
+
+
+class VlmCoordinateSpace(BaseModel, ABC):
+    """Abstract base for VLM coordinate conventions.
+
+    Each subclass describes one coordinate grid a VLM may emit and knows
+    how to map those coordinates back to pixel space and how to render
+    the matching prompt section.
+    """
+
+    @abstractmethod
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        """Map model coordinates to pixel coordinates in *target_resolution*."""
+
+    @abstractmethod
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        """Build prompt text describing coordinate bounds for the model."""
+
+
+class PixelCoordinateSpace(VlmCoordinateSpace):
+    """Identity mapping -- coordinates already in pixel space.
+
+    Used by Anthropic/Claude which emit coordinates matching the
+    screenshot resolution.
+    """
+
+    def map_to_target(
+        self,
+        x: float,
+        y: float,
+        target_resolution: tuple[int, int],  # noqa: ARG002
+    ) -> tuple[int, int]:
+        return int(x), int(y)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        sw, sh = screenshot_resolution
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class ScaledCoordinateSpace(VlmCoordinateSpace):
+    """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling."""
+
+    width: int = Field(gt=0, description="Width of the coordinate grid")
+    height: int = Field(gt=0, description="Height of the coordinate grid")
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw / self.width), int(y * th / self.height)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        if (self.width, self.height) != screenshot_resolution:
+            lines.append(
+                f"* Emit coordinates in a {self.width}x{self.height} "
+                f"normalised grid: 0 <= x < {self.width}, "
+                f"0 <= y < {self.height}"
+            )
+        else:
+            sw, sh = screenshot_resolution
+            lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class NormalizedCoordinateSpace(VlmCoordinateSpace):
+    """0.0-1.0 float grid (Kimi). No fields."""
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw), int(y * th)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(
+            "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0"
+        )
+        return "\n".join(lines)