Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
2b252c6
feat: add description, class and text locators
adi-wan-askui Apr 3, 2025
e9c5a97
feat(locators): optimise serialization for anthropic claude 3.5 sonnet
adi-wan-askui Apr 3, 2025
da923d7
feat(locators): add relations
adi-wan-askui Apr 3, 2025
9e82706
fix(locators): fix serialization + restructure
adi-wan-askui Apr 7, 2025
ffd329c
fix(locators): fix nested relation serialization
adi-wan-askui Apr 8, 2025
1cc944c
fix(locators): serializations
adi-wan-askui Apr 8, 2025
3ec2db2
feat(locators): add image locator + locator validation
adi-wan-askui Apr 9, 2025
8d2d08e
feat(locators): improve error messages when loading images
adi-wan-askui Apr 9, 2025
fa2363d
refactor(locators): rm unused redirection
adi-wan-askui Apr 9, 2025
7062a2d
feat(locators): add serialization of image locators for AskUI API
adi-wan-askui Apr 9, 2025
bb98ab1
feat(locators): add ai element locator
adi-wan-askui Apr 10, 2025
9776d8d
test(unit): disable telemetry and set workspace id env to fix tests
adi-wan-askui Apr 11, 2025
9a7c4e1
chore(locators): remove everything but public locators from public pk…
adi-wan-askui Apr 11, 2025
3526cf7
docs(locators): add missing doc strings
adi-wan-askui Apr 11, 2025
12e0c44
fix(router): allow "Locator" only with "askui" model
adi-wan-askui Apr 11, 2025
1d6837d
fix(locators): add cycle detection to locators
adi-wan-askui Apr 11, 2025
33f176e
feat(agent)!: use dep. inj. to make better testable/configurable
adi-wan-askui Apr 11, 2025
49e0f44
refactor: rename `LocatingError` to `ElementNotFoundError`
adi-wan-askui Apr 11, 2025
f7ecb3c
feat(agent): do not report by default
adi-wan-askui Apr 11, 2025
324d5cb
feat!: change default model selection
adi-wan-askui Apr 11, 2025
89a30b2
docs: update README (add locators, new agent actions, new reporters e…
adi-wan-askui Apr 11, 2025
ea64305
test: reset device id tests correctly
adi-wan-askui Apr 11, 2025
e906570
feat!(agent): enable `askui` model + response format with `get()`
adi-wan-askui Apr 14, 2025
ba524ec
feat!(agent): raise error with get() if response schema is not implem…
adi-wan-askui Apr 15, 2025
3d805c0
refactor(utils): clean up where functions, classes etc. are defined
adi-wan-askui Apr 15, 2025
9fa65c2
refactor: remove obsolete code
adi-wan-askui Apr 15, 2025
7c5c061
feat!(agent): switch `get()` from dictionary to pydantic.BaseModel fo…
adi-wan-askui Apr 15, 2025
47da12e
docs: document `VisionAgent.get()`
adi-wan-askui Apr 15, 2025
e80fc50
refactor!(agent): rename `model_name` parameter to `model`
adi-wan-askui Apr 15, 2025
b6f7837
feat!(agent): enable selecting models using composition / for whole a…
adi-wan-askui Apr 16, 2025
b4a584a
refactor!(locators): rename `Class` to `Element`
adi-wan-askui Apr 17, 2025
0e3238c
feat(agent): support primitive types as response_schema in get method
adi-wan-askui Apr 17, 2025
469058c
refactor: validate all public methods & make locators non-pydantic based
adi-wan-askui Apr 17, 2025
5fb40b4
fix(reporting): fix reports overriding each other
adi-wan-askui Apr 17, 2025
c04b4b9
refactor(agent): make agent more modular / better testable
adi-wan-askui Apr 17, 2025
e4bbf11
feat(agent): make it easier to pass image to locate() and get()
adi-wan-askui Apr 17, 2025
30bac04
docs(locators): improve docs of relations
adi-wan-askui Apr 22, 2025
5406f26
feat(reporting): add image for get() to report
adi-wan-askui Apr 22, 2025
3712cfd
refactor(locators)!: rename Description to Prompt
adi-wan-askui Apr 22, 2025
5ff4774
feat(locators): change default reference point to center for right_of…
adi-wan-askui Apr 22, 2025
a256e5e
docs(locators): document all parameters
adi-wan-askui Apr 22, 2025
b7e9c57
feat(reporting): add image of Image / AIElement to report
adi-wan-askui Apr 23, 2025
2768389
Merge pull request #42 from askui/general-optimisations
adi-wan-askui Apr 23, 2025
ae4ae46
Merge pull request #41 from askui/model-selection
adi-wan-askui Apr 23, 2025
a19e477
Merge pull request #40 from askui/get-command
adi-wan-askui Apr 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ path = "src/askui/__init__.py"
distribution = true

[tool.pdm.scripts]
test = "pytest"
"test:unit" = "pytest tests/unit"
"test:integration" = "pytest tests/integration"
test = "pytest -n auto"
"test:e2e" = "pytest -n auto tests/e2e"
"test:integration" = "pytest -n auto tests/integration"
"test:unit" = "pytest -n auto tests/unit"
sort = "isort ."
format = "black ."
lint = "ruff check ."
Expand All @@ -56,6 +57,7 @@ test = [
"black>=25.1.0",
"ruff>=0.9.5",
"pytest-mock>=3.14.0",
"pytest-xdist>=3.6.1",
]
chat = [
"streamlit>=1.42.0",
Expand Down
54 changes: 30 additions & 24 deletions src/askui/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pydantic import Field, validate_call

from askui.container import telemetry
from askui.locators import Locator

from .tools.askui.askui_controller import (
AskUiControllerClient,
Expand All @@ -15,7 +16,7 @@
from .models.anthropic.claude import ClaudeHandler
from .logger import logger, configure_logging
from .tools.toolbox import AgentToolbox
from .models.router import ModelRouter
from .models.router import ModelRouter, Point
from .reporting.report import SimpleReportGenerator
import time
from dotenv import load_dotenv
Expand Down Expand Up @@ -59,13 +60,13 @@ def _check_askui_controller_enabled(self) -> None:
"AskUI Controller is not initialized. Please, set `enable_askui_controller` to `True` when initializing the `VisionAgent`."
)

@telemetry.record_call(exclude={"instruction"})
def click(self, instruction: Optional[str] = None, button: Literal['left', 'middle', 'right'] = 'left', repeat: int = 1, model_name: Optional[str] = None) -> None:
@telemetry.record_call(exclude={"locator"})
def click(self, locator: Optional[str | Locator] = None, button: Literal['left', 'middle', 'right'] = 'left', repeat: int = 1, model_name: Optional[str] = None) -> None:
"""
Simulates a mouse click on the user interface element identified by the provided instruction.
Simulates a mouse click on the user interface element identified by the provided locator.

Parameters:
instruction (str | None): The identifier or description of the element to click.
locator (str | Locator | None): The identifier or description of the element to click.
button ('left' | 'middle' | 'right'): Specifies which mouse button to click. Defaults to 'left'.
repeat (int): The number of times to click. Must be greater than 0. Defaults to 1.
model_name (str | None): The model name to be used for element detection. Optional.
Expand All @@ -92,29 +93,34 @@ def click(self, instruction: Optional[str] = None, button: Literal['left', 'midd
msg = f'{button} ' + msg
if repeat > 1:
msg += f' {repeat}x times'
if instruction is not None:
msg += f' on "{instruction}"'
if locator is not None:
msg += f' on "{locator}"'
self.report.add_message("User", msg)
if instruction is not None:
logger.debug("VisionAgent received instruction to click '%s'", instruction)
self.__mouse_move(instruction, model_name)
if locator is not None:
logger.debug("VisionAgent received instruction to click '%s'", locator)
self._mouse_move(locator, model_name)
self.client.click(button, repeat) # type: ignore

def __mouse_move(self, instruction: str, model_name: Optional[str] = None) -> None:
self._check_askui_controller_enabled()
screenshot = self.client.screenshot() # type: ignore
x, y = self.model_router.locate(screenshot, instruction, model_name)

def locate(self, locator: str | Locator, screenshot: Optional[Image.Image] = None, model_name: Optional[str] = None) -> Point:
if screenshot is None:
self._check_askui_controller_enabled()
screenshot = self.client.screenshot() # type: ignore
point = self.model_router.locate(screenshot, locator, model_name)
if self.report is not None:
self.report.add_message("ModelRouter", f"locate: ({x}, {y})")
self.client.mouse(x, y) # type: ignore
self.report.add_message("ModelRouter", f"locate: ({point[0]}, {point[1]})")
return point

def _mouse_move(self, locator: str | Locator, model_name: Optional[str] = None) -> None:
point = self.locate(locator=locator, model_name=model_name)
self.client.mouse(point[0], point[1]) # type: ignore

@telemetry.record_call(exclude={"instruction"})
def mouse_move(self, instruction: str, model_name: Optional[str] = None) -> None:
@telemetry.record_call(exclude={"locator"})
def mouse_move(self, locator: str | Locator, model_name: Optional[str] = None) -> None:
"""
Moves the mouse cursor to the UI element identified by the provided instruction.
Moves the mouse cursor to the UI element identified by the provided locator.

Parameters:
instruction (str): The identifier or description of the element to move to.
locator (str | Locator): The identifier or description of the element to move to.
model_name (str | None): The model name to be used for element detection. Optional.

Example:
Expand All @@ -126,9 +132,9 @@ def mouse_move(self, instruction: str, model_name: Optional[str] = None) -> None
```
"""
if self.report is not None:
self.report.add_message("User", f'mouse_move: "{instruction}"')
logger.debug("VisionAgent received instruction to mouse_move '%s'", instruction)
self.__mouse_move(instruction, model_name)
self.report.add_message("User", f'mouse_move: "{locator}"')
logger.debug("VisionAgent received instruction to mouse_move to '%s'", locator)
self._mouse_move(locator, model_name)

@telemetry.record_call()
def mouse_scroll(self, x: int, y: int) -> None:
Expand Down
2 changes: 1 addition & 1 deletion src/askui/chat/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def rerun():
image=screenshot_with_crosshair,
)
agent.mouse_move(
instruction=element_description.replace('"', ""),
locator=element_description.replace('"', ""),
model_name="anthropic-claude-3-5-sonnet-20241022",
)
else:
Expand Down
13 changes: 13 additions & 0 deletions src/askui/locators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .relatable import ReferencePoint
from .locators import Class, Description, Locator, Text, TextMatchType
from . import serializers

__all__ = [
"Class",
"Description",
"Locator",
"ReferencePoint",
"Text",
"TextMatchType",
"serializers",
]
77 changes: 77 additions & 0 deletions src/askui/locators/locators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from abc import ABC, abstractmethod
from typing import Generic, Literal, TypeVar

from askui.locators.relatable import Relatable


SerializedLocator = TypeVar("SerializedLocator")


class LocatorSerializer(Generic[SerializedLocator], ABC):
@abstractmethod
def serialize(self, locator: "Locator") -> SerializedLocator:
raise NotImplementedError()


class Locator(Relatable, ABC):
def serialize(
self, serializer: LocatorSerializer[SerializedLocator]
) -> SerializedLocator:
return serializer.serialize(self)


class Description(Locator):
def __init__(self, description: str):
super().__init__()
self.description = description

def __str__(self):
result = f'element with description "{self.description}"'
return result + super()._relations_str()


class Class(Locator):
# None is used to indicate that it is an element with a class but not a specific class
def __init__(self, class_name: Literal["text", "textfield"] | None = None):
super().__init__()
self.class_name = class_name

def __str__(self):
result = (
f'element with class "{self.class_name}"'
if self.class_name
else "element that has a class"
)
return result + super()._relations_str()


TextMatchType = Literal["similar", "exact", "contains", "regex"]


class Text(Class):
def __init__(
self,
text: str | None = None,
match_type: TextMatchType = "similar",
similarity_threshold: int = 70,
):
super().__init__(class_name="text")
self.text = text
self.match_type = match_type
self.similarity_threshold = similarity_threshold

def __str__(self):
if self.text is None:
result = "text"
else:
result = "text "
match self.match_type:
case "similar":
result += f'similar to "{self.text}" (similarity >= {self.similarity_threshold}%)'
case "exact":
result += f'"{self.text}"'
case "contains":
result += f'containing text "{self.text}"'
case "regex":
result += f'matching regex "{self.text}"'
return result + super()._relations_str()
Loading