Skip to content

Commit 3cee669

Browse files
authored
Merge pull request #405 from hud-evals/j/fix-gemini-computer
Fix coordinate scaling for gemini
2 parents d2e68f8 + 81da598 commit 3cee669

4 files changed

Lines changed: 112 additions & 46 deletions

File tree

hud/tools/computer/gemini.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,14 @@ async def __call__(
228228

229229
# Helper to finalize ContentResult: rescale if requested and ensure URL metadata
230230
async def _finalize(
231-
result: ContentResult, requested_url: str | None = None
231+
result: ContentResult,
232+
requested_url: str | None = None,
233+
output: str | None = None,
232234
) -> list[ContentBlock]:
235+
if output is not None and result.error is None:
236+
result.output = output
237+
elif result.error == "":
238+
result.error = "Tool execution failed with no error output"
233239
if result.base64_image and self.rescale_images:
234240
try:
235241
result.base64_image = await self._rescale_screenshot(result.base64_image)
@@ -253,14 +259,14 @@ async def _finalize(
253259
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
254260
sx, sy = self._scale_coordinates(x, y)
255261
result = await self.executor.click(x=sx, y=sy)
256-
return await _finalize(result)
262+
return await _finalize(result, output=f"Clicked at ({x}, {y})")
257263

258264
elif action == "hover_at":
259265
if x is None or y is None:
260266
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
261267
sx, sy = self._scale_coordinates(x, y)
262268
result = await self.executor.move(x=sx, y=sy)
263-
return await _finalize(result)
269+
return await _finalize(result, output=f"Hovered at ({x}, {y})")
264270

265271
elif action == "type_text_at":
266272
if x is None or y is None:
@@ -284,7 +290,7 @@ async def _finalize(
284290

285291
# Type (optionally press enter after)
286292
result = await self.executor.write(text=text, enter_after=bool(press_enter))
287-
return await _finalize(result)
293+
return await _finalize(result, output=f"Typed text at ({x}, {y})")
288294

289295
elif action == "scroll_document":
290296
if direction is None:
@@ -317,7 +323,7 @@ async def _finalize(
317323
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
318324
)
319325
result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
320-
return await _finalize(result)
326+
return await _finalize(result, output=f"Scrolled document {direction}")
321327

322328
elif action == "scroll_at":
323329
if direction is None:
@@ -351,7 +357,7 @@ async def _finalize(
351357
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
352358
)
353359
result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
354-
return await _finalize(result)
360+
return await _finalize(result, output=f"Scrolled {direction} at ({x}, {y})")
355361

356362
elif action == "wait_5_seconds":
357363
result = await self.executor.wait(time=5000)
@@ -419,7 +425,10 @@ async def _finalize(
419425
)
420426
path = self._inset_scaled_drag_path(path)
421427
result = await self.executor.drag(path=path)
422-
return await _finalize(result)
428+
return await _finalize(
429+
result,
430+
output=f"Dragged from ({x}, {y}) to ({destination_x}, {destination_y})",
431+
)
423432

424433
else:
425434
raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))

hud/tools/computer/hud.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
class AgentCoordinate(int):
27-
"""Carry both execution and model-visible coordinate values."""
27+
"""Execution pixel coordinate with optional model-coordinate metadata."""
2828

2929
agent_value: int
3030

@@ -33,15 +33,6 @@ def __new__(cls, value: int, agent_value: int) -> Self:
3333
obj.agent_value = agent_value
3434
return obj
3535

36-
def __format__(self, format_spec: str) -> str:
37-
return format(self.agent_value, format_spec)
38-
39-
def __str__(self) -> str:
40-
return str(int(self))
41-
42-
def __repr__(self) -> str:
43-
return repr(self.agent_value)
44-
4536

4637
class HudComputerTool(BaseTool):
4738
"""

hud/tools/computer/tests/test_computer.py

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from hud.tools.computer.anthropic import AnthropicComputerTool
99
from hud.tools.computer.gemini import GeminiComputerTool
1010
from hud.tools.computer.glm import GLMComputerTool
11-
from hud.tools.computer.hud import HudComputerTool
11+
from hud.tools.computer.hud import AgentCoordinate, HudComputerTool
1212
from hud.tools.computer.openai import OpenAIComputerTool
1313
from hud.tools.computer.qwen import QwenComputerTool
1414
from hud.tools.executors.base import BaseExecutor
@@ -36,6 +36,11 @@ async def drag(self, path, pattern=None, hold_keys=None, take_screenshot=True):
3636
return await super().drag(path, pattern, hold_keys, take_screenshot=False)
3737

3838

39+
class EmptyErrorExecutor(BaseExecutor):
40+
async def click(self, *args, **kwargs):
41+
return ContentResult(error="")
42+
43+
3944
@pytest.mark.asyncio
4045
async def test_hud_computer_screenshot():
4146
comp = HudComputerTool()
@@ -75,15 +80,42 @@ async def test_anthropic_computer_screenshot():
7580

7681

7782
@pytest.mark.asyncio
78-
async def test_gemini_computer_click_reports_agent_coordinates():
83+
async def test_gemini_computer_scaling_preserves_model_coordinates():
7984
comp = GeminiComputerTool()
85+
x, y = comp._scale_coordinates(214, 420)
86+
87+
assert x is not None
88+
assert y is not None
89+
assert int(x) != 214
90+
assert int(y) != 420
91+
assert getattr(x, "agent_value") == 214
92+
assert getattr(y, "agent_value") == 420
93+
94+
95+
@pytest.mark.asyncio
96+
async def test_gemini_computer_click_reports_model_coordinates():
97+
comp = GeminiComputerTool(executor=BaseExecutor())
98+
8099
blocks = await comp(action="click_at", x=214, y=420)
81100

82101
assert any(
83-
"(214, 420)" in content.text for content in blocks if isinstance(content, TextContent)
102+
"Clicked at (214, 420)" in content.text
103+
for content in blocks
104+
if isinstance(content, TextContent)
84105
)
85106

86107

108+
@pytest.mark.asyncio
109+
async def test_gemini_computer_does_not_mask_empty_error():
110+
comp = GeminiComputerTool(executor=EmptyErrorExecutor())
111+
112+
blocks = await comp(action="click_at", x=214, y=420)
113+
text = "\n".join(content.text for content in blocks if isinstance(content, TextContent))
114+
115+
assert "Clicked at (214, 420)" not in text
116+
assert "Tool execution failed with no error output" in text
117+
118+
87119
@pytest.mark.asyncio
88120
async def test_anthropic_computer_zoom():
89121
"""Test zoom action on AnthropicComputerTool.
@@ -125,40 +157,44 @@ async def test_anthropic_computer_zoom():
125157

126158
@pytest.mark.asyncio
127159
async def test_openai_computer_click():
128-
comp = OpenAIComputerTool()
160+
comp = OpenAIComputerTool(executor=BaseExecutor())
129161
blocks = await comp(type="click", x=5, y=5)
130162
assert blocks
131-
assert any("(5, 5)" in content.text for content in blocks if isinstance(content, TextContent))
132163

133164

134165
@pytest.mark.asyncio
135-
async def test_anthropic_computer_click_reports_agent_coordinates():
136-
comp = AnthropicComputerTool()
137-
blocks = await comp(action="left_click", coordinate=[123, 456], text=None)
166+
async def test_anthropic_computer_scaling_preserves_agent_coordinates():
167+
comp = AnthropicComputerTool(executor=BaseExecutor())
168+
x, y = comp._scale_coordinates(123, 456)
138169

139-
assert any(
140-
"(123, 456)" in content.text for content in blocks if isinstance(content, TextContent)
141-
)
170+
assert x is not None
171+
assert y is not None
172+
assert getattr(x, "agent_value") == 123
173+
assert getattr(y, "agent_value") == 456
142174

143175

144176
@pytest.mark.asyncio
145-
async def test_qwen_computer_click_reports_agent_coordinates():
146-
comp = QwenComputerTool()
147-
blocks = await comp(action="left_click", coordinate=[123, 456])
177+
async def test_qwen_computer_scaling_preserves_agent_coordinates():
178+
comp = QwenComputerTool(executor=BaseExecutor())
179+
x, y = comp._scale_coordinates(123, 456)
148180

149-
assert any(
150-
"(123, 456)" in content.text for content in blocks if isinstance(content, TextContent)
151-
)
181+
assert x is not None
182+
assert y is not None
183+
assert getattr(x, "agent_value") == 123
184+
assert getattr(y, "agent_value") == 456
152185

153186

154187
@pytest.mark.asyncio
155-
async def test_glm_computer_click_reports_agent_coordinates():
156-
comp = GLMComputerTool()
157-
blocks = await comp(action="left_click", start_box="[123,456]")
188+
async def test_glm_computer_scaling_preserves_model_coordinates():
189+
comp = GLMComputerTool(executor=BaseExecutor())
190+
x, y = comp._scale_coordinates(123, 456)
158191

159-
assert any(
160-
"(123, 456)" in content.text for content in blocks if isinstance(content, TextContent)
161-
)
192+
assert x is not None
193+
assert y is not None
194+
assert int(x) != 123
195+
assert int(y) != 456
196+
assert getattr(x, "agent_value") == 123
197+
assert getattr(y, "agent_value") == 456
162198

163199

164200
def test_normalized_coordinate_max_stays_in_display_bounds():
@@ -217,6 +253,28 @@ async def test_xdo_drag_executes_interpolated_mouse_moves():
217253
assert mouse_moves[-1] == "mousemove 120 0"
218254

219255

256+
@pytest.mark.asyncio
257+
async def test_xdo_commands_use_execution_pixels_for_agent_coordinates():
258+
executor = RecordingXDOExecutor()
259+
260+
await executor.click(x=AgentCoordinate(309, 214), y=AgentCoordinate(396, 420))
261+
262+
assert executor.commands[-1] == "mousemove 309 396 click 1"
263+
264+
265+
@pytest.mark.asyncio
266+
async def test_xdo_nonzero_empty_stderr_surfaces_error(monkeypatch):
267+
async def fake_run(command: str):
268+
return 1, "", ""
269+
270+
monkeypatch.setattr("hud.tools.executors.xdo.run", fake_run)
271+
executor = XDOExecutor()
272+
273+
result = await executor.execute("mousemove 1 2", take_screenshot=False)
274+
275+
assert result.error == "Command failed with exit code 1"
276+
277+
220278
class TestHudComputerToolExtended:
221279
"""Extended tests for HudComputerTool covering edge cases and platform logic."""
222280

hud/tools/executors/xdo.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
import logging
66
import os
77
import shlex
8-
from pathlib import Path
8+
from contextlib import suppress
99
from tempfile import gettempdir
1010
from typing import Literal
1111
from uuid import uuid4
1212

13+
from anyio import Path
14+
1315
from hud.tools.types import ContentResult
1416
from hud.tools.utils import run
1517

@@ -141,9 +143,14 @@ async def execute(self, command: str, take_screenshot: bool = True) -> ContentRe
141143
# Execute command
142144
returncode, stdout, stderr = await run(full_command)
143145

146+
error = None
147+
if returncode != 0:
148+
error = stderr or f"Command failed with exit code {returncode}"
149+
144150
# Prepare result
145151
result = ContentResult(
146-
output=stdout if stdout else None, error=stderr if stderr or returncode != 0 else None
152+
output=stdout if stdout else None,
153+
error=error,
147154
)
148155

149156
# Take screenshot if requested
@@ -167,7 +174,7 @@ async def screenshot(self) -> str | None:
167174
# Real screenshot using scrot
168175
if OUTPUT_DIR:
169176
output_dir = Path(OUTPUT_DIR)
170-
output_dir.mkdir(parents=True, exist_ok=True)
177+
await output_dir.mkdir(parents=True, exist_ok=True)
171178
screenshot_path = output_dir / f"screenshot_{uuid4().hex}.png"
172179
else:
173180
# Generate a unique path in system temp dir without opening a file
@@ -177,12 +184,13 @@ async def screenshot(self) -> str | None:
177184

178185
returncode, _, _stderr = await run(screenshot_cmd)
179186

180-
if returncode == 0 and screenshot_path.exists():
187+
if returncode == 0 and await screenshot_path.exists():
181188
try:
182-
image_data = screenshot_path.read_bytes()
189+
image_data = await screenshot_path.read_bytes()
183190
# Remove the file unless user requested persistence via env var
184191
if not OUTPUT_DIR:
185-
screenshot_path.unlink(missing_ok=True)
192+
with suppress(FileNotFoundError):
193+
await screenshot_path.unlink()
186194
return base64.b64encode(image_data).decode()
187195
except Exception:
188196
return None

0 commit comments

Comments
 (0)