Skip to content

Commit 35d6b36

Browse files
authored
Merge pull request #404 from hud-evals/yang/gemini-tool-fix-drag-and-drop
Yang/gemini tool fix drag and drop
2 parents 289b4f1 + 06faab7 commit 35d6b36

6 files changed

Lines changed: 164 additions & 29 deletions

File tree

hud/tools/computer/gemini.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222

2323
logger = logging.getLogger(__name__)
2424

25+
GEMINI_DRAG_INSET = 25
26+
DISPLAY_DRAG_INSET_PIXELS = 20
27+
2528
SUPPORTED_GEMINI_COMPUTER_USE_MODELS = (
2629
"gemini-2.5-computer-use-preview-10-2025",
2730
"gemini-3-flash-preview",
@@ -168,6 +171,30 @@ def __init__(
168171
**kwargs,
169172
)
170173

174+
def _inset_drag_coordinate(self, value: int) -> int:
175+
"""Keep Gemini normalized drag endpoints away from display edges."""
176+
if (
177+
self.coordinate_space is None
178+
or not isinstance(value, int | float)
179+
or not 0 <= value <= self.coordinate_space
180+
):
181+
return value
182+
183+
max_value = max(self.coordinate_space - GEMINI_DRAG_INSET, GEMINI_DRAG_INSET)
184+
return min(max(value, GEMINI_DRAG_INSET), max_value)
185+
186+
def _inset_scaled_drag_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
187+
"""Keep scaled drag points inside the display so they do not hit OS/window edges."""
188+
max_x = max(self.environment_width - 1 - DISPLAY_DRAG_INSET_PIXELS, 0)
189+
max_y = max(self.environment_height - 1 - DISPLAY_DRAG_INSET_PIXELS, 0)
190+
return [
191+
(
192+
min(max(int(x), DISPLAY_DRAG_INSET_PIXELS), max_x),
193+
min(max(int(y), DISPLAY_DRAG_INSET_PIXELS), max_y),
194+
)
195+
for x, y in path
196+
]
197+
171198
async def __call__(
172199
self,
173200
action: str = ACTION_FIELD,
@@ -381,7 +408,16 @@ async def _finalize(
381408
message="x, y, destination_x, and destination_y are required",
382409
)
383410
)
384-
path = self._scale_path([(x, y), (destination_x, destination_y)])
411+
path = self._scale_path(
412+
[
413+
(self._inset_drag_coordinate(x), self._inset_drag_coordinate(y)),
414+
(
415+
self._inset_drag_coordinate(destination_x),
416+
self._inset_drag_coordinate(destination_y),
417+
),
418+
]
419+
)
420+
path = self._inset_scaled_drag_path(path)
385421
result = await self.executor.drag(path=path)
386422
return await _finalize(result)
387423

hud/tools/computer/tests/test_computer.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,28 @@
1212
from hud.tools.computer.openai import OpenAIComputerTool
1313
from hud.tools.computer.qwen import QwenComputerTool
1414
from hud.tools.executors.base import BaseExecutor
15-
from hud.tools.types import Coordinate
15+
from hud.tools.executors.xdo import XDOExecutor
16+
from hud.tools.types import ContentResult, Coordinate
17+
18+
19+
class RecordingXDOExecutor(XDOExecutor):
20+
def __init__(self):
21+
super().__init__()
22+
self.commands: list[str] = []
23+
24+
async def execute(self, command: str, take_screenshot: bool = True):
25+
self.commands.append(command)
26+
return ContentResult(output=command)
27+
28+
29+
class RecordingExecutor(BaseExecutor):
30+
def __init__(self):
31+
super().__init__()
32+
self.drag_paths: list[list[tuple[int, int]]] = []
33+
34+
async def drag(self, path, pattern=None, hold_keys=None, take_screenshot=True):
35+
self.drag_paths.append(path)
36+
return await super().drag(path, pattern, hold_keys, take_screenshot=False)
1637

1738

1839
@pytest.mark.asyncio
@@ -151,6 +172,51 @@ def test_normalized_coordinate_max_stays_in_display_bounds():
151172
assert int(y) <= comp.environment_height - 1
152173

153174

175+
def test_drag_path_interpolation_adds_intermediate_points():
176+
executor = BaseExecutor()
177+
178+
path = executor._interpolate_drag_path([(0, 0), (120, 0)])
179+
180+
assert path[0] == (0, 0)
181+
assert path[-1] == (120, 0)
182+
assert len(path) == 11
183+
184+
185+
@pytest.mark.asyncio
186+
async def test_gemini_drag_clamps_edges_and_interpolates_executor_path():
187+
executor = RecordingExecutor()
188+
comp = GeminiComputerTool(executor=executor, width=1400, height=850)
189+
190+
blocks = await comp(
191+
action="drag_and_drop",
192+
x=0,
193+
y=500,
194+
destination_x=1000,
195+
destination_y=500,
196+
)
197+
198+
assert blocks
199+
path = executor.drag_paths[0]
200+
assert path[0][0] >= 20
201+
assert path[-1][0] <= comp.environment_width - 1 - 20
202+
203+
interpolated = executor._interpolate_drag_path(path)
204+
assert len(interpolated) > 2
205+
206+
207+
@pytest.mark.asyncio
208+
async def test_xdo_drag_executes_interpolated_mouse_moves():
209+
executor = RecordingXDOExecutor()
210+
211+
result = await executor.drag([(0, 0), (120, 0)], take_screenshot=False)
212+
213+
mouse_moves = [command for command in executor.commands if command.startswith("mousemove ")]
214+
assert result.output == "Dragged along 11 points"
215+
assert len(mouse_moves) == 11
216+
assert mouse_moves[0] == "mousemove 0 0"
217+
assert mouse_moves[-1] == "mousemove 120 0"
218+
219+
154220
class TestHudComputerToolExtended:
155221
"""Extended tests for HudComputerTool covering edge cases and platform logic."""
156222

hud/tools/executors/base.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import asyncio
44
import base64
55
import logging
6+
import math
67
from io import BytesIO
8+
from itertools import pairwise
79
from typing import TYPE_CHECKING, Literal, TypeAlias
810

911
from hud.tools.types import ContentResult
@@ -13,6 +15,8 @@
1315

1416
logger = logging.getLogger(__name__)
1517

18+
DRAG_STEP_PIXELS = 12
19+
1620

1721
class BaseExecutor:
1822
"""
@@ -42,6 +46,31 @@ def __init__(self, display_num: int | None = None) -> None:
4246
self._screenshot_delay = 0.5
4347
logger.info("BaseExecutor initialized")
4448

49+
def _interpolate_drag_path(
50+
self, path: list[tuple[int, int]], step_pixels: int = DRAG_STEP_PIXELS
51+
) -> list[tuple[int, int]]:
52+
"""Fill long drag segments with intermediate points for pointer-delta UIs."""
53+
if len(path) < 2:
54+
return path
55+
56+
interpolated: list[tuple[int, int]] = [path[0]]
57+
for start, end in pairwise(path):
58+
start_x, start_y = start
59+
end_x, end_y = end
60+
distance = math.hypot(end_x - start_x, end_y - start_y)
61+
steps = max(1, math.ceil(distance / max(step_pixels, 1)))
62+
63+
for step in range(1, steps + 1):
64+
t = step / steps
65+
point = (
66+
round(start_x + (end_x - start_x) * t),
67+
round(start_y + (end_y - start_y) * t),
68+
)
69+
if point != interpolated[-1]:
70+
interpolated.append(point)
71+
72+
return interpolated
73+
4574
# ===== Core CLA Actions =====
4675

4776
async def click(

hud/tools/executors/pyautogui.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -478,33 +478,26 @@ async def drag(
478478
return ContentResult(error="Drag path must have at least 2 points")
479479

480480
try:
481+
drag_path = self._interpolate_drag_path(path)
482+
481483
# Hold keys if specified
482484
self._hold_keys_context(hold_keys)
483485

484486
try:
485487
# Move to start
486-
start_x, start_y = path[0]
488+
start_x, start_y = drag_path[0]
487489
self.pyautogui.moveTo(start_x, start_y)
488490

489-
# Handle multi-point drag
490-
if len(path) == 2:
491-
# Simple drag
492-
end_x, end_y = path[1]
493-
self.pyautogui.dragTo(end_x, end_y, duration=0.5, button="left")
494-
result = ContentResult(
495-
output=f"Dragged from ({start_x}, {start_y}) to ({end_x}, {end_y})"
496-
)
497-
else:
498-
# Multi-point drag
499-
self.pyautogui.mouseDown(button="left")
500-
for i, (x, y) in enumerate(path[1:], 1):
501-
duration = 0.1
502-
if pattern and i - 1 < len(pattern):
503-
duration = pattern[i - 1] / 1000.0 # Convert ms to seconds
504-
self.pyautogui.moveTo(x, y, duration=duration)
505-
self.pyautogui.mouseUp(button="left")
506-
507-
result = ContentResult(output=f"Dragged along {len(path)} points")
491+
# Move through enough points for pointer-delta-sensitive UIs.
492+
self.pyautogui.mouseDown(button="left")
493+
for i, (x, y) in enumerate(drag_path[1:], 1):
494+
duration = 0.01
495+
if pattern and i - 1 < len(pattern):
496+
duration = pattern[i - 1] / 1000.0 # Convert ms to seconds
497+
self.pyautogui.moveTo(x, y, duration=duration)
498+
self.pyautogui.mouseUp(button="left")
499+
500+
result = ContentResult(output=f"Dragged along {len(drag_path)} points")
508501

509502
if hold_keys:
510503
result = ContentResult(output=f"{result.output} while holding {hold_keys}")

hud/tools/executors/tests/test_pyautogui_executor.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,22 @@ async def test_drag_with_pyautogui(self):
116116
"""Test drag when pyautogui is available."""
117117
executor = PyAutoGUIExecutor()
118118

119-
with patch("pyautogui.dragTo") as mock_drag:
119+
with (
120+
patch("pyautogui.moveTo") as mock_move,
121+
patch("pyautogui.mouseDown") as mock_down,
122+
patch("pyautogui.mouseUp") as mock_up,
123+
):
120124
# drag expects a path (list of coordinate tuples)
121125
path = [(100, 100), (300, 400)]
122126
result = await executor.drag(path)
123127

124128
assert isinstance(result, ContentResult)
125129
assert result.output and "Dragged" in result.output
126-
# Implementation uses dragTo to move to each point
127-
mock_drag.assert_called()
130+
# Implementation holds the button and moves through interpolated points.
131+
mock_move.assert_any_call(100, 100)
132+
assert mock_move.call_count > len(path)
133+
mock_down.assert_called_once_with(button="left")
134+
mock_up.assert_called_once_with(button="left")
128135

129136
@pytest.mark.asyncio
130137
async def test_wait(self):

hud/tools/executors/xdo.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,20 +425,24 @@ async def drag(
425425
if len(path) < 2:
426426
return ContentResult(error="Drag path must have at least 2 points")
427427

428+
drag_path = self._interpolate_drag_path(path)
429+
428430
# Hold keys if specified
429431
await self._hold_keys_context(hold_keys)
430432

431433
try:
432434
# Start drag
433-
start_x, start_y = path[0]
435+
start_x, start_y = drag_path[0]
434436
await self.execute(f"mousemove {start_x} {start_y}", take_screenshot=False)
435437
await self.execute("mousedown 1", take_screenshot=False)
436438

437439
# Move through intermediate points
438-
for i, (x, y) in enumerate(path[1:], 1):
440+
for i, (x, y) in enumerate(drag_path[1:], 1):
439441
# Apply delay if pattern is specified
440442
if pattern and i - 1 < len(pattern):
441443
await asyncio.sleep(pattern[i - 1] / 1000.0) # Convert ms to seconds
444+
else:
445+
await asyncio.sleep(0.008)
442446

443447
await self.execute(f"mousemove {x} {y}", take_screenshot=False)
444448

@@ -449,10 +453,10 @@ async def drag(
449453
if take_screenshot:
450454
screenshot = await self.screenshot()
451455
result = ContentResult(
452-
output=f"Dragged along {len(path)} points", base64_image=screenshot
456+
output=f"Dragged along {len(drag_path)} points", base64_image=screenshot
453457
)
454458
else:
455-
result = ContentResult(output=f"Dragged along {len(path)} points")
459+
result = ContentResult(output=f"Dragged along {len(drag_path)} points")
456460

457461
finally:
458462
# Release held keys

0 commit comments

Comments
 (0)