Skip to content

Commit 1c9a1bb

Browse files
abrichrclaude
andauthored
fix(agent): add double_click support and anti-loop recovery to PlannerGrounderAgent (#148)
- Add double_click to planner prompt action types with guidance to use it for opening applications, files, and desktop icons (fixes Windows 11 desktop icons requiring double-click) - Handle double_click in _build_action_from_structured by routing through the grounder for coordinates and overriding the returned click type - Add anti-loop detection that checks the last 3 planner instructions for exact string match and injects a WARNING into the planner prompt forcing a different strategy (fixes agent repeating the same failed action 15+ times) - Add dialog dismissal awareness to planner prompt (dismiss popups, notifications, and dialog boxes before attempting target actions) - Add 10 new tests covering double_click parsing, anti-loop detection, and dialog dismissal prompt content Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 76501cc commit 1c9a1bb

2 files changed

Lines changed: 288 additions & 8 deletions

File tree

openadapt_evals/agents/planner_grounder_agent.py

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,21 +72,34 @@
7272
7373
Output a JSON object with exactly these fields:
7474
{{"decision": "COMMAND" | "DONE" | "FAIL",
75-
"action_type": "click" | "type" | "key" | "scroll",
76-
"action_value": "<text to type, key to press, or empty for click>",
75+
"action_type": "click" | "double_click" | "type" | "key" | "scroll",
76+
"action_value": "<text to type, key to press, or empty for click/double_click>",
7777
"target_description": "<what element to interact with>",
7878
"reasoning": "<brief explanation>"}}
7979
8080
Rules:
81-
- action_type must be exactly ONE of: click, type, key, scroll
82-
- For click: target_description describes WHAT to click. action_value is empty.
81+
- action_type must be exactly ONE of: click, double_click, type, key, scroll
82+
- For click: target_description describes WHAT to click. action_value is empty. Use for buttons, menus, links, and UI controls.
83+
- For double_click: use to open/launch applications, files, or desktop icons. action_value is empty.
8384
- For type: action_value is the text to type. Append \\n to submit/press Enter after typing.
8485
- For key: action_value is the key (e.g., "Enter", "Tab", "Ctrl+A").
8586
- For scroll: action_value is "up" or "down".
8687
- Output ONE action per response. Never combine multiple actions.
8788
- Do NOT include pixel coordinates — a grounding model handles that.
88-
- If your last 3 actions were the same, try a completely different approach.
89-
"""
89+
- If there are dialog boxes, notifications, or popups blocking your target, dismiss them first (click X, press Escape, or click 'Not now'/'Later'/'Skip').
90+
- If your last 3 actions were the same and failed, you MUST try a completely different approach: dismiss any dialogs, try keyboard shortcuts, or interact with different UI elements.
91+
{anti_loop_warning}"""
92+
93+
# Warning injected when repeated identical actions are detected.
94+
_ANTI_LOOP_WARNING = (
95+
"\nWARNING: Your last {n} actions were identical and failed. "
96+
"You MUST try a completely different approach: dismiss any dialogs, "
97+
"try keyboard shortcuts, or interact with different UI elements. "
98+
"Do NOT repeat the same action again.\n"
99+
)
100+
101+
# Number of consecutive identical actions that triggers the anti-loop warning.
102+
_ANTI_LOOP_THRESHOLD = 3
90103

91104
_GROUNDER_SYSTEM = (
92105
"You are a GUI grounding model. Given a screenshot and a natural-language "
@@ -310,6 +323,18 @@ def act(
310323
# Call grounder for click-type actions
311324
action = self._call_grounder(observation, instruction)
312325

326+
# If the planner requested double_click, override the
327+
# grounder's returned "click" type to "double_click".
328+
if action_type == "double_click" and action.type == "click":
329+
action = BenchmarkAction(
330+
type="double_click",
331+
x=action.x,
332+
y=action.y,
333+
target_node_id=action.target_node_id,
334+
target_bbox=action.target_bbox,
335+
raw_action=action.raw_action,
336+
)
337+
313338
# Record for history.
314339
action_str = action_to_string(action)
315340
self._action_history.append(f"{action_str} (instruction: {instruction})")
@@ -323,18 +348,32 @@ def act(
323348

324349
@staticmethod
325350
def _parse_non_click_action(instruction: str) -> BenchmarkAction | None:
326-
"""Detect type/key/scroll actions from planner instruction.
351+
"""Detect type/key/scroll/double-click actions from planner instruction.
327352
328353
The grounder only returns click coordinates. For type/key/scroll
329354
actions, we parse the action directly from the planner's
330355
instruction text.
331356
332-
Returns None if the instruction is a click action (needs grounder).
357+
Returns None if the instruction is a plain click action (needs grounder).
358+
Returns a ``double_click`` action if the instruction mentions
359+
"double-click" or "open" (launching an application/file).
333360
"""
334361
import re
335362

336363
lower = instruction.lower()
337364

365+
# Detect explicit "double-click" or "double click" instructions.
366+
# These need the grounder for coordinates, so we return a marker
367+
# action with type="double_click" but x/y=None — the caller
368+
# will route through the grounder and override the type.
369+
if re.search(r"double[\s-]?click", lower):
370+
logger.info(
371+
"Planner instruction parsed as DOUBLE_CLICK: %r", instruction,
372+
)
373+
# Return None so the grounder is called, but signal double_click
374+
# via a special return. The caller checks instruction text too.
375+
return None
376+
338377
# Detect "type 'text'" or "type "text"" patterns
339378
type_match = re.search(
340379
r"(?:type|enter|input|write)\s+['\"]([^'\"]+)['\"]",
@@ -425,6 +464,12 @@ def _build_action_from_structured(
425464
# using target_description as the instruction.
426465
return None
427466

467+
if action_type == "double_click":
468+
# Double-click also needs grounding for coordinates — return None
469+
# so the caller invokes the grounder, but store the action type
470+
# so we can set it on the returned action.
471+
return None
472+
428473
logger.warning("Unknown structured action_type: %r", action_type)
429474
return None
430475

@@ -463,6 +508,51 @@ def reset(self) -> None:
463508

464509
# -- Private helpers ---------------------------------------------------
465510

511+
def _check_action_loop(self) -> str:
512+
"""Detect repeated identical planner instructions and return a warning.
513+
514+
Compares the last ``_ANTI_LOOP_THRESHOLD`` entries in the action
515+
history. If they all share the same instruction text (extracted
516+
from the ``(instruction: ...)`` suffix appended by ``act()``),
517+
returns an anti-loop warning string to inject into the planner
518+
prompt. Otherwise returns an empty string.
519+
520+
The comparison uses exact string matching on the instruction
521+
portion of the history entry (the text after ``(instruction: ``
522+
and before the closing ``)``).
523+
"""
524+
threshold = _ANTI_LOOP_THRESHOLD
525+
if len(self._action_history) < threshold:
526+
return ""
527+
528+
recent = self._action_history[-threshold:]
529+
530+
# Extract instruction text from history entries.
531+
import re
532+
533+
instructions: list[str] = []
534+
for entry in recent:
535+
m = re.search(r"\(instruction:\s*(.+)\)\s*$", entry)
536+
if m:
537+
instructions.append(m.group(1).strip())
538+
else:
539+
# Entry without instruction suffix (e.g. DONE, queued).
540+
return ""
541+
542+
if len(instructions) < threshold:
543+
return ""
544+
545+
# Check if all instructions are identical.
546+
if len(set(instructions)) == 1:
547+
logger.warning(
548+
"Anti-loop: last %d instructions are identical: %r",
549+
threshold,
550+
instructions[0],
551+
)
552+
return _ANTI_LOOP_WARNING.format(n=threshold)
553+
554+
return ""
555+
466556
def _call_planner(
467557
self,
468558
observation: BenchmarkObservation,
@@ -502,10 +592,14 @@ def _call_planner(
502592
for i, a in enumerate(self._action_history[-self._max_history :])
503593
)
504594

595+
# Check for repeated identical actions and inject anti-loop warning.
596+
anti_loop_warning = self._check_action_loop()
597+
505598
prompt = _PLANNER_PROMPT.format(
506599
task_instruction=task.instruction,
507600
action_history=history_text,
508601
a11y_tree=a11y_text,
602+
anti_loop_warning=anti_loop_warning,
509603
)
510604

511605
images = [observation.screenshot] if observation.screenshot else None

tests/test_planner_grounder_agent.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
)
1414
from openadapt_evals.agents.planner_grounder_agent import (
1515
PlannerGrounderAgent,
16+
_ANTI_LOOP_THRESHOLD,
17+
_PLANNER_PROMPT,
1618
_action_to_planner_output,
1719
)
1820

@@ -905,3 +907,187 @@ def test_continuation_queues_followup(
905907
action2 = agent.act(observation, task)
906908
assert action2.type == "key"
907909
assert action2.key == "enter"
910+
911+
912+
# -- Tests: double_click action type ------------------------------------------
913+
914+
915+
class TestDoubleClick:
916+
"""Tests for double_click action type parsing and handling."""
917+
918+
@patch("openadapt_evals.vlm.vlm_call")
919+
@patch("openadapt_evals.vlm.extract_json")
920+
def test_structured_double_click_calls_grounder_and_overrides_type(
921+
self, mock_extract, mock_vlm, observation, task
922+
):
923+
"""Structured double_click calls grounder and sets action type to double_click."""
924+
mock_vlm.return_value = "{}"
925+
mock_extract.return_value = {
926+
"decision": "COMMAND",
927+
"action_type": "double_click",
928+
"action_value": "",
929+
"target_description": "the Notepad icon on desktop",
930+
"reasoning": "Need to open Notepad",
931+
}
932+
933+
grounder = MockGrounderAgent(x=0.3, y=0.7)
934+
agent = PlannerGrounderAgent(
935+
planner="claude-sonnet-4-20250514",
936+
grounder=grounder,
937+
planner_provider="anthropic",
938+
)
939+
action = agent.act(observation, task)
940+
941+
assert action.type == "double_click"
942+
assert action.x == 0.3
943+
assert action.y == 0.7
944+
945+
@patch("openadapt_evals.vlm.vlm_call")
946+
@patch("openadapt_evals.vlm.extract_json")
947+
def test_double_click_in_planner_prompt(
948+
self, mock_extract, mock_vlm, observation, task
949+
):
950+
"""Planner prompt includes double_click as a valid action type."""
951+
mock_extract.return_value = {
952+
"decision": "DONE",
953+
"instruction": "",
954+
"reasoning": "",
955+
}
956+
mock_vlm.return_value = "{}"
957+
958+
agent = PlannerGrounderAgent(
959+
planner="claude-sonnet-4-20250514",
960+
grounder=MockGrounderAgent(),
961+
planner_provider="anthropic",
962+
)
963+
agent.act(observation, task)
964+
965+
# Check that the prompt sent to vlm_call mentions double_click.
966+
call_args = mock_vlm.call_args_list[0]
967+
prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "")
968+
assert "double_click" in prompt
969+
970+
def test_double_click_in_prompt_template(self):
971+
"""The planner prompt template includes double_click as a valid action type."""
972+
assert "double_click" in _PLANNER_PROMPT
973+
assert "open/launch applications" in _PLANNER_PROMPT
974+
975+
def test_parse_non_click_returns_none_for_double_click_instruction(self):
976+
"""_parse_non_click_action returns None for 'double-click' text (needs grounder)."""
977+
result = PlannerGrounderAgent._parse_non_click_action(
978+
"Double-click the Notepad icon"
979+
)
980+
# Should return None so the grounder is called for coordinates.
981+
assert result is None
982+
983+
984+
# -- Tests: Anti-loop detection ------------------------------------------------
985+
986+
987+
class TestAntiLoopDetection:
988+
"""Tests for the anti-loop detection that triggers after repeated identical actions."""
989+
990+
def test_no_warning_with_few_actions(self, observation, task):
991+
"""No anti-loop warning when fewer than threshold actions recorded."""
992+
planner = MockPlannerAgent()
993+
grounder = MockGrounderAgent()
994+
agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
995+
996+
# Only 2 actions — below the threshold of 3.
997+
agent._action_history = [
998+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
999+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1000+
]
1001+
warning = agent._check_action_loop()
1002+
assert warning == ""
1003+
1004+
def test_warning_after_threshold_identical_actions(self, observation, task):
1005+
"""Anti-loop warning triggers after 3 identical instructions."""
1006+
planner = MockPlannerAgent()
1007+
grounder = MockGrounderAgent()
1008+
agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
1009+
1010+
# Simulate 3 identical instruction entries.
1011+
agent._action_history = [
1012+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1013+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1014+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1015+
]
1016+
warning = agent._check_action_loop()
1017+
assert "WARNING" in warning
1018+
assert "completely different approach" in warning
1019+
1020+
def test_no_warning_with_varied_actions(self, observation, task):
1021+
"""No anti-loop warning when actions differ."""
1022+
planner = MockPlannerAgent()
1023+
grounder = MockGrounderAgent()
1024+
agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
1025+
1026+
agent._action_history = [
1027+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1028+
"TYPE('hello') (instruction: Type hello)",
1029+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1030+
]
1031+
warning = agent._check_action_loop()
1032+
assert warning == ""
1033+
1034+
def test_no_warning_with_queued_entries(self, observation, task):
1035+
"""No anti-loop warning when entries include queued actions (no instruction suffix)."""
1036+
planner = MockPlannerAgent()
1037+
grounder = MockGrounderAgent()
1038+
agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
1039+
1040+
agent._action_history = [
1041+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1042+
"KEY(enter) (queued)",
1043+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1044+
]
1045+
warning = agent._check_action_loop()
1046+
assert warning == ""
1047+
1048+
@patch("openadapt_evals.vlm.vlm_call")
1049+
@patch("openadapt_evals.vlm.extract_json")
1050+
def test_anti_loop_warning_injected_in_planner_prompt(
1051+
self, mock_extract, mock_vlm, observation, task
1052+
):
1053+
"""When anti-loop triggers, the warning is injected into the planner prompt."""
1054+
mock_extract.return_value = {
1055+
"decision": "DONE",
1056+
"instruction": "",
1057+
"reasoning": "",
1058+
}
1059+
mock_vlm.return_value = "{}"
1060+
1061+
agent = PlannerGrounderAgent(
1062+
planner="claude-sonnet-4-20250514",
1063+
grounder=MockGrounderAgent(),
1064+
planner_provider="anthropic",
1065+
)
1066+
1067+
# Seed history with 3 identical instructions.
1068+
agent._action_history = [
1069+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1070+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1071+
"CLICK(0.5, 0.5) (instruction: Click Settings)",
1072+
]
1073+
1074+
agent.act(observation, task)
1075+
1076+
# Check the prompt sent to vlm_call includes the warning.
1077+
call_args = mock_vlm.call_args_list[0]
1078+
prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "")
1079+
assert "WARNING" in prompt
1080+
assert "completely different approach" in prompt
1081+
1082+
1083+
# -- Tests: Dialog dismissal in planner prompt ----------------------------------
1084+
1085+
1086+
class TestDialogDismissalPrompt:
1087+
"""Tests for dialog dismissal guidance in the planner prompt."""
1088+
1089+
def test_dialog_dismissal_in_prompt_template(self):
1090+
"""Planner prompt includes guidance to dismiss blocking dialogs."""
1091+
assert "dialog boxes" in _PLANNER_PROMPT
1092+
assert "dismiss them first" in _PLANNER_PROMPT
1093+
assert "Escape" in _PLANNER_PROMPT

0 commit comments

Comments
 (0)