fix(agent): add double_click support and anti-loop recovery to PlannerGrounderAgent (#148)

abrichr · claude · web-flow · commit 1c9a1bbbbd05 · 2026-03-19T14:54:38.000-04:00
- Add double_click to planner prompt action types with guidance to use
  it for opening applications, files, and desktop icons (fixes Windows
  11 desktop icons requiring double-click)
- Handle double_click in _build_action_from_structured by routing through
  the grounder for coordinates and overriding the returned click type
- Add anti-loop detection that checks the last 3 planner instructions
  for exact string match and injects a WARNING into the planner prompt
  forcing a different strategy (fixes agent repeating the same failed
  action 15+ times)
- Add dialog dismissal awareness to planner prompt (dismiss popups,
  notifications, and dialog boxes before attempting target actions)
- Add 10 new tests covering double_click parsing, anti-loop detection,
  and dialog dismissal prompt content

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_evals/agents/planner_grounder_agent.py b/openadapt_evals/agents/planner_grounder_agent.py
@@ -72,21 +72,34 @@
 
 Output a JSON object with exactly these fields:
 {{"decision": "COMMAND" | "DONE" | "FAIL",
-  "action_type": "click" | "type" | "key" | "scroll",
-  "action_value": "<text to type, key to press, or empty for click>",
+  "action_type": "click" | "double_click" | "type" | "key" | "scroll",
+  "action_value": "<text to type, key to press, or empty for click/double_click>",
   "target_description": "<what element to interact with>",
   "reasoning": "<brief explanation>"}}
 
 Rules:
-- action_type must be exactly ONE of: click, type, key, scroll
-- For click: target_description describes WHAT to click. action_value is empty.
+- action_type must be exactly ONE of: click, double_click, type, key, scroll
+- For click: target_description describes WHAT to click. action_value is empty. Use for buttons, menus, links, and UI controls.
+- For double_click: use to open/launch applications, files, or desktop icons. action_value is empty.
 - For type: action_value is the text to type. Append \\n to submit/press Enter after typing.
 - For key: action_value is the key (e.g., "Enter", "Tab", "Ctrl+A").
 - For scroll: action_value is "up" or "down".
 - Output ONE action per response. Never combine multiple actions.
 - Do NOT include pixel coordinates — a grounding model handles that.
-- If your last 3 actions were the same, try a completely different approach.
-"""
+- If there are dialog boxes, notifications, or popups blocking your target, dismiss them first (click X, press Escape, or click 'Not now'/'Later'/'Skip').
+- If your last 3 actions were the same and failed, you MUST try a completely different approach: dismiss any dialogs, try keyboard shortcuts, or interact with different UI elements.
+{anti_loop_warning}"""
+
+# Warning injected when repeated identical actions are detected.
+_ANTI_LOOP_WARNING = (
+    "\nWARNING: Your last {n} actions were identical and failed. "
+    "You MUST try a completely different approach: dismiss any dialogs, "
+    "try keyboard shortcuts, or interact with different UI elements. "
+    "Do NOT repeat the same action again.\n"
+)
+
+# Number of consecutive identical actions that triggers the anti-loop warning.
+_ANTI_LOOP_THRESHOLD = 3
 
 _GROUNDER_SYSTEM = (
     "You are a GUI grounding model. Given a screenshot and a natural-language "
@@ -310,6 +323,18 @@ def act(
                 # Call grounder for click-type actions
                 action = self._call_grounder(observation, instruction)
 
+                # If the planner requested double_click, override the
+                # grounder's returned "click" type to "double_click".
+                if action_type == "double_click" and action.type == "click":
+                    action = BenchmarkAction(
+                        type="double_click",
+                        x=action.x,
+                        y=action.y,
+                        target_node_id=action.target_node_id,
+                        target_bbox=action.target_bbox,
+                        raw_action=action.raw_action,
+                    )
+
         # Record for history.
         action_str = action_to_string(action)
         self._action_history.append(f"{action_str} (instruction: {instruction})")
@@ -323,18 +348,32 @@ def act(
 
     @staticmethod
     def _parse_non_click_action(instruction: str) -> BenchmarkAction | None:
-        """Detect type/key/scroll actions from planner instruction.
+        """Detect type/key/scroll/double-click actions from planner instruction.
 
         The grounder only returns click coordinates. For type/key/scroll
         actions, we parse the action directly from the planner's
         instruction text.
 
-        Returns None if the instruction is a click action (needs grounder).
+        Returns None if the instruction is a plain click action (needs grounder).
+        Returns a ``double_click`` action if the instruction mentions
+        "double-click" or "open" (launching an application/file).
         """
         import re
 
         lower = instruction.lower()
 
+        # Detect explicit "double-click" or "double click" instructions.
+        # These need the grounder for coordinates, so we return a marker
+        # action with type="double_click" but x/y=None — the caller
+        # will route through the grounder and override the type.
+        if re.search(r"double[\s-]?click", lower):
+            logger.info(
+                "Planner instruction parsed as DOUBLE_CLICK: %r", instruction,
+            )
+            # Return None so the grounder is called, but signal double_click
+            # via a special return. The caller checks instruction text too.
+            return None
+
         # Detect "type 'text'" or "type "text"" patterns
         type_match = re.search(
             r"(?:type|enter|input|write)\s+['\"]([^'\"]+)['\"]",
@@ -425,6 +464,12 @@ def _build_action_from_structured(
             # using target_description as the instruction.
             return None
 
+        if action_type == "double_click":
+            # Double-click also needs grounding for coordinates — return None
+            # so the caller invokes the grounder, but store the action type
+            # so we can set it on the returned action.
+            return None
+
         logger.warning("Unknown structured action_type: %r", action_type)
         return None
 
@@ -463,6 +508,51 @@ def reset(self) -> None:
 
     # -- Private helpers ---------------------------------------------------
 
+    def _check_action_loop(self) -> str:
+        """Detect repeated identical planner instructions and return a warning.
+
+        Compares the last ``_ANTI_LOOP_THRESHOLD`` entries in the action
+        history. If they all share the same instruction text (extracted
+        from the ``(instruction: ...)`` suffix appended by ``act()``),
+        returns an anti-loop warning string to inject into the planner
+        prompt. Otherwise returns an empty string.
+
+        The comparison uses exact string matching on the instruction
+        portion of the history entry (the text after ``(instruction: ``
+        and before the closing ``)``).
+        """
+        threshold = _ANTI_LOOP_THRESHOLD
+        if len(self._action_history) < threshold:
+            return ""
+
+        recent = self._action_history[-threshold:]
+
+        # Extract instruction text from history entries.
+        import re
+
+        instructions: list[str] = []
+        for entry in recent:
+            m = re.search(r"\(instruction:\s*(.+)\)\s*$", entry)
+            if m:
+                instructions.append(m.group(1).strip())
+            else:
+                # Entry without instruction suffix (e.g. DONE, queued).
+                return ""
+
+        if len(instructions) < threshold:
+            return ""
+
+        # Check if all instructions are identical.
+        if len(set(instructions)) == 1:
+            logger.warning(
+                "Anti-loop: last %d instructions are identical: %r",
+                threshold,
+                instructions[0],
+            )
+            return _ANTI_LOOP_WARNING.format(n=threshold)
+
+        return ""
+
     def _call_planner(
         self,
         observation: BenchmarkObservation,
@@ -502,10 +592,14 @@ def _call_planner(
             for i, a in enumerate(self._action_history[-self._max_history :])
         )
 
+        # Check for repeated identical actions and inject anti-loop warning.
+        anti_loop_warning = self._check_action_loop()
+
         prompt = _PLANNER_PROMPT.format(
             task_instruction=task.instruction,
             action_history=history_text,
             a11y_tree=a11y_text,
+            anti_loop_warning=anti_loop_warning,
         )
 
         images = [observation.screenshot] if observation.screenshot else None
diff --git a/tests/test_planner_grounder_agent.py b/tests/test_planner_grounder_agent.py
@@ -13,6 +13,8 @@
 )
 from openadapt_evals.agents.planner_grounder_agent import (
     PlannerGrounderAgent,
+    _ANTI_LOOP_THRESHOLD,
+    _PLANNER_PROMPT,
     _action_to_planner_output,
 )
 
@@ -905,3 +907,187 @@ def test_continuation_queues_followup(
         action2 = agent.act(observation, task)
         assert action2.type == "key"
         assert action2.key == "enter"
+
+
+# -- Tests: double_click action type ------------------------------------------
+
+
+class TestDoubleClick:
+    """Tests for double_click action type parsing and handling."""
+
+    @patch("openadapt_evals.vlm.vlm_call")
+    @patch("openadapt_evals.vlm.extract_json")
+    def test_structured_double_click_calls_grounder_and_overrides_type(
+        self, mock_extract, mock_vlm, observation, task
+    ):
+        """Structured double_click calls grounder and sets action type to double_click."""
+        mock_vlm.return_value = "{}"
+        mock_extract.return_value = {
+            "decision": "COMMAND",
+            "action_type": "double_click",
+            "action_value": "",
+            "target_description": "the Notepad icon on desktop",
+            "reasoning": "Need to open Notepad",
+        }
+
+        grounder = MockGrounderAgent(x=0.3, y=0.7)
+        agent = PlannerGrounderAgent(
+            planner="claude-sonnet-4-20250514",
+            grounder=grounder,
+            planner_provider="anthropic",
+        )
+        action = agent.act(observation, task)
+
+        assert action.type == "double_click"
+        assert action.x == 0.3
+        assert action.y == 0.7
+
+    @patch("openadapt_evals.vlm.vlm_call")
+    @patch("openadapt_evals.vlm.extract_json")
+    def test_double_click_in_planner_prompt(
+        self, mock_extract, mock_vlm, observation, task
+    ):
+        """Planner prompt includes double_click as a valid action type."""
+        mock_extract.return_value = {
+            "decision": "DONE",
+            "instruction": "",
+            "reasoning": "",
+        }
+        mock_vlm.return_value = "{}"
+
+        agent = PlannerGrounderAgent(
+            planner="claude-sonnet-4-20250514",
+            grounder=MockGrounderAgent(),
+            planner_provider="anthropic",
+        )
+        agent.act(observation, task)
+
+        # Check that the prompt sent to vlm_call mentions double_click.
+        call_args = mock_vlm.call_args_list[0]
+        prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "")
+        assert "double_click" in prompt
+
+    def test_double_click_in_prompt_template(self):
+        """The planner prompt template includes double_click as a valid action type."""
+        assert "double_click" in _PLANNER_PROMPT
+        assert "open/launch applications" in _PLANNER_PROMPT
+
+    def test_parse_non_click_returns_none_for_double_click_instruction(self):
+        """_parse_non_click_action returns None for 'double-click' text (needs grounder)."""
+        result = PlannerGrounderAgent._parse_non_click_action(
+            "Double-click the Notepad icon"
+        )
+        # Should return None so the grounder is called for coordinates.
+        assert result is None
+
+
+# -- Tests: Anti-loop detection ------------------------------------------------
+
+
+class TestAntiLoopDetection:
+    """Tests for the anti-loop detection that triggers after repeated identical actions."""
+
+    def test_no_warning_with_few_actions(self, observation, task):
+        """No anti-loop warning when fewer than threshold actions recorded."""
+        planner = MockPlannerAgent()
+        grounder = MockGrounderAgent()
+        agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
+
+        # Only 2 actions — below the threshold of 3.
+        agent._action_history = [
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+        ]
+        warning = agent._check_action_loop()
+        assert warning == ""
+
+    def test_warning_after_threshold_identical_actions(self, observation, task):
+        """Anti-loop warning triggers after 3 identical instructions."""
+        planner = MockPlannerAgent()
+        grounder = MockGrounderAgent()
+        agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
+
+        # Simulate 3 identical instruction entries.
+        agent._action_history = [
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+        ]
+        warning = agent._check_action_loop()
+        assert "WARNING" in warning
+        assert "completely different approach" in warning
+
+    def test_no_warning_with_varied_actions(self, observation, task):
+        """No anti-loop warning when actions differ."""
+        planner = MockPlannerAgent()
+        grounder = MockGrounderAgent()
+        agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
+
+        agent._action_history = [
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "TYPE('hello') (instruction: Type hello)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+        ]
+        warning = agent._check_action_loop()
+        assert warning == ""
+
+    def test_no_warning_with_queued_entries(self, observation, task):
+        """No anti-loop warning when entries include queued actions (no instruction suffix)."""
+        planner = MockPlannerAgent()
+        grounder = MockGrounderAgent()
+        agent = PlannerGrounderAgent(planner=planner, grounder=grounder)
+
+        agent._action_history = [
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "KEY(enter) (queued)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+        ]
+        warning = agent._check_action_loop()
+        assert warning == ""
+
+    @patch("openadapt_evals.vlm.vlm_call")
+    @patch("openadapt_evals.vlm.extract_json")
+    def test_anti_loop_warning_injected_in_planner_prompt(
+        self, mock_extract, mock_vlm, observation, task
+    ):
+        """When anti-loop triggers, the warning is injected into the planner prompt."""
+        mock_extract.return_value = {
+            "decision": "DONE",
+            "instruction": "",
+            "reasoning": "",
+        }
+        mock_vlm.return_value = "{}"
+
+        agent = PlannerGrounderAgent(
+            planner="claude-sonnet-4-20250514",
+            grounder=MockGrounderAgent(),
+            planner_provider="anthropic",
+        )
+
+        # Seed history with 3 identical instructions.
+        agent._action_history = [
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+            "CLICK(0.5, 0.5) (instruction: Click Settings)",
+        ]
+
+        agent.act(observation, task)
+
+        # Check the prompt sent to vlm_call includes the warning.
+        call_args = mock_vlm.call_args_list[0]
+        prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "")
+        assert "WARNING" in prompt
+        assert "completely different approach" in prompt
+
+
+# -- Tests: Dialog dismissal in planner prompt ----------------------------------
+
+
+class TestDialogDismissalPrompt:
+    """Tests for dialog dismissal guidance in the planner prompt."""
+
+    def test_dialog_dismissal_in_prompt_template(self):
+        """Planner prompt includes guidance to dismiss blocking dialogs."""
+        assert "dialog boxes" in _PLANNER_PROMPT
+        assert "dismiss them first" in _PLANNER_PROMPT
+        assert "Escape" in _PLANNER_PROMPT