|
13 | 13 | ) |
14 | 14 | from openadapt_evals.agents.planner_grounder_agent import ( |
15 | 15 | PlannerGrounderAgent, |
| 16 | + _ANTI_LOOP_THRESHOLD, |
| 17 | + _PLANNER_PROMPT, |
16 | 18 | _action_to_planner_output, |
17 | 19 | ) |
18 | 20 |
|
@@ -905,3 +907,187 @@ def test_continuation_queues_followup( |
905 | 907 | action2 = agent.act(observation, task) |
906 | 908 | assert action2.type == "key" |
907 | 909 | assert action2.key == "enter" |
| 910 | + |
| 911 | + |
| 912 | +# -- Tests: double_click action type ------------------------------------------ |
| 913 | + |
| 914 | + |
| 915 | +class TestDoubleClick: |
| 916 | + """Tests for double_click action type parsing and handling.""" |
| 917 | + |
| 918 | + @patch("openadapt_evals.vlm.vlm_call") |
| 919 | + @patch("openadapt_evals.vlm.extract_json") |
| 920 | + def test_structured_double_click_calls_grounder_and_overrides_type( |
| 921 | + self, mock_extract, mock_vlm, observation, task |
| 922 | + ): |
| 923 | + """Structured double_click calls grounder and sets action type to double_click.""" |
| 924 | + mock_vlm.return_value = "{}" |
| 925 | + mock_extract.return_value = { |
| 926 | + "decision": "COMMAND", |
| 927 | + "action_type": "double_click", |
| 928 | + "action_value": "", |
| 929 | + "target_description": "the Notepad icon on desktop", |
| 930 | + "reasoning": "Need to open Notepad", |
| 931 | + } |
| 932 | + |
| 933 | + grounder = MockGrounderAgent(x=0.3, y=0.7) |
| 934 | + agent = PlannerGrounderAgent( |
| 935 | + planner="claude-sonnet-4-20250514", |
| 936 | + grounder=grounder, |
| 937 | + planner_provider="anthropic", |
| 938 | + ) |
| 939 | + action = agent.act(observation, task) |
| 940 | + |
| 941 | + assert action.type == "double_click" |
| 942 | + assert action.x == 0.3 |
| 943 | + assert action.y == 0.7 |
| 944 | + |
| 945 | + @patch("openadapt_evals.vlm.vlm_call") |
| 946 | + @patch("openadapt_evals.vlm.extract_json") |
| 947 | + def test_double_click_in_planner_prompt( |
| 948 | + self, mock_extract, mock_vlm, observation, task |
| 949 | + ): |
| 950 | + """Planner prompt includes double_click as a valid action type.""" |
| 951 | + mock_extract.return_value = { |
| 952 | + "decision": "DONE", |
| 953 | + "instruction": "", |
| 954 | + "reasoning": "", |
| 955 | + } |
| 956 | + mock_vlm.return_value = "{}" |
| 957 | + |
| 958 | + agent = PlannerGrounderAgent( |
| 959 | + planner="claude-sonnet-4-20250514", |
| 960 | + grounder=MockGrounderAgent(), |
| 961 | + planner_provider="anthropic", |
| 962 | + ) |
| 963 | + agent.act(observation, task) |
| 964 | + |
| 965 | + # Check that the prompt sent to vlm_call mentions double_click. |
| 966 | + call_args = mock_vlm.call_args_list[0] |
| 967 | + prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "") |
| 968 | + assert "double_click" in prompt |
| 969 | + |
| 970 | + def test_double_click_in_prompt_template(self): |
| 971 | + """The planner prompt template includes double_click as a valid action type.""" |
| 972 | + assert "double_click" in _PLANNER_PROMPT |
| 973 | + assert "open/launch applications" in _PLANNER_PROMPT |
| 974 | + |
| 975 | + def test_parse_non_click_returns_none_for_double_click_instruction(self): |
| 976 | + """_parse_non_click_action returns None for 'double-click' text (needs grounder).""" |
| 977 | + result = PlannerGrounderAgent._parse_non_click_action( |
| 978 | + "Double-click the Notepad icon" |
| 979 | + ) |
| 980 | + # Should return None so the grounder is called for coordinates. |
| 981 | + assert result is None |
| 982 | + |
| 983 | + |
| 984 | +# -- Tests: Anti-loop detection ------------------------------------------------ |
| 985 | + |
| 986 | + |
| 987 | +class TestAntiLoopDetection: |
| 988 | + """Tests for the anti-loop detection that triggers after repeated identical actions.""" |
| 989 | + |
| 990 | + def test_no_warning_with_few_actions(self, observation, task): |
| 991 | + """No anti-loop warning when fewer than threshold actions recorded.""" |
| 992 | + planner = MockPlannerAgent() |
| 993 | + grounder = MockGrounderAgent() |
| 994 | + agent = PlannerGrounderAgent(planner=planner, grounder=grounder) |
| 995 | + |
| 996 | + # Only 2 actions — below the threshold of 3. |
| 997 | + agent._action_history = [ |
| 998 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 999 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1000 | + ] |
| 1001 | + warning = agent._check_action_loop() |
| 1002 | + assert warning == "" |
| 1003 | + |
| 1004 | + def test_warning_after_threshold_identical_actions(self, observation, task): |
| 1005 | + """Anti-loop warning triggers after 3 identical instructions.""" |
| 1006 | + planner = MockPlannerAgent() |
| 1007 | + grounder = MockGrounderAgent() |
| 1008 | + agent = PlannerGrounderAgent(planner=planner, grounder=grounder) |
| 1009 | + |
| 1010 | + # Simulate 3 identical instruction entries. |
| 1011 | + agent._action_history = [ |
| 1012 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1013 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1014 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1015 | + ] |
| 1016 | + warning = agent._check_action_loop() |
| 1017 | + assert "WARNING" in warning |
| 1018 | + assert "completely different approach" in warning |
| 1019 | + |
| 1020 | + def test_no_warning_with_varied_actions(self, observation, task): |
| 1021 | + """No anti-loop warning when actions differ.""" |
| 1022 | + planner = MockPlannerAgent() |
| 1023 | + grounder = MockGrounderAgent() |
| 1024 | + agent = PlannerGrounderAgent(planner=planner, grounder=grounder) |
| 1025 | + |
| 1026 | + agent._action_history = [ |
| 1027 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1028 | + "TYPE('hello') (instruction: Type hello)", |
| 1029 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1030 | + ] |
| 1031 | + warning = agent._check_action_loop() |
| 1032 | + assert warning == "" |
| 1033 | + |
| 1034 | + def test_no_warning_with_queued_entries(self, observation, task): |
| 1035 | + """No anti-loop warning when entries include queued actions (no instruction suffix).""" |
| 1036 | + planner = MockPlannerAgent() |
| 1037 | + grounder = MockGrounderAgent() |
| 1038 | + agent = PlannerGrounderAgent(planner=planner, grounder=grounder) |
| 1039 | + |
| 1040 | + agent._action_history = [ |
| 1041 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1042 | + "KEY(enter) (queued)", |
| 1043 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1044 | + ] |
| 1045 | + warning = agent._check_action_loop() |
| 1046 | + assert warning == "" |
| 1047 | + |
| 1048 | + @patch("openadapt_evals.vlm.vlm_call") |
| 1049 | + @patch("openadapt_evals.vlm.extract_json") |
| 1050 | + def test_anti_loop_warning_injected_in_planner_prompt( |
| 1051 | + self, mock_extract, mock_vlm, observation, task |
| 1052 | + ): |
| 1053 | + """When anti-loop triggers, the warning is injected into the planner prompt.""" |
| 1054 | + mock_extract.return_value = { |
| 1055 | + "decision": "DONE", |
| 1056 | + "instruction": "", |
| 1057 | + "reasoning": "", |
| 1058 | + } |
| 1059 | + mock_vlm.return_value = "{}" |
| 1060 | + |
| 1061 | + agent = PlannerGrounderAgent( |
| 1062 | + planner="claude-sonnet-4-20250514", |
| 1063 | + grounder=MockGrounderAgent(), |
| 1064 | + planner_provider="anthropic", |
| 1065 | + ) |
| 1066 | + |
| 1067 | + # Seed history with 3 identical instructions. |
| 1068 | + agent._action_history = [ |
| 1069 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1070 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1071 | + "CLICK(0.5, 0.5) (instruction: Click Settings)", |
| 1072 | + ] |
| 1073 | + |
| 1074 | + agent.act(observation, task) |
| 1075 | + |
| 1076 | + # Check the prompt sent to vlm_call includes the warning. |
| 1077 | + call_args = mock_vlm.call_args_list[0] |
| 1078 | + prompt = call_args.args[0] if call_args.args else call_args.kwargs.get("prompt", "") |
| 1079 | + assert "WARNING" in prompt |
| 1080 | + assert "completely different approach" in prompt |
| 1081 | + |
| 1082 | + |
| 1083 | +# -- Tests: Dialog dismissal in planner prompt ---------------------------------- |
| 1084 | + |
| 1085 | + |
| 1086 | +class TestDialogDismissalPrompt: |
| 1087 | + """Tests for dialog dismissal guidance in the planner prompt.""" |
| 1088 | + |
| 1089 | + def test_dialog_dismissal_in_prompt_template(self): |
| 1090 | + """Planner prompt includes guidance to dismiss blocking dialogs.""" |
| 1091 | + assert "dialog boxes" in _PLANNER_PROMPT |
| 1092 | + assert "dismiss them first" in _PLANNER_PROMPT |
| 1093 | + assert "Escape" in _PLANNER_PROMPT |
0 commit comments