3434
3535from __future__ import annotations
3636
37+ import json
3738import re
3839from typing import Any , Dict , List , Optional , Tuple
3940
5758 "exit status" ,
5859 "is not recognized" ,
5960 "could not be found" ,
61+ "no results" ,
62+ "no results found" ,
6063)
6164
6265_EXIT_CODE_RE = re .compile (r"exit code[:\s]+([1-9]\d*)" , re .IGNORECASE )
6972_NON_RETRYABLE = frozenset ({"timeout" , "permission" , "missing_command" , "limit" })
7073_NONRETRY_THRESHOLD = 2
7174
75+ # Idempotent tools that are especially prone to content-free repetition and that
76+ # the issue evidence shows spiraling with no progress even when individual calls
77+ # return "success". Count these as non-progress after a shorter run so the model
78+ # is nudged toward a different query / tool / strategy.
79+ _SHORT_CIRCUIT_IDEMPOTENT = frozenset ({"search_files" , "web_search" , "web_extract" })
80+ _SHORT_CIRCUIT_REPEAT_THRESHOLD = 4
81+
7282# Mutating tools get LOWER thresholds than idempotent tools because a fixation
7383# on mutating operations (writing files, running commands) is more costly and
7484# indicates a deeper strategy problem (#432).
75- _IDEMPOTENT_TOOLS = frozenset (
76- {
77- "read_file" ,
78- "search_files" ,
79- "web_search" ,
80- "web_extract" ,
81- "session_search" ,
82- "browser_snapshot" ,
83- "browser_console" ,
84- "browser_get_images" ,
85- "mcp_filesystem_read_file" ,
86- "mcp_filesystem_read_text_file" ,
87- "mcp_filesystem_read_multiple_files" ,
88- "mcp_filesystem_list_directory" ,
89- "mcp_filesystem_list_directory_with_sizes" ,
90- "mcp_filesystem_directory_tree" ,
91- "mcp_filesystem_get_file_info" ,
92- "mcp_filesystem_search_files" ,
93- }
94- )
95- _MUTATING_TOOLS = frozenset (
96- {
97- "terminal" ,
98- "execute_code" ,
99- "write_file" ,
100- "patch" ,
101- "todo" ,
102- "memory" ,
103- "skill_manage" ,
104- "browser_click" ,
105- "browser_type" ,
106- "browser_press" ,
107- "browser_scroll" ,
108- "browser_navigate" ,
109- "send_message" ,
110- "cronjob" ,
111- "delegate_task" ,
112- "process" ,
113- }
114- )
85+ _IDEMPOTENT_TOOLS = frozenset ({
86+ "read_file" ,
87+ "search_files" ,
88+ "web_search" ,
89+ "web_extract" ,
90+ "session_search" ,
91+ "browser_snapshot" ,
92+ "browser_console" ,
93+ "browser_get_images" ,
94+ "mcp_filesystem_read_file" ,
95+ "mcp_filesystem_read_text_file" ,
96+ "mcp_filesystem_read_multiple_files" ,
97+ "mcp_filesystem_list_directory" ,
98+ "mcp_filesystem_list_directory_with_sizes" ,
99+ "mcp_filesystem_directory_tree" ,
100+ "mcp_filesystem_get_file_info" ,
101+ "mcp_filesystem_search_files" ,
102+ })
103+ _MUTATING_TOOLS = frozenset ({
104+ "terminal" ,
105+ "execute_code" ,
106+ "write_file" ,
107+ "patch" ,
108+ "todo" ,
109+ "memory" ,
110+ "skill_manage" ,
111+ "browser_click" ,
112+ "browser_type" ,
113+ "browser_press" ,
114+ "browser_scroll" ,
115+ "browser_navigate" ,
116+ "send_message" ,
117+ "cronjob" ,
118+ "delegate_task" ,
119+ "process" ,
120+ })
115121# Default thresholds: lower for mutating tools, higher for idempotent (#432).
116122# Mutating: repeat at 4, fail at 2, escalate at 8
117123# Idempotent: repeat at 8, fail at 4, escalate at 15
@@ -143,45 +149,107 @@ def _looks_like_failure(content: Any) -> bool:
143149 return bool (_EXIT_CODE_RE .search (content ))
144150
145151
146- def _recent_tool_runs (messages : List [Dict [str , Any ]]) -> List [Tuple [str , bool , Optional [str ]]]:
147- """Most-recent-first list of (single_tool_name, result_failed, failure_class)
152+ def _tool_call_arg_hash (tool_calls : List [Dict [str , Any ]]) -> Optional [str ]:
153+ """Canonical key of the INPUT arguments of an assistant turn's tool call(s).
154+
155+ Used to detect identical-query repetition for spiral-prone idempotent tools
156+ like web_search / web_extract (#467): the same query produces the same
157+ non-progressing result and drives a loop.
158+
159+ Identity is read from ``tool_calls[].function.arguments`` — the ACTUAL call
160+ inputs — NOT from the tool result. Tool results do not carry the input args,
161+ and web_search / web_extract outputs are XML-wrapped in
162+ ``<untrusted_tool_result>`` so they can never be parsed back into arguments;
163+ reading identity from the result left this short-circuit permanently inert.
164+
165+ ``arguments`` may be a JSON string (the OpenAI wire format) or an already
166+ parsed dict; both normalize to the same canonical key, and key ordering is
167+ irrelevant. An unparseable string is hashed verbatim (still a stable
168+ identity). Returns None when NO arguments can be recovered from any call, so
169+ a turn with missing args never yields a false identity match (fail-safe: no
170+ spurious short-circuit).
171+ """
172+ keys : List [str ] = []
173+ for tc in tool_calls :
174+ if not isinstance (tc , dict ):
175+ continue
176+ fn = tc .get ("function" )
177+ if not isinstance (fn , dict ):
178+ continue
179+ raw = fn .get ("arguments" )
180+ if raw is None :
181+ continue
182+ parsed : Any
183+ if isinstance (raw , str ):
184+ s = raw .strip ()
185+ if not s :
186+ continue
187+ try :
188+ parsed = json .loads (s )
189+ except Exception :
190+ keys .append (s ) # unparseable args: hash the raw string verbatim
191+ continue
192+ else :
193+ parsed = raw
194+ try :
195+ keys .append (
196+ json .dumps (
197+ parsed , sort_keys = True , ensure_ascii = False , separators = ("," , ":" )
198+ )
199+ )
200+ except (TypeError , ValueError ):
201+ keys .append (repr (parsed ))
202+ if not keys :
203+ return None
204+ return "|" .join (keys )
205+
206+
207+ def _recent_tool_runs (
208+ messages : List [Dict [str , Any ]],
209+ ) -> List [Tuple [str , bool , Optional [str ], Optional [str ]]]:
210+ """Most-recent-first list of
211+ (single_tool_name, result_failed, failure_class, arg_hash)
148212 for the trailing run of assistant turns that each called EXACTLY ONE tool.
149213 ``failure_class`` is the tool_diagnostics category of the failing result (or
150- None when the turn did not fail).
214+ None when the turn did not fail). ``arg_hash`` is a canonical key of the
215+ assistant tool-call INPUT arguments for the turn (``function.arguments``),
216+ when they can be recovered.
151217
152218 Stops at the first assistant turn that is not a single-tool call (a text
153219 reply, or a multi-tool turn) — that breaks the "stuck on one tool" run.
154220 Multi-tool turns are normal varied work, not a single-tool spiral.
155221 """
156- runs : List [Tuple [str , bool , Optional [str ]]] = []
222+ runs : List [Tuple [str , bool , Optional [str ], Optional [ str ] ]] = []
157223 i = len (messages ) - 1
158224 # Collect tool results by id as we walk back so we can mark failures.
159225 while i >= 0 :
160226 msg = messages [i ]
161227 if msg .get ("role" ) == "assistant" and msg .get ("tool_calls" ):
162228 tcs = [tc for tc in msg ["tool_calls" ] if isinstance (tc , dict )]
163229 names = [
164- tc .get ("function" , {}).get ("name" )
165- for tc in tcs
166- if tc .get ("function" )
230+ tc .get ("function" , {}).get ("name" ) for tc in tcs if tc .get ("function" )
167231 ]
168232 names = [n for n in names if n ]
169233 if len (set (names )) != 1 :
170234 break # text turn or multi-tool turn — run ends
171235 tool = names [0 ]
172236 if runs and tool != runs [0 ][0 ]:
173237 break # tool changed — the same-tool run ends here
238+ # Identity for #467 same-query detection comes from the call INPUT
239+ # args of THIS assistant turn, not the result that follows it.
240+ arg_hash = _tool_call_arg_hash (tcs )
174241 # Results for this turn are the "tool" messages that follow it.
175242 failed = False
176243 category : Optional [str ] = None
177244 for j in range (i + 1 , len (messages )):
178245 tm = messages [j ]
179246 if tm .get ("role" ) != "tool" :
180247 break
181- if _looks_like_failure (tm .get ("content" )):
248+ content = tm .get ("content" )
249+ if _looks_like_failure (content ):
182250 failed = True
183- category = _failure_category (tm . get ( " content" ) ) or category
184- runs .append ((tool , failed , category ))
251+ category = _failure_category (content ) or category
252+ runs .append ((tool , failed , category , arg_hash ))
185253 i -= 1
186254 elif msg .get ("role" ) == "tool" :
187255 i -= 1 # skip result messages; handled with their assistant turn
@@ -223,11 +291,13 @@ def maybe_nudge(
223291) -> Optional [str ]:
224292 """Return a nudge string if the trailing single-tool run is stuck, else None.
225293
226- Three trigger levels (each is lower for mutating tools than idempotent):
294+ Trigger levels (each is lower for mutating tools than idempotent):
227295 1. Non-retryable failure class repeated twice (highest priority, #231)
228296 2. Generic failures >= fail_threshold
229297 3. Same tool called >= repeat_threshold times in a row
230298 4. Escalated interrupt at higher counts (#432)
299+ 5. Same *arguments* repeated for short-circuit idempotent tools
300+ (search_files / web_search / web_extract) >= 4 times (#467)
231301
232302 Returns None when the agent is making varied progress (not stuck).
233303 """
@@ -243,18 +313,30 @@ def maybe_nudge(
243313 is_unknown = cat == "unknown"
244314 if repeat_threshold is None :
245315 repeat_threshold = (
246- _MUTATING_REPEAT_THRESHOLD if (is_mutating or is_unknown )
316+ _MUTATING_REPEAT_THRESHOLD
317+ if (is_mutating or is_unknown )
247318 else _IDEMPOTENT_REPEAT_THRESHOLD
248319 )
249320 if fail_threshold is None :
250321 fail_threshold = (
251- _MUTATING_FAIL_THRESHOLD if (is_mutating or is_unknown )
322+ _MUTATING_FAIL_THRESHOLD
323+ if (is_mutating or is_unknown )
252324 else _IDEMPOTENT_FAIL_THRESHOLD
253325 )
254326 escalate_threshold = (
255- _MUTATING_ESCALATE_THRESHOLD if (is_mutating or is_unknown )
327+ _MUTATING_ESCALATE_THRESHOLD
328+ if (is_mutating or is_unknown )
256329 else _IDEMPOTENT_ESCALATE_THRESHOLD
257330 )
331+ short_circuit_threshold = (
332+ _MUTATING_REPEAT_THRESHOLD
333+ if (is_mutating or is_unknown )
334+ else (
335+ _SHORT_CIRCUIT_REPEAT_THRESHOLD
336+ if tool in _SHORT_CIRCUIT_IDEMPOTENT
337+ else repeat_threshold
338+ )
339+ )
258340
259341 # All entries in `runs` share the same tool (run breaks on tool change),
260342 # but guard anyway:
@@ -264,7 +346,7 @@ def maybe_nudge(
264346 consec_nonretry = 0
265347 nonretry_class : Optional [str ] = None
266348 counting_nonretry = True
267- for _t , failed , category in same :
349+ for _t , failed , category , _arg_hash in same :
268350 if failed :
269351 consec_fail += 1
270352 else :
@@ -312,6 +394,23 @@ def maybe_nudge(
312394 f"again the same way."
313395 )
314396
397+ # Same-argument repetition for known spiral-prone idempotent tools (#467).
398+ # This catches web_search returning "no results" / search_files returning
399+ # nothing, where each individual call technically "succeeded" but repeating
400+ # the exact same query is still a loop.
401+ if tool in _SHORT_CIRCUIT_IDEMPOTENT and count >= short_circuit_threshold :
402+ arg_hashes = [r [3 ] for r in same if r [3 ] is not None ]
403+ if arg_hashes and len (set (arg_hashes )) == 1 :
404+ score = _tool_spiral_score (tool , count , short_circuit_threshold )
405+ score_line = f"\n { score } " if score else ""
406+ return (
407+ f"[loop-guard] You have called `{ tool } ` { count } times with the "
408+ f"SAME arguments and the result is not making progress.{ score_line } "
409+ f"Do NOT repeat `{ tool } ` with those identical arguments. Rephrase "
410+ f"the query, broaden or narrow it, switch to a different information "
411+ f"source, or state the blocker if no relevant results are available."
412+ )
413+
315414 if count >= repeat_threshold :
316415 # Build diversity score for the nudge.
317416 score = _tool_spiral_score (tool , count , repeat_threshold )
0 commit comments