Skip to content

Commit e9016b0

Browse files
Lexus2016Hermes Evolution
andauthored
feat: web search fallback chain + loop-guard rework (Closes #467 #544 #571) (#574)
* feat: web search fallback chain + loop-guard rework (#467 #544 #571) - _search_with_fallbacks now tries the configured fallback chain when the primary backend reports failure OR when it returns empty results and a fallback chain is configured. Empty results with no fallback chain remain success=True so real 'no hits' outcomes are not turned into provider-dead errors (#467 rework). - DDGS provider surfaces provider-dead only when empty results occur and a fallback chain exists. - Update fallback-chain tests to match the new semantics and keep regression coverage. - Carry forward prior loop-guard / tool-diagnostics improvements from #544. Closes #467 Closes #544 Closes #571 Co-Authored-By: Hermes Evolution <evolution@hermes.ai> * fix(loop-guard): make #467 same-query short-circuit actually fire The #467 same-query short-circuit for spiral-prone idempotent tools (web_search / web_extract / search_files) was functionally inert: the dedup identity was computed by `_tool_result_arg_hash`, which parsed the tool RESULT with `json.loads` to recover input args. But tool results do not carry the input args, and web_search / web_extract outputs are XML-wrapped in `<untrusted_tool_result>`, so the parse always raised and the arg hash was always None -> the short-circuit never triggered. With zero test coverage, CI stayed green over a dead feature. Fix: derive the identity hash from the assistant tool-call INPUT args (`tool_calls[].function.arguments`) instead of the result. New `_tool_call_arg_hash` canonicalizes JSON-string OR dict args (key-order insensitive), hashes unparseable string args verbatim, and returns None on missing args (fail-safe: no spurious short-circuit). The generic repeat/spiral detection is untouched. Adds TDD coverage in tests/agent/test_loop_guard.py: same-query fires (string args, dict args, key-order insensitive, web_extract), and negatives (different queries do not short-circuit; varied queries still hit the generic spiral nudge at the repeat threshold). --------- Co-authored-by: Hermes Evolution <evolution@hermes.ai>
1 parent cdfe707 commit e9016b0

9 files changed

Lines changed: 2194 additions & 1096 deletions

File tree

agent/loop_guard.py

Lines changed: 154 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
from __future__ import annotations
3636

37+
import json
3738
import re
3839
from typing import Any, Dict, List, Optional, Tuple
3940

@@ -57,6 +58,8 @@
5758
"exit status",
5859
"is not recognized",
5960
"could not be found",
61+
"no results",
62+
"no results found",
6063
)
6164

6265
_EXIT_CODE_RE = re.compile(r"exit code[:\s]+([1-9]\d*)", re.IGNORECASE)
@@ -69,49 +72,52 @@
6972
_NON_RETRYABLE = frozenset({"timeout", "permission", "missing_command", "limit"})
7073
_NONRETRY_THRESHOLD = 2
7174

75+
# Idempotent tools that are especially prone to content-free repetition and that
76+
# the issue evidence shows spiraling with no progress even when individual calls
77+
# return "success". Count these as non-progress after a shorter run so the model
78+
# is nudged toward a different query / tool / strategy.
79+
_SHORT_CIRCUIT_IDEMPOTENT = frozenset({"search_files", "web_search", "web_extract"})
80+
_SHORT_CIRCUIT_REPEAT_THRESHOLD = 4
81+
7282
# Mutating tools get LOWER thresholds than idempotent tools because a fixation
7383
# on mutating operations (writing files, running commands) is more costly and
7484
# indicates a deeper strategy problem (#432).
75-
_IDEMPOTENT_TOOLS = frozenset(
76-
{
77-
"read_file",
78-
"search_files",
79-
"web_search",
80-
"web_extract",
81-
"session_search",
82-
"browser_snapshot",
83-
"browser_console",
84-
"browser_get_images",
85-
"mcp_filesystem_read_file",
86-
"mcp_filesystem_read_text_file",
87-
"mcp_filesystem_read_multiple_files",
88-
"mcp_filesystem_list_directory",
89-
"mcp_filesystem_list_directory_with_sizes",
90-
"mcp_filesystem_directory_tree",
91-
"mcp_filesystem_get_file_info",
92-
"mcp_filesystem_search_files",
93-
}
94-
)
95-
_MUTATING_TOOLS = frozenset(
96-
{
97-
"terminal",
98-
"execute_code",
99-
"write_file",
100-
"patch",
101-
"todo",
102-
"memory",
103-
"skill_manage",
104-
"browser_click",
105-
"browser_type",
106-
"browser_press",
107-
"browser_scroll",
108-
"browser_navigate",
109-
"send_message",
110-
"cronjob",
111-
"delegate_task",
112-
"process",
113-
}
114-
)
85+
_IDEMPOTENT_TOOLS = frozenset({
86+
"read_file",
87+
"search_files",
88+
"web_search",
89+
"web_extract",
90+
"session_search",
91+
"browser_snapshot",
92+
"browser_console",
93+
"browser_get_images",
94+
"mcp_filesystem_read_file",
95+
"mcp_filesystem_read_text_file",
96+
"mcp_filesystem_read_multiple_files",
97+
"mcp_filesystem_list_directory",
98+
"mcp_filesystem_list_directory_with_sizes",
99+
"mcp_filesystem_directory_tree",
100+
"mcp_filesystem_get_file_info",
101+
"mcp_filesystem_search_files",
102+
})
103+
_MUTATING_TOOLS = frozenset({
104+
"terminal",
105+
"execute_code",
106+
"write_file",
107+
"patch",
108+
"todo",
109+
"memory",
110+
"skill_manage",
111+
"browser_click",
112+
"browser_type",
113+
"browser_press",
114+
"browser_scroll",
115+
"browser_navigate",
116+
"send_message",
117+
"cronjob",
118+
"delegate_task",
119+
"process",
120+
})
115121
# Default thresholds: lower for mutating tools, higher for idempotent (#432).
116122
# Mutating: repeat at 4, fail at 2, escalate at 8
117123
# Idempotent: repeat at 8, fail at 4, escalate at 15
@@ -143,45 +149,107 @@ def _looks_like_failure(content: Any) -> bool:
143149
return bool(_EXIT_CODE_RE.search(content))
144150

145151

146-
def _recent_tool_runs(messages: List[Dict[str, Any]]) -> List[Tuple[str, bool, Optional[str]]]:
147-
"""Most-recent-first list of (single_tool_name, result_failed, failure_class)
152+
def _tool_call_arg_hash(tool_calls: List[Dict[str, Any]]) -> Optional[str]:
153+
"""Canonical key of the INPUT arguments of an assistant turn's tool call(s).
154+
155+
Used to detect identical-query repetition for spiral-prone idempotent tools
156+
like web_search / web_extract (#467): the same query produces the same
157+
non-progressing result and drives a loop.
158+
159+
Identity is read from ``tool_calls[].function.arguments`` — the ACTUAL call
160+
inputs — NOT from the tool result. Tool results do not carry the input args,
161+
and web_search / web_extract outputs are XML-wrapped in
162+
``<untrusted_tool_result>`` so they can never be parsed back into arguments;
163+
reading identity from the result left this short-circuit permanently inert.
164+
165+
``arguments`` may be a JSON string (the OpenAI wire format) or an already
166+
parsed dict; both normalize to the same canonical key, and key ordering is
167+
irrelevant. An unparseable string is hashed verbatim (still a stable
168+
identity). Returns None when NO arguments can be recovered from any call, so
169+
a turn with missing args never yields a false identity match (fail-safe: no
170+
spurious short-circuit).
171+
"""
172+
keys: List[str] = []
173+
for tc in tool_calls:
174+
if not isinstance(tc, dict):
175+
continue
176+
fn = tc.get("function")
177+
if not isinstance(fn, dict):
178+
continue
179+
raw = fn.get("arguments")
180+
if raw is None:
181+
continue
182+
parsed: Any
183+
if isinstance(raw, str):
184+
s = raw.strip()
185+
if not s:
186+
continue
187+
try:
188+
parsed = json.loads(s)
189+
except Exception:
190+
keys.append(s) # unparseable args: hash the raw string verbatim
191+
continue
192+
else:
193+
parsed = raw
194+
try:
195+
keys.append(
196+
json.dumps(
197+
parsed, sort_keys=True, ensure_ascii=False, separators=(",", ":")
198+
)
199+
)
200+
except (TypeError, ValueError):
201+
keys.append(repr(parsed))
202+
if not keys:
203+
return None
204+
return "|".join(keys)
205+
206+
207+
def _recent_tool_runs(
208+
messages: List[Dict[str, Any]],
209+
) -> List[Tuple[str, bool, Optional[str], Optional[str]]]:
210+
"""Most-recent-first list of
211+
(single_tool_name, result_failed, failure_class, arg_hash)
148212
for the trailing run of assistant turns that each called EXACTLY ONE tool.
149213
``failure_class`` is the tool_diagnostics category of the failing result (or
150-
None when the turn did not fail).
214+
None when the turn did not fail). ``arg_hash`` is a canonical key of the
215+
assistant tool-call INPUT arguments for the turn (``function.arguments``),
216+
when they can be recovered.
151217
152218
Stops at the first assistant turn that is not a single-tool call (a text
153219
reply, or a multi-tool turn) — that breaks the "stuck on one tool" run.
154220
Multi-tool turns are normal varied work, not a single-tool spiral.
155221
"""
156-
runs: List[Tuple[str, bool, Optional[str]]] = []
222+
runs: List[Tuple[str, bool, Optional[str], Optional[str]]] = []
157223
i = len(messages) - 1
158224
# Collect tool results by id as we walk back so we can mark failures.
159225
while i >= 0:
160226
msg = messages[i]
161227
if msg.get("role") == "assistant" and msg.get("tool_calls"):
162228
tcs = [tc for tc in msg["tool_calls"] if isinstance(tc, dict)]
163229
names = [
164-
tc.get("function", {}).get("name")
165-
for tc in tcs
166-
if tc.get("function")
230+
tc.get("function", {}).get("name") for tc in tcs if tc.get("function")
167231
]
168232
names = [n for n in names if n]
169233
if len(set(names)) != 1:
170234
break # text turn or multi-tool turn — run ends
171235
tool = names[0]
172236
if runs and tool != runs[0][0]:
173237
break # tool changed — the same-tool run ends here
238+
# Identity for #467 same-query detection comes from the call INPUT
239+
# args of THIS assistant turn, not the result that follows it.
240+
arg_hash = _tool_call_arg_hash(tcs)
174241
# Results for this turn are the "tool" messages that follow it.
175242
failed = False
176243
category: Optional[str] = None
177244
for j in range(i + 1, len(messages)):
178245
tm = messages[j]
179246
if tm.get("role") != "tool":
180247
break
181-
if _looks_like_failure(tm.get("content")):
248+
content = tm.get("content")
249+
if _looks_like_failure(content):
182250
failed = True
183-
category = _failure_category(tm.get("content")) or category
184-
runs.append((tool, failed, category))
251+
category = _failure_category(content) or category
252+
runs.append((tool, failed, category, arg_hash))
185253
i -= 1
186254
elif msg.get("role") == "tool":
187255
i -= 1 # skip result messages; handled with their assistant turn
@@ -223,11 +291,13 @@ def maybe_nudge(
223291
) -> Optional[str]:
224292
"""Return a nudge string if the trailing single-tool run is stuck, else None.
225293
226-
Three trigger levels (each is lower for mutating tools than idempotent):
294+
Trigger levels (each is lower for mutating tools than idempotent):
227295
1. Non-retryable failure class repeated twice (highest priority, #231)
228296
2. Generic failures >= fail_threshold
229297
3. Same tool called >= repeat_threshold times in a row
230298
4. Escalated interrupt at higher counts (#432)
299+
5. Same *arguments* repeated for short-circuit idempotent tools
300+
(search_files / web_search / web_extract) >= 4 times (#467)
231301
232302
Returns None when the agent is making varied progress (not stuck).
233303
"""
@@ -243,18 +313,30 @@ def maybe_nudge(
243313
is_unknown = cat == "unknown"
244314
if repeat_threshold is None:
245315
repeat_threshold = (
246-
_MUTATING_REPEAT_THRESHOLD if (is_mutating or is_unknown)
316+
_MUTATING_REPEAT_THRESHOLD
317+
if (is_mutating or is_unknown)
247318
else _IDEMPOTENT_REPEAT_THRESHOLD
248319
)
249320
if fail_threshold is None:
250321
fail_threshold = (
251-
_MUTATING_FAIL_THRESHOLD if (is_mutating or is_unknown)
322+
_MUTATING_FAIL_THRESHOLD
323+
if (is_mutating or is_unknown)
252324
else _IDEMPOTENT_FAIL_THRESHOLD
253325
)
254326
escalate_threshold = (
255-
_MUTATING_ESCALATE_THRESHOLD if (is_mutating or is_unknown)
327+
_MUTATING_ESCALATE_THRESHOLD
328+
if (is_mutating or is_unknown)
256329
else _IDEMPOTENT_ESCALATE_THRESHOLD
257330
)
331+
short_circuit_threshold = (
332+
_MUTATING_REPEAT_THRESHOLD
333+
if (is_mutating or is_unknown)
334+
else (
335+
_SHORT_CIRCUIT_REPEAT_THRESHOLD
336+
if tool in _SHORT_CIRCUIT_IDEMPOTENT
337+
else repeat_threshold
338+
)
339+
)
258340

259341
# All entries in `runs` share the same tool (run breaks on tool change),
260342
# but guard anyway:
@@ -264,7 +346,7 @@ def maybe_nudge(
264346
consec_nonretry = 0
265347
nonretry_class: Optional[str] = None
266348
counting_nonretry = True
267-
for _t, failed, category in same:
349+
for _t, failed, category, _arg_hash in same:
268350
if failed:
269351
consec_fail += 1
270352
else:
@@ -312,6 +394,23 @@ def maybe_nudge(
312394
f"again the same way."
313395
)
314396

397+
# Same-argument repetition for known spiral-prone idempotent tools (#467).
398+
# This catches web_search returning "no results" / search_files returning
399+
# nothing, where each individual call technically "succeeded" but repeating
400+
# the exact same query is still a loop.
401+
if tool in _SHORT_CIRCUIT_IDEMPOTENT and count >= short_circuit_threshold:
402+
arg_hashes = [r[3] for r in same if r[3] is not None]
403+
if arg_hashes and len(set(arg_hashes)) == 1:
404+
score = _tool_spiral_score(tool, count, short_circuit_threshold)
405+
score_line = f"\n{score}" if score else ""
406+
return (
407+
f"[loop-guard] You have called `{tool}` {count} times with the "
408+
f"SAME arguments and the result is not making progress.{score_line} "
409+
f"Do NOT repeat `{tool}` with those identical arguments. Rephrase "
410+
f"the query, broaden or narrow it, switch to a different information "
411+
f"source, or state the blocker if no relevant results are available."
412+
)
413+
315414
if count >= repeat_threshold:
316415
# Build diversity score for the nudge.
317416
score = _tool_spiral_score(tool, count, repeat_threshold)

0 commit comments

Comments
 (0)