Skip to content

Commit 66e679e

Browse files
Nichol4sclaude
andcommitted
[argus] loop_detector: drop layer-2 frequency detection
Layer 2 counted every call to a tool *name* regardless of args, then warned at 30 / hard-stopped at 50. The intent (per the docstring) was to "catch cross-file read loops that hash-based detection misses": model gets lost, calls read_file across 40 different files, layer 1 doesn't fire because each call has a distinct key. In practice it false-positives on legitimate high-cardinality work — argus thread 5077dd8e (Webfuse docs audit) tripped the warning twice while crawling distinct URLs, and the model misread the warning as a hard quota and stopped. Layer 1 + the edit-aware reset shipped in 3bac95e are now strong enough to handle the verifier-iteration false positive that originally pushed us toward layer 2 as a backstop. The remaining layer-2-only signal (genuine cross-arg drift) is rare on a single-user local box and the recursion limit catches the truly pathological cases. Removes: - _DEFAULT_TOOL_FREQ_WARN / _DEFAULT_TOOL_FREQ_HARD_LIMIT constants - _TOOL_FREQ_WARNING_MSG / _TOOL_FREQ_HARD_STOP_MSG constants - tool_freq_warn / tool_freq_hard_limit constructor params - self._tool_freq / self._tool_freq_warned per-thread state - the layer-2 block at the end of _track_and_check - TestToolFrequencyDetection (10 tests) plus the LRU-eviction assertions on _tool_freq[*] 46 loop-detection tests still pass (was 56; the dropped 10 were all TestToolFrequencyDetection plus 2 nested LRU asserts). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3bac95e commit 66e679e

2 files changed

Lines changed: 2 additions & 256 deletions

File tree

backend/packages/harness/deerflow/agents/middlewares/loop_detection_middleware.py

Lines changed: 2 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232
_DEFAULT_HARD_LIMIT = 8 # force-stop after 8 identical calls
3333
_DEFAULT_WINDOW_SIZE = 20 # track last N tool calls
3434
_DEFAULT_MAX_TRACKED_THREADS = 100 # LRU eviction limit
35-
_DEFAULT_TOOL_FREQ_WARN = 30 # warn after 30 calls to the same tool type
36-
_DEFAULT_TOOL_FREQ_HARD_LIMIT = 50 # force-stop after 50 calls to the same tool type
3735

3836

3937
def _normalize_tool_call_args(raw_args: object) -> tuple[dict, str | None]:
@@ -171,18 +169,8 @@ def _hash_tool_calls(tool_calls: list[dict]) -> str:
171169
"If the task genuinely cannot be completed, summarize what you accomplished and stop."
172170
)
173171

174-
_TOOL_FREQ_WARNING_MSG = (
175-
"[REPEAT TOOL CALL DETECTED] You have called {tool_name} {count} times in this conversation. "
176-
"Step back: are these calls converging on a result, or are you cycling through similar variations? "
177-
"If you are cycling, switch strategy — instrument, reduce the test surface, or pick a clearly different angle. "
178-
"Do not rewrite the artifact from scratch; that usually introduces new bugs without fixing the original. "
179-
"If the task genuinely cannot be completed, summarize what you accomplished and stop."
180-
)
181-
182172
_HARD_STOP_MSG = "[FORCED STOP] Repeated tool calls exceeded the safety limit. Producing final answer with results collected so far."
183173

184-
_TOOL_FREQ_HARD_STOP_MSG = "[FORCED STOP] Tool {tool_name} called {count} times — exceeded the per-tool safety limit. Producing final answer with results collected so far."
185-
186174

187175
class LoopDetectionMiddleware(AgentMiddleware[AgentState]):
188176
"""Detects and breaks repetitive tool call loops.
@@ -196,12 +184,6 @@ class LoopDetectionMiddleware(AgentMiddleware[AgentState]):
196184
Default: 20.
197185
max_tracked_threads: Maximum number of threads to track before
198186
evicting the least recently used. Default: 100.
199-
tool_freq_warn: Number of calls to the same tool *type* (regardless
200-
of arguments) before injecting a frequency warning. Catches
201-
cross-file read loops that hash-based detection misses.
202-
Default: 30.
203-
tool_freq_hard_limit: Number of calls to the same tool type before
204-
forcing a stop. Default: 50.
205187
"""
206188

207189
def __init__(
@@ -210,23 +192,16 @@ def __init__(
210192
hard_limit: int = _DEFAULT_HARD_LIMIT,
211193
window_size: int = _DEFAULT_WINDOW_SIZE,
212194
max_tracked_threads: int = _DEFAULT_MAX_TRACKED_THREADS,
213-
tool_freq_warn: int = _DEFAULT_TOOL_FREQ_WARN,
214-
tool_freq_hard_limit: int = _DEFAULT_TOOL_FREQ_HARD_LIMIT,
215195
):
216196
super().__init__()
217197
self.warn_threshold = warn_threshold
218198
self.hard_limit = hard_limit
219199
self.window_size = window_size
220200
self.max_tracked_threads = max_tracked_threads
221-
self.tool_freq_warn = tool_freq_warn
222-
self.tool_freq_hard_limit = tool_freq_hard_limit
223201
self._lock = threading.Lock()
224202
# Per-thread tracking using OrderedDict for LRU eviction
225203
self._history: OrderedDict[str, list[str]] = OrderedDict()
226204
self._warned: dict[str, set[str]] = defaultdict(set)
227-
# Per-thread, per-tool-type cumulative call counts
228-
self._tool_freq: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
229-
self._tool_freq_warned: dict[str, set[str]] = defaultdict(set)
230205
# Per-thread set of paths mutated since the last reset. A subsequent call
231206
# that references one of these paths clears identical-hash history (the
232207
# file changed, so re-probing it isn't a loop), and consumes the path so
@@ -248,19 +223,14 @@ def _evict_if_needed(self) -> None:
248223
while len(self._history) > self.max_tracked_threads:
249224
evicted_id, _ = self._history.popitem(last=False)
250225
self._warned.pop(evicted_id, None)
251-
self._tool_freq.pop(evicted_id, None)
252-
self._tool_freq_warned.pop(evicted_id, None)
253226
self._mutated_paths.pop(evicted_id, None)
254227
logger.debug("Evicted loop tracking for thread %s (LRU)", evicted_id)
255228

256229
def _track_and_check(self, state: AgentState, runtime: Runtime) -> tuple[str | None, bool]:
257230
"""Track tool calls and check for loops.
258231
259-
Two detection layers:
260-
1. **Hash-based** (existing): catches identical tool call sets.
261-
2. **Frequency-based** (new): catches the same *tool type* being
262-
called many times with varying arguments (e.g. ``read_file``
263-
on 40 different files).
232+
Hash-based detection: identical tool call sets in a sliding window
233+
trigger a warning at warn_threshold and a hard stop at hard_limit.
264234
265235
Edit-aware reset: if any tool call references a path that was mutated
266236
(by a prior write_file/str_replace) since the last reset for that path,
@@ -331,7 +301,6 @@ def _track_and_check(self, state: AgentState, runtime: Runtime) -> tuple[str | N
331301
if mp:
332302
self._mutated_paths[thread_id].add(mp)
333303

334-
# --- Layer 1: hash-based (identical call sets) ---
335304
if count >= self.hard_limit:
336305
logger.error(
337306
"Loop hard limit reached — forcing stop",
@@ -359,40 +328,6 @@ def _track_and_check(self, state: AgentState, runtime: Runtime) -> tuple[str | N
359328
)
360329
return _WARNING_MSG, False
361330

362-
# --- Layer 2: per-tool-type frequency ---
363-
freq = self._tool_freq[thread_id]
364-
for tc in tool_calls:
365-
name = tc.get("name", "")
366-
if not name:
367-
continue
368-
freq[name] += 1
369-
tc_count = freq[name]
370-
371-
if tc_count >= self.tool_freq_hard_limit:
372-
logger.error(
373-
"Tool frequency hard limit reached — forcing stop",
374-
extra={
375-
"thread_id": thread_id,
376-
"tool_name": name,
377-
"count": tc_count,
378-
},
379-
)
380-
return _TOOL_FREQ_HARD_STOP_MSG.format(tool_name=name, count=tc_count), True
381-
382-
if tc_count >= self.tool_freq_warn:
383-
warned = self._tool_freq_warned[thread_id]
384-
if name not in warned:
385-
warned.add(name)
386-
logger.warning(
387-
"Tool frequency warning — too many calls to same tool type",
388-
extra={
389-
"thread_id": thread_id,
390-
"tool_name": name,
391-
"count": tc_count,
392-
},
393-
)
394-
return _TOOL_FREQ_WARNING_MSG.format(tool_name=name, count=tc_count), False
395-
396331
return None, False
397332

398333
@staticmethod
@@ -468,12 +403,8 @@ def reset(self, thread_id: str | None = None) -> None:
468403
if thread_id:
469404
self._history.pop(thread_id, None)
470405
self._warned.pop(thread_id, None)
471-
self._tool_freq.pop(thread_id, None)
472-
self._tool_freq_warned.pop(thread_id, None)
473406
self._mutated_paths.pop(thread_id, None)
474407
else:
475408
self._history.clear()
476409
self._warned.clear()
477-
self._tool_freq.clear()
478410
self._mutated_paths.clear()
479-
self._tool_freq_warned.clear()

backend/tests/test_loop_detection_middleware.py

Lines changed: 0 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -328,8 +328,6 @@ def test_lru_eviction(self):
328328
mw._apply(_make_state(tool_calls=call), runtime_new)
329329

330330
assert "thread-0" not in mw._history
331-
assert "thread-0" not in mw._tool_freq
332-
assert "thread-0" not in mw._tool_freq_warned
333331
assert "thread-new" in mw._history
334332
assert len(mw._history) == 3
335333

@@ -501,189 +499,6 @@ def _make_provider_state():
501499
assert msg.response_metadata["finish_reason"] == "stop"
502500

503501

504-
class TestToolFrequencyDetection:
505-
"""Tests for per-tool-type frequency detection (Layer 2).
506-
507-
This catches the case where an agent calls the same tool type many times
508-
with *different* arguments (e.g. read_file on 40 different files), which
509-
bypasses hash-based detection.
510-
"""
511-
512-
def _read_call(self, path):
513-
return {"name": "read_file", "id": f"call_read_{path}", "args": {"path": path}}
514-
515-
def test_below_freq_warn_returns_none(self):
516-
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
517-
runtime = _make_runtime()
518-
519-
for i in range(4):
520-
result = mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
521-
assert result is None
522-
523-
def test_freq_warn_at_threshold(self):
524-
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
525-
runtime = _make_runtime()
526-
527-
for i in range(4):
528-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
529-
530-
# 5th call to read_file (different file each time) triggers freq warning
531-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_4.py")]), runtime)
532-
assert result is not None
533-
msg = result["messages"][0]
534-
assert isinstance(msg, HumanMessage)
535-
assert "read_file" in msg.content
536-
assert "REPEAT TOOL CALL DETECTED" in msg.content
537-
538-
def test_freq_warn_only_injected_once(self):
539-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
540-
runtime = _make_runtime()
541-
542-
for i in range(2):
543-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
544-
545-
# 3rd triggers warning
546-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime)
547-
assert result is not None
548-
assert "REPEAT TOOL CALL DETECTED" in result["messages"][0].content
549-
550-
# 4th should not re-warn (already warned for read_file)
551-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_3.py")]), runtime)
552-
assert result is None
553-
554-
def test_freq_hard_stop_at_limit(self):
555-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=6)
556-
runtime = _make_runtime()
557-
558-
for i in range(5):
559-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
560-
561-
# 6th call triggers hard stop
562-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_5.py")]), runtime)
563-
assert result is not None
564-
msg = result["messages"][0]
565-
assert isinstance(msg, AIMessage)
566-
assert msg.tool_calls == []
567-
assert "FORCED STOP" in msg.content
568-
assert "read_file" in msg.content
569-
570-
def test_different_tools_tracked_independently(self):
571-
"""read_file and bash should have independent frequency counters."""
572-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
573-
runtime = _make_runtime()
574-
575-
# 2 read_file calls
576-
for i in range(2):
577-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
578-
579-
# 2 bash calls — should not trigger (bash count = 2, read_file count = 2)
580-
for i in range(2):
581-
result = mw._apply(_make_state(tool_calls=[_bash_call(f"cmd_{i}")]), runtime)
582-
assert result is None
583-
584-
# 3rd read_file triggers (read_file count = 3)
585-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime)
586-
assert result is not None
587-
assert "read_file" in result["messages"][0].content
588-
589-
def test_freq_reset_clears_state(self):
590-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
591-
runtime = _make_runtime()
592-
593-
for i in range(2):
594-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime)
595-
596-
mw.reset()
597-
598-
# After reset, count restarts — should not trigger
599-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_new.py")]), runtime)
600-
assert result is None
601-
602-
def test_freq_reset_per_thread_clears_only_target(self):
603-
"""reset(thread_id=...) should clear frequency state for that thread only."""
604-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
605-
runtime_a = _make_runtime("thread-A")
606-
runtime_b = _make_runtime("thread-B")
607-
608-
# 2 calls on each thread
609-
for i in range(2):
610-
mw._apply(_make_state(tool_calls=[self._read_call(f"/a_{i}.py")]), runtime_a)
611-
mw._apply(_make_state(tool_calls=[self._read_call(f"/b_{i}.py")]), runtime_b)
612-
613-
# Reset only thread-A
614-
mw.reset(thread_id="thread-A")
615-
616-
assert "thread-A" not in mw._tool_freq
617-
assert "thread-A" not in mw._tool_freq_warned
618-
619-
# thread-B state should still be intact — 3rd call triggers warn
620-
result = mw._apply(_make_state(tool_calls=[self._read_call("/b_2.py")]), runtime_b)
621-
assert result is not None
622-
assert "REPEAT TOOL CALL DETECTED" in result["messages"][0].content
623-
624-
# thread-A restarted from 0 — should not trigger
625-
result = mw._apply(_make_state(tool_calls=[self._read_call("/a_new.py")]), runtime_a)
626-
assert result is None
627-
628-
def test_freq_per_thread_isolation(self):
629-
"""Frequency counts should be independent per thread."""
630-
mw = LoopDetectionMiddleware(tool_freq_warn=3, tool_freq_hard_limit=10)
631-
runtime_a = _make_runtime("thread-A")
632-
runtime_b = _make_runtime("thread-B")
633-
634-
# 2 calls on thread A
635-
for i in range(2):
636-
mw._apply(_make_state(tool_calls=[self._read_call(f"/file_{i}.py")]), runtime_a)
637-
638-
# 2 calls on thread B — should NOT push thread A over threshold
639-
for i in range(2):
640-
mw._apply(_make_state(tool_calls=[self._read_call(f"/other_{i}.py")]), runtime_b)
641-
642-
# 3rd call on thread A — triggers (count=3 for thread A only)
643-
result = mw._apply(_make_state(tool_calls=[self._read_call("/file_2.py")]), runtime_a)
644-
assert result is not None
645-
assert "REPEAT TOOL CALL DETECTED" in result["messages"][0].content
646-
647-
def test_multi_tool_single_response_counted(self):
648-
"""When a single response has multiple tool calls, each is counted."""
649-
mw = LoopDetectionMiddleware(tool_freq_warn=5, tool_freq_hard_limit=10)
650-
runtime = _make_runtime()
651-
652-
# Response 1: 2 read_file calls → count = 2
653-
call = [self._read_call("/a.py"), self._read_call("/b.py")]
654-
result = mw._apply(_make_state(tool_calls=call), runtime)
655-
assert result is None
656-
657-
# Response 2: 2 more → count = 4
658-
call = [self._read_call("/c.py"), self._read_call("/d.py")]
659-
result = mw._apply(_make_state(tool_calls=call), runtime)
660-
assert result is None
661-
662-
# Response 3: 1 more → count = 5 → triggers warn
663-
result = mw._apply(_make_state(tool_calls=[self._read_call("/e.py")]), runtime)
664-
assert result is not None
665-
assert "read_file" in result["messages"][0].content
666-
667-
def test_hash_detection_takes_priority(self):
668-
"""Hash-based hard stop fires before frequency check for identical calls."""
669-
mw = LoopDetectionMiddleware(
670-
warn_threshold=2,
671-
hard_limit=3,
672-
tool_freq_warn=100,
673-
tool_freq_hard_limit=200,
674-
)
675-
runtime = _make_runtime()
676-
call = [self._read_call("/same_file.py")]
677-
678-
for _ in range(2):
679-
mw._apply(_make_state(tool_calls=call), runtime)
680-
681-
# 3rd identical call → hash hard_limit=3 fires (not freq)
682-
result = mw._apply(_make_state(tool_calls=call), runtime)
683-
assert result is not None
684-
msg = result["messages"][0]
685-
assert isinstance(msg, AIMessage)
686-
assert _HARD_STOP_MSG in msg.content
687502

688503

689504
class TestEditAwareReset:

0 commit comments

Comments
 (0)