Preserve valid tool-call history windows during truncation

TatsuKo-Tsukimi · Y1fe1Zh0u · commit ced38f67689a · 2026-05-12T15:51:08.000+08:00
Single-agent chat history is currently sliced by message count, which can split assistant tool_calls from their required tool result messages. This refreshes PR #487 on current upstream/main by introducing a pair-aware history window helper and routing web chat plus Feishu history truncation through it. Constraint: Provider APIs require every role=tool message to match a preceding assistant tool_call in the same request. Rejected: Keep only the existing head-pop guard | it only fixes orphan tool messages at the first position and misses cuts inside multi-message tool-call blocks. Confidence: high Scope-risk: narrow Directive: Future token-budget truncation must reuse the pair-aware walker instead of slicing conversation lists directly. Tested: /Users/zhou/Code/clawith/backend/.venv/bin/python -m pytest tests/test_history_window.py from /tmp/clawith-pr487-work/backend, 16 passed Not-tested: Full backend suite and live web/Feishu chat flows
diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
@@ -18,6 +18,7 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import truncate_by_message_count
 
 router = APIRouter(tags=["feishu"])
 
@@ -1634,7 +1635,13 @@ async def _call_agent_llm(
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
+        # Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
+        # pairs intact. Today _normalize_history_messages drops DB role="tool_call"
+        # rows, so this path has no tool messages and the helper acts as plain count
+        # truncation; the safety kicks in once a feishu reorganization helper exists.
+        messages.extend(
+            truncate_by_message_count(_normalize_history_messages(history), ctx_size)
+        )
     messages.append({"role": "user", "content": user_text})
 
     # Use actual user_id so the system prompt knows who it's chatting with
diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
@@ -19,6 +19,7 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import truncate_by_message_count
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -775,10 +776,12 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         # Per-(user, agent) onboarding. With no row, prepend the
                         # greeting prompt and mark the pair as "greeted" once it
diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
@@ -0,0 +1,166 @@
+"""Pair-aware conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Orphan detection is by ``tool_call_id`` matching, not by adjacency — a
+tool message inserted between a valid pair and other messages (from
+malformed persistence or upstream truncation) is dropped, not folded
+into an adjacent block. This makes the helper robust against orphans
+at any position, not just at the slice head.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def _identify_orphans(messages: list[dict[str, Any]]) -> set[int]:
+    """Return indices of ``role="tool"`` messages whose ``tool_call_id`` has
+    no matching ``assistant.tool_calls`` earlier in the conversation.
+
+    OpenAI rejects the request the moment a ``function_call_output`` is
+    sent without its matching ``function_call``, regardless of whether
+    that tool message is at the head, middle, or end. So orphan detection
+    is by ID matching, not by position.
+    """
+    orphans: set[int] = set()
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        if not tcid:
+            orphans.add(i)
+            continue
+        # Search backward for an assistant whose tool_calls contains this id.
+        # Walks past intervening user / system / other-assistant messages.
+        found = False
+        j = i - 1
+        while j >= 0:
+            m = messages[j]
+            if m.get("role") == "assistant" and m.get("tool_calls"):
+                ids = {tc.get("id") for tc in m["tool_calls"]}
+                if tcid in ids:
+                    found = True
+                    break
+            j -= 1
+        if not found:
+            orphans.add(i)
+    return orphans
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching ``role="tool"``
+        message (identified by ``tool_call_id``, not adjacency).
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages — those whose ``tool_call_id`` has no matching assistant — are
+    silently dropped regardless of budget. Sending them to OpenAI causes the
+    #446 error.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    orphans = _identify_orphans(messages)
+    n = len(messages)
+    consumed: set[int] = set(orphans)  # orphans drop unconditionally
+    blocks: list[set[int]] = []  # tail-to-head order
+
+    for i in range(n - 1, -1, -1):
+        if i in consumed:
+            continue
+        msg = messages[i]
+        role = msg.get("role")
+
+        if role == "tool":
+            # Find this tool's owning assistant by matching tool_call_id
+            tcid = msg.get("tool_call_id")
+            asst_idx = -1
+            j = i - 1
+            while j >= 0:
+                m = messages[j]
+                if m.get("role") == "assistant" and m.get("tool_calls"):
+                    ids = {tc.get("id") for tc in m["tool_calls"]}
+                    if tcid in ids:
+                        asst_idx = j
+                        break
+                j -= 1
+            if asst_idx < 0:
+                # Defensive — orphan detection should have caught this
+                consumed.add(i)
+                continue
+            # Block = assistant + ALL of its matching tool messages (siblings)
+            asst_tc_ids = {tc.get("id") for tc in messages[asst_idx]["tool_calls"]}
+            block = {asst_idx}
+            for k in range(asst_idx + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        elif role == "assistant" and msg.get("tool_calls"):
+            # Encountered the assistant before any of its tools (e.g. tools
+            # were truncated upstream or are still in flight). Group with
+            # whatever matching tools follow it.
+            asst_tc_ids = {tc.get("id") for tc in msg["tool_calls"]}
+            block = {i}
+            for k in range(i + 1, n):
+                if k in consumed:
+                    continue
+                m = messages[k]
+                if (
+                    m.get("role") == "tool"
+                    and m.get("tool_call_id") in asst_tc_ids
+                ):
+                    block.add(k)
+            consumed |= block
+            blocks.append(block)
+        else:
+            consumed.add(i)
+            blocks.append({i})
+
+    # Walk blocks tail-to-head, taking until budget exhausted.
+    keep: set[int] = set()
+    budget = max_messages
+    for block in blocks:
+        size = len(block)
+        if size <= budget:
+            keep |= block
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [messages[k] for k in sorted(keep)]
diff --git a/backend/tests/test_history_window.py b/backend/tests/test_history_window.py