fix(mcp): omit reflect tool_trace/llm_trace from responses by default (#2242)

yunanwg · web-flow · commit 44972d32150c · 2026-06-17T12:32:39.000+02:00
The MCP `reflect` tool returned the full `reflect_async` result, which
includes `tool_trace` and `llm_trace` — the entire internal agent loop,
including full mental-model text. A default reflect response measured
59,657 chars (text 5,987 + tool_trace 52,711), silently consuming tens
of KB of MCP-client context on every call, while the REST API omits the
trace by default.

Add a symmetric `include_trace: bool = False` flag (mirroring the
existing `include_based_on`); the trace becomes opt-in for debugging.
Applied to both the multi-bank and single-bank reflect registrations,
with a regression test covering both.
diff --git a/hindsight-api-slim/hindsight_api/mcp_tools.py b/hindsight-api-slim/hindsight_api/mcp_tools.py
@@ -983,6 +983,7 @@ async def reflect(
             tags: list[str] | None = None,
             tags_match: str = "any",
             include_based_on: bool = False,
+            include_trace: bool = False,
             bank_id: str | None = None,
         ) -> str:
             """
@@ -1013,6 +1014,7 @@ async def reflect(
                 tags: Optional tags to filter memories by (e.g., ['project:alpha'])
                 tags_match: How to match tags - 'any' (match any tag) or 'all' (match all tags). Default: 'any'
                 include_based_on: Include source facts used for synthesis. Defaults to false because broad reflections can exceed MCP client result limits.
+                include_trace: Include the reflection's internal tool_trace/llm_trace. Defaults to false because the trace can be tens of KB and overflow MCP client context; enable only for debugging.
                 bank_id: Optional bank to reflect in (defaults to session bank). Use for cross-bank operations.
             """
             try:
@@ -1042,6 +1044,12 @@ async def reflect(
                 result_data = json.loads(reflect_result.model_dump_json(indent=2))
                 if not include_based_on:
                     result_data.pop("based_on", None)
+                if not include_trace:
+                    # The agentic reflect loop's tool_trace/llm_trace can be tens of KB
+                    # (full mental-model text) and silently overflow MCP client context;
+                    # the REST API omits it by default too. Opt in via include_trace.
+                    result_data.pop("tool_trace", None)
+                    result_data.pop("llm_trace", None)
                 if response_schema is not None and hasattr(reflect_result, "structured_output"):
                     result_data["structured_output"] = reflect_result.structured_output
                 return json.dumps(result_data, indent=2)
@@ -1064,6 +1072,7 @@ async def reflect(
             tags: list[str] | None = None,
             tags_match: str = "any",
             include_based_on: bool = False,
+            include_trace: bool = False,
         ) -> dict:
             """
             Generate thoughtful analysis by synthesizing stored memories with the bank's personality.
@@ -1093,6 +1102,7 @@ async def reflect(
                 tags: Optional tags to filter memories by (e.g., ['project:alpha'])
                 tags_match: How to match tags - 'any' (match any tag) or 'all' (match all tags). Default: 'any'
                 include_based_on: Include source facts used for synthesis. Defaults to false because broad reflections can exceed MCP client result limits.
+                include_trace: Include the reflection's internal tool_trace/llm_trace. Defaults to false because the trace can be tens of KB and overflow MCP client context; enable only for debugging.
             """
             try:
                 target_bank = config.bank_id_resolver()
@@ -1121,6 +1131,12 @@ async def reflect(
                 result_data = reflect_result.model_dump()
                 if not include_based_on:
                     result_data.pop("based_on", None)
+                if not include_trace:
+                    # The agentic reflect loop's tool_trace/llm_trace can be tens of KB
+                    # (full mental-model text) and silently overflow MCP client context;
+                    # the REST API omits it by default too. Opt in via include_trace.
+                    result_data.pop("tool_trace", None)
+                    result_data.pop("llm_trace", None)
                 if response_schema is not None and hasattr(reflect_result, "structured_output"):
                     result_data["structured_output"] = reflect_result.structured_output
                 return result_data
diff --git a/hindsight-api-slim/tests/test_mcp_tools.py b/hindsight-api-slim/tests/test_mcp_tools.py
@@ -1947,3 +1947,65 @@ async def test_annotations_apply_in_single_bank_mode(self, mock_memory):
         ann = _tools(_make_mcp_server(mock_memory, {"recall"}, include_bank_id=False))["recall"].annotations
         assert ann is not None
         assert ann.readOnlyHint is True
+
+
+def _reflect_mcp_with_trace(include_bank_id_param: bool):
+    """An MCP server whose reflect returns a result carrying tool_trace/llm_trace."""
+    from fastmcp import FastMCP
+
+    # Mirrors ReflectResult: the agentic loop's trace fields are large and present.
+    reflect_payload = {
+        "text": "answer",
+        "based_on": {"world": []},
+        "tool_trace": [{"tool": "recall", "output": "x" * 1000}],
+        "llm_trace": [{"model": "test", "output": "y" * 1000}],
+    }
+    memory = MagicMock()
+    memory.reflect_async = AsyncMock(
+        return_value=MagicMock(
+            model_dump_json=lambda indent=None: json.dumps(reflect_payload),
+            model_dump=lambda: dict(reflect_payload),
+            structured_output=None,
+        )
+    )
+    mcp = FastMCP("test")
+    config = MCPToolsConfig(
+        bank_id_resolver=lambda: "test-bank",
+        include_bank_id_param=include_bank_id_param,
+        tools={"reflect"},
+    )
+    register_mcp_tools(mcp, memory, config)
+    return mcp
+
+
+def _reflect_result_data(result) -> dict:
+    """The multi-bank reflect returns a JSON string; single-bank returns a dict."""
+    return json.loads(result) if isinstance(result, str) else result
+
+
+@pytest.mark.asyncio
+class TestReflectTraceOmission:
+    """reflect must not leak the agentic tool_trace/llm_trace into MCP responses by default."""
+
+    @pytest.mark.parametrize("multi_bank", [True, False])
+    async def test_trace_omitted_by_default(self, multi_bank):
+        mcp = _reflect_mcp_with_trace(multi_bank)
+        data = _reflect_result_data(await _tools(mcp)["reflect"].fn(query="q"))
+        assert data["text"] == "answer"
+        assert "tool_trace" not in data
+        assert "llm_trace" not in data
+
+    @pytest.mark.parametrize("multi_bank", [True, False])
+    async def test_trace_included_when_requested(self, multi_bank):
+        mcp = _reflect_mcp_with_trace(multi_bank)
+        data = _reflect_result_data(await _tools(mcp)["reflect"].fn(query="q", include_trace=True))
+        assert "tool_trace" in data
+        assert "llm_trace" in data
+
+    @pytest.mark.parametrize("multi_bank", [True, False])
+    async def test_based_on_flag_is_independent_of_trace(self, multi_bank):
+        # include_based_on keeps based_on but must not pull the trace back in.
+        mcp = _reflect_mcp_with_trace(multi_bank)
+        data = _reflect_result_data(await _tools(mcp)["reflect"].fn(query="q", include_based_on=True))
+        assert "based_on" in data
+        assert "tool_trace" not in data