fix: narrow McpError handling, use composition for ConnectionSafeMcpTool

jsonmp-k8 · jsonmp-k8 · commit 48764d85a020 · 2026-04-07T00:15:21.000-04:00
Address review feedback on #1531: - Only catch transport-level McpErrors (timeouts, stream drops) via keyword inspection; protocol-level McpErrors (invalid args, validation) now propagate so the LLM can correct its behavior - Replace fragile __new__ + __dict__ copy with composition pattern: store inner McpTool and delegate via __getattr__ - Add exc_info to logger.error() for operator-visible tracebacks - Remove unused Dict import, use lowercase dict[str, Any] - Simplify test setup: mock inner tool directly, remove patch boilerplate - Add test_protocol_mcp_error_still_raises to verify narrowing Signed-off-by: Jaison Paul <paul.jaison@gmail.com>
diff --git a/python/packages/kagent-adk/src/kagent/adk/_mcp_toolset.py b/python/packages/kagent-adk/src/kagent/adk/_mcp_toolset.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import logging
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 import httpx
 from google.adk.tools import BaseTool
@@ -23,16 +23,37 @@
 # - httpx.TransportError: covers httpx.NetworkError (ConnectError, ReadError,
 #   WriteError, CloseError), httpx.TimeoutException, httpx.ProtocolError, etc.
 #   These do NOT inherit from stdlib ConnectionError/OSError.
-# - McpError: raised by mcp.shared.session.send_request() when the underlying
-#   SSE/HTTP stream drops or a tool call hits the session read timeout. The MCP
-#   client wraps the transport-level error into McpError before it reaches us.
+#
+# McpError is handled separately in ConnectionSafeMcpTool.run_async() because
+# it is the general MCP protocol error class. Only transport-level McpErrors
+# (e.g., session read timeouts) should be caught; protocol-level McpErrors
+# (e.g., invalid tool arguments) must propagate so the LLM can correct itself.
 _CONNECTION_ERROR_TYPES = (
     ConnectionError,
     TimeoutError,
     httpx.TransportError,
-    McpError,
 )
 
+# Keywords in McpError messages that indicate transport-level failures
+# (as opposed to protocol-level errors like invalid arguments).
+_TRANSPORT_MCP_ERROR_KEYWORDS = (
+    "timeout", "timed out", "connection", "eof", "reset",
+    "closed", "transport", "stream", "unreachable",
+)
+
+
+def _is_transport_mcp_error(error: McpError) -> bool:
+    """Check if an McpError represents a transport-level failure.
+
+    McpError wraps all MCP protocol errors, but only transport-level failures
+    (e.g., session read timeouts, stream closures) should be caught and
+    returned to the LLM as non-retryable errors. Protocol-level errors
+    (e.g., invalid tool arguments, server validation failures) should
+    propagate so the LLM can correct its behavior.
+    """
+    message = error.error.message.lower()
+    return any(keyword in message for keyword in _TRANSPORT_MCP_ERROR_KEYWORDS)
+
 
 def _enrich_cancelled_error(error: BaseException) -> asyncio.CancelledError:
     message = "Failed to create MCP session: operation cancelled"
@@ -49,26 +70,47 @@ class ConnectionSafeMcpTool(McpTool):
     peer") causes the LLM to retry the tool call in a tight loop, burning
     100% CPU for up to max_llm_calls iterations.
 
+    Uses composition: delegates to an inner McpTool instance via __getattr__,
+    avoiding the fragile __new__ + __dict__ copy pattern that would break if
+    upstream McpTool adds __slots__, properties, or post-init hooks.
+
     See: https://github.com/kagent-dev/kagent/issues/1530
     """
 
+    _inner_tool: McpTool
+
+    def __init__(self, inner_tool: McpTool):
+        # Store the inner tool without calling McpTool.__init__
+        # (which requires connection params we don't have).
+        object.__setattr__(self, "_inner_tool", inner_tool)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._inner_tool, name)
+
+    def _connection_error_response(self, error: Exception) -> dict[str, Any]:
+        error_message = (
+            f"MCP tool '{self.name}' failed due to a connection error: "
+            f"{type(error).__name__}: {error}. "
+            "The MCP server may be unreachable. "
+            "Do not retry this tool — inform the user about the failure."
+        )
+        logger.error(error_message, exc_info=error)
+        return {"error": error_message}
+
     async def run_async(
         self,
         *,
-        args: Dict[str, Any],
+        args: dict[str, Any],
         tool_context: ToolContext,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         try:
-            return await super().run_async(args=args, tool_context=tool_context)
+            return await self._inner_tool.run_async(args=args, tool_context=tool_context)
         except _CONNECTION_ERROR_TYPES as error:
-            error_message = (
-                f"MCP tool '{self.name}' failed due to a connection error: "
-                f"{type(error).__name__}: {error}. "
-                "The MCP server may be unreachable. "
-                "Do not retry this tool — inform the user about the failure."
-            )
-            logger.error(error_message)
-            return {"error": error_message}
+            return self._connection_error_response(error)
+        except McpError as error:
+            if not _is_transport_mcp_error(error):
+                raise
+            return self._connection_error_response(error)
 
 
 class KAgentMcpToolset(McpToolset):
@@ -87,16 +129,10 @@ async def get_tools(self, readonly_context: Optional[ReadonlyContext] = None) ->
 
         # Wrap each McpTool with ConnectionSafeMcpTool so that connection
         # errors are returned as error text instead of raised.
-        # Uses __new__ + __dict__ copy to re-type the instance without calling
-        # McpTool.__init__ (which requires connection params we don't have).
-        # This is safe because McpTool uses plain instance attributes, not
-        # __slots__ or descriptors.
         wrapped_tools: list[BaseTool] = []
         for tool in tools:
             if isinstance(tool, McpTool) and not isinstance(tool, ConnectionSafeMcpTool):
-                safe_tool = ConnectionSafeMcpTool.__new__(ConnectionSafeMcpTool)
-                safe_tool.__dict__.update(tool.__dict__)
-                wrapped_tools.append(safe_tool)
+                wrapped_tools.append(ConnectionSafeMcpTool(tool))
             else:
                 wrapped_tools.append(tool)
         return wrapped_tools
diff --git a/python/packages/kagent-adk/tests/unittests/test_mcp_connection_error_handling.py b/python/packages/kagent-adk/tests/unittests/test_mcp_connection_error_handling.py
@@ -12,32 +12,25 @@
 from google.adk.tools.mcp_tool.mcp_tool import McpTool
 from google.adk.tools.mcp_tool.mcp_toolset import McpToolset
 from mcp.shared.exceptions import McpError
+from mcp.types import ErrorData
 
 from kagent.adk._mcp_toolset import ConnectionSafeMcpTool, KAgentMcpToolset
 
 
 def _make_connection_safe_tool(side_effect):
-    """Create a ConnectionSafeMcpTool with a mocked super().run_async."""
-    tool = ConnectionSafeMcpTool.__new__(ConnectionSafeMcpTool)
-    tool.name = "test-tool"
-    tool._mcp_tool = MagicMock()
-    tool._mcp_tool.name = "test-tool"
-    tool._mcp_session_manager = AsyncMock()
-    tool._header_provider = None
-    tool._auth_config = None
-    tool._confirmation_config = None
-    tool._progress_callback = None
-    tool._parent_run_async = AsyncMock(side_effect=side_effect)
-    return tool
+    """Create a ConnectionSafeMcpTool wrapping a mock McpTool."""
+    inner_tool = MagicMock(spec=McpTool)
+    inner_tool.name = "test-tool"
+    inner_tool.run_async = AsyncMock(side_effect=side_effect)
+    return ConnectionSafeMcpTool(inner_tool)
 
 
 @pytest.mark.asyncio
 async def test_connection_reset_error_returns_error_dict():
     """ConnectionResetError should be caught and returned as error text."""
     tool = _make_connection_safe_tool(ConnectionResetError("Connection reset by peer"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={"key": "value"}, tool_context=MagicMock())
+    result = await tool.run_async(args={"key": "value"}, tool_context=MagicMock())
 
     assert "error" in result
     assert "ConnectionResetError" in result["error"]
@@ -50,8 +43,7 @@ async def test_connection_refused_error_returns_error_dict():
     """ConnectionRefusedError should be caught and returned as error text."""
     tool = _make_connection_safe_tool(ConnectionRefusedError("Connection refused"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "ConnectionRefusedError" in result["error"]
@@ -62,8 +54,7 @@ async def test_timeout_error_returns_error_dict():
     """TimeoutError should be caught and returned as error text."""
     tool = _make_connection_safe_tool(TimeoutError("timed out"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "TimeoutError" in result["error"]
@@ -74,8 +65,7 @@ async def test_httpx_connect_error_returns_error_dict():
     """httpx.ConnectError should be caught via httpx.TransportError."""
     tool = _make_connection_safe_tool(httpx.ConnectError("connection refused"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "ConnectError" in result["error"]
@@ -86,8 +76,7 @@ async def test_httpx_read_error_returns_error_dict():
     """httpx.ReadError (connection reset by peer) should be caught."""
     tool = _make_connection_safe_tool(httpx.ReadError("peer closed connection"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "ReadError" in result["error"]
@@ -98,57 +87,62 @@ async def test_httpx_connect_timeout_returns_error_dict():
     """httpx.ConnectTimeout should be caught via httpx.TransportError."""
     tool = _make_connection_safe_tool(httpx.ConnectTimeout("timed out"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "ConnectTimeout" in result["error"]
 
 
 @pytest.mark.asyncio
-async def test_mcp_error_returns_error_dict():
-    """McpError (raised by MCP session on stream drop / read timeout) should be caught."""
-    from mcp.types import ErrorData
+async def test_transport_mcp_error_returns_error_dict():
+    """McpError with a transport-level message (e.g., session read timeout) should be caught."""
+    tool = _make_connection_safe_tool(
+        McpError(ErrorData(code=-1, message="session read timeout"))
+    )
 
-    tool = _make_connection_safe_tool(McpError(ErrorData(code=-1, message="session read timeout")))
-
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        result = await tool.run_async(args={}, tool_context=MagicMock())
+    result = await tool.run_async(args={}, tool_context=MagicMock())
 
     assert "error" in result
     assert "McpError" in result["error"]
     assert "session read timeout" in result["error"]
 
 
+@pytest.mark.asyncio
+async def test_protocol_mcp_error_still_raises():
+    """McpError with a protocol-level message (e.g., invalid arguments) should propagate."""
+    tool = _make_connection_safe_tool(
+        McpError(ErrorData(code=-32602, message="Invalid params: unknown tool"))
+    )
+
+    with pytest.raises(McpError, match="Invalid params"):
+        await tool.run_async(args={}, tool_context=MagicMock())
+
+
 @pytest.mark.asyncio
 async def test_non_connection_error_still_raises():
     """Non-connection errors (e.g. ValueError) should still propagate."""
     tool = _make_connection_safe_tool(ValueError("bad argument"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        with pytest.raises(ValueError, match="bad argument"):
-            await tool.run_async(args={}, tool_context=MagicMock())
+    with pytest.raises(ValueError, match="bad argument"):
+        await tool.run_async(args={}, tool_context=MagicMock())
 
 
 @pytest.mark.asyncio
 async def test_cancelled_error_still_raises():
     """CancelledError must propagate — it's not a connection error."""
     tool = _make_connection_safe_tool(asyncio.CancelledError("cancelled"))
 
-    with patch.object(McpTool, "run_async", tool._parent_run_async):
-        with pytest.raises(asyncio.CancelledError):
-            await tool.run_async(args={}, tool_context=MagicMock())
+    with pytest.raises(asyncio.CancelledError):
+        await tool.run_async(args={}, tool_context=MagicMock())
 
 
 @pytest.mark.asyncio
 async def test_get_tools_wraps_mcp_tools():
     """KAgentMcpToolset.get_tools should wrap McpTool instances with ConnectionSafeMcpTool."""
-    # Create a real McpTool instance (bypassing __init__) so isinstance checks work
     fake_mcp_tool = McpTool.__new__(McpTool)
     fake_mcp_tool.name = "wrapped-tool"
     fake_mcp_tool._some_attr = "value"
 
-    # A non-McpTool object that should pass through unchanged
     fake_other_tool = MagicMock()
     fake_other_tool.name = "other-tool"
 
@@ -164,5 +158,4 @@ async def mock_super_get_tools(self_arg, readonly_context=None):
     assert isinstance(tools[0], ConnectionSafeMcpTool)
     assert tools[0].name == "wrapped-tool"
     assert tools[0]._some_attr == "value"
-    # Non-McpTool should pass through unchanged
     assert tools[1] is fake_other_tool