google
diff --git a/‎src/google/adk/features/_feature_registry.py‎
Lines changed: 9 additions & 0 deletions b/‎src/google/adk/features/_feature_registry.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/google/adk/tools/mcp_tool/mcp_session_manager.py‎
Lines changed: 72 additions & 16 deletions b/‎src/google/adk/tools/mcp_tool/mcp_session_manager.py‎
Lines changed: 72 additions & 16 deletions
diff --git a/‎src/google/adk/tools/mcp_tool/mcp_tool.py‎
Lines changed: 56 additions & 2 deletions b/‎src/google/adk/tools/mcp_tool/mcp_tool.py‎
Lines changed: 56 additions & 2 deletions
@@ -41,6 +41,12 @@ class FeatureName(str, Enum):
   GOOGLE_CREDENTIALS_CONFIG = "GOOGLE_CREDENTIALS_CONFIG"
   GOOGLE_TOOL = "GOOGLE_TOOL"
   JSON_SCHEMA_FOR_FUNC_DECL = "JSON_SCHEMA_FOR_FUNC_DECL"
+  # Private (leading underscore): not part of the public API surface.
+  # GE flips this on by setting the env var
+  # `ADK_ENABLE_MCP_GRACEFUL_ERROR_HANDLING=1`; nothing should import this
+  # enum member by name. Keeping it private avoids a backward-compat
+  # obligation for what is intended as a temporary, internal kill-switch.
+  _MCP_GRACEFUL_ERROR_HANDLING = "MCP_GRACEFUL_ERROR_HANDLING"
   PROGRESSIVE_SSE_STREAMING = "PROGRESSIVE_SSE_STREAMING"
   PUBSUB_TOOL_CONFIG = "PUBSUB_TOOL_CONFIG"
   PUBSUB_TOOLSET = "PUBSUB_TOOLSET"
@@ -131,6 +137,9 @@ class FeatureConfig:
     FeatureName.JSON_SCHEMA_FOR_FUNC_DECL: FeatureConfig(
         FeatureStage.WIP, default_on=False
     ),
+    FeatureName._MCP_GRACEFUL_ERROR_HANDLING: FeatureConfig(
+        FeatureStage.EXPERIMENTAL, default_on=False
+    ),
     FeatureName.PROGRESSIVE_SSE_STREAMING: FeatureConfig(
         FeatureStage.EXPERIMENTAL, default_on=True
     ),
 
@@ -44,6 +44,8 @@
 from pydantic import BaseModel
 from pydantic import ConfigDict
 
+from ...features import FeatureName
+from ...features import is_feature_enabled
 from .session_context import SessionContext
 
 logger = logging.getLogger('google_adk.' + __name__)
@@ -237,11 +239,18 @@ def __init__(
       self._connection_params = connection_params
     self._errlog = errlog
 
-    # Session pool: maps session keys to (session, exit_stack, loop) tuples
+    # Session pool: maps session keys to (session, exit_stack, loop) tuples.
+    # Kept as a tuple for backward-compatibility with downstream tests
+    # that construct or unpack entries directly.
     self._sessions: Dict[
         str, tuple[ClientSession, AsyncExitStack, asyncio.AbstractEventLoop]
     ] = {}
 
+    # Sibling pool: maps session keys to their SessionContext. Stored
+    # separately from `_sessions` so the tuple shape above stays stable.
+    # Used by McpTool to access `_run_guarded` for transport-crash detection.
+    self._session_contexts: Dict[str, SessionContext] = {}
+
     # Map of event loops to their respective locks to prevent race conditions
     # across different event loops in session creation.
     self._session_lock_map: dict[asyncio.AbstractEventLoop, asyncio.Lock] = {}
@@ -323,6 +332,26 @@ def _is_session_disconnected(self, session: ClientSession) -> bool:
     """
     return session._read_stream._closed or session._write_stream._closed
 
+  def _get_session_context(
+      self, headers: Optional[Dict[str, str]] = None
+  ) -> Optional[SessionContext]:
+    """Returns the SessionContext for the session matching the given headers.
+
+    Note: This method reads from the session-context pool without acquiring
+    ``_session_lock``. This is safe because it is called immediately after
+    ``create_session()`` (which populates the entry under the lock) within
+    the same task, and dict reads are atomic in CPython.
+
+    Args:
+        headers: Optional headers used to identify the session.
+
+    Returns:
+        The SessionContext if a matching session exists, None otherwise.
+    """
+    merged_headers = self._merge_headers(headers)
+    session_key = self._generate_session_key(merged_headers)
+    return self._session_contexts.get(session_key)
+
   async def _cleanup_session(
       self,
       session_key: str,
@@ -378,6 +407,10 @@ def cleanup_done(f: asyncio.Future):
     finally:
       if session_key in self._sessions:
         del self._sessions[session_key]
+      # Also drop the SessionContext reference so we don't leak the
+      # SessionContext after its underlying session is gone.
+      if session_key in self._session_contexts:
+        del self._session_contexts[session_key]
 
   def _create_client(self, merged_headers: Optional[Dict[str, str]] = None):
     """Creates an MCP client based on the connection parameters.
@@ -453,15 +486,30 @@ async def create_session(
       if session_key in self._sessions:
         session, exit_stack, stored_loop = self._sessions[session_key]
 
-        # Check if the existing session is still connected and bound to the current loop
+        # Check if the existing session is still connected and bound to
+        # the current loop. When the feature flag is on, we ALSO check the
+        # SessionContext's background task: a crashed transport can leave
+        # the session's read/write streams open even though the underlying
+        # task has already died (e.g. after a 4xx/5xx HTTP response).
+        # Without that extra check, callers would reuse a dead session and
+        # hang on the next call. The check is gated because it triggers
+        # session re-creation in some test mocks where `_task` looks
+        # "not alive" but the streams are otherwise reusable.
         current_loop = asyncio.get_running_loop()
-        if stored_loop is current_loop and not self._is_session_disconnected(
-            session
+        if is_feature_enabled(FeatureName._MCP_GRACEFUL_ERROR_HANDLING):  # pylint: disable=protected-access
+          ctx = self._session_contexts.get(session_key)
+          ctx_alive = ctx is None or ctx._is_task_alive  # pylint: disable=protected-access
+        else:
+          ctx_alive = True  # Pre-fix: do not consult task aliveness
+        if (
+            stored_loop is current_loop
+            and not self._is_session_disconnected(session)
+            and ctx_alive
         ):
           # Session is still good, return it
           return session
         else:
-          # Session is disconnected or from a different loop, clean it up
+          # Session is disconnected, dead, or from a different loop; clean up.
           logger.info(
               'Cleaning up session (disconnected or different loop): %s',
               session_key,
@@ -485,26 +533,32 @@ async def create_session(
         client = self._create_client(merged_headers)
         is_stdio = isinstance(self._connection_params, StdioConnectionParams)
 
+        session_context = SessionContext(
+            client=client,
+            timeout=timeout_in_seconds,
+            sse_read_timeout=sse_read_timeout_in_seconds,
+            is_stdio=is_stdio,
+            sampling_callback=self._sampling_callback,
+            sampling_capabilities=self._sampling_capabilities,
+        )
+
         session = await asyncio.wait_for(
-            exit_stack.enter_async_context(
-                SessionContext(
-                    client=client,
-                    timeout=timeout_in_seconds,
-                    sse_read_timeout=sse_read_timeout_in_seconds,
-                    is_stdio=is_stdio,
-                    sampling_callback=self._sampling_callback,
-                    sampling_capabilities=self._sampling_capabilities,
-                )
-            ),
+            exit_stack.enter_async_context(session_context),
             timeout=timeout_in_seconds,
         )
 
-        # Store session, exit stack, and loop in the pool
+        # Store session, exit stack, and loop in the pool. The pool storage
+        # remains a tuple for backward-compatibility with downstream tests
+        # that construct or unpack entries directly.
         self._sessions[session_key] = (
             session,
             exit_stack,
             asyncio.get_running_loop(),
         )
+        # Track the SessionContext in a sibling dict so McpTool can call
+        # `_run_guarded` on it. Stored separately to avoid changing the
+        # shape of `_sessions` (which is a public-ish internal surface).
+        self._session_contexts[session_key] = session_context
         logger.debug('Created new session: %s', session_key)
         return session
 
@@ -524,6 +578,7 @@ def __getstate__(self):
     state = self.__dict__.copy()
     # Remove unpicklable entries or those that shouldn't persist across pickle
     state['_sessions'] = {}
+    state['_session_contexts'] = {}
     state['_session_lock_map'] = {}
 
     # Locks and file-like objects cannot be pickled
@@ -537,6 +592,7 @@ def __setstate__(self, state):
     self.__dict__.update(state)
     # Re-initialize members that were not pickled
     self._sessions = {}
+    self._session_contexts = {}
     self._session_lock_map = {}
     self._lock_map_lock = threading.Lock()
     # If _errlog was removed during pickling, default to sys.stderr
 
@@ -31,6 +31,7 @@
 
 from fastapi.openapi.models import APIKeyIn
 from google.genai.types import FunctionDeclaration
+from mcp.shared.exceptions import McpError
 from mcp.shared.session import ProgressFnT
 from mcp.types import Tool as McpBaseTool
 from opentelemetry import propagate
@@ -45,11 +46,18 @@
 from ...features import FeatureName
 from ...features import is_feature_enabled
 from ...utils.context_utils import find_context_parameter
+# `is_feature_enabled(FeatureName._MCP_GRACEFUL_ERROR_HANDLING)` gates the
+# error-boundary and transport-crash-detection behavior added in this module.
+# When the flag is off (default) or via ADK_DISABLE_MCP_GRACEFUL_ERROR_HANDLING=1
+# `run_async` and `_run_async_impl` fall back to the pre-fix behavior.
+# The enum member is intentionally private (leading underscore) so it is not
+# part of the ADK public API; consumers flip the env var, not the symbol.
 from .._gemini_schema_util import _to_gemini_schema
 from ..base_authenticated_tool import BaseAuthenticatedTool
 from ..tool_context import ToolContext
 from .mcp_session_manager import MCPSessionManager
 from .mcp_session_manager import retry_on_errors
+from .session_context import SessionContext
 
 logger = logging.getLogger("google_adk." + __name__)
 
@@ -339,7 +347,26 @@ async def run_async(
         }
       elif not tool_context.tool_confirmation.confirmed:
         return {"error": "This tool call is rejected."}
-    return await super().run_async(args=args, tool_context=tool_context)
+
+    if not is_feature_enabled(FeatureName._MCP_GRACEFUL_ERROR_HANDLING):  # pylint: disable=protected-access
+      # Pre-fix behavior: exceptions bubble up to the agent runner.
+      return await super().run_async(args=args, tool_context=tool_context)
+
+    # New behavior: convert MCP-level and unexpected errors into a
+    # structured `{"error": "..."}` dict so the agent loop can continue
+    # gracefully instead of being killed by an unhandled exception. This
+    # is the primary fix for the 5-minute hang seen when Model Armor (or
+    # any AGW policy) returns a 403 mid-tool-call.
+    try:
+      return await super().run_async(args=args, tool_context=tool_context)
+    except McpError as e:
+      logger.warning("MCP tool execution failed with McpError: %s", e)
+      return {"error": f"MCP tool execution failed: {e}"}
+    except Exception as e:  # pylint: disable=broad-exception-caught
+      logger.warning(
+          "Unexpected error during MCP tool execution: %s", e, exc_info=True
+      )
+      return {"error": f"Unexpected error during MCP tool execution: {e}"}
 
   @retry_on_errors
   @override
@@ -384,12 +411,39 @@ async def _run_async_impl(
     # Resolve progress callback (may be a factory that needs runtime context)
     resolved_callback = self._resolve_progress_callback(tool_context)
 
-    response = await session.call_tool(
+    call_coro = session.call_tool(
         self._mcp_tool.name,
         arguments=args,
         progress_callback=resolved_callback,
         meta=meta_trace_context,
     )
+
+    if is_feature_enabled(FeatureName._MCP_GRACEFUL_ERROR_HANDLING):  # pylint: disable=protected-access
+      # Race the tool call against the background session task so that
+      # transport crashes (e.g. non-2xx HTTP responses from an AGW with
+      # Model Armor) surface immediately instead of hanging until
+      # sse_read_timeout (default 5 minutes) expires. ConnectionError is
+      # intentionally NOT caught here; it propagates to retry_on_errors,
+      # which will create a fresh session and retry once before finally
+      # surfacing the failure to the agent (where the run_async wrapper
+      # converts it into an `{"error": ...}` dict).
+      #
+      # The isinstance check is intentional: tests and external subclasses
+      # may inject mock session managers whose `_get_session_context`
+      # returns a Mock instead of a real SessionContext (or None). Falling
+      # back to the direct await keeps those callers working.
+      session_context = self._mcp_session_manager._get_session_context(  # pylint: disable=protected-access
+          headers=final_headers
+      )
+      if isinstance(session_context, SessionContext):
+        response = await session_context._run_guarded(call_coro)  # pylint: disable=protected-access
+      else:
+        response = await call_coro
+    else:
+      # Pre-fix behavior: await the call directly. This is what causes the
+      # ~300s hang when the underlying transport crashes.
+      response = await call_coro
+
     result = response.model_dump(exclude_none=True, mode="json")
 
     # Push UI widget to the event actions if the tool supports it.