Optimizing High Concurrency (#85)

xzrderek · Dylan Huang · web-flow · commit e7149c5e6c19 · 2025-07-31T22:38:07.000-07:00
* working

* changing tests

* updating llm usage

* bug with accessing msg

* temp

* not finished yet

* adding tau2 checks

* removing erroneous tau2 subfolder

* remove workflows folder

* test

* revert

* fix test

* tool warmup + concurrent uvicorn server

---------

Co-authored-by: Dylan Huang &lt;dhuang@fireworks.ai&gt;
diff --git a/eval_protocol/mcp/client/connection.py b/eval_protocol/mcp/client/connection.py
@@ -23,6 +23,10 @@
 class MCPConnectionManager:
     """Manages MCP client connections and session lifecycle."""
 
+    def __init__(self):
+        self._tools_cache: Dict[str, List[Dict]] = {}
+        self._tools_cache_lock = asyncio.Lock()
+
     async def initialize_session(self, session: MCPSession) -> None:
         """
         Initialize a persistent MCP session.
@@ -99,9 +103,40 @@ async def initialize_session(self, session: MCPSession) -> None:
                 session.session_id = server_session_id
                 logger.debug(f"Updated session ID to match server: {server_session_id}")
 
+        # PRE-WARM: Discover and cache tools immediately after session initialization
+        # This prevents concurrent list_tools() calls later
+        await self._prewarm_tools_cache(session)
+
+    async def _prewarm_tools_cache(self, session: MCPSession) -> None:
+        """
+        Pre-warm the tools cache for this session's base URL.
+        This prevents concurrent list_tools() calls during discover_tools().
+        """
+        cache_key = session.base_url
+
+        async with self._tools_cache_lock:
+            # Only fetch tools if not already cached for this base_url
+            if cache_key not in self._tools_cache:
+                logger.debug(f"Pre-warming tools cache for {cache_key}")
+                tools_response = await session._mcp_session.list_tools()
+                tools = tools_response.tools if hasattr(tools_response, "tools") else []
+
+                tool_schemas = []
+                for tool in tools:
+                    tool_schema = {
+                        "name": tool.name,
+                        "description": tool.description,
+                        "input_schema": (tool.inputSchema if hasattr(tool, "inputSchema") else {}),
+                    }
+                    tool_schemas.append(tool_schema)
+
+                self._tools_cache[cache_key] = tool_schemas
+                logger.debug(f"✅ PRE-WARMED {len(tool_schemas)} tools for{cache_key}")
+
     async def discover_tools(self, session: MCPSession) -> List[Dict]:
         """
         Discover available tools from an MCP session.
+        Now uses pre-warmed cache to avoid concurrent list_tools() calls.
 
         Args:
             session: The MCPSession to discover tools from
@@ -112,9 +147,19 @@ async def discover_tools(self, session: MCPSession) -> List[Dict]:
         if not session._mcp_session:
             raise RuntimeError("Session not initialized")
 
+        cache_key = session.base_url
+
+        # Check cache first (should be pre-warmed during initialization)
+        async with self._tools_cache_lock:
+            if cache_key in self._tools_cache:
+                cached_tools = self._tools_cache[cache_key]
+                logger.debug(f"Using cached tools for session {session.session_id} ({len(cached_tools)} tools)")
+                return cached_tools
+
+        # Fallback: if cache miss (shouldn't happen with pre-warming), fetch directly
+        logger.warning(f"Cache miss for {cache_key} - this shouldn't happen with pre-warming")
         mcp_session = session._mcp_session
 
-        # Get available tools from MCP server
         tools_response = await mcp_session.list_tools()
         tools = tools_response.tools if hasattr(tools_response, "tools") else []
 
@@ -129,8 +174,26 @@ async def discover_tools(self, session: MCPSession) -> List[Dict]:
             }
             tool_schemas.append(tool_schema)
 
+        # Cache the result for future use
+        async with self._tools_cache_lock:
+            self._tools_cache[cache_key] = tool_schemas
+
         return tool_schemas
 
+    def clear_tools_cache(self, base_url: Optional[str] = None):
+        """
+        Clear the tools cache for debugging or when server tools change.
+
+        Args:
+            base_url: If provided, clear cache only for this URL. If None, clear all.
+        """
+        if base_url:
+            self._tools_cache.pop(base_url, None)
+            logger.debug(f"Cleared tools cache for {base_url}")
+        else:
+            self._tools_cache.clear()
+            logger.debug("Cleared all tools cache")
+
     async def get_initial_state(self, session: MCPSession) -> Any:
         """
         Get initial state from session-aware control plane endpoint.
@@ -160,8 +223,9 @@ async def get_initial_state(self, session: MCPSession) -> Any:
 
                 # Query initial state endpoint
                 try:
-                    # Use shorter timeout for playback mode
-                    timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 5.0
+                    # Use shorter timeout for playback mode, longer timeout for high-concurrency initialization
+                    # (50+ concurrent sessions need more time for initial state setup)
+                    timeout = 3.0 if hasattr(session, "_is_playback_mode") and session._is_playback_mode else 15.0
                     async with httpx.AsyncClient(timeout=timeout) as client:
                         initial_state_response = await client.get(
                             f"{base_url}/control/initial_state",
diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py
@@ -12,6 +12,7 @@
 - Session-aware control plane endpoints via @control_plane_endpoint decorator
 """
 
+import asyncio
 import hashlib
 import inspect
 import json
@@ -21,6 +22,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Optional, Tuple
 
+import uvicorn
 from mcp.server.fastmcp import Context, FastMCP
 from starlette.requests import Request
 from starlette.responses import JSONResponse
@@ -553,29 +555,32 @@ def format_observation(self, obs: Any, env: Any) -> Dict[str, Any]:
             return {"observation": serialized_obs}
 
     def run(self, transport: str = "streamable-http", **kwargs):
-        """
-        Run the unified MCP-Gym server.
-
-        Args:
-            transport: MCP transport protocol ("stdio", "sse", "streamable-http")
-            **kwargs: Additional arguments passed to FastMCP.run()
-        """
-        print(f"🚀 {self.mcp.name} MCP-Gym Server Starting...")
-        print(f"📡 Transport: {transport}")
-        print("🎯 MCP Pattern: HTTP endpoints for control plane, tools for data plane")
-        print("🔗 Session-aware control plane endpoints:")
-
-        # List registered control plane endpoints
-        for endpoint_name, endpoint_func in self._control_plane_endpoints.items():
-            print(f"  - {endpoint_name}: {endpoint_func._control_plane_path}")
-
-        if not self._control_plane_endpoints:
-            print("  - No control plane endpoints registered")
-
-        print()
-
-        # Run the unified server
-        self.mcp.run(transport=transport, **kwargs)
+        """Run the unified MCP-Gym server with high concurrency settings."""
+        if transport == "streamable-http":
+            # Run with custom high-concurrency uvicorn config
+
+            async def run_with_high_concurrency():
+                starlette_app = self.mcp.streamable_http_app()
+
+                config = uvicorn.Config(
+                    starlette_app,
+                    host=self.mcp.settings.host,
+                    port=self.mcp.settings.port,
+                    log_level=self.mcp.settings.log_level.lower(),
+                    # HIGH CONCURRENCY SETTINGS
+                    limit_concurrency=200,  # Increase for HTTP endpoints + MCP
+                    limit_max_requests=100000,  # Higher request limit
+                    timeout_keep_alive=120,  # Longer keep-alive for control plane
+                    timeout_notify=180,
+                    h11_max_incomplete_event_size=4 * 1024 * 1024,  # Handle larger events
+                )
+                server = uvicorn.Server(config)
+                await server.serve()
+
+            asyncio.run(run_with_high_concurrency())
+        else:
+            # Use default FastMCP run for other transports
+            self.mcp.run(transport=transport, **kwargs)
 
     def _to_json_serializable(self, obj: Any) -> Any:
         """Convert any object to JSON-serializable format.