eval-protocol
diff --git a/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 1 addition & 0 deletions b/‎eval_protocol/mcp/execution/policy.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎eval_protocol/mcp/mcp_multi_client.py‎
Lines changed: 141 additions & 0 deletions b/‎eval_protocol/mcp/mcp_multi_client.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎eval_protocol/pytest/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 80 additions & 0 deletions b/‎eval_protocol/pytest/default_agent_rollout_processor.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/default_no_op_rollout_process.py‎
Lines changed: 12 additions & 0 deletions b/‎eval_protocol/pytest/default_no_op_rollout_process.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 30 additions & 0 deletions b/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 30 additions & 0 deletions
@@ -185,6 +185,7 @@ async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
                 "choices": [
                     {
                         "message": {
+                            "role": response.choices[0].message.role,
                             "content": response.choices[0].message.content,
                             "tool_calls": (
                                 [
 
@@ -0,0 +1,141 @@
+import json
+import os
+from contextlib import AsyncExitStack
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from dotenv import load_dotenv
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+from mcp.types import CallToolResult
+from openai.types import FunctionDefinition
+from openai.types.chat import ChatCompletionToolParam
+
+from eval_protocol.types.types import MCPMultiClientConfiguration
+
+load_dotenv()  # load environment variables from .env
+
+
+class MCPMultiClient:
+    """
+    Implements what clients like Cursor and Claude Desktop do when you configure
+    them to use multiple MCP servers. The difference is that it validates
+    against a list of environment variables rather than injects them into the
+    MCP server process. This is so you can version control your configuration
+    without exposing your environment variables to the MCP server process.
+
+    Environment variables should instead be set in a .env file
+    """
+
+    def __init__(self, config_path: Optional[str] = None):
+        # Initialize session and client objects
+        self.sessions: Dict[str, ClientSession] = {}
+        self.tools_to_sessions: Dict[str, ClientSession] = {}
+        self.exit_stack = AsyncExitStack()
+        self.config = self._load_config(config_path)
+
+    def _load_config(self, config_path: Optional[str] = None) -> MCPMultiClientConfiguration:
+        """Load MCP server configuration from file or use default"""
+        if config_path and os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                return json.load(f)
+
+        # Default configuration - can be overridden by config file
+        return {"mcpServers": {}}
+
+    def _validate_environment_variables(self, server_name: str, required_env: List[str]) -> None:
+        """Validate that required environment variables are set in os.environ"""
+        missing_vars = []
+        for env_var in required_env:
+            if env_var not in os.environ:
+                missing_vars.append(env_var)
+
+        if missing_vars:
+            raise ValueError(
+                f"Server '{server_name}' requires the following environment variables "
+                f"to be set in os.environ: {missing_vars}. "
+                f"Please set these variables in your environment or .env file."
+            )
+
+    async def connect_to_servers(self):
+        """Connect to all configured MCP servers"""
+        if not self.config.get("mcpServers"):
+            print("No MCP servers configured. Please provide a configuration file.")
+            return
+
+        for server_name, server_config in self.config["mcpServers"].items():
+            try:
+                await self._connect_to_server(server_name, server_config)
+            except Exception as e:
+                print(f"Failed to connect to server '{server_name}': {e}")
+
+    async def _connect_to_server(self, server_name: str, server_config: Dict[str, Any]):
+        """Connect to a specific MCP server using its configuration"""
+        command = server_config.get("command")
+        args = server_config.get("args", [])
+        env_config = server_config.get("env", [])
+
+        if not command:
+            raise ValueError(f"Server '{server_name}' must have a 'command' specified")
+
+        # Validate that required environment variables are set
+        if env_config:
+            self._validate_environment_variables(server_name, env_config)
+
+        # Use the current system environment (os.environ) - don't override with config
+        server_params = StdioServerParameters(command=command, args=args, env=os.environ)
+
+        stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
+        stdio, write = stdio_transport
+        session = await self.exit_stack.enter_async_context(ClientSession(stdio, write))
+
+        await session.initialize()
+        self.sessions[server_name] = session
+
+        # List available tools
+        response = await session.list_tools()
+        tools = response.tools
+        for tool in tools:
+            if tool.name in self.tools_to_sessions:
+                raise ValueError(f"Tool '{tool.name}' already exists")
+            self.tools_to_sessions[tool.name] = session
+        print(
+            f"\nConnected to server '{server_name}' with tools:",
+            [tool.name for tool in tools],
+        )
+
+    async def get_available_tools(self) -> List[ChatCompletionToolParam]:
+        """Get all available tools from all connected servers"""
+        all_tools = []
+        for server_name, session in self.sessions.items():
+            try:
+                response = await session.list_tools()
+                for tool in response.tools:
+                    all_tools.append(
+                        ChatCompletionToolParam(
+                            function=FunctionDefinition(
+                                name=tool.name,  # Prefix with server name
+                                description=tool.description,
+                                parameters=tool.inputSchema,
+                            ),
+                            type="function",
+                        )
+                    )
+            except Exception as e:
+                print(f"Error listing tools from server '{server_name}': {e}")
+
+        return all_tools
+
+    async def call_tool(self, tool_name: str, tool_args: Dict[str, Any]) -> CallToolResult:
+        """Call a specific tool by name with arguments"""
+
+        session = self.tools_to_sessions[tool_name]
+        try:
+            result = await session.call_tool(tool_name, tool_args)
+            return result
+        except Exception as e:
+            return f"Error calling tool {tool_name}: {e}"
+
+    async def cleanup(self):
+        """Clean up resources"""
+        await self.exit_stack.aclose()
@@ -0,0 +1,15 @@
+from .default_agent_rollout_processor import default_agent_rollout_processor
+from .default_no_op_rollout_process import default_no_op_rollout_processor
+from .default_single_turn_rollout_process import default_single_turn_rollout_processor
+from .pytest_utils import evaluate, evaluation_test
+from .types import RolloutProcessor, RolloutProcessorConfig
+
+__all__ = [
+    "default_agent_rollout_processor",
+    "default_no_op_rollout_processor",
+    "default_single_turn_rollout_processor",
+    "RolloutProcessor",
+    "RolloutProcessorConfig",
+    "evaluate",
+    "evaluation_test",
+]
@@ -0,0 +1,80 @@
+import json
+import os
+from typing import Any, List, Optional
+
+from mcp.types import CallToolResult
+from openai.types.chat import ChatCompletionMessage, ChatCompletionToolParam
+from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
+
+from eval_protocol.mcp.execution.policy import LiteLLMPolicy
+from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest.types import RolloutProcessorConfig
+
+
+class Agent:
+    """
+    A really simple agent that calls the model until no more tool calls are needed.
+    """
+
+    def __init__(self, model: str, initial_messages: list[Message], config_path: str):
+        self.model = model
+        self.messages: list[Message] = initial_messages
+        self._policy = LiteLLMPolicy(model_id=model)
+        self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
+
+    async def setup(self):
+        if self.mcp_client:
+            await self.mcp_client.connect_to_servers()
+
+    async def call_agent(self) -> str:
+        """
+        Call the assistant with the user query.
+        """
+        tools = await self.mcp_client.get_available_tools() if self.mcp_client else None
+
+        message = await self._call_model(self.messages, tools)
+        self.messages.append(message)
+        if message["tool_calls"]:
+            for tool_call in message["tool_calls"]:
+                tool_call_id = tool_call["id"]
+                tool_name = tool_call["function"]["name"]
+                tool_args = tool_call["function"]["arguments"]
+                tool_args_dict = json.loads(tool_args)
+                tool_result = await self.mcp_client.call_tool(tool_name, tool_args_dict)
+                content = self._get_content_from_tool_result(tool_result)
+                self.messages.append(
+                    {
+                        "role": "tool",
+                        "content": content,
+                        "tool_call_id": tool_call_id,
+                    }
+                )
+        return message["content"]
+
+    async def _call_model(
+        self, messages: list[Message], tools: Optional[list[ChatCompletionToolParam]]
+    ) -> ChatCompletionMessage:
+        messages = [message.model_dump() if hasattr(message, "model_dump") else message for message in messages]
+        response = await self._policy._make_llm_call(
+            messages=messages,
+            tools=tools,
+        )
+        return response["choices"][0]["message"]
+
+    def _get_content_from_tool_result(self, tool_result: CallToolResult) -> str:
+        if tool_result.structuredContent:
+            return json.dumps(tool_result.structuredContent)
+        if len(tool_result.content) > 1:
+            raise NotImplementedError("Multiple content is not supported yet")
+        first_content = tool_result.content[0]
+        if first_content.type != "text":
+            raise NotImplementedError("Non-text content is not supported yet")
+        return first_content.text
+
+
+async def default_agent_rollout_processor(row: EvaluationRow, config: RolloutProcessorConfig) -> List[EvaluationRow]:
+    agent = Agent(model=config.model, initial_messages=config.initial_messages, config_path=config.mcp_config_path)
+    await agent.setup()
+    await agent.call_agent()
+    return [EvaluationRow(messages=agent.messages)]
@@ -0,0 +1,12 @@
+from typing import List
+
+from eval_protocol.models import EvaluationRow
+from eval_protocol.pytest.types import ModelParam, RolloutProcessorConfig
+
+
+def default_no_op_rollout_processor(row: EvaluationRow, config: RolloutProcessorConfig) -> List[EvaluationRow]:
+    """
+    Simply passes input dataset through to the test function. This can be useful
+    if you want to run the rollout yourself.
+    """
+    return [row]
@@ -0,0 +1,30 @@
+from typing import List
+
+from openai import OpenAI
+
+from eval_protocol.auth import get_fireworks_api_base, get_fireworks_api_key
+from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message
+from eval_protocol.pytest.types import ModelParam, RolloutProcessorConfig
+
+
+def default_single_turn_rollout_processor(row: EvaluationRow, config: RolloutProcessorConfig) -> List[EvaluationRow]:
+    """Generate a single response from a Fireworks model."""
+
+    api_key = get_fireworks_api_key()
+    api_base = get_fireworks_api_base()
+    client = OpenAI(api_key=api_key, base_url=f"{api_base}/inference/v1")
+
+    if len(row.messages) == 0:
+        raise ValueError("Messages is empty. Please provide a non-empty dataset")
+
+    messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
+
+    response = client.chat.completions.create(model=config.model, messages=messages_payload, **config.input_params)
+    assistant_content = response.choices[0].message.content or ""
+    messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
+    processed = EvaluationRow(
+        messages=messages,
+        ground_truth=row.ground_truth,
+        input_metadata=InputMetadata(completion_params=CompletionParams(model=config.model)),
+    )
+    return [processed]
Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,7 @@ async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:`
`185`	`185`	`"choices": [`
`186`	`186`	`{`
`187`	`187`	`"message": {`
	`188`	`+ "role": response.choices[0].message.role,`
`188`	`189`	`"content": response.choices[0].message.content,`
`189`	`190`	`"tool_calls": (`
`190`	`191`	`[`