eval-protocol · xzrderek · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py
@@ -59,6 +59,20 @@ def __init__(
         # Initialize conversation state tracking for proper OpenAI trajectories
         self.initialized = False
 
+    def _supports_reasoning_details(self) -> bool:
+        """
+        Returns True if this policy is configured for a provider/model that expects
+        top-level reasoning_details to be preserved (e.g., Gemini 3 via OpenRouter).
+        """
+        model_id = getattr(self, "model_id", "") or ""
+        base_url = getattr(self, "base_url", "") or ""
+
+        if isinstance(model_id, str) and "openrouter" in model_id:
+            return True
+        if isinstance(base_url, str) and "openrouter.ai" in base_url:
+            return True
+        return False
+
     @abstractmethod
     async def _make_llm_call(self, messages: List[Dict], tools: List[Dict]) -> Dict:
         """
@@ -199,6 +213,10 @@ async def _generate_live_tool_calls(
         if message.get("tool_calls"):
             assistant_message_for_history["tool_calls"] = message["tool_calls"]
 
+        rd = message.get("reasoning_details", None)
+        if rd is not None and self._supports_reasoning_details():
+            assistant_message_for_history["reasoning_details"] = rd
+
         # Add to actual conversation history
         conversation_history.append(assistant_message_for_history)
 

diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
@@ -148,6 +148,9 @@ def _clean_messages_for_api(self, messages: List[Dict]) -> List[Dict]:
         # Standard OpenAI message fields
         allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"}
 
+        if self._supports_reasoning_details():
+            allowed_fields.add("reasoning_details")
+
         clean_messages = []
         for msg in messages:
             # Only keep allowed fields
@@ -217,31 +220,37 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
                 logger.debug(f"🔄 API call for model: {self.model_id}")
 
             # LiteLLM already returns OpenAI-compatible format
+            message_obj = getattr(response.choices[0], "message", object())
+
+            message_dict: Dict[str, Any] = {
+                "role": getattr(message_obj, "role", "assistant"),
+                "content": getattr(message_obj, "content", None),
+                "tool_calls": (
+                    [
+                        {
+                            "id": getattr(tc, "id", None),
+                            "type": getattr(tc, "type", "function"),
+                            "function": {
+                                "name": getattr(getattr(tc, "function", None), "name", "tool"),
+                                "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
+                            },
+                        }
+                        for tc in (getattr(message_obj, "tool_calls", []) or [])
+                    ]
+                    if getattr(message_obj, "tool_calls", None)
+                    else []
+                ),
+            }
+
+            if self._supports_reasoning_details():
+                rd = getattr(message_obj, "reasoning_details", None)
+                if rd is not None:
+                    message_dict["reasoning_details"] = rd
+
             return {
                 "choices": [
                     {
-                        "message": {
-                            "role": getattr(getattr(response.choices[0], "message", object()), "role", "assistant"),
-                            "content": getattr(getattr(response.choices[0], "message", object()), "content", None),
-                            "tool_calls": (
-                                [
-                                    {
-                                        "id": getattr(tc, "id", None),
-                                        "type": getattr(tc, "type", "function"),
-                                        "function": {
-                                            "name": getattr(getattr(tc, "function", None), "name", "tool"),
-                                            "arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
-                                        },
-                                    }
-                                    for tc in (
-                                        getattr(getattr(response.choices[0], "message", object()), "tool_calls", [])
-                                        or []
-                                    )
-                                ]
-                                if getattr(getattr(response.choices[0], "message", object()), "tool_calls", None)
-                                else []
-                            ),
-                        },
+                        "message": message_dict,
                         "finish_reason": getattr(response.choices[0], "finish_reason", None),
                     }
                 ],