fixed bugs

cemde · cemde · commit eb24ce152328 · 2025-12-26T14:31:08.000+01:00
diff --git a/examples/tau2_benchmark/tau2_default_agent_benchmark.py b/examples/tau2_benchmark/tau2_default_agent_benchmark.py
@@ -287,7 +287,7 @@ def run_benchmark(
         callbacks=[logger],
         n_task_repeats=n_task_repeats,
         fail_on_setup_error=True,
-        fail_on_task_error=False,  # Continue on task errors
+        fail_on_task_error=False,  # Set to False to continue on task errors
         fail_on_evaluation_error=True,
     )
 
diff --git a/maseval/benchmark/tau2/domains/retail/tools.py b/maseval/benchmark/tau2/domains/retail/tools.py
@@ -52,7 +52,8 @@ def _get_order(self, order_id: str) -> Order:
         """Get the order from the database.
 
         Args:
-            order_id: The order id, such as '#W0000000'. Be careful there is a '#' symbol at the beginning.
+            order_id: The order id, such as '#W0000000' or 'W0000000'.
+                The '#' prefix is optional and will be added if missing.
 
         Returns:
             The order.
@@ -62,6 +63,9 @@ def _get_order(self, order_id: str) -> Order:
         """
         if self.db is None:
             raise ValueError("Database not initialized")
+        # Normalize order_id: add '#' prefix if missing (LLMs often omit it)
+        if not order_id.startswith("#"):
+            order_id = f"#{order_id}"
         if order_id not in self.db.orders:
             raise ValueError("Order not found")
         return self.db.orders[order_id]
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
@@ -168,7 +168,26 @@ def _convert_messages(self, messages: List[Dict[str, Any]]) -> tuple[Optional[st
             if role == "system":
                 system_instruction = content
             elif role == "assistant":
-                contents.append({"role": "model", "parts": [{"text": content}]})
+                # Handle assistant messages with or without tool calls
+                parts = []
+                if content:
+                    parts.append({"text": content})
+                # Convert tool_calls to Google's function_call format
+                tool_calls = msg.get("tool_calls", [])
+                if tool_calls:
+                    import json
+
+                    for tc in tool_calls:
+                        if tc.get("type") == "function":
+                            func = tc.get("function", {})
+                            args_str = func.get("arguments", "{}")
+                            try:
+                                args = json.loads(args_str) if isinstance(args_str, str) else args_str
+                            except json.JSONDecodeError:
+                                args = {}
+                            parts.append({"function_call": {"name": func.get("name", ""), "args": args}})
+                if parts:
+                    contents.append({"role": "model", "parts": parts})
             elif role == "tool":
                 # Tool response in Google format
                 tool_call_id = msg.get("tool_call_id", "")
@@ -237,7 +256,7 @@ def _parse_response(self, response: Any) -> ChatResponse:
         tool_calls = None
         if hasattr(response, "candidates") and response.candidates:
             candidate = response.candidates[0]
-            if hasattr(candidate, "content") and candidate.content:
+            if hasattr(candidate, "content") and candidate.content and candidate.content.parts:
                 for part in candidate.content.parts:
                     if hasattr(part, "function_call") and part.function_call:
                         if tool_calls is None:

Original file line number	Diff line number	Diff line change
`@@ -287,7 +287,7 @@ def run_benchmark(`
`287`	`287`	`callbacks=[logger],`
`288`	`288`	`n_task_repeats=n_task_repeats,`
`289`	`289`	`fail_on_setup_error=True,`
`290`		`- fail_on_task_error=False, # Continue on task errors`
	`290`	`+ fail_on_task_error=False, # Set to False to continue on task errors`
`291`	`291`	`fail_on_evaluation_error=True,`
`292`	`292`	`)`
`293`	`293`