diff --git a/.github/scripts/extract_changelog.py b/.github/scripts/extract_changelog.py index d090de5f..4918cab2 100644 --- a/.github/scripts/extract_changelog.py +++ b/.github/scripts/extract_changelog.py @@ -19,6 +19,7 @@ def extract_section(version: str, changelog_path: Path) -> str: if not match: print(f"No changelog entry found for version {version}", file=sys.stderr) sys.exit(1) + assert match is not None return match.group(0).strip() diff --git a/AGENTS.md b/AGENTS.md index 75cf950f..7036a867 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -102,12 +102,12 @@ uv remove **Framework Adapter Pattern:** -When implementing wrappers for external frameworks, **always use the framework's native message storage as the source of truth**: +When implementing adapters for external frameworks, **always use the framework's native message storage as the source of truth**: **Pattern 1: Persistent State (smolagents)** ```python -class MyFrameworkWrapper(AgentAdapter): +class MyFrameworkAdapter(AgentAdapter): def get_messages(self) -> MessageHistory: """Dynamically fetch from framework's internal storage.""" # Get from framework (e.g., agent.memory, agent.messages) @@ -236,3 +236,7 @@ For lists and dictionaries, use `Dict[...,...]`, `List[...]`, `Sequence[...]` et - DO NOT publicly distribute code or data - DO NOT publish without explicit permission - DO NOT share copyrighted third-party benchmark data + +## Changelog + +When the task is completed, add your changes to the Changelog. diff --git a/CHANGELOG.md b/CHANGELOG.md index 8963237d..5cfb2573 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- The `logs` property inside `SmolAgentAdapter` and `LanggraphAgentAdapter` are now properly filled. (PR: #3) + ### Changed ### Fixed +- Consistent naming of agent `adapter` over `wrapper` (PR: #3) + ### Removed +- Removed `set_message_history`, `append_message_history` and `clear_message_history` for `AgentAdapter` and subclasses. (PR: #3) + ## [0.1.2] - 2025-11-18 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 90d20667..5f83f17b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,7 +23,7 @@ The `maseval` package is designed with a strict separation between its core logi 1. **`maseval/core`**: This is the heart of the library. It contains the essential logic and **must not** have any optional dependencies. It should be fully functional with a minimal installation. -2. **`maseval/interface`**: This contains adapters and wrappers for other multi-agent frameworks (like `crewai`, `langgraph`, etc.). All dependencies for these integrations are optional. +2. **`maseval/interface`**: This contains adapters for other multi-agent frameworks (like `crewai`, `langgraph`, etc.). All dependencies for these integrations are optional. > [!WARNING] > Code in `maseval/core` **must never** import from `maseval/interface`. This separation is critical to keep the core package lightweight and dependency-free. Breaking this rule will cause the library to fail. @@ -197,11 +197,11 @@ The pipeline automatically performs the following tasks: ### 6. Implementing Framework Adapters -When creating wrappers for external agent frameworks (in `maseval/interface/agents/`), follow these best practices to ensure consistency and reliability: +When creating adapters for external agent frameworks (in `maseval/interface/agents/`), follow these best practices to ensure consistency and reliability: #### Message History Pattern -**Always use the framework's native message storage as the source of truth.** Do not cache converted messages in the wrapper, as this can lead to inconsistencies if the framework's internal state changes. +**Always use the framework's native message storage as the source of truth.** Do not cache converted messages in the adapter, as this can lead to inconsistencies if the framework's internal state changes. **Correct Pattern** (SmolAgents example): @@ -256,13 +256,14 @@ When adding support for a new framework: - [ ] Add conditional import in `maseval/interface/agents/__init__.py` - [ ] Write integration tests in `tests/test_interface/` - [ ] Update documentation with usage examples +- [ ] Provide a `logs` property inside the `AgentAdapter`. #### Framework-Specific Patterns **Pattern 1: Persistent State (smolagents)** ```python -class MyFrameworkWrapper(AgentAdapter): +class MyFrameworkAdapter(AgentAdapter): def get_messages(self) -> MessageHistory: """Dynamically fetch from framework's internal storage.""" # Get from framework (e.g., agent.memory, agent.messages) diff --git a/README.md b/README.md index 399630f4..c7a33371 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Analogous to pytest for testing or MLflow for ML experimentation, MASEval focuse - **Task-Specific Configurations:** Each benchmark task is a self-contained evaluation unit with its own instructions, environment state, success criteria, and custom evaluation logic. One task might measure success by environment state changes, another by programmatic output validation. -- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and wrappers enable any agent system to be evaluated without modification to the core library. +- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and adapters enable any agent system to be evaluated without modification to the core library. - **Lifecycle Hooks via Callbacks:** Inject custom logic at any point in the evaluation lifecycle (e.g., on_run_start, on_task_start, on_agent_step_end) through a callback system. This enables extensibility without modifying core evaluation logic. diff --git a/docs/guides/config-gathering.md b/docs/guides/config-gathering.md index cad44e49..b87688c6 100644 --- a/docs/guides/config-gathering.md +++ b/docs/guides/config-gathering.md @@ -124,8 +124,8 @@ class MyBenchmark(Benchmark): def setup_agents(self, agent_data, environment, task, user): model = MyModelAdapter(...) agent = MyAgent(model=model) - wrapper = AgentAdapter(agent, "agent") - return [wrapper], {"agent": wrapper} + adapter = AgentAdapter(agent, "agent") + return [adapter], {"agent": adapter} # ... other methods # Run benchmark diff --git a/docs/guides/message-tracing.md b/docs/guides/message-tracing.md index 3ca5dd54..85e41563 100644 --- a/docs/guides/message-tracing.md +++ b/docs/guides/message-tracing.md @@ -16,7 +16,7 @@ MASEval provides message tracing to capture agent conversations during benchmark ## Core Concepts -**`MessageHistory`**: OpenAI-compatible message storage that all agent wrappers use internally. +**`MessageHistory`**: OpenAI-compatible message storage that all agent adapters use internally. **`AgentAdapter.get_messages()`**: Standard method to retrieve conversation history from any wrapped agent. @@ -26,17 +26,17 @@ MASEval provides message tracing to capture agent conversations during benchmark ### Accessing Message History -Every agent wrapper exposes message history through `get_messages()`: +Every agent adapter exposes message history through `get_messages()`: ```python -from maseval.interface.agents import SmolAgentsWrapper +from maseval.interface.agents import SmolAgentAdapter # Create and run your agent -wrapper = SmolAgentsWrapper(agent, name="researcher") -result = wrapper.run("What's the capital of France?") +agent_adapter = SmolAgentAdapter(agent, name="researcher") +result = agent_adapter.run("What's the capital of France?") # Get the conversation -messages = wrapper.get_messages() +messages = agent_adapter.get_messages() # Inspect messages for msg in messages: @@ -45,18 +45,21 @@ for msg in messages: print(f" Tools called: {[tc['function']['name'] for tc in msg['tool_calls']]}") ``` -### Clearing History Between Tasks +### Fresh Conversations for Multiple Tasks -In benchmarks, you typically want to clear history before each new task: +In benchmarks, you typically want a fresh agent instance for each task: ```python # In your benchmark loop for task in benchmark.tasks: - wrapper.clear_message_history() # Reset for new task - result = wrapper.run(task.query) + # Create a new adapter instance for each task + agent_adapter = YourAgentAdapter(agent_instance=agent, name="task_agent") + result = agent_adapter.run(task.query) evaluate(result, task.ground_truth) ``` +This ensures each task starts with a clean slate and avoids conversation history contamination. + ## Using the Tracing Callback For multi-agent systems or when you need to collect conversations from many runs, use `MessageTracingAgentCallback`: @@ -68,12 +71,12 @@ from maseval.core.callbacks import MessageTracingAgentCallback tracer = MessageTracingAgentCallback() # Attach to your agent(s) -wrapper = SmolAgentsWrapper(agent, name="assistant", callbacks=[tracer]) +agent_adapter = SmolAgentAdapter(agent, name="assistant", callbacks=[tracer]) # Run tasks -wrapper.run("Task 1") -wrapper.run("Task 2") -wrapper.run("Task 3") +agent_adapter.run("Task 1") +agent_adapter.run("Task 2") +agent_adapter.run("Task 3") # Get all conversations conversations = tracer.get_all_conversations() @@ -93,8 +96,8 @@ Share one tracer across multiple agents to collect all conversations: tracer = MessageTracingAgentCallback() # Attach to multiple agents -agent1 = SmolAgentsWrapper(agent1, name="researcher", callbacks=[tracer]) -agent2 = SmolAgentsWrapper(agent2, name="writer", callbacks=[tracer]) +agent1 = SmolAgentAdapter(agent1, name="researcher", callbacks=[tracer]) +agent2 = SmolAgentAdapter(agent2, name="writer", callbacks=[tracer]) # Run both agents agent1.run("Research topic X") @@ -119,7 +122,7 @@ tracer = MessageTracingAgentCallback() for batch in task_batches: for task in batch: - wrapper.run(task.query) + agent_adapter.run(task.query) # Process this batch conversations = tracer.get_all_conversations() @@ -190,9 +193,9 @@ Messages use OpenAI's chat completion format: } ``` -## Custom Agent Wrappers +## Custom Agent Adapters -If you're implementing a custom wrapper, the framework handles message storage automatically via `get_messages()`. Just ensure your `_run_agent()` method returns a `MessageHistory`: +If you're implementing a custom adapter, the framework handles message storage automatically via `get_messages()`. Just ensure your `_run_agent()` method returns a `MessageHistory`: ```python from maseval import AgentAdapter, MessageHistory @@ -211,13 +214,13 @@ class MyAgentAdapter(AgentAdapter): return history ``` -See the [Agent Wrapper guide](../reference/agent.md) for details on implementing custom wrappers. +See the [AgentAdapter guide](../reference/agent.md) for details on implementing custom adapters. ## Tips **For debugging**: Use `verbose=True` to see traces in real-time. -**For benchmarks**: Clear history between tasks with `wrapper.clear_message_history()`. +**For benchmarks**: Create a new adapter instance for each task to ensure clean conversation history. **For multi-agent systems**: Use a shared tracer and `get_conversations_by_agent()` to analyze each agent separately. diff --git a/docs/index.md b/docs/index.md index 709fb37c..38d24493 100644 --- a/docs/index.md +++ b/docs/index.md @@ -24,7 +24,7 @@ More details in the [Quickstart](getting-started/quickstart.md) - **Task-Specific Configurations:** Each benchmark task is a self-contained evaluation unit with its own instructions, environment state, success criteria, and custom evaluation logic. One task might measure success by environment state changes, another by programmatic output validation. -- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and wrappers enable any agent system to be evaluated without modification to the core library. +- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and adapters enable any agent system to be evaluated without modification to the core library. - **Lifecycle Hooks via Callbacks:** Inject custom logic at any point in the evaluation lifecycle (e.g., `on_run_start`, `on_task_start`, `on_agent_step_end`) through a callback system. This enables extensibility without modifying core evaluation logic. diff --git a/maseval/core/agent.py b/maseval/core/agent.py index 1c0e006d..5adb4248 100644 --- a/maseval/core/agent.py +++ b/maseval/core/agent.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod -from typing import List, Any, Optional, Union, Dict +from typing import List, Any, Optional, Dict from .callback import AgentCallback -from .history import MessageHistory, RoleType +from .history import MessageHistory from .tracing import TraceableMixin from .config import ConfigurableMixin @@ -10,7 +10,7 @@ class AgentAdapter(ABC, TraceableMixin, ConfigurableMixin): """Wraps an agent from any framework to provide a standard interface. - This wrapper provides: + This Adapter provides: - Unified execution interface via `run()` - Callback hooks for monitoring - Message history management via getter/setter @@ -101,35 +101,6 @@ def get_messages(self) -> MessageHistory: """ return self.messages if self.messages is not None else MessageHistory() - def set_message_history(self, history: MessageHistory) -> None: - """Set the message history. - - This is typically called by _run_agent() implementations after executing - the agent, but can also be used to inject or modify history. - - Args: - history: The MessageHistory to set - """ - self.messages = history - - def clear_message_history(self) -> None: - """Clear the message history.""" - self.messages = None - - def append_to_message_history(self, role: Union[RoleType, str], content: Union[str, List[Any]], **kwargs) -> None: - """Append a message to the history. - - If no history exists, creates a new one. - - Args: - role: The message role ("user", "assistant", "system", "tool") - content: The message content (string or list of content parts) - **kwargs: Additional fields (name, metadata, timestamp, etc.) - """ - if self.messages is None: - self.messages = MessageHistory() - self.messages.add_message(role, content, **kwargs) # type: ignore - def gather_traces(self) -> dict[str, Any]: """Gather execution traces from this agent. @@ -148,7 +119,7 @@ def gather_traces(self) -> dict[str, Any]: How to use: This method is automatically called by Benchmark during trace collection. - Framework-specific wrappers can extend this to include additional data: + Framework-specific adapters can extend this to include additional data: ```python def gather_traces(self) -> dict[str, Any]: @@ -181,12 +152,12 @@ def gather_config(self) -> dict[str, Any]: - gathered_at: ISO timestamp - name: Agent name - agent_type: Underlying agent framework class name - - wrapper_type: The specific wrapper class (e.g., SmolAgentAdapter) + - adapter_type: The specific adapter class (e.g., SmolAgentAdapter) - callbacks: List of callback class names attached to this agent How to use: This method is automatically called by Benchmark during config collection. - Framework-specific wrappers can extend this to include additional data: + Framework-specific adapters can extend this to include additional data: ```python def gather_config(self) -> dict[str, Any]: @@ -200,7 +171,7 @@ def gather_config(self) -> dict[str, Any]: **super().gather_config(), "name": self.name, "agent_type": type(self.agent).__name__, - "wrapper_type": type(self).__name__, + "adapter_type": type(self).__name__, "callbacks": [type(cb).__name__ for cb in self.callbacks], } diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py index 7f7e9f60..4ed0badb 100644 --- a/maseval/core/benchmark.py +++ b/maseval/core/benchmark.py @@ -64,8 +64,8 @@ def setup_environment(self, agent_data, task): def setup_agents(self, agent_data, environment, task, user): agent = MyAgent(model=agent_data["model"]) - wrapper = AgentAdapter(agent, "agent") - return [wrapper], {"agent": wrapper} + agent_adapter = AgentAdapter(agent, "agent") + return [agent_adapter], {"agent": agent_adapter} def run_agents(self, agents, task, environment): return agents[0].run(task.query) @@ -258,10 +258,10 @@ def setup_agents(self, agent_data, environment, task, user): # Create agent (auto-registered when returned) agent = MyAgent(model=model) - wrapper = AgentAdapter(agent, "agent1") + agent_adapter = AgentAdapter(agent, "agent1") # Environment and user are also auto-registered - return [wrapper], {"agent1": wrapper} + return [agent_adapter], {"agent1": agent_adapter} ``` Traces and configs are automatically collected before evaluation via @@ -673,12 +673,12 @@ def setup_agents(self, agent_data, environment, task, user): model=model, managed_agents=[w.agent for w in workers.values()] ) - orchestrator_wrapper = AgentAdapter(orchestrator, "orchestrator") + orchestrator_adapter = AgentAdapter(orchestrator, "orchestrator") # Return orchestrator to run, but all agents for monitoring # All agents auto-registered for tracing - all_agents = {"orchestrator": orchestrator_wrapper, **workers} - return [orchestrator_wrapper], all_agents + all_agents = {"orchestrator": orchestrator_adapter, **workers} + return [orchestrator_adapter], all_agents ``` """ pass diff --git a/maseval/core/callbacks/message_tracing.py b/maseval/core/callbacks/message_tracing.py index 498b7269..5bcded95 100644 --- a/maseval/core/callbacks/message_tracing.py +++ b/maseval/core/callbacks/message_tracing.py @@ -30,8 +30,8 @@ class MessageTracingAgentCallback(AgentCallback): tracer = MessageTracingAgentCallback(include_metadata=True, verbose=True) # Use with agent - wrapper = MyAgentAdapter(agent, name="agent1", callbacks=[tracer]) - wrapper.run("What's the weather?") + agent_adapter = MyAgentAdapter(agent, name="agent1", callbacks=[tracer]) + agent_adapter.run("What's the weather?") # Access traced conversations for conversation in tracer.get_all_conversations(): @@ -71,7 +71,7 @@ def on_run_end(self, agent: AgentAdapter, result: Any) -> None: """Called when agent execution completes. Args: - agent: The agent wrapper instance + agent: The agent adapter instance result: The result returned by the agent (usually MessageHistory) """ # Get message history from agent diff --git a/maseval/interface/agents/langgraph.py b/maseval/interface/agents/langgraph.py index 5a15ae4c..f944b2ed 100644 --- a/maseval/interface/agents/langgraph.py +++ b/maseval/interface/agents/langgraph.py @@ -4,7 +4,9 @@ pip install maseval[langgraph] """ -from typing import TYPE_CHECKING, Any +import time +from datetime import datetime +from typing import TYPE_CHECKING, Any, Dict from maseval import AgentAdapter, MessageHistory, User @@ -32,13 +34,13 @@ class LangGraphAgentAdapter(AgentAdapter): Requires langgraph to be installed. - This wrapper converts LangChain/LangGraph message types to MASEval's + This adapter converts LangChain/LangGraph message types to MASEval's OpenAI-compatible MessageHistory format. It preserves tool calls, tool responses, and multi-modal content. LangGraph graphs can be stateless or stateful (with checkpointer). This - wrapper supports both modes: - - Stateless: Messages from invoke() result are cached in wrapper + adapter supports both modes: + - Stateless: Messages from invoke() result are cached in adapter - Stateful: Messages fetched from graph state if config/thread_id provided Example: @@ -50,17 +52,17 @@ class LangGraphAgentAdapter(AgentAdapter): graph = StateGraph(...) compiled_graph = graph.compile() - wrapper = LangGraphAgentAdapter(compiled_graph, "agent_name") - result = wrapper.run("What's the weather?") + agent_adapter = LangGraphAgentAdapter(compiled_graph, "agent_name") + result = agent_adapter.run("What's the weather?") # Access message history - for msg in wrapper.get_messages(): + for msg in agent_adapter.get_messages(): print(msg['role'], msg['content']) ``` """ def __init__(self, agent_instance, name: str, callbacks=None, config=None): - """Initialize the LangGraph wrapper. + """Initialize the LangGraph adapter. Args: agent_instance: Compiled LangGraph graph @@ -103,85 +105,6 @@ def get_messages(self) -> MessageHistory: # No messages available return MessageHistory() - def set_message_history(self, history: MessageHistory) -> None: - """Set message history for langgraph. - - For stateless graphs, updates the cached result. - For stateful graphs, this is not fully supported as LangGraph manages state internally. - - Args: - history: MASEval MessageHistory to set - """ - _check_langgraph_installed() - from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage - - # Convert MessageHistory to LangChain messages - lc_messages = [] - for msg in history: - role = msg.get("role", "assistant") - content = msg.get("content", "") - - if role == "user": - lc_messages.append(HumanMessage(content=content)) - elif role == "assistant": - lc_messages.append(AIMessage(content=content)) - elif role == "system": - lc_messages.append(SystemMessage(content=content)) - elif role == "tool": - tool_call_id = msg.get("tool_call_id", "") - lc_messages.append(ToolMessage(content=content, tool_call_id=tool_call_id)) - - # Update cached result - self._last_result = {"messages": lc_messages} - - # Also update base class cache - super().set_message_history(history) - - def clear_message_history(self) -> None: - """Clear message history for langgraph. - - Clears the cached result. For stateful graphs, this doesn't clear - the persistent state in the checkpointer. - """ - self._last_result = None - super().clear_message_history() - - def append_to_message_history(self, role: str, content: Any, **kwargs) -> None: - """Append message to history. - - For stateless graphs, this appends to the cached result. - For stateful graphs, messages are managed by LangGraph during invoke(). - - Args: - role: Message role - content: Message content (string or list) - **kwargs: Additional message fields - """ - _check_langgraph_installed() - from langchain_core.messages import HumanMessage, AIMessage, SystemMessage - - # Get current messages - current_messages = [] - if self._last_result and "messages" in self._last_result: - current_messages = self._last_result["messages"] - - # Create new message - if role == "user": - new_msg = HumanMessage(content=str(content)) - elif role == "assistant": - new_msg = AIMessage(content=str(content)) - elif role == "system": - new_msg = SystemMessage(content=str(content)) - else: - new_msg = AIMessage(content=str(content)) - - # Append and update cache - current_messages.append(new_msg) - self._last_result = {"messages": current_messages} - - # Also update base class cache - super().append_to_message_history(role, content, **kwargs) - def gather_config(self) -> dict[str, Any]: """Gather configuration from this LangGraph agent. @@ -191,7 +114,7 @@ def gather_config(self) -> dict[str, Any]: - gathered_at: ISO timestamp - name: Agent name - agent_type: CompiledGraph or similar - - wrapper_type: LangGraphAgentAdapter + - adapter_type: LangGraphAgentAdapter - callbacks: List of callback class names - has_checkpointer: Whether the graph has state persistence - config: LangGraph config dict (with sensitive data removed) @@ -239,27 +162,101 @@ def _run_agent(self, query: str) -> Any: _check_langgraph_installed() from langchain_core.messages import HumanMessage - # Initialize the state with the user query - initial_state = {"messages": [HumanMessage(content=query)]} + start_time = time.time() + timestamp = datetime.now().isoformat() + + try: + # Initialize the state with the user query + initial_state = {"messages": [HumanMessage(content=query)]} + + # Invoke the graph (with config if provided) + if self._langgraph_config: + result = self.agent.invoke(initial_state, config=self._langgraph_config) + else: + result = self.agent.invoke(initial_state) + + # Cache the result for stateless graphs + self._last_result = result + duration = time.time() - start_time + + # Log successful execution + log_entry: Dict[str, Any] = { + "timestamp": timestamp, + "query": query, + "query_length": len(query), + "duration_seconds": duration, + "status": "success", + } + + # Extract state information if available + if isinstance(result, dict): + log_entry["state_keys"] = list(result.keys()) + messages = result.get("messages", []) + log_entry["message_count"] = len(messages) if messages else 0 + + # Try to extract token usage from messages if available + # (LangChain messages may have usage_metadata) + total_input_tokens = 0 + total_output_tokens = 0 + for msg in messages: + if hasattr(msg, "usage_metadata") and msg.usage_metadata: + # usage_metadata can be dict or object + if isinstance(msg.usage_metadata, dict): + total_input_tokens += msg.usage_metadata.get("input_tokens", 0) + total_output_tokens += msg.usage_metadata.get("output_tokens", 0) + else: + total_input_tokens += getattr(msg.usage_metadata, "input_tokens", 0) + total_output_tokens += getattr(msg.usage_metadata, "output_tokens", 0) + + if total_input_tokens > 0 or total_output_tokens > 0: + log_entry["input_tokens"] = total_input_tokens + log_entry["output_tokens"] = total_output_tokens + log_entry["total_tokens"] = total_input_tokens + total_output_tokens + + # For stateful graphs with checkpointer, get state snapshot metadata + if self._langgraph_config and hasattr(self.agent, "get_state"): + try: + state_snapshot = self.agent.get_state(self._langgraph_config) + if state_snapshot.metadata: + log_entry["checkpoint_metadata"] = { + "source": state_snapshot.metadata.get("source"), + "step": state_snapshot.metadata.get("step"), + } + if state_snapshot.created_at: + log_entry["checkpoint_created_at"] = state_snapshot.created_at + except Exception: + # If get_state fails, just skip metadata + pass + + self.logs.append(log_entry) + + # Extract and return the final answer from the graph's result + # LangGraph typically returns dict with 'messages' key, extract the last AI message + messages = result.get("messages", []) + if messages: + last_message = messages[-1] + # Return the content of the last message as the final answer + return getattr(last_message, "content", str(last_message)) + + return None + + except Exception as e: + duration = time.time() - start_time + + # Log failed execution + self.logs.append( + { + "timestamp": timestamp, + "query": query, + "query_length": len(query), + "duration_seconds": duration, + "status": "error", + "error": str(e), + "error_type": type(e).__name__, + } + ) - # Invoke the graph (with config if provided) - if self._langgraph_config: - result = self.agent.invoke(initial_state, config=self._langgraph_config) - else: - result = self.agent.invoke(initial_state) - - # Cache the result for stateless graphs - self._last_result = result - - # Extract and return the final answer from the graph's result - # LangGraph typically returns dict with 'messages' key, extract the last AI message - messages = result.get("messages", []) - if messages: - last_message = messages[-1] - # Return the content of the last message as the final answer - return getattr(last_message, "content", str(last_message)) - - return None + raise def _convert_langchain_messages(self, lc_messages: list) -> MessageHistory: """Convert LangChain messages to MASEval MessageHistory format. diff --git a/maseval/interface/agents/smolagents.py b/maseval/interface/agents/smolagents.py index 1650b468..7cab9a15 100644 --- a/maseval/interface/agents/smolagents.py +++ b/maseval/interface/agents/smolagents.py @@ -4,7 +4,7 @@ pip install maseval[smolagents] """ -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Dict, List from maseval import AgentAdapter, MessageHistory, User @@ -31,7 +31,7 @@ class SmolAgentAdapter(AgentAdapter): Requires smolagents to be installed. - This wrapper converts smolagents' internal message format to MASEval's + This adapter converts smolagents' internal message format to MASEval's OpenAI-compatible MessageHistory format. It automatically tracks tool calls, tool responses, and agent reasoning. @@ -41,15 +41,168 @@ class SmolAgentAdapter(AgentAdapter): from smolagents import MultiStepAgent agent = MultiStepAgent(...) - wrapper = SmolAgentAdapter(agent) - result = wrapper.run("What's the weather?") + agent_adapter = SmolAgentAdapter(agent) + result = agent_adapter.run("What's the weather?") # Access message history - for msg in wrapper.get_messages(): + for msg in agent_adapter.get_messages(): print(msg['role'], msg['content']) ``` """ + def __init__(self, agent_instance, name: str, callbacks=None): + """Initialize the Smolagent adapter. + + Note: We don't call super().__init__() to avoid initializing self.logs as a list, + since we override it as a property that dynamically fetches from agent.memory. + """ + self.agent = agent_instance + self.name = name + self.callbacks = callbacks or [] + self.messages = None + + @property + def logs(self) -> List[Dict[str, Any]]: # type: ignore[override] + """Dynamically generate logs from smolagents' internal memory. + + Converts smolagents' ActionStep and PlanningStep objects into log entries + compatible with the AgentAdapter contract, including all available properties. + + Returns: + List of log dictionaries with comprehensive step information + """ + _check_smolagents_installed() + from smolagents.memory import ActionStep, PlanningStep, TaskStep + + logs_list: List[Dict[str, Any]] = [] + + if not hasattr(self.agent, "memory") or not hasattr(self.agent.memory, "steps"): + return logs_list + + for step in self.agent.memory.steps: + if isinstance(step, ActionStep): + log_entry: Dict[str, Any] = { + "step_type": "ActionStep", + "step_number": step.step_number, + "status": "error" if step.error else "success", + } + + # Timing information + if hasattr(step, "timing") and step.timing: + log_entry["start_time"] = step.timing.start_time + log_entry["end_time"] = step.timing.end_time + log_entry["duration_seconds"] = step.timing.duration + + # Token usage information + if hasattr(step, "token_usage") and step.token_usage: + log_entry["input_tokens"] = step.token_usage.input_tokens + log_entry["output_tokens"] = step.token_usage.output_tokens + log_entry["total_tokens"] = step.token_usage.total_tokens + + # Model input messages - convert to MASEval format + if hasattr(step, "model_input_messages") and step.model_input_messages: + log_entry["model_input_messages"] = self._convert_smolagents_messages(step.model_input_messages).to_list() + + # Tool calls (ToolCall objects) + if hasattr(step, "tool_calls") and step.tool_calls: + log_entry["tool_calls"] = [ + { + "id": tc.id, + "name": tc.name, + "arguments": tc.arguments, + } + for tc in step.tool_calls + ] + + # Error information + if step.error: + log_entry["error"] = str(step.error) + log_entry["error_type"] = type(step.error).__name__ + + # Model output message - convert to MASEval format + if hasattr(step, "model_output_message") and step.model_output_message: + converted = self._convert_smolagents_messages([step.model_output_message]) + if len(converted) > 0: + log_entry["model_output_message"] = converted[0] + + # Model output (raw) + if hasattr(step, "model_output") and step.model_output is not None: + log_entry["model_output"] = step.model_output + + # Code action (for CodeAgent) + if hasattr(step, "code_action") and step.code_action: + log_entry["code_action"] = step.code_action + + # Observations + if hasattr(step, "observations") and step.observations: + log_entry["observations"] = step.observations + + # Observations images + if hasattr(step, "observations_images") and step.observations_images: + log_entry["observations_images_count"] = len(step.observations_images) + + # Action output + if hasattr(step, "action_output") and step.action_output is not None: + # Convert to string if it's not JSON-serializable + try: + log_entry["action_output"] = step.action_output + except (TypeError, ValueError): + log_entry["action_output"] = str(step.action_output) + + # Is final answer flag + if hasattr(step, "is_final_answer"): + log_entry["is_final_answer"] = step.is_final_answer + + logs_list.append(log_entry) + + elif isinstance(step, PlanningStep): + log_entry = { + "step_type": "PlanningStep", + } + + # Timing information + if hasattr(step, "timing") and step.timing: + log_entry["start_time"] = step.timing.start_time + log_entry["end_time"] = step.timing.end_time + log_entry["duration_seconds"] = step.timing.duration + + # Token usage information + if hasattr(step, "token_usage") and step.token_usage: + log_entry["input_tokens"] = step.token_usage.input_tokens + log_entry["output_tokens"] = step.token_usage.output_tokens + log_entry["total_tokens"] = step.token_usage.total_tokens + + # Model input messages - convert to MASEval format + if hasattr(step, "model_input_messages") and step.model_input_messages: + log_entry["model_input_messages"] = self._convert_smolagents_messages(step.model_input_messages).to_list() + + # Model output message - convert to MASEval format + if hasattr(step, "model_output_message") and step.model_output_message: + converted = self._convert_smolagents_messages([step.model_output_message]) + if len(converted) > 0: + log_entry["model_output_message"] = converted[0] + + # Plan + if hasattr(step, "plan") and step.plan: + log_entry["plan"] = step.plan + + logs_list.append(log_entry) + + elif isinstance(step, TaskStep): + # Log task initiation + log_entry = { + "step_type": "TaskStep", + "task": step.task, + } + + # Task images if present + if hasattr(step, "task_images") and step.task_images: + log_entry["task_images_count"] = len(step.task_images) + + logs_list.append(log_entry) + + return logs_list + def gather_traces(self) -> dict: """Gather traces including message history and monitoring data. @@ -137,7 +290,7 @@ def gather_config(self) -> dict[str, Any]: - gathered_at: ISO timestamp - name: Agent name - agent_type: Underlying agent class name - - wrapper_type: SmolAgentAdapter + - adapter_type: SmolAgentAdapter - callbacks: List of callback class names - smolagents_config: Full configuration from agent.to_dict() including: - model: Model configuration with class and parameters @@ -198,61 +351,14 @@ def get_messages(self) -> MessageHistory: # Convert and return return self._convert_smolagents_messages(smol_messages) - def set_message_history(self, history: MessageHistory) -> None: - """Set message history - NOT SUPPORTED by smolagents. - - Args: - history: MASEval MessageHistory to set - - Raises: - NotImplementedError: smolagents doesn't support arbitrary message injection - """ - raise NotImplementedError( - "smolagents doesn't support setting arbitrary message history. " - "The agent's memory is built from execution steps and cannot be directly manipulated. " - "Use clear_message_history() to reset, then run() to generate new conversation." - ) - - def clear_message_history(self) -> None: - """Clear message history by resetting smolagents memory.""" - _check_smolagents_installed() - from smolagents.memory import AgentMemory - - # Get system prompt before clearing - system_prompt = "" - if hasattr(self.agent, "memory") and hasattr(self.agent.memory, "system_prompt"): - system_prompt = self.agent.memory.system_prompt - - # Reset memory - self.agent.memory = AgentMemory(system_prompt=system_prompt) - - # Also clear base class cache - super().clear_message_history() - - def append_to_message_history(self, role: str, content: Any, **kwargs) -> None: - """Append message to history - NOT SUPPORTED by smolagents. - - Args: - role: Message role - content: Message content (string or list) - **kwargs: Additional message fields - - Raises: - NotImplementedError: smolagents doesn't support arbitrary message injection - """ - raise NotImplementedError( - "smolagents doesn't support appending arbitrary messages to history. " - "The agent's memory is built from execution steps and cannot be directly manipulated. " - "Use run() to generate conversation messages." - ) - def _run_agent(self, query: str) -> str: _check_smolagents_installed() # Run the agent (this updates the agent's internal memory and returns the final answer) + # All execution details are tracked in agent.memory.steps automatically final_answer = self.agent.run(query) - # Return the final answer (traces are captured via get_messages()) + # Return the final answer (traces are captured via get_messages() and gather_traces()) return final_answer def _convert_smolagents_messages(self, smol_messages: list) -> MessageHistory: diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index 04273dfb..d30989f2 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -44,7 +44,7 @@ def _extract_text(self, response: Any) -> str: if "candidates" in response and response["candidates"]: return response["candidates"][0].get("content", "") if "output" in response and isinstance(response["output"], list) and response["output"]: - # some wrappers return a list of text chunks + # some implementations return a list of text chunks first = response["output"][0] if isinstance(first, dict): return first.get("content", "") diff --git a/tests/README.md b/tests/README.md index 1746cd3f..269e1e56 100644 --- a/tests/README.md +++ b/tests/README.md @@ -101,7 +101,7 @@ Tests are organized into three directories following a **bottom-up and top-down Examples: - `test_model_adapter.py` - Base `ModelAdapter` class behavior -- `test_agent_wrapper.py` - Base `AgentAdapter` class behavior +- `test_agent_adapter.py` - Base `AgentAdapter` class behavior - `test_benchmark_lifecycle.py` - Core benchmark orchestration ### `test_interface/` @@ -110,7 +110,7 @@ Examples: Examples: -- `test_agent_integration/` - Framework-specific agent wrappers +- `test_agent_integration/` - Framework-specific agent adapters - `test_model_integration/` - Provider-specific model adapters (OpenAI, Google, HuggingFace, LiteLLM) ### `test_contract/` @@ -119,7 +119,7 @@ Examples: Contract tests use parametrized tests to verify that all implementations (e.g., different framework adapters) behave identically for key operations: -- `test_agent_wrapper_contract.py` - All `AgentAdapter` implementations return same message format, trigger callbacks uniformly +- `test_agent_adapter_contract.py` - All `AgentAdapter` implementations return same message format, trigger callbacks uniformly - `test_model_adapter_contract.py` - All `ModelAdapter` implementations log calls identically, produce same trace/config structure (65+ parameterized tests) - `test_collection_contract.py` - All components (Agent, Model, Environment, User) follow same tracing/config contracts diff --git a/tests/TESTING_PLAN.md b/tests/TESTING_PLAN.md index 1109bf0b..fc3292af 100644 --- a/tests/TESTING_PLAN.md +++ b/tests/TESTING_PLAN.md @@ -45,7 +45,7 @@ Every benchmark execution follows: **Setup → Run → Evaluate** - `setup_environment()` → creates isolated task environment - `setup_user()` → optional user simulator -- `setup_agents()` → instantiates agent wrappers +- `setup_agents()` → instantiates agent adapters - `run_agents()` → executes multi-agent system - Message collection and `evaluate()` → assessment @@ -110,7 +110,7 @@ All core functionality is fully tested. See individual test files in `tests/test - `test_message_history.py` (14 tests) - Message history interface and operations - `test_trace_collection.py` (10 tests) - Trace gathering from all components - `test_config_collection.py` (11 tests) - Configuration collection for reproducibility -- `test_agent_wrapper.py` (8 tests) - Agent wrapper base functionality +- `test_agent_adapter.py` (8 tests) - agent adapter base functionality - `test_environment.py` (7 tests) - Environment state management and tools - `test_user_simulator.py` (5 tests) - User simulation for collaborative benchmarks - `test_model_adapter.py` (36 tests) - Model adapter comprehensive testing @@ -125,7 +125,7 @@ All core functionality is fully tested. See individual test files in `tests/test All contract tests validate cross-implementation consistency. See individual test files in `tests/test_contract/` for complete contract guarantees: -- `test_agent_wrapper_contract.py` (11 tests) - Framework-agnostic agent wrapper contract +- `test_agent_adapter_contract.py` (11 tests) - Framework-agnostic agent adapter contract - `test_collection_contract.py` (20 tests) - Universal tracing and config contract - `test_model_adapter_contract.py` (16 tests) - Model provider abstraction contract @@ -187,11 +187,11 @@ Test file: `tests/test_core/test_config_collection.py` **Why:** Reproducibility depends on comprehensive config capture. -#### 5. **Agent Wrapper Tests** ✅ FULLY IMPLEMENTED +#### 5. **agent adapter Tests** ✅ FULLY IMPLEMENTED **Status:** ✅ **COMPLETE** - 8 tests implemented -Test file: `tests/test_core/test_agent_wrapper.py` +Test file: `tests/test_core/test_agent_adapter.py` **What is tested:** See test file for complete list. Tests cover callback triggering, message history operations (get/set/clear/append), trace collection, and config gathering. @@ -314,7 +314,7 @@ Test file: `tests/test_core/test_benchmark_integration.py` (proposed) **Status:** ✅ **COMPLETE** - 11 tests implemented -Test file: `tests/test_contract/test_agent_wrapper_contract.py` +Test file: `tests/test_contract/test_agent_adapter_contract.py` **Purpose:** Validates that ALL AgentAdapter implementations (smolagents, langgraph, dummy) honor the same behavioral contract and behave identically for key operations. This is MASEval's **CORE PROMISE** - framework-agnostic agent abstraction. @@ -393,7 +393,7 @@ Test files: **Smolagents (10 tests):** -- Wrapper/user creation and import guards +- Adapter creation and import guards - Trace gathering with/without monitoring - Trace gathering with planning steps - Message manipulation support (not supported) @@ -401,7 +401,7 @@ Test files: **LangGraph (5 tests):** -- Wrapper import and availability checks +- Adapter import and availability checks - Message manipulation with/without system messages **Why:** Validates framework-specific adapters work correctly with their respective libraries and handle framework-specific features properly. @@ -421,7 +421,7 @@ Test files: **Thread Safety and Concurrency:** -- `test_wrapper_concurrent_runs()` - Multiple threads calling run() simultaneously +- `test_adapter_concurrent_runs()` - Multiple threads calling run() simultaneously - `test_trace_collection_thread_safety()` - Trace accumulation in concurrent execution - `test_callback_thread_safety()` - Callbacks triggered from multiple threads @@ -526,13 +526,13 @@ Each test file should: ### P1 (Should Have - High Value) ✅ ALL COMPLETE -6. ✅ Agent wrapper tests +6. ✅ agent adapter tests 7. ✅ Environment tests 8. ✅ Callback orchestration tests 9. ✅ Task collection tests 10. ✅ Evaluator tests 11. ✅ Result logger callbacks -12. ✅ Contract tests (agent wrapper, collection, model adapter) +12. ✅ Contract tests (agent adapter, collection, model adapter) ### P2 (Nice to Have - Completeness) ✅ ALL COMPLETE @@ -561,7 +561,7 @@ Shared fixtures implemented in `tests/conftest.py`: - `dummy_model` - DummyModelAdapter with configurable responses - `dummy_agent` - DummyAgent that tracks calls -- `dummy_agent_wrapper` - DummyAgentAdapter with message history +- `dummy_agent_adapter` - DummyAgentAdapter with message history - `dummy_environment` - DummyEnvironment with state management - `dummy_user` - DummyUser for simulation testing - `dummy_task` - Single Task instance @@ -614,7 +614,7 @@ pytest -x --ff # Stop on first failure, run previous failures first - ✅ **Test Count:** 333 tests implemented across 23 test files - ✅ **Core Coverage:** All P0 (4/5), P1 (7/7), and P2 (6/6) tests complete -- ✅ **Contract Coverage:** All contract tests implemented (agent wrapper, collection, model adapter) +- ✅ **Contract Coverage:** All contract tests implemented (agent adapter, collection, model adapter) - ✅ **Interface Coverage:** All adapter integration tests complete (agents + models) - 🟡 **Runtime:** Not yet measured - 🟡 **Reliability:** Not yet run in CI @@ -639,14 +639,14 @@ pytest -x --ff # Stop on first failure, run previous failures first ### Phase 2 (Week 2): Core Coverage ✅ COMPLETE -- ✅ Implement P1 tests (agent wrapper, environment, callbacks, tasks, evaluator) +- ✅ Implement P1 tests (agent adapter, environment, callbacks, tasks, evaluator) - ✅ Add callback orchestration tests - ✅ Message tracing callback specialized tests - ✅ Automatic registration tests ### Phase 3 (Week 3): Interface Coverage ✅ COMPLETE -- ✅ Contract tests (agent wrapper, collection, model adapter - 47 tests) +- ✅ Contract tests (agent adapter, collection, model adapter - 47 tests) - ✅ Smolagents integration (10 tests) - ✅ LangGraph integration (5 tests) - ✅ Model adapter integrations (22 tests across 4 providers) diff --git a/tests/conftest.py b/tests/conftest.py index 81ef463e..399eb916 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -92,19 +92,33 @@ def __str__(self): class DummyAgentAdapter(AgentAdapter): - """Test agent wrapper that populates message history.""" + """Test agent adapter that populates message history.""" def _run_agent(self, query: str) -> str: - # Create message history - history = MessageHistory() - history.add_message(role="user", content=query) + import time + + # Track timing + start_time = time.time() # Run underlying agent response = self.agent.run(query) - history.add_message(role="assistant", content=response) - # Store history - self.set_message_history(history) + # Store history directly + if self.messages is None: + self.messages = MessageHistory() + self.messages.add_message(role="user", content=query) + self.messages.add_message(role="assistant", content=response) + + # Populate logs to fulfill contract + duration = time.time() - start_time + self.logs.append( + { + "query": query, + "duration_seconds": duration, + "status": "success", + "response": response, + } + ) # Return final answer (not the history) return response @@ -199,8 +213,8 @@ def setup_agents( ) -> Tuple[Sequence[AgentAdapter], Dict[str, AgentAdapter]]: self.setup_agents_calls.append((agent_data, environment, task, user)) agent = DummyAgent() - wrapper = DummyAgentAdapter(agent, "test_agent") - return [wrapper], {"test_agent": wrapper} + agent_adapter = DummyAgentAdapter(agent, "test_agent") + return [agent_adapter], {"test_agent": agent_adapter} def setup_evaluators( self, environment: Environment, task: Task, agents: Sequence[AgentAdapter], user: Optional[User] @@ -240,8 +254,8 @@ def dummy_agent(): @pytest.fixture -def dummy_agent_wrapper(dummy_agent): - """Create a dummy agent wrapper.""" +def dummy_agent_adapter(dummy_agent): + """Create a dummy agent adapter.""" return DummyAgentAdapter(dummy_agent, "test_agent") diff --git a/tests/test_contract/test_agent_wrapper_contract.py b/tests/test_contract/test_agent_adapter_contract.py similarity index 70% rename from tests/test_contract/test_agent_wrapper_contract.py rename to tests/test_contract/test_agent_adapter_contract.py index 6cd32902..40fa4dbf 100644 --- a/tests/test_contract/test_agent_wrapper_contract.py +++ b/tests/test_contract/test_agent_adapter_contract.py @@ -115,7 +115,7 @@ def agent_node(state: State) -> State: response = mock_llm([{"role": "user", "content": user_msg}]) - # Return LangChain-style message objects so the wrapper conversion works + # Return LangChain-style message objects so the adapter conversion works return {"messages": messages + [AIMessage(content=response)]} # Build graph @@ -130,8 +130,8 @@ def agent_node(state: State) -> State: raise ValueError(f"Unknown framework: {framework}") -def create_wrapper_for_framework(framework: str, agent, callbacks: Optional[List[AgentCallback]] = None): - """Create a framework-specific wrapper instance.""" +def create_adapter_for_framework(framework: str, agent, callbacks: Optional[List[AgentCallback]] = None): + """Create a framework-specific adapter instance.""" # Verify agent is not None and is the expected type for the framework assert agent is not None, f"Agent instance is None for framework: {framework}" @@ -172,7 +172,7 @@ def create_wrapper_for_framework(framework: str, agent, callbacks: Optional[List class TestAgentAdapterContract: """Verify all AgentAdapter implementations honor the same contract.""" - def test_wrapper_run_returns_same_structure(self, framework): + def test_adapter_run_returns_same_structure(self, framework): """Test all frameworks return string result and populate message history. Contract: run() must return a string (the final answer) and populate @@ -180,16 +180,16 @@ def test_wrapper_run_returns_same_structure(self, framework): """ mock_llm = MockLLM(responses=["Test response to query"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) - result = wrapper.run("Test query") + result = adapter.run("Test query") # All should return string (final answer) assert isinstance(result, str) assert len(result) > 0 # All should populate message history identically - history = wrapper.get_messages() + history = adapter.get_messages() assert len(history) > 0 # Some frameworks (smolagents) prepend a system message; accept either. @@ -198,7 +198,7 @@ def test_wrapper_run_returns_same_structure(self, framework): # Ensure at least one assistant/tool message exists somewhere in the history assert any(msg.get("role") in ["assistant", "tool"] for msg in history) - def test_wrapper_message_format_identical(self, framework): + def test_adapter_message_format_identical(self, framework): """Test all frameworks produce OpenAI-compatible message format. Contract: All messages must have 'role' and 'content' keys, matching @@ -206,10 +206,10 @@ def test_wrapper_message_format_identical(self, framework): """ mock_llm = MockLLM(responses=["Response content"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) - wrapper.run("Test query") - history = wrapper.get_messages() + adapter.run("Test query") + history = adapter.get_messages() # Verify OpenAI format for msg in history: @@ -219,18 +219,18 @@ def test_wrapper_message_format_identical(self, framework): allowed = {"user", "assistant", "system", "tool"} assert role in allowed or role.startswith("tool"), f"Invalid role: {msg['role']}" - def test_wrapper_callbacks_triggered_uniformly(self, framework): + def test_adapter_callbacks_triggered_uniformly(self, framework): """Test callbacks fire in same order across all frameworks. Contract: on_run_start and on_run_end callbacks must fire in the - correct order (start before run, end after run) for all wrappers. + correct order (start before run, end after run) for all adapters. """ callback_tracker = CallbackTracker() mock_llm = MockLLM(responses=["Response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent, callbacks=[callback_tracker]) + adapter = create_adapter_for_framework(framework, agent, callbacks=[callback_tracker]) - wrapper.run("Test query") + adapter.run("Test query") # All frameworks should trigger same callback sequence assert "on_agent_start" in callback_tracker.events @@ -238,20 +238,20 @@ def test_wrapper_callbacks_triggered_uniformly(self, framework): assert callback_tracker.events[0] == "on_agent_start" assert callback_tracker.events[-1] == "on_agent_end" - def test_wrapper_traces_same_structure(self, framework): + def test_adapter_traces_same_structure(self, framework): """Test gather_traces returns consistent structure across frameworks. - Contract: All wrappers must provide message history in traces, enabling + Contract: All adapters must provide message history in traces, enabling uniform access to execution data regardless of underlying framework. """ mock_llm = MockLLM(responses=["Response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) - wrapper.run("Test query") - traces = wrapper.gather_traces() + adapter.run("Test query") + traces = adapter.gather_traces() - # All should include message history; different wrappers name this key + # All should include message history; different adapters name this key if "message_history" in traces: messages = traces["message_history"] else: @@ -260,24 +260,24 @@ def test_wrapper_traces_same_structure(self, framework): assert isinstance(messages, list) assert len(messages) > 0 - def test_wrapper_config_same_structure(self, framework): + def test_adapter_config_same_structure(self, framework): """Test gather_config returns consistent structure across frameworks. - Contract: All wrappers must provide agent name in config, enabling + Contract: All adapters must provide agent name in config, enabling identification and reproducibility tracking. """ mock_llm = MockLLM(responses=["Response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) - config = wrapper.gather_config() + config = adapter.gather_config() # All should include agent name assert "agent_name" in config or "name" in config # All should include some identifying information assert len(config) > 0 - def test_wrapper_get_messages_after_multiple_runs(self, framework): + def test_adapter_get_messages_after_multiple_runs(self, framework): """Test message history accumulation across multiple agent runs. Contract: Message history behavior during multi-turn conversations must @@ -285,24 +285,24 @@ def test_wrapper_get_messages_after_multiple_runs(self, framework): """ mock_llm = MockLLM(responses=["First response", "Second response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) # First run - wrapper.run("First query") - history_1 = wrapper.get_messages() + adapter.run("First query") + history_1 = adapter.get_messages() len_1 = len(history_1) assert len_1 > 0 # Second run (behavior may differ: some accumulate, some reset) - wrapper.run("Second query") - history_2 = wrapper.get_messages() + adapter.run("Second query") + history_2 = adapter.get_messages() len_2 = len(history_2) # At minimum, should have messages from second run assert len_2 > 0 # Note: We don't enforce accumulation vs reset - that's framework-specific - def test_wrapper_empty_query_handling(self, framework): + def test_adapter_empty_query_handling(self, framework): """All frameworks handle empty queries gracefully. Note: This test accepts both success and failure for empty queries. @@ -312,21 +312,21 @@ def test_wrapper_empty_query_handling(self, framework): """ mock_llm = MockLLM(responses=["Response to empty"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) # Should not crash on empty query try: - result = wrapper.run("") + result = adapter.run("") # If it succeeds, should return something assert result is not None except (ValueError, AssertionError): # It's acceptable to reject empty queries pass - def test_wrapper_on_event_callback(self, framework): + def test_adapter_on_event_callback(self, framework): """Test that standard callback hooks fire consistently across frameworks. - Contract: All wrappers must fire on_run_start and on_run_end callbacks. + Contract: All adapters must fire on_run_start and on_run_end callbacks. The on_event hook is optional for custom events. """ events = [] @@ -343,20 +343,20 @@ def on_run_end(self, agent, result): mock_llm = MockLLM(responses=["Response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent, callbacks=[EventTracker()]) + adapter = create_adapter_for_framework(framework, agent, callbacks=[EventTracker()]) - wrapper.run("Test query") + adapter.run("Test query") # Verify standard callbacks fired event_types = [e[0] for e in events] assert "on_run_start" in event_types assert "on_run_end" in event_types - # Note: on_event() is a generic hook that wrappers can use to emit custom events. + # Note: on_event() is a generic hook that adapters can use to emit custom events. # The base AgentAdapter doesn't emit any events by default, but the callback - # mechanism should work if wrappers choose to use it. + # mechanism should work if adapters choose to use it. - def test_wrapper_callback_lifecycle_order(self, framework): + def test_adapter_callback_lifecycle_order(self, framework): """Test callbacks fire in correct lifecycle order with proper state. Contract: on_run_start fires before execution with initial state, @@ -379,9 +379,9 @@ def on_run_end(self, agent, result): mock_llm = MockLLM(responses=["Test response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent, callbacks=[LifecycleTracker()]) + adapter = create_adapter_for_framework(framework, agent, callbacks=[LifecycleTracker()]) - result = wrapper.run("Test query") + result = adapter.run("Test query") # Verify callback order assert len(lifecycle_events) == 2 @@ -399,7 +399,7 @@ def on_run_end(self, agent, result): # Verify result is passed to on_run_end assert lifecycle_events[1][2] == result - def test_wrapper_multiple_callbacks(self, framework): + def test_adapter_multiple_callbacks(self, framework): """Test multiple callbacks execute in registration order. Contract: When multiple callbacks are registered, they must execute @@ -423,9 +423,9 @@ def on_run_end(self, agent, result): mock_llm = MockLLM(responses=["Response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent, callbacks=[FirstCallback(), SecondCallback()]) + agent_adapter = create_adapter_for_framework(framework, agent, callbacks=[FirstCallback(), SecondCallback()]) - wrapper.run("Test query") + agent_adapter.run("Test query") # Verify all callbacks fired assert len(call_order) == 4 @@ -433,35 +433,101 @@ def on_run_end(self, agent, result): # Verify order: all on_run_start before any on_run_end assert call_order == ["first_start", "second_start", "first_end", "second_end"] - def test_wrapper_message_history_after_clear_and_run(self, framework): - """Test message history clear resets state for fresh conversations. + def test_adapter_logs_populated_after_run(self, framework): + """Test all adapters populate self.logs during execution. - Contract: clear_message_history must fully reset history state, and - subsequent run() calls must start with clean history regardless of - framework implementation details. + Contract: All AgentAdapter implementations must populate the self.logs + attribute with execution information. This enables uniform access to + detailed execution traces regardless of the underlying framework. - Note: smolagents maintains a system message after clear. + The logs should contain basic execution information that can be used + for debugging, monitoring, and evaluation purposes. + """ + mock_llm = MockLLM(responses=["Test response"]) + agent = create_agent_for_framework(framework, mock_llm) + adapter = create_adapter_for_framework(framework, agent) + + # Before run, logs should be empty + assert isinstance(adapter.logs, list) + initial_log_count = len(adapter.logs) + + # Run the agent + adapter.run("Test query") + + # After run, logs should be populated + assert len(adapter.logs) > initial_log_count + assert isinstance(adapter.logs, list) + + # Verify logs contain useful information (at least one entry) + # Different frameworks may structure logs differently, but all should have entries + assert len(adapter.logs) > 0 + + def test_adapter_logs_in_gather_traces(self, framework): + """Test that gather_traces includes logs field. + + Contract: The gather_traces() method must include the logs field, + providing a unified way to access execution details across all frameworks. + """ + mock_llm = MockLLM(responses=["Test response"]) + agent = create_agent_for_framework(framework, mock_llm) + adapter = create_adapter_for_framework(framework, agent) + + # Run the agent + adapter.run("Test query") + + # Gather traces + traces = adapter.gather_traces() + + # Verify logs field exists and is populated + assert "logs" in traces + assert isinstance(traces["logs"], list) + assert len(traces["logs"]) > 0 + + def test_adapter_logs_structure_has_basic_info(self, framework): + """Test that logs entries contain basic execution information. + + Contract: While the exact structure of log entries may vary by framework, + all implementations should provide basic execution information in their logs. + This test verifies that log entries are dictionaries containing some form + of execution data. + """ + mock_llm = MockLLM(responses=["Test response"]) + agent = create_agent_for_framework(framework, mock_llm) + adapter = create_adapter_for_framework(framework, agent) + + # Run the agent + adapter.run("Test query") + + # Verify logs contain dict entries with data + logs = adapter.logs + assert len(logs) > 0 + + # Each log entry should be a dictionary + for log_entry in logs: + assert isinstance(log_entry, dict) + # Should have at least one field with information + assert len(log_entry) > 0 + + def test_adapter_logs_accumulate_across_runs(self, framework): + """Test that logs accumulate or reset consistently across multiple run + calls to the agent. + + Contract: Adapter logs should maintain a consistent lifecycle behavior + across runs. """ mock_llm = MockLLM(responses=["First response", "Second response"]) agent = create_agent_for_framework(framework, mock_llm) - wrapper = create_wrapper_for_framework(framework, agent) + adapter = create_adapter_for_framework(framework, agent) # First run - wrapper.run("First query") - history_1 = wrapper.get_messages() - assert len(history_1) > 0 - - # Clear and verify empty (or just system message for smolagents) - wrapper.clear_message_history() - history_after_clear = wrapper.get_messages() - expected_after_clear = 1 if framework == "smolagents" else 0 # smolagents keeps system message - assert len(history_after_clear) == expected_after_clear - - # Second run should populate new history - wrapper.run("Second query") - history_2 = wrapper.get_messages() - assert len(history_2) > expected_after_clear # Should have more than just system message - - # History should only contain second run's messages - # (exact count depends on framework, but should have at least one message) - assert any("Second query" in str(msg.get("content", "")) for msg in history_2) + adapter.run("First query") + logs_count_after_first = len(adapter.logs) + assert logs_count_after_first > 0 + + # Second run + adapter.run("Second query") + logs_count_after_second = len(adapter.logs) + + # Logs should either accumulate or stay consistent + # (we accept both behaviors as long as logs are populated) + assert logs_count_after_second > 0 diff --git a/tests/test_contract/test_collection_contract.py b/tests/test_contract/test_collection_contract.py index 1df13085..6e77b8b3 100644 --- a/tests/test_contract/test_collection_contract.py +++ b/tests/test_contract/test_collection_contract.py @@ -124,7 +124,7 @@ def remove_timestamp(data: Dict[str, Any]) -> Dict[str, Any]: def create_agent_for_framework(framework: str): - """Create agent wrapper for specified framework.""" + """Create agent adapter for specified framework.""" if framework == "dummy": agent = DummyAgent() return DummyAgentAdapter(agent, "test_agent") @@ -187,7 +187,7 @@ class TestUniversalTracingContract: """Test that ALL components follow the same tracing contract.""" def test_agent_traces_have_base_fields(self): - """Agent wrappers must include base trace fields including name.""" + """agent adapters must include base trace fields including name.""" agent = create_agent_for_framework("dummy") agent.run("Test query") @@ -250,7 +250,7 @@ class TestUniversalConfigContract: """Test that ALL components follow the same configuration contract.""" def test_agent_config_has_base_fields(self): - """Agent wrappers must include base config fields including name.""" + """agent adapters must include base config fields including name.""" agent = create_agent_for_framework("dummy") config = agent.gather_config() @@ -320,7 +320,7 @@ def test_config_never_raises_exceptions(self): @pytest.mark.interface @pytest.mark.parametrize("framework", ["dummy", "smolagents", "langgraph"]) class TestCrossFrameworkTracingConsistency: - """Test that agent wrappers have consistent tracing across frameworks.""" + """Test that agent adapters have consistent tracing across frameworks.""" def test_all_frameworks_return_same_base_structure(self, framework): """All frameworks must return same base trace structure.""" @@ -355,12 +355,12 @@ def test_all_frameworks_return_same_base_config(self, framework): # All agents must have these fields (AgentAdapter contract) assert "agent_type" in config, "Missing 'agent_type' field" - assert "wrapper_type" in config, "Missing 'wrapper_type' field" + assert "adapter_type" in config, "Missing 'adapter_type' field" assert "callbacks" in config, "Missing 'callbacks' field" # Verify types assert isinstance(config["agent_type"], str) - assert isinstance(config["wrapper_type"], str) + assert isinstance(config["adapter_type"], str) assert isinstance(config["callbacks"], list) def test_all_frameworks_have_json_serializable_traces(self, framework): diff --git a/tests/test_core/test_agent_adapter.py b/tests/test_core/test_agent_adapter.py new file mode 100644 index 00000000..10161403 --- /dev/null +++ b/tests/test_core/test_agent_adapter.py @@ -0,0 +1,93 @@ +"""Test AgentAdapter functionality. + +These tests verify that AgentAdapter provides the correct interface for +adapting agents from any framework. +""" + +import pytest +from maseval import MessageHistory + + +@pytest.mark.core +class TestAgentAdapter: + """Tests for AgentAdapter interface and behavior.""" + + def test_agent_adapter_run_triggers_callbacks(self, dummy_agent_adapter): + """Test that run() triggers agent callbacks.""" + from maseval import AgentCallback + + # Track callback invocations + callback_calls = [] + + class TrackingCallback(AgentCallback): + def on_run_start(self, agent): + callback_calls.append(("start", agent.name)) + + def on_run_end(self, agent, result): + callback_calls.append(("end", agent.name, result)) + + dummy_agent_adapter.callbacks = [TrackingCallback()] + _ = dummy_agent_adapter.run("Test query") + + assert len(callback_calls) == 2 + assert callback_calls[0] == ("start", "test_agent") + assert callback_calls[1][0] == "end" + assert callback_calls[1][1] == "test_agent" + assert "Response to: Test query" in callback_calls[1][2] + + def test_agent_adapter_get_messages_returns_history(self, dummy_agent_adapter): + """Test that get_messages() returns MessageHistory.""" + # Before run, should return empty history + history = dummy_agent_adapter.get_messages() + assert isinstance(history, MessageHistory) + assert len(history) == 0 + + # After run, should have messages + dummy_agent_adapter.run("Test query") + history = dummy_agent_adapter.get_messages() + assert len(history) == 2 + assert history[0]["role"] == "user" + assert history[1]["role"] == "assistant" + + def test_agent_adapter_gather_traces_includes_messages(self, dummy_agent_adapter): + """Test that gather_traces() includes message history.""" + dummy_agent_adapter.run("Test query") + + traces = dummy_agent_adapter.gather_traces() + + assert "type" in traces + assert "gathered_at" in traces + assert "name" in traces + assert "agent_type" in traces + assert "message_count" in traces + assert "messages" in traces + + assert traces["name"] == "test_agent" + assert traces["message_count"] == 2 + assert len(traces["messages"]) == 2 + + def test_agent_adapter_gather_config(self, dummy_agent_adapter): + """Test that gather_config() returns configuration.""" + config = dummy_agent_adapter.gather_config() + + assert "type" in config + assert "gathered_at" in config + assert "name" in config + assert "agent_type" in config + + assert config["name"] == "test_agent" + assert config["type"] == "DummyAgentAdapter" + + def test_agent_adapter_multiple_runs(self, dummy_agent_adapter): + """Test that adapter can be run multiple times and history accumulates.""" + result1 = dummy_agent_adapter.run("Query 1") + assert "Query 1" in result1 + + result2 = dummy_agent_adapter.run("Query 2") + assert "Query 2" in result2 + + # History should have both runs + history = dummy_agent_adapter.get_messages() + assert len(history) == 4 # 2 messages per run + assert history[0]["content"] == "Query 1" + assert history[2]["content"] == "Query 2" diff --git a/tests/test_core/test_agent_wrapper.py b/tests/test_core/test_agent_wrapper.py deleted file mode 100644 index d91d8be2..00000000 --- a/tests/test_core/test_agent_wrapper.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Test AgentAdapter functionality. - -These tests verify that AgentAdapter provides the correct interface for -wrapping agents from any framework. -""" - -import pytest -from maseval import MessageHistory - - -@pytest.mark.core -class TestAgentAdapter: - """Tests for AgentAdapter interface and behavior.""" - - def test_agent_wrapper_run_triggers_callbacks(self, dummy_agent_wrapper): - """Test that run() triggers agent callbacks.""" - from maseval import AgentCallback - - # Track callback invocations - callback_calls = [] - - class TrackingCallback(AgentCallback): - def on_run_start(self, agent): - callback_calls.append(("start", agent.name)) - - def on_run_end(self, agent, result): - callback_calls.append(("end", agent.name, result)) - - dummy_agent_wrapper.callbacks = [TrackingCallback()] - _ = dummy_agent_wrapper.run("Test query") - - assert len(callback_calls) == 2 - assert callback_calls[0] == ("start", "test_agent") - assert callback_calls[1][0] == "end" - assert callback_calls[1][1] == "test_agent" - assert "Response to: Test query" in callback_calls[1][2] - - def test_agent_wrapper_get_messages_returns_history(self, dummy_agent_wrapper): - """Test that get_messages() returns MessageHistory.""" - # Before run, should return empty history - history = dummy_agent_wrapper.get_messages() - assert isinstance(history, MessageHistory) - assert len(history) == 0 - - # After run, should have messages - dummy_agent_wrapper.run("Test query") - history = dummy_agent_wrapper.get_messages() - assert len(history) == 2 - assert history[0]["role"] == "user" - assert history[1]["role"] == "assistant" - - def test_agent_wrapper_set_message_history(self, dummy_agent_wrapper): - """Test that message history can be set manually.""" - new_history = MessageHistory() - new_history.add_message("user", "Custom message") - new_history.add_message("assistant", "Custom response") - - dummy_agent_wrapper.set_message_history(new_history) - - retrieved = dummy_agent_wrapper.get_messages() - assert len(retrieved) == 2 - assert retrieved[0]["content"] == "Custom message" - assert retrieved[1]["content"] == "Custom response" - - def test_agent_wrapper_clear_message_history(self, dummy_agent_wrapper): - """Test that message history can be cleared.""" - dummy_agent_wrapper.run("Test") - assert len(dummy_agent_wrapper.get_messages()) > 0 - - dummy_agent_wrapper.clear_message_history() - assert len(dummy_agent_wrapper.get_messages()) == 0 - - def test_agent_wrapper_append_to_message_history(self, dummy_agent_wrapper): - """Test that messages can be appended to history.""" - dummy_agent_wrapper.append_to_message_history("user", "First message") - dummy_agent_wrapper.append_to_message_history("assistant", "First response") - - history = dummy_agent_wrapper.get_messages() - assert len(history) == 2 - assert history[0]["content"] == "First message" - assert history[1]["content"] == "First response" - - def test_agent_wrapper_gather_traces_includes_messages(self, dummy_agent_wrapper): - """Test that gather_traces() includes message history.""" - dummy_agent_wrapper.run("Test query") - - traces = dummy_agent_wrapper.gather_traces() - - assert "type" in traces - assert "gathered_at" in traces - assert "name" in traces - assert "agent_type" in traces - assert "message_count" in traces - assert "messages" in traces - - assert traces["name"] == "test_agent" - assert traces["message_count"] == 2 - assert len(traces["messages"]) == 2 - - def test_agent_wrapper_gather_config(self, dummy_agent_wrapper): - """Test that gather_config() returns configuration.""" - config = dummy_agent_wrapper.gather_config() - - assert "type" in config - assert "gathered_at" in config - assert "name" in config - assert "agent_type" in config - - assert config["name"] == "test_agent" - assert config["type"] == "DummyAgentAdapter" - - def test_agent_wrapper_multiple_runs(self, dummy_agent_wrapper): - """Test that wrapper can be run multiple times.""" - result1 = dummy_agent_wrapper.run("Query 1") - assert "Query 1" in result1 - - # Clear history for second run - dummy_agent_wrapper.clear_message_history() - - result2 = dummy_agent_wrapper.run("Query 2") - assert "Query 2" in result2 - - # History should only have second run - history = dummy_agent_wrapper.get_messages() - assert len(history) == 2 - assert history[0]["content"] == "Query 2" diff --git a/tests/test_core/test_benchmark/test_automatic_registration.py b/tests/test_core/test_benchmark/test_automatic_registration.py index 6ead667a..e1c55aa7 100644 --- a/tests/test_core/test_benchmark/test_automatic_registration.py +++ b/tests/test_core/test_benchmark/test_automatic_registration.py @@ -88,12 +88,12 @@ def test_duplicate_registration_helpful_message(): # Create and register an agent agent = DummyAgent() - wrapper = DummyAgentAdapter(agent, "my_agent") - benchmark.register("agents", "first_name", wrapper) + agent_adapter = DummyAgentAdapter(agent, "my_agent") + benchmark.register("agents", "first_name", agent_adapter) # Try to register again with different name with pytest.raises(ValueError) as exc_info: - benchmark.register("agents", "second_name", wrapper) + benchmark.register("agents", "second_name", agent_adapter) error_message = str(exc_info.value) assert "already registered as 'agents:first_name'" in error_message diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py index fa8a0e84..0344872d 100644 --- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py +++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py @@ -318,8 +318,8 @@ def _run_agent(self, query: str) -> str: class TaskFailureBenchmark(DummyBenchmark): def setup_agents(self, agent_data, environment, task, user): agent = FailingAgent() - wrapper = FailingAgentAdapter(agent, "failing_agent") - return [wrapper], {"failing_agent": wrapper} + agent_adapter = FailingAgentAdapter(agent, "failing_agent") + return [agent_adapter], {"failing_agent": agent_adapter} tasks = TaskCollection.from_list([{"query": "Test query", "environment_data": {}}]) benchmark = TaskFailureBenchmark( @@ -354,8 +354,8 @@ def _run_agent(self, query: str) -> str: class TaskFailureBenchmark(DummyBenchmark): def setup_agents(self, agent_data, environment, task, user): agent = FailingAgent() - wrapper = FailingAgentAdapter(agent, "failing_agent") - return [wrapper], {"failing_agent": wrapper} + agent_adapter = FailingAgentAdapter(agent, "failing_agent") + return [agent_adapter], {"failing_agent": agent_adapter} tasks = TaskCollection.from_list([{"query": "Test query", "environment_data": {}}]) benchmark = TaskFailureBenchmark( @@ -440,13 +440,13 @@ def __init__(self, *args, **kwargs): def setup_agents(self, agent_data, environment, task, user): if self.task_counter == 1: # Fail second task agent = FailingAgent() - wrapper = FailingAgentAdapter(agent, "failing") + agent_adapter = FailingAgentAdapter(agent, "failing") else: agent = DummyAgent() - wrapper = DummyAgentAdapter(agent, "test_agent") + agent_adapter = DummyAgentAdapter(agent, "test_agent") self.task_counter += 1 - return [wrapper], {wrapper.name: wrapper} + return [agent_adapter], {agent_adapter.name: agent_adapter} tasks = TaskCollection.from_list( [ @@ -611,13 +611,13 @@ def setup_agents(self, agent_data, environment, task, user): # Fail second task on first run only if self.task_counter == 1 and self.fail_on_first_run: agent = FailingAgent() - wrapper = FailingAgentAdapter(agent, "failing") + agent_adapter = FailingAgentAdapter(agent, "failing") else: agent = DummyAgent() - wrapper = DummyAgentAdapter(agent, "test_agent") + agent_adapter = DummyAgentAdapter(agent, "test_agent") self.task_counter += 1 - return [wrapper], {wrapper.name: wrapper} + return [agent_adapter], {agent_adapter.name: agent_adapter} tasks = TaskCollection.from_list( [ diff --git a/tests/test_core/test_benchmark/test_config_collection.py b/tests/test_core/test_benchmark/test_config_collection.py index 15076b3c..e1a4ad3c 100644 --- a/tests/test_core/test_benchmark/test_config_collection.py +++ b/tests/test_core/test_benchmark/test_config_collection.py @@ -150,7 +150,7 @@ def test_config_handles_component_errors_gracefully(self): from conftest import DummyBenchmark from maseval import AgentAdapter - class FailingConfigWrapper(AgentAdapter): + class FailingConfigAdapter(AgentAdapter): def _run_agent(self, query: str) -> str: return "success" @@ -163,8 +163,8 @@ def setup_agents(self, agent_data, environment, task, user): from conftest import DummyAgent agent = DummyAgent() - wrapper = FailingConfigWrapper(agent, "failing_agent") - return [wrapper], {"failing_agent": wrapper} # type: ignore[return-value] + agent_adapter = FailingConfigAdapter(agent, "failing_agent") + return [agent_adapter], {"failing_agent": agent_adapter} # type: ignore[return-value] tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}]) benchmark = TestBenchmark(agent_data={"model": "test"}) diff --git a/tests/test_core/test_benchmark/test_trace_collection.py b/tests/test_core/test_benchmark/test_trace_collection.py index 6d89d2e2..2b801a70 100644 --- a/tests/test_core/test_benchmark/test_trace_collection.py +++ b/tests/test_core/test_benchmark/test_trace_collection.py @@ -86,8 +86,8 @@ def setup_agents(self, agent_data, environment, task, user): from conftest import DummyAgent agent = DummyAgent() - wrapper = FailingAgentAdapter(agent, "failing_agent") - return [wrapper], {"failing_agent": wrapper} # type: ignore[return-value] + agent_adapter = FailingAgentAdapter(agent, "failing_agent") + return [agent_adapter], {"failing_agent": agent_adapter} # type: ignore[return-value] tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}]) benchmark = TestBenchmark(agent_data={"model": "test"}) @@ -125,10 +125,10 @@ def setup_agents(self, agent_data, environment, task, user): from conftest import DummyAgent agent = DummyAgent() - wrapper = ModelUsingAgentAdapter(agent, "test_agent", model) + agent_adapter = ModelUsingAgentAdapter(agent, "test_agent", model) # Manually register the model self.register("models", "test_model", model) - return [wrapper], {"test_agent": wrapper} # type: ignore[return-value] + return [agent_adapter], {"test_agent": agent_adapter} # type: ignore[return-value] tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}]) benchmark = TestBenchmark(agent_data={"model": "test"}) diff --git a/tests/test_core/test_message_tracing_callback.py b/tests/test_core/test_message_tracing_callback.py index d7f11bb1..bfd6b9ed 100644 --- a/tests/test_core/test_message_tracing_callback.py +++ b/tests/test_core/test_message_tracing_callback.py @@ -14,10 +14,10 @@ from conftest import DummyAgent -class TestAgentAdapter(AgentAdapter): - """Test wrapper implementation that populates message history for testing. +class TracingTestAgentAdapter(AgentAdapter): + """Test adapter implementation that populates message history for testing. - This wrapper simulates realistic agent behavior by creating proper message + This adapter simulates realistic agent behavior by creating proper message histories with user queries, assistant responses, and optional tool calls. """ @@ -47,8 +47,8 @@ def _run_agent(self, query: str) -> str: # Normal response without tools history.add_message(role="assistant", content=response) - # Store history so get_messages() can retrieve it - self.set_message_history(history) + # Store history directly + self.messages = history return response @@ -76,10 +76,10 @@ def test_basic_tracing(self): """ callback = MessageTracingAgentCallback() agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="test_agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="test_agent", callbacks=[callback]) - # Run query - wrapper.run("Test query") + # Run agent - should trigger callbacks + agent_adapter.run("Test query") # Check traced conversation conversations = callback.get_all_conversations() @@ -101,12 +101,11 @@ def test_multiple_conversations(self): """ callback = MessageTracingAgentCallback() agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="agent1", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="agent1", callbacks=[callback]) - # Run multiple queries queries = ["Query 1", "Query 2", "Query 3"] for query in queries: - wrapper.run(query) + agent_adapter.run(query) # Check all conversations traced conversations = callback.get_all_conversations() @@ -124,9 +123,9 @@ def test_metadata_included(self): """ callback = MessageTracingAgentCallback(include_metadata=True) agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback]) - wrapper.run("Test query with tool") + agent_adapter.run("Test query with tool") conv = callback.get_all_conversations()[0] assert "metadata" in conv @@ -143,9 +142,9 @@ def test_metadata_excluded(self): """ callback = MessageTracingAgentCallback(include_metadata=False) agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback]) - wrapper.run("Test query") + agent_adapter.run("Test query") conv = callback.get_all_conversations()[0] assert "metadata" not in conv @@ -164,15 +163,15 @@ def test_multi_agent_tracing(self): # Create two agents agent1 = DummyAgent() - wrapper1 = TestAgentAdapter(agent1, name="agent1", callbacks=[callback]) + adapter1 = TracingTestAgentAdapter(agent1, name="agent1", callbacks=[callback]) agent2 = DummyAgent() - wrapper2 = TestAgentAdapter(agent2, name="agent2", callbacks=[callback]) + adapter2 = TracingTestAgentAdapter(agent2, name="agent2", callbacks=[callback]) - # Run queries on both - wrapper1.run("Query for agent1") - wrapper2.run("Query for agent2") - wrapper1.run("Another query for agent1") + # Run both agents + adapter1.run("Query for agent1") + adapter2.run("Query for agent2") + adapter1.run("Another query for agent1") # Check all conversations traced conversations = callback.get_all_conversations() @@ -193,11 +192,11 @@ def test_statistics(self): """ callback = MessageTracingAgentCallback() agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="test_agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="test_agent", callbacks=[callback]) - # Run queries - wrapper.run("Query 1") - wrapper.run("Query 2 with tool") + # Run multiple times + agent_adapter.run("Query 1") + agent_adapter.run("Query 2 with tool") stats = callback.get_statistics() @@ -218,11 +217,11 @@ def test_clear(self): """ callback = MessageTracingAgentCallback() agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback]) # Trace some conversations - wrapper.run("Query 1") - wrapper.run("Query 2") + agent_adapter.run("Query 1") + agent_adapter.run("Query 2") assert len(callback.get_all_conversations()) == 2 # Clear @@ -238,10 +237,10 @@ def test_tool_call_tracing(self): """ callback = MessageTracingAgentCallback() agent = DummyAgent() - wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback]) + agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback]) # Run query that triggers tool call - wrapper.run("Query with tool") + agent_adapter.run("Query with tool") conv = callback.get_all_conversations()[0] @@ -259,19 +258,21 @@ def test_no_history_handling(self): """Test graceful handling when agent returns empty message history. Verifies that callback creates valid conversation records even when - agent wrappers return empty histories (edge case for minimal agents). + agent adapters return empty histories (edge case for minimal agents). """ + callback = MessageTracingAgentCallback() + agent = DummyAgent() - class NoHistoryWrapper(AgentAdapter): - def _run_agent(self, query: str) -> MessageHistory: - # Don't populate history + class NoHistoryAdapter(AgentAdapter): + def get_messages(self): return MessageHistory() - callback = MessageTracingAgentCallback() - agent = DummyAgent() - wrapper = NoHistoryWrapper(agent, name="agent", callbacks=[callback]) + def _run_agent(self, query: str): + return MessageHistory() + + agent_adapter = NoHistoryAdapter(agent, name="agent", callbacks=[callback]) - wrapper.run("Test") + agent_adapter.run("Test") # Should still trace, but with empty messages conversations = callback.get_all_conversations() diff --git a/tests/test_interface/test_agent_integration/test_langgraph_integration.py b/tests/test_interface/test_agent_integration/test_langgraph_integration.py index 32546b25..5d5d646a 100644 --- a/tests/test_interface/test_agent_integration/test_langgraph_integration.py +++ b/tests/test_interface/test_agent_integration/test_langgraph_integration.py @@ -13,7 +13,7 @@ pytestmark = [pytest.mark.interface, pytest.mark.langgraph] -def test_langgraph_wrapper_import(): +def test_langgraph_adapter_import(): """Test that LangGraphAgentAdapter can be imported when langgraph is installed.""" from maseval.interface.agents.langgraph import LangGraphAgentAdapter, LangGraphUser @@ -37,31 +37,42 @@ def test_check_langgraph_installed_function(): _check_langgraph_installed() -def test_langgraph_wrapper_message_manipulation(): - """Test that LangGraphAgentAdapter supports message history manipulation. +def test_langgraph_adapter_logs_after_run(): + """Test that LangGraphAgentAdapter.logs is populated after run(). - LangGraph supports manually managing message history through: - - append_to_message_history: Add individual messages - - set_message_history: Replace entire history - - clear_message_history: Remove all messages - - get_messages: Retrieve current history - - This is useful for multi-turn conversations and testing scenarios. + This test validates that the manual logging implementation in LangGraphAgentAdapter + captures all relevant execution information including: + - Timing information (timestamp, duration) + - Query information + - Token usage (extracted from message metadata) + - Status (success/error) + - State information (keys, message count) + - Checkpoint metadata (if available) """ from maseval.interface.agents.langgraph import LangGraphAgentAdapter - from maseval import MessageHistory from langgraph.graph import StateGraph, END from typing_extensions import TypedDict from langchain_core.messages import AIMessage + from langchain_core.messages.ai import UsageMetadata + import time - # Create a simple LangGraph agent + # Create a LangGraph agent with token usage metadata class State(TypedDict): messages: list def agent_node(state: State) -> State: messages = state["messages"] - messages.append(AIMessage(content="Test response")) - return {"messages": messages} + # Create AI message with usage metadata (simulates LLM response) + # UsageMetadata is a TypedDict, so we create it properly + response = AIMessage( + content="Test response", + usage_metadata=UsageMetadata( + input_tokens=50, + output_tokens=30, + total_tokens=80, + ), + ) + return {"messages": messages + [response]} graph = StateGraph(State) graph.add_node("agent", agent_node) @@ -69,63 +80,68 @@ def agent_node(state: State) -> State: graph.add_edge("agent", END) compiled = graph.compile() - wrapper = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") + adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") - # Test append_to_message_history - wrapper.append_to_message_history("user", "First message") - wrapper.append_to_message_history("assistant", "First response") + # Capture time before run + time_before = time.time() - history = wrapper.get_messages() - assert len(history) == 2 - assert history[0]["role"] == "user" - assert history[0]["content"] == "First message" - assert history[1]["role"] == "assistant" - assert history[1]["content"] == "First response" + # Run the agent + adapter.run("Test query") - # Test clear_message_history - wrapper.clear_message_history() - history = wrapper.get_messages() - assert len(history) == 0 + # Capture time after run + time_after = time.time() - # Test set_message_history - new_history = MessageHistory() - new_history.add_message("user", "Set message 1") - new_history.add_message("assistant", "Set response 1") - new_history.add_message("user", "Set message 2") + # Access logs + logs = adapter.logs - wrapper.set_message_history(new_history) - history = wrapper.get_messages() - assert len(history) == 3 - assert history[0]["content"] == "Set message 1" - assert history[1]["content"] == "Set response 1" - assert history[2]["content"] == "Set message 2" + # Verify logs structure + assert isinstance(logs, list) + assert len(logs) >= 1 # At least one log entry - # Verify history persists across multiple retrievals - history_again = wrapper.get_messages() - assert len(history_again) == 3 - assert history_again[0]["content"] == "Set message 1" + # Get the most recent log entry + log_entry = logs[-1] + # Verify required fields + assert "timestamp" in log_entry + assert "query" in log_entry + assert "duration_seconds" in log_entry + assert "status" in log_entry -def test_langgraph_wrapper_message_manipulation_with_system_message(): - """Test message manipulation with system messages. + # Verify field values + assert log_entry["query"] == "Test query" + assert log_entry["status"] == "success" + assert log_entry["duration_seconds"] > 0 + assert log_entry["duration_seconds"] < (time_after - time_before) + 0.1 # Reasonable duration - Verifies that system messages are properly converted and handled - when manipulating message history in LangGraph. - """ + # Verify state information + assert "state_keys" in log_entry + assert "messages" in log_entry["state_keys"] + assert "message_count" in log_entry + assert log_entry["message_count"] >= 1 + + # Verify token usage is captured from message metadata + assert "input_tokens" in log_entry + assert "output_tokens" in log_entry + assert "total_tokens" in log_entry + assert log_entry["input_tokens"] == 50 + assert log_entry["output_tokens"] == 30 + assert log_entry["total_tokens"] == 80 + + +def test_langgraph_adapter_logs_multiple_runs(): + """Test that logs accumulate across multiple runs.""" from maseval.interface.agents.langgraph import LangGraphAgentAdapter - from maseval import MessageHistory from langgraph.graph import StateGraph, END from typing_extensions import TypedDict from langchain_core.messages import AIMessage - # Create a simple LangGraph agent class State(TypedDict): messages: list def agent_node(state: State) -> State: messages = state["messages"] - messages.append(AIMessage(content="Response")) - return {"messages": messages} + response = AIMessage(content="Response") + return {"messages": messages + [response]} graph = StateGraph(State) graph.add_node("agent", agent_node) @@ -133,19 +149,94 @@ def agent_node(state: State) -> State: graph.add_edge("agent", END) compiled = graph.compile() - wrapper = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") + adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") + + # First run + adapter.run("Query 1") + logs_after_first = adapter.logs + assert len(logs_after_first) == 1 + assert logs_after_first[0]["query"] == "Query 1" + + # Second run + adapter.run("Query 2") + logs_after_second = adapter.logs + assert len(logs_after_second) == 2 + assert logs_after_second[0]["query"] == "Query 1" + assert logs_after_second[1]["query"] == "Query 2" + + +def test_langgraph_adapter_logs_error_handling(): + """Test that logs capture error information when agent execution fails.""" + from maseval.interface.agents.langgraph import LangGraphAgentAdapter + from langgraph.graph import StateGraph, END + from typing_extensions import TypedDict + + class State(TypedDict): + messages: list + + def failing_node(state: State) -> State: + raise ValueError("Intentional test error") + + graph = StateGraph(State) + graph.add_node("agent", failing_node) + graph.set_entry_point("agent") + graph.add_edge("agent", END) + compiled = graph.compile() + + adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") + + # Run should raise an error + try: + adapter.run("Test query") + assert False, "Expected ValueError to be raised" + except ValueError as e: + assert "Intentional test error" in str(e) + + # Verify error is logged + logs = adapter.logs + assert len(logs) == 1 + + log_entry = logs[0] + assert log_entry["status"] == "error" + assert "error" in log_entry + assert "error_type" in log_entry + assert log_entry["error_type"] == "ValueError" + assert "Intentional test error" in log_entry["error"] + assert log_entry["query"] == "Test query" + assert log_entry["duration_seconds"] >= 0 + + +def test_langgraph_adapter_logs_without_token_metadata(): + """Test that logs work correctly when messages don't have usage metadata.""" + from maseval.interface.agents.langgraph import LangGraphAgentAdapter + from langgraph.graph import StateGraph, END + from typing_extensions import TypedDict + from langchain_core.messages import AIMessage + + class State(TypedDict): + messages: list + + def agent_node(state: State) -> State: + messages = state["messages"] + # Create response without usage metadata + response = AIMessage(content="Test response") + return {"messages": messages + [response]} + + graph = StateGraph(State) + graph.add_node("agent", agent_node) + graph.set_entry_point("agent") + graph.add_edge("agent", END) + compiled = graph.compile() - # Test set_message_history with system message - new_history = MessageHistory() - new_history.add_message("system", "You are a helpful assistant") - new_history.add_message("user", "Hello") - new_history.add_message("assistant", "Hi there") + adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent") + adapter.run("Test query") - wrapper.set_message_history(new_history) - history = wrapper.get_messages() + # Verify logs exist but token fields are None or 0 + logs = adapter.logs + assert len(logs) == 1 - assert len(history) == 3 - assert history[0]["role"] == "system" - assert history[0]["content"] == "You are a helpful assistant" - assert history[1]["role"] == "user" - assert history[2]["role"] == "assistant" + log_entry = logs[0] + # Token fields should be present but with default values + assert log_entry.get("input_tokens") in [None, 0] + assert log_entry.get("output_tokens") in [None, 0] + assert log_entry.get("total_tokens") in [None, 0] diff --git a/tests/test_interface/test_agent_integration/test_smolagents_integration.py b/tests/test_interface/test_agent_integration/test_smolagents_integration.py index 2bbdf1e5..3fdb8412 100644 --- a/tests/test_interface/test_agent_integration/test_smolagents_integration.py +++ b/tests/test_interface/test_agent_integration/test_smolagents_integration.py @@ -13,7 +13,7 @@ pytestmark = [pytest.mark.interface, pytest.mark.smolagents] -def test_smolagents_wrapper_import(): +def test_smolagents_adapter_import(): """Test that SmolAgentAdapter can be imported when smolagents is installed.""" from maseval.interface.agents.smolagents import SmolAgentAdapter, SmolAgentUser @@ -37,15 +37,15 @@ def test_check_smolagents_installed_function(): _check_smolagents_installed() -def test_smolagents_wrapper_creation(): +def test_smolagents_adapter_creation(): """Test that SmolAgentAdapter can be created.""" from maseval.interface.agents.smolagents import SmolAgentAdapter - # Create wrapper with mock agent - wrapper = SmolAgentAdapter(agent_instance=object(), name="test_agent") + # Create adapter with mock agent + agent_adapter = SmolAgentAdapter(agent_instance=object(), name="test_agent") - assert wrapper.name == "test_agent" - assert wrapper.agent is not None + assert agent_adapter.name == "test_agent" + assert agent_adapter.agent is not None def test_smolagents_user_creation(): @@ -67,7 +67,7 @@ def test_smolagents_user_creation(): assert user.name == "test_user" -def test_smolagents_wrapper_gather_traces_with_monitoring(): +def test_smolagents_adapter_gather_traces_with_monitoring(): """Test that SmolAgentAdapter.gather_traces() captures token and timing data.""" from maseval.interface.agents.smolagents import SmolAgentAdapter from smolagents.memory import ActionStep, AgentMemory @@ -107,11 +107,11 @@ def test_smolagents_wrapper_gather_traces_with_monitoring(): # Mock write_memory_to_messages to return empty list (we're testing gather_traces, not get_messages) mock_agent.write_memory_to_messages = Mock(return_value=[]) - # Create wrapper - wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") + # Create adapter + agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") # Call gather_traces - traces = wrapper.gather_traces() + traces = agent_adapter.gather_traces() # Verify aggregated statistics assert "total_steps" in traces @@ -154,7 +154,7 @@ def test_smolagents_wrapper_gather_traces_with_monitoring(): assert step2_detail["action_output"] == "Output from step 2" -def test_smolagents_wrapper_gather_traces_without_monitoring(): +def test_smolagents_adapter_gather_traces_without_monitoring(): """Test that gather_traces works when agent has no monitoring data.""" from maseval.interface.agents.smolagents import SmolAgentAdapter from smolagents.memory import AgentMemory @@ -165,11 +165,11 @@ def test_smolagents_wrapper_gather_traces_without_monitoring(): mock_agent.memory = AgentMemory(system_prompt="Test system prompt") mock_agent.write_memory_to_messages = Mock(return_value=[]) - # Create wrapper - wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") + # Create adapter + agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") # Call gather_traces - traces = wrapper.gather_traces() + traces = agent_adapter.gather_traces() # Verify aggregated statistics show zero usage assert "total_steps" in traces @@ -191,7 +191,7 @@ def test_smolagents_wrapper_gather_traces_without_monitoring(): assert len(traces["steps_detail"]) == 0 -def test_smolagents_wrapper_gather_traces_with_planning_step(): +def test_smolagents_adapter_gather_traces_with_planning_step(): """Test that gather_traces captures PlanningStep data correctly.""" from maseval.interface.agents.smolagents import SmolAgentAdapter from smolagents.memory import PlanningStep, AgentMemory @@ -218,11 +218,11 @@ def test_smolagents_wrapper_gather_traces_with_planning_step(): # Mock write_memory_to_messages mock_agent.write_memory_to_messages = Mock(return_value=[]) - # Create wrapper - wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") + # Create adapter + agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") # Call gather_traces - traces = wrapper.gather_traces() + traces = agent_adapter.gather_traces() # Verify aggregated statistics assert traces["total_steps"] == 1 @@ -245,71 +245,168 @@ def test_smolagents_wrapper_gather_traces_with_planning_step(): assert "observations" not in step_detail -def test_smolagents_wrapper_message_manipulation_not_supported(): - """Test that smolagents explicitly raises NotImplementedError for message manipulation. +def test_smolagents_adapter_logs_property(): + """Test that SmolAgentAdapter.logs property returns converted memory steps. - smolagents builds its AgentMemory from execution steps and does not support - arbitrary message injection. The wrapper should raise clear NotImplementedError - for set_message_history and append_to_message_history operations. - - Only clear_message_history is supported (resets memory with system prompt). + This test validates that the logs property correctly extracts all relevant + information from smolagents' internal memory system, including: + - Step types (ActionStep, PlanningStep) + - Timing information (start_time, end_time, duration) + - Token usage (input_tokens, output_tokens, total_tokens) + - Model input/output messages + - Tool calls and observations + - Error information """ from maseval.interface.agents.smolagents import SmolAgentAdapter - from maseval import MessageHistory - from smolagents import CodeAgent - from conftest import FakeSmolagentsModel + from smolagents.memory import ActionStep, PlanningStep, AgentMemory, ToolCall + from smolagents.monitoring import TokenUsage, Timing + from smolagents.models import ChatMessage, MessageRole + from unittest.mock import Mock + import time + + # Create a mock agent with memory + mock_agent = Mock() + mock_agent.memory = AgentMemory(system_prompt="Test system prompt") + + # Add an ActionStep with comprehensive data + start_time = time.time() + step1 = ActionStep( + step_number=1, + timing=Timing(start_time=start_time, end_time=start_time + 0.5), + observations_images=[], + ) + step1.token_usage = TokenUsage(input_tokens=100, output_tokens=50) + step1.observations = "Tool returned: success" + step1.action_output = "Final output from action" + step1.tool_calls = [ToolCall(name="test_tool", arguments={"arg": "value"}, id="call_123")] + step1.model_input_messages = [ + ChatMessage(role=MessageRole.USER, content="Execute this task"), + ChatMessage(role=MessageRole.SYSTEM, content="System context"), + ] + mock_agent.memory.steps.append(step1) + + # Add a PlanningStep + step2 = PlanningStep( + timing=Timing(start_time=start_time + 0.5, end_time=start_time + 1.0), + model_input_messages=[ChatMessage(role=MessageRole.USER, content="What should I do?")], + model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Here's the plan"), + plan="Step 1: Do this\nStep 2: Do that", + ) + step2.token_usage = TokenUsage(input_tokens=200, output_tokens=150) + mock_agent.memory.steps.append(step2) + + # Mock write_memory_to_messages + mock_agent.write_memory_to_messages = Mock(return_value=[]) + + # Create adapter + adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") + + # Access logs property + logs = adapter.logs + + # Verify logs structure + assert isinstance(logs, list) + assert len(logs) == 2 + + # Verify ActionStep log entry + action_log = logs[0] + assert action_log["step_type"] == "ActionStep" + assert action_log["step_number"] == 1 + assert action_log["input_tokens"] == 100 + assert action_log["output_tokens"] == 50 + assert action_log["total_tokens"] == 150 + assert action_log["duration_seconds"] == pytest.approx(0.5, abs=0.01) + assert action_log["observations"] == "Tool returned: success" + assert action_log["action_output"] == "Final output from action" + assert "tool_calls" in action_log + assert len(action_log["tool_calls"]) == 1 + assert action_log["tool_calls"][0]["name"] == "test_tool" + + # Verify model_input_messages are converted + assert "model_input_messages" in action_log + assert isinstance(action_log["model_input_messages"], list) + assert len(action_log["model_input_messages"]) == 2 + assert action_log["model_input_messages"][0]["role"] == "user" + assert action_log["model_input_messages"][0]["content"] == "Execute this task" + assert action_log["model_input_messages"][1]["role"] == "system" + + # Verify PlanningStep log entry + planning_log = logs[1] + assert planning_log["step_type"] == "PlanningStep" + assert planning_log["input_tokens"] == 200 + assert planning_log["output_tokens"] == 150 + assert planning_log["total_tokens"] == 350 + assert planning_log["duration_seconds"] == pytest.approx(0.5, abs=0.01) + assert planning_log["plan"] == "Step 1: Do this\nStep 2: Do that" + + # Verify model_input_messages for planning step + assert "model_input_messages" in planning_log + assert len(planning_log["model_input_messages"]) == 1 + assert planning_log["model_input_messages"][0]["content"] == "What should I do?" + + # PlanningStep should not have action-specific fields + assert "action_output" not in planning_log + assert "observations" not in planning_log + assert "tool_calls" not in planning_log + + +def test_smolagents_adapter_logs_with_errors(): + """Test that adapter.logs captures error information from failed steps.""" + from maseval.interface.agents.smolagents import SmolAgentAdapter + from smolagents import AgentError + from smolagents.memory import ActionStep, AgentMemory + from smolagents.monitoring import Timing + from unittest.mock import Mock + import time - # Create a smolagents agent - mock_model = FakeSmolagentsModel(["Test response"]) - agent = CodeAgent(tools=[], model=mock_model, max_steps=1) - wrapper = SmolAgentAdapter(agent_instance=agent, name="test_agent") + # Create a mock agent with memory + mock_agent = Mock() + mock_agent.memory = AgentMemory(system_prompt="Test system prompt") - # Test that append_to_message_history raises NotImplementedError - with pytest.raises(NotImplementedError) as exc_info: - wrapper.append_to_message_history("user", "Manual message") + # Add an ActionStep with an error + start_time = time.time() + step = ActionStep( + step_number=1, + timing=Timing(start_time=start_time, end_time=start_time + 0.2), + observations_images=[], + ) + # Create a proper AgentError object with mock logger + mock_logger = Mock() + step.error = AgentError("Tool execution failed: Connection timeout", logger=mock_logger) + mock_agent.memory.steps.append(step) - assert "doesn't support appending" in str(exc_info.value) - assert "memory is built from execution steps" in str(exc_info.value) + # Mock write_memory_to_messages + mock_agent.write_memory_to_messages = Mock(return_value=[]) - # Test that set_message_history raises NotImplementedError - with pytest.raises(NotImplementedError) as exc_info: - new_history = MessageHistory() - new_history.add_message("user", "Test message") - wrapper.set_message_history(new_history) + # Create adapter + adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") - assert "doesn't support setting" in str(exc_info.value) - assert "memory is built from execution steps" in str(exc_info.value) + # Access logs property + logs = adapter.logs + # Verify error is captured + assert len(logs) == 1 + assert "error" in logs[0] + assert logs[0]["error"] == "Tool execution failed: Connection timeout" -def test_smolagents_wrapper_clear_message_history_supported(): - """Test that smolagents supports clear_message_history. - clear_message_history is the only history manipulation operation - supported by smolagents. It resets the AgentMemory while preserving - the system prompt. - """ +def test_smolagents_adapter_logs_empty_when_no_steps(): + """Test that adapter.logs returns empty list when no execution has occurred.""" from maseval.interface.agents.smolagents import SmolAgentAdapter - from smolagents import CodeAgent - from conftest import FakeSmolagentsModel - - # Create a smolagents agent - mock_model = FakeSmolagentsModel(["Test response"]) - agent = CodeAgent(tools=[], model=mock_model, max_steps=1) - wrapper = SmolAgentAdapter(agent_instance=agent, name="test_agent") - - # Run the agent to populate memory - wrapper.run("Test query") - - # Verify memory has content (should have multiple messages after run) - messages_before = wrapper.get_messages() - assert len(messages_before) > 1 # At least system + user messages - - # Clear the memory - wrapper.clear_message_history() - - # Verify memory is reset (only system message remains) - messages_after = wrapper.get_messages() - assert len(messages_after) == 1 - assert messages_after[0]["role"] == "system" - # System prompt content is framework-specific, just verify it exists and has content - assert len(messages_after[0]["content"]) > 0 + from smolagents.memory import AgentMemory + from unittest.mock import Mock + + # Create a mock agent with empty memory + mock_agent = Mock() + mock_agent.memory = AgentMemory(system_prompt="Test system prompt") + mock_agent.write_memory_to_messages = Mock(return_value=[]) + + # Create adapter + adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent") + + # Access logs property + logs = adapter.logs + + # Should be empty + assert isinstance(logs, list) + assert len(logs) == 0