diff --git a/.github/scripts/extract_changelog.py b/.github/scripts/extract_changelog.py
index d090de5f..4918cab2 100644
--- a/.github/scripts/extract_changelog.py
+++ b/.github/scripts/extract_changelog.py
@@ -19,6 +19,7 @@ def extract_section(version: str, changelog_path: Path) -> str:
     if not match:
         print(f"No changelog entry found for version {version}", file=sys.stderr)
         sys.exit(1)
+    assert match is not None
     return match.group(0).strip()
 
 
diff --git a/AGENTS.md b/AGENTS.md
index 75cf950f..7036a867 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -102,12 +102,12 @@ uv remove <package-name>
 
 **Framework Adapter Pattern:**
 
-When implementing wrappers for external frameworks, **always use the framework's native message storage as the source of truth**:
+When implementing adapters for external frameworks, **always use the framework's native message storage as the source of truth**:
 
 **Pattern 1: Persistent State (smolagents)**
 
 ```python
-class MyFrameworkWrapper(AgentAdapter):
+class MyFrameworkAdapter(AgentAdapter):
     def get_messages(self) -> MessageHistory:
         """Dynamically fetch from framework's internal storage."""
         # Get from framework (e.g., agent.memory, agent.messages)
@@ -236,3 +236,7 @@ For lists and dictionaries, use `Dict[...,...]`, `List[...]`, `Sequence[...]` et
 - DO NOT publicly distribute code or data
 - DO NOT publish without explicit permission
 - DO NOT share copyrighted third-party benchmark data
+
+## Changelog
+
+When the task is completed, add your changes to the Changelog.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8963237d..5cfb2573 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,12 +9,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- The `logs` property inside `SmolAgentAdapter` and `LanggraphAgentAdapter` are now properly filled. (PR: #3)
+
 ### Changed
 
 ### Fixed
 
+- Consistent naming of agent `adapter` over `wrapper` (PR: #3)
+
 ### Removed
 
+- Removed `set_message_history`, `append_message_history` and `clear_message_history` for `AgentAdapter` and subclasses. (PR: #3)
+
 ## [0.1.2] - 2025-11-18
 
 ### Added
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 90d20667..5f83f17b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,7 +23,7 @@ The `maseval` package is designed with a strict separation between its core logi
 
 1.  **`maseval/core`**: This is the heart of the library. It contains the essential logic and **must not** have any optional dependencies. It should be fully functional with a minimal installation.
 
-2.  **`maseval/interface`**: This contains adapters and wrappers for other multi-agent frameworks (like `crewai`, `langgraph`, etc.). All dependencies for these integrations are optional.
+2.  **`maseval/interface`**: This contains adapters for other multi-agent frameworks (like `crewai`, `langgraph`, etc.). All dependencies for these integrations are optional.
 
 > [!WARNING]
 > Code in `maseval/core` **must never** import from `maseval/interface`. This separation is critical to keep the core package lightweight and dependency-free. Breaking this rule will cause the library to fail.
@@ -197,11 +197,11 @@ The pipeline automatically performs the following tasks:
 
 ### 6. Implementing Framework Adapters
 
-When creating wrappers for external agent frameworks (in `maseval/interface/agents/`), follow these best practices to ensure consistency and reliability:
+When creating adapters for external agent frameworks (in `maseval/interface/agents/`), follow these best practices to ensure consistency and reliability:
 
 #### Message History Pattern
 
-**Always use the framework's native message storage as the source of truth.** Do not cache converted messages in the wrapper, as this can lead to inconsistencies if the framework's internal state changes.
+**Always use the framework's native message storage as the source of truth.** Do not cache converted messages in the adapter, as this can lead to inconsistencies if the framework's internal state changes.
 
 **Correct Pattern** (SmolAgents example):
 
@@ -256,13 +256,14 @@ When adding support for a new framework:
 - [ ] Add conditional import in `maseval/interface/agents/__init__.py`
 - [ ] Write integration tests in `tests/test_interface/`
 - [ ] Update documentation with usage examples
+- [ ] Provide a `logs` property inside the `AgentAdapter`.
 
 #### Framework-Specific Patterns
 
 **Pattern 1: Persistent State (smolagents)**
 
 ```python
-class MyFrameworkWrapper(AgentAdapter):
+class MyFrameworkAdapter(AgentAdapter):
     def get_messages(self) -> MessageHistory:
         """Dynamically fetch from framework's internal storage."""
         # Get from framework (e.g., agent.memory, agent.messages)
diff --git a/README.md b/README.md
index 399630f4..c7a33371 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Analogous to pytest for testing or MLflow for ML experimentation, MASEval focuse
 
 - **Task-Specific Configurations:** Each benchmark task is a self-contained evaluation unit with its own instructions, environment state, success criteria, and custom evaluation logic. One task might measure success by environment state changes, another by programmatic output validation.
 
-- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and wrappers enable any agent system to be evaluated without modification to the core library.
+- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and adapters enable any agent system to be evaluated without modification to the core library.
 
 - **Lifecycle Hooks via Callbacks:** Inject custom logic at any point in the evaluation lifecycle (e.g., on_run_start, on_task_start, on_agent_step_end) through a callback system. This enables extensibility without modifying core evaluation logic.
 
diff --git a/docs/guides/config-gathering.md b/docs/guides/config-gathering.md
index cad44e49..b87688c6 100644
--- a/docs/guides/config-gathering.md
+++ b/docs/guides/config-gathering.md
@@ -124,8 +124,8 @@ class MyBenchmark(Benchmark):
     def setup_agents(self, agent_data, environment, task, user):
         model = MyModelAdapter(...)
         agent = MyAgent(model=model)
-        wrapper = AgentAdapter(agent, "agent")
-        return [wrapper], {"agent": wrapper}
+        adapter = AgentAdapter(agent, "agent")
+        return [adapter], {"agent": adapter}
     # ... other methods
 
 # Run benchmark
diff --git a/docs/guides/message-tracing.md b/docs/guides/message-tracing.md
index 3ca5dd54..85e41563 100644
--- a/docs/guides/message-tracing.md
+++ b/docs/guides/message-tracing.md
@@ -16,7 +16,7 @@ MASEval provides message tracing to capture agent conversations during benchmark
 
 ## Core Concepts
 
-**`MessageHistory`**: OpenAI-compatible message storage that all agent wrappers use internally.
+**`MessageHistory`**: OpenAI-compatible message storage that all agent adapters use internally.
 
 **`AgentAdapter.get_messages()`**: Standard method to retrieve conversation history from any wrapped agent.
 
@@ -26,17 +26,17 @@ MASEval provides message tracing to capture agent conversations during benchmark
 
 ### Accessing Message History
 
-Every agent wrapper exposes message history through `get_messages()`:
+Every agent adapter exposes message history through `get_messages()`:
 
 ```python
-from maseval.interface.agents import SmolAgentsWrapper
+from maseval.interface.agents import SmolAgentAdapter
 
 # Create and run your agent
-wrapper = SmolAgentsWrapper(agent, name="researcher")
-result = wrapper.run("What's the capital of France?")
+agent_adapter = SmolAgentAdapter(agent, name="researcher")
+result = agent_adapter.run("What's the capital of France?")
 
 # Get the conversation
-messages = wrapper.get_messages()
+messages = agent_adapter.get_messages()
 
 # Inspect messages
 for msg in messages:
@@ -45,18 +45,21 @@ for msg in messages:
         print(f"  Tools called: {[tc['function']['name'] for tc in msg['tool_calls']]}")
 ```
 
-### Clearing History Between Tasks
+### Fresh Conversations for Multiple Tasks
 
-In benchmarks, you typically want to clear history before each new task:
+In benchmarks, you typically want a fresh agent instance for each task:
 
 ```python
 # In your benchmark loop
 for task in benchmark.tasks:
-    wrapper.clear_message_history()  # Reset for new task
-    result = wrapper.run(task.query)
+    # Create a new adapter instance for each task
+    agent_adapter = YourAgentAdapter(agent_instance=agent, name="task_agent")
+    result = agent_adapter.run(task.query)
     evaluate(result, task.ground_truth)
 ```
 
+This ensures each task starts with a clean slate and avoids conversation history contamination.
+
 ## Using the Tracing Callback
 
 For multi-agent systems or when you need to collect conversations from many runs, use `MessageTracingAgentCallback`:
@@ -68,12 +71,12 @@ from maseval.core.callbacks import MessageTracingAgentCallback
 tracer = MessageTracingAgentCallback()
 
 # Attach to your agent(s)
-wrapper = SmolAgentsWrapper(agent, name="assistant", callbacks=[tracer])
+agent_adapter = SmolAgentAdapter(agent, name="assistant", callbacks=[tracer])
 
 # Run tasks
-wrapper.run("Task 1")
-wrapper.run("Task 2")
-wrapper.run("Task 3")
+agent_adapter.run("Task 1")
+agent_adapter.run("Task 2")
+agent_adapter.run("Task 3")
 
 # Get all conversations
 conversations = tracer.get_all_conversations()
@@ -93,8 +96,8 @@ Share one tracer across multiple agents to collect all conversations:
 tracer = MessageTracingAgentCallback()
 
 # Attach to multiple agents
-agent1 = SmolAgentsWrapper(agent1, name="researcher", callbacks=[tracer])
-agent2 = SmolAgentsWrapper(agent2, name="writer", callbacks=[tracer])
+agent1 = SmolAgentAdapter(agent1, name="researcher", callbacks=[tracer])
+agent2 = SmolAgentAdapter(agent2, name="writer", callbacks=[tracer])
 
 # Run both agents
 agent1.run("Research topic X")
@@ -119,7 +122,7 @@ tracer = MessageTracingAgentCallback()
 
 for batch in task_batches:
     for task in batch:
-        wrapper.run(task.query)
+        agent_adapter.run(task.query)
 
     # Process this batch
     conversations = tracer.get_all_conversations()
@@ -190,9 +193,9 @@ Messages use OpenAI's chat completion format:
 }
 ```
 
-## Custom Agent Wrappers
+## Custom Agent Adapters
 
-If you're implementing a custom wrapper, the framework handles message storage automatically via `get_messages()`. Just ensure your `_run_agent()` method returns a `MessageHistory`:
+If you're implementing a custom adapter, the framework handles message storage automatically via `get_messages()`. Just ensure your `_run_agent()` method returns a `MessageHistory`:
 
 ```python
 from maseval import AgentAdapter, MessageHistory
@@ -211,13 +214,13 @@ class MyAgentAdapter(AgentAdapter):
         return history
 ```
 
-See the [Agent Wrapper guide](../reference/agent.md) for details on implementing custom wrappers.
+See the [AgentAdapter guide](../reference/agent.md) for details on implementing custom adapters.
 
 ## Tips
 
 **For debugging**: Use `verbose=True` to see traces in real-time.
 
-**For benchmarks**: Clear history between tasks with `wrapper.clear_message_history()`.
+**For benchmarks**: Create a new adapter instance for each task to ensure clean conversation history.
 
 **For multi-agent systems**: Use a shared tracer and `get_conversations_by_agent()` to analyze each agent separately.
 
diff --git a/docs/index.md b/docs/index.md
index 709fb37c..38d24493 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -24,7 +24,7 @@ More details in the [Quickstart](getting-started/quickstart.md)
 
 - **Task-Specific Configurations:** Each benchmark task is a self-contained evaluation unit with its own instructions, environment state, success criteria, and custom evaluation logic. One task might measure success by environment state changes, another by programmatic output validation.
 
-- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and wrappers enable any agent system to be evaluated without modification to the core library.
+- **Framework Agnostic by Design:** MASEval is intentionally unopinionated about agent frameworks, model providers, and system architectures. Simple, standardized interfaces and adapters enable any agent system to be evaluated without modification to the core library.
 
 - **Lifecycle Hooks via Callbacks:** Inject custom logic at any point in the evaluation lifecycle (e.g., `on_run_start`, `on_task_start`, `on_agent_step_end`) through a callback system. This enables extensibility without modifying core evaluation logic.
 
diff --git a/maseval/core/agent.py b/maseval/core/agent.py
index 1c0e006d..5adb4248 100644
--- a/maseval/core/agent.py
+++ b/maseval/core/agent.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import List, Any, Optional, Union, Dict
+from typing import List, Any, Optional, Dict
 
 from .callback import AgentCallback
-from .history import MessageHistory, RoleType
+from .history import MessageHistory
 from .tracing import TraceableMixin
 from .config import ConfigurableMixin
 
@@ -10,7 +10,7 @@
 class AgentAdapter(ABC, TraceableMixin, ConfigurableMixin):
     """Wraps an agent from any framework to provide a standard interface.
 
-    This wrapper provides:
+    This Adapter provides:
     - Unified execution interface via `run()`
     - Callback hooks for monitoring
     - Message history management via getter/setter
@@ -101,35 +101,6 @@ def get_messages(self) -> MessageHistory:
         """
         return self.messages if self.messages is not None else MessageHistory()
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set the message history.
-
-        This is typically called by _run_agent() implementations after executing
-        the agent, but can also be used to inject or modify history.
-
-        Args:
-            history: The MessageHistory to set
-        """
-        self.messages = history
-
-    def clear_message_history(self) -> None:
-        """Clear the message history."""
-        self.messages = None
-
-    def append_to_message_history(self, role: Union[RoleType, str], content: Union[str, List[Any]], **kwargs) -> None:
-        """Append a message to the history.
-
-        If no history exists, creates a new one.
-
-        Args:
-            role: The message role ("user", "assistant", "system", "tool")
-            content: The message content (string or list of content parts)
-            **kwargs: Additional fields (name, metadata, timestamp, etc.)
-        """
-        if self.messages is None:
-            self.messages = MessageHistory()
-        self.messages.add_message(role, content, **kwargs)  # type: ignore
-
     def gather_traces(self) -> dict[str, Any]:
         """Gather execution traces from this agent.
 
@@ -148,7 +119,7 @@ def gather_traces(self) -> dict[str, Any]:
 
         How to use:
             This method is automatically called by Benchmark during trace collection.
-            Framework-specific wrappers can extend this to include additional data:
+            Framework-specific adapters can extend this to include additional data:
 
             ```python
             def gather_traces(self) -> dict[str, Any]:
@@ -181,12 +152,12 @@ def gather_config(self) -> dict[str, Any]:
             - gathered_at: ISO timestamp
             - name: Agent name
             - agent_type: Underlying agent framework class name
-            - wrapper_type: The specific wrapper class (e.g., SmolAgentAdapter)
+            - adapter_type: The specific adapter class (e.g., SmolAgentAdapter)
             - callbacks: List of callback class names attached to this agent
 
         How to use:
             This method is automatically called by Benchmark during config collection.
-            Framework-specific wrappers can extend this to include additional data:
+            Framework-specific adapters can extend this to include additional data:
 
             ```python
             def gather_config(self) -> dict[str, Any]:
@@ -200,7 +171,7 @@ def gather_config(self) -> dict[str, Any]:
             **super().gather_config(),
             "name": self.name,
             "agent_type": type(self.agent).__name__,
-            "wrapper_type": type(self).__name__,
+            "adapter_type": type(self).__name__,
             "callbacks": [type(cb).__name__ for cb in self.callbacks],
         }
 
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 7f7e9f60..4ed0badb 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -64,8 +64,8 @@ def setup_environment(self, agent_data, task):
 
                 def setup_agents(self, agent_data, environment, task, user):
                     agent = MyAgent(model=agent_data["model"])
-                    wrapper = AgentAdapter(agent, "agent")
-                    return [wrapper], {"agent": wrapper}
+                    agent_adapter = AgentAdapter(agent, "agent")
+                    return [agent_adapter], {"agent": agent_adapter}
 
                 def run_agents(self, agents, task, environment):
                     return agents[0].run(task.query)
@@ -258,10 +258,10 @@ def setup_agents(self, agent_data, environment, task, user):
 
                 # Create agent (auto-registered when returned)
                 agent = MyAgent(model=model)
-                wrapper = AgentAdapter(agent, "agent1")
+                agent_adapter = AgentAdapter(agent, "agent1")
 
                 # Environment and user are also auto-registered
-                return [wrapper], {"agent1": wrapper}
+                return [agent_adapter], {"agent1": agent_adapter}
             ```
 
             Traces and configs are automatically collected before evaluation via
@@ -673,12 +673,12 @@ def setup_agents(self, agent_data, environment, task, user):
                     model=model,
                     managed_agents=[w.agent for w in workers.values()]
                 )
-                orchestrator_wrapper = AgentAdapter(orchestrator, "orchestrator")
+                orchestrator_adapter = AgentAdapter(orchestrator, "orchestrator")
 
                 # Return orchestrator to run, but all agents for monitoring
                 # All agents auto-registered for tracing
-                all_agents = {"orchestrator": orchestrator_wrapper, **workers}
-                return [orchestrator_wrapper], all_agents
+                all_agents = {"orchestrator": orchestrator_adapter, **workers}
+                return [orchestrator_adapter], all_agents
             ```
         """
         pass
diff --git a/maseval/core/callbacks/message_tracing.py b/maseval/core/callbacks/message_tracing.py
index 498b7269..5bcded95 100644
--- a/maseval/core/callbacks/message_tracing.py
+++ b/maseval/core/callbacks/message_tracing.py
@@ -30,8 +30,8 @@ class MessageTracingAgentCallback(AgentCallback):
         tracer = MessageTracingAgentCallback(include_metadata=True, verbose=True)
 
         # Use with agent
-        wrapper = MyAgentAdapter(agent, name="agent1", callbacks=[tracer])
-        wrapper.run("What's the weather?")
+        agent_adapter = MyAgentAdapter(agent, name="agent1", callbacks=[tracer])
+        agent_adapter.run("What's the weather?")
 
         # Access traced conversations
         for conversation in tracer.get_all_conversations():
@@ -71,7 +71,7 @@ def on_run_end(self, agent: AgentAdapter, result: Any) -> None:
         """Called when agent execution completes.
 
         Args:
-            agent: The agent wrapper instance
+            agent: The agent adapter instance
             result: The result returned by the agent (usually MessageHistory)
         """
         # Get message history from agent
diff --git a/maseval/interface/agents/langgraph.py b/maseval/interface/agents/langgraph.py
index 5a15ae4c..f944b2ed 100644
--- a/maseval/interface/agents/langgraph.py
+++ b/maseval/interface/agents/langgraph.py
@@ -4,7 +4,9 @@
     pip install maseval[langgraph]
 """
 
-from typing import TYPE_CHECKING, Any
+import time
+from datetime import datetime
+from typing import TYPE_CHECKING, Any, Dict
 
 from maseval import AgentAdapter, MessageHistory, User
 
@@ -32,13 +34,13 @@ class LangGraphAgentAdapter(AgentAdapter):
 
     Requires langgraph to be installed.
 
-    This wrapper converts LangChain/LangGraph message types to MASEval's
+    This adapter converts LangChain/LangGraph message types to MASEval's
     OpenAI-compatible MessageHistory format. It preserves tool calls, tool
     responses, and multi-modal content.
 
     LangGraph graphs can be stateless or stateful (with checkpointer). This
-    wrapper supports both modes:
-    - Stateless: Messages from invoke() result are cached in wrapper
+    adapter supports both modes:
+    - Stateless: Messages from invoke() result are cached in adapter
     - Stateful: Messages fetched from graph state if config/thread_id provided
 
     Example:
@@ -50,17 +52,17 @@ class LangGraphAgentAdapter(AgentAdapter):
         graph = StateGraph(...)
         compiled_graph = graph.compile()
 
-        wrapper = LangGraphAgentAdapter(compiled_graph, "agent_name")
-        result = wrapper.run("What's the weather?")
+        agent_adapter = LangGraphAgentAdapter(compiled_graph, "agent_name")
+        result = agent_adapter.run("What's the weather?")
 
         # Access message history
-        for msg in wrapper.get_messages():
+        for msg in agent_adapter.get_messages():
             print(msg['role'], msg['content'])
         ```
     """
 
     def __init__(self, agent_instance, name: str, callbacks=None, config=None):
-        """Initialize the LangGraph wrapper.
+        """Initialize the LangGraph adapter.
 
         Args:
             agent_instance: Compiled LangGraph graph
@@ -103,85 +105,6 @@ def get_messages(self) -> MessageHistory:
         # No messages available
         return MessageHistory()
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set message history for langgraph.
-
-        For stateless graphs, updates the cached result.
-        For stateful graphs, this is not fully supported as LangGraph manages state internally.
-
-        Args:
-            history: MASEval MessageHistory to set
-        """
-        _check_langgraph_installed()
-        from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
-
-        # Convert MessageHistory to LangChain messages
-        lc_messages = []
-        for msg in history:
-            role = msg.get("role", "assistant")
-            content = msg.get("content", "")
-
-            if role == "user":
-                lc_messages.append(HumanMessage(content=content))
-            elif role == "assistant":
-                lc_messages.append(AIMessage(content=content))
-            elif role == "system":
-                lc_messages.append(SystemMessage(content=content))
-            elif role == "tool":
-                tool_call_id = msg.get("tool_call_id", "")
-                lc_messages.append(ToolMessage(content=content, tool_call_id=tool_call_id))
-
-        # Update cached result
-        self._last_result = {"messages": lc_messages}
-
-        # Also update base class cache
-        super().set_message_history(history)
-
-    def clear_message_history(self) -> None:
-        """Clear message history for langgraph.
-
-        Clears the cached result. For stateful graphs, this doesn't clear
-        the persistent state in the checkpointer.
-        """
-        self._last_result = None
-        super().clear_message_history()
-
-    def append_to_message_history(self, role: str, content: Any, **kwargs) -> None:
-        """Append message to history.
-
-        For stateless graphs, this appends to the cached result.
-        For stateful graphs, messages are managed by LangGraph during invoke().
-
-        Args:
-            role: Message role
-            content: Message content (string or list)
-            **kwargs: Additional message fields
-        """
-        _check_langgraph_installed()
-        from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-
-        # Get current messages
-        current_messages = []
-        if self._last_result and "messages" in self._last_result:
-            current_messages = self._last_result["messages"]
-
-        # Create new message
-        if role == "user":
-            new_msg = HumanMessage(content=str(content))
-        elif role == "assistant":
-            new_msg = AIMessage(content=str(content))
-        elif role == "system":
-            new_msg = SystemMessage(content=str(content))
-        else:
-            new_msg = AIMessage(content=str(content))
-
-        # Append and update cache
-        current_messages.append(new_msg)
-        self._last_result = {"messages": current_messages}
-
-        # Also update base class cache
-        super().append_to_message_history(role, content, **kwargs)
-
     def gather_config(self) -> dict[str, Any]:
         """Gather configuration from this LangGraph agent.
 
@@ -191,7 +114,7 @@ def gather_config(self) -> dict[str, Any]:
             - gathered_at: ISO timestamp
             - name: Agent name
             - agent_type: CompiledGraph or similar
-            - wrapper_type: LangGraphAgentAdapter
+            - adapter_type: LangGraphAgentAdapter
             - callbacks: List of callback class names
             - has_checkpointer: Whether the graph has state persistence
             - config: LangGraph config dict (with sensitive data removed)
@@ -239,27 +162,101 @@ def _run_agent(self, query: str) -> Any:
         _check_langgraph_installed()
         from langchain_core.messages import HumanMessage
 
-        # Initialize the state with the user query
-        initial_state = {"messages": [HumanMessage(content=query)]}
+        start_time = time.time()
+        timestamp = datetime.now().isoformat()
+
+        try:
+            # Initialize the state with the user query
+            initial_state = {"messages": [HumanMessage(content=query)]}
+
+            # Invoke the graph (with config if provided)
+            if self._langgraph_config:
+                result = self.agent.invoke(initial_state, config=self._langgraph_config)
+            else:
+                result = self.agent.invoke(initial_state)
+
+            # Cache the result for stateless graphs
+            self._last_result = result
+            duration = time.time() - start_time
+
+            # Log successful execution
+            log_entry: Dict[str, Any] = {
+                "timestamp": timestamp,
+                "query": query,
+                "query_length": len(query),
+                "duration_seconds": duration,
+                "status": "success",
+            }
+
+            # Extract state information if available
+            if isinstance(result, dict):
+                log_entry["state_keys"] = list(result.keys())
+                messages = result.get("messages", [])
+                log_entry["message_count"] = len(messages) if messages else 0
+
+                # Try to extract token usage from messages if available
+                # (LangChain messages may have usage_metadata)
+                total_input_tokens = 0
+                total_output_tokens = 0
+                for msg in messages:
+                    if hasattr(msg, "usage_metadata") and msg.usage_metadata:
+                        # usage_metadata can be dict or object
+                        if isinstance(msg.usage_metadata, dict):
+                            total_input_tokens += msg.usage_metadata.get("input_tokens", 0)
+                            total_output_tokens += msg.usage_metadata.get("output_tokens", 0)
+                        else:
+                            total_input_tokens += getattr(msg.usage_metadata, "input_tokens", 0)
+                            total_output_tokens += getattr(msg.usage_metadata, "output_tokens", 0)
+
+                if total_input_tokens > 0 or total_output_tokens > 0:
+                    log_entry["input_tokens"] = total_input_tokens
+                    log_entry["output_tokens"] = total_output_tokens
+                    log_entry["total_tokens"] = total_input_tokens + total_output_tokens
+
+            # For stateful graphs with checkpointer, get state snapshot metadata
+            if self._langgraph_config and hasattr(self.agent, "get_state"):
+                try:
+                    state_snapshot = self.agent.get_state(self._langgraph_config)
+                    if state_snapshot.metadata:
+                        log_entry["checkpoint_metadata"] = {
+                            "source": state_snapshot.metadata.get("source"),
+                            "step": state_snapshot.metadata.get("step"),
+                        }
+                    if state_snapshot.created_at:
+                        log_entry["checkpoint_created_at"] = state_snapshot.created_at
+                except Exception:
+                    # If get_state fails, just skip metadata
+                    pass
+
+            self.logs.append(log_entry)
+
+            # Extract and return the final answer from the graph's result
+            # LangGraph typically returns dict with 'messages' key, extract the last AI message
+            messages = result.get("messages", [])
+            if messages:
+                last_message = messages[-1]
+                # Return the content of the last message as the final answer
+                return getattr(last_message, "content", str(last_message))
+
+            return None
+
+        except Exception as e:
+            duration = time.time() - start_time
+
+            # Log failed execution
+            self.logs.append(
+                {
+                    "timestamp": timestamp,
+                    "query": query,
+                    "query_length": len(query),
+                    "duration_seconds": duration,
+                    "status": "error",
+                    "error": str(e),
+                    "error_type": type(e).__name__,
+                }
+            )
 
-        # Invoke the graph (with config if provided)
-        if self._langgraph_config:
-            result = self.agent.invoke(initial_state, config=self._langgraph_config)
-        else:
-            result = self.agent.invoke(initial_state)
-
-        # Cache the result for stateless graphs
-        self._last_result = result
-
-        # Extract and return the final answer from the graph's result
-        # LangGraph typically returns dict with 'messages' key, extract the last AI message
-        messages = result.get("messages", [])
-        if messages:
-            last_message = messages[-1]
-            # Return the content of the last message as the final answer
-            return getattr(last_message, "content", str(last_message))
-
-        return None
+            raise
 
     def _convert_langchain_messages(self, lc_messages: list) -> MessageHistory:
         """Convert LangChain messages to MASEval MessageHistory format.
diff --git a/maseval/interface/agents/smolagents.py b/maseval/interface/agents/smolagents.py
index 1650b468..7cab9a15 100644
--- a/maseval/interface/agents/smolagents.py
+++ b/maseval/interface/agents/smolagents.py
@@ -4,7 +4,7 @@
     pip install maseval[smolagents]
 """
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Dict, List
 
 from maseval import AgentAdapter, MessageHistory, User
 
@@ -31,7 +31,7 @@ class SmolAgentAdapter(AgentAdapter):
 
     Requires smolagents to be installed.
 
-    This wrapper converts smolagents' internal message format to MASEval's
+    This adapter converts smolagents' internal message format to MASEval's
     OpenAI-compatible MessageHistory format. It automatically tracks tool calls,
     tool responses, and agent reasoning.
 
@@ -41,15 +41,168 @@ class SmolAgentAdapter(AgentAdapter):
         from smolagents import MultiStepAgent
 
         agent = MultiStepAgent(...)
-        wrapper = SmolAgentAdapter(agent)
-        result = wrapper.run("What's the weather?")
+        agent_adapter = SmolAgentAdapter(agent)
+        result = agent_adapter.run("What's the weather?")
 
         # Access message history
-        for msg in wrapper.get_messages():
+        for msg in agent_adapter.get_messages():
             print(msg['role'], msg['content'])
         ```
     """
 
+    def __init__(self, agent_instance, name: str, callbacks=None):
+        """Initialize the Smolagent adapter.
+
+        Note: We don't call super().__init__() to avoid initializing self.logs as a list,
+        since we override it as a property that dynamically fetches from agent.memory.
+        """
+        self.agent = agent_instance
+        self.name = name
+        self.callbacks = callbacks or []
+        self.messages = None
+
+    @property
+    def logs(self) -> List[Dict[str, Any]]:  # type: ignore[override]
+        """Dynamically generate logs from smolagents' internal memory.
+
+        Converts smolagents' ActionStep and PlanningStep objects into log entries
+        compatible with the AgentAdapter contract, including all available properties.
+
+        Returns:
+            List of log dictionaries with comprehensive step information
+        """
+        _check_smolagents_installed()
+        from smolagents.memory import ActionStep, PlanningStep, TaskStep
+
+        logs_list: List[Dict[str, Any]] = []
+
+        if not hasattr(self.agent, "memory") or not hasattr(self.agent.memory, "steps"):
+            return logs_list
+
+        for step in self.agent.memory.steps:
+            if isinstance(step, ActionStep):
+                log_entry: Dict[str, Any] = {
+                    "step_type": "ActionStep",
+                    "step_number": step.step_number,
+                    "status": "error" if step.error else "success",
+                }
+
+                # Timing information
+                if hasattr(step, "timing") and step.timing:
+                    log_entry["start_time"] = step.timing.start_time
+                    log_entry["end_time"] = step.timing.end_time
+                    log_entry["duration_seconds"] = step.timing.duration
+
+                # Token usage information
+                if hasattr(step, "token_usage") and step.token_usage:
+                    log_entry["input_tokens"] = step.token_usage.input_tokens
+                    log_entry["output_tokens"] = step.token_usage.output_tokens
+                    log_entry["total_tokens"] = step.token_usage.total_tokens
+
+                # Model input messages - convert to MASEval format
+                if hasattr(step, "model_input_messages") and step.model_input_messages:
+                    log_entry["model_input_messages"] = self._convert_smolagents_messages(step.model_input_messages).to_list()
+
+                # Tool calls (ToolCall objects)
+                if hasattr(step, "tool_calls") and step.tool_calls:
+                    log_entry["tool_calls"] = [
+                        {
+                            "id": tc.id,
+                            "name": tc.name,
+                            "arguments": tc.arguments,
+                        }
+                        for tc in step.tool_calls
+                    ]
+
+                # Error information
+                if step.error:
+                    log_entry["error"] = str(step.error)
+                    log_entry["error_type"] = type(step.error).__name__
+
+                # Model output message - convert to MASEval format
+                if hasattr(step, "model_output_message") and step.model_output_message:
+                    converted = self._convert_smolagents_messages([step.model_output_message])
+                    if len(converted) > 0:
+                        log_entry["model_output_message"] = converted[0]
+
+                # Model output (raw)
+                if hasattr(step, "model_output") and step.model_output is not None:
+                    log_entry["model_output"] = step.model_output
+
+                # Code action (for CodeAgent)
+                if hasattr(step, "code_action") and step.code_action:
+                    log_entry["code_action"] = step.code_action
+
+                # Observations
+                if hasattr(step, "observations") and step.observations:
+                    log_entry["observations"] = step.observations
+
+                # Observations images
+                if hasattr(step, "observations_images") and step.observations_images:
+                    log_entry["observations_images_count"] = len(step.observations_images)
+
+                # Action output
+                if hasattr(step, "action_output") and step.action_output is not None:
+                    # Convert to string if it's not JSON-serializable
+                    try:
+                        log_entry["action_output"] = step.action_output
+                    except (TypeError, ValueError):
+                        log_entry["action_output"] = str(step.action_output)
+
+                # Is final answer flag
+                if hasattr(step, "is_final_answer"):
+                    log_entry["is_final_answer"] = step.is_final_answer
+
+                logs_list.append(log_entry)
+
+            elif isinstance(step, PlanningStep):
+                log_entry = {
+                    "step_type": "PlanningStep",
+                }
+
+                # Timing information
+                if hasattr(step, "timing") and step.timing:
+                    log_entry["start_time"] = step.timing.start_time
+                    log_entry["end_time"] = step.timing.end_time
+                    log_entry["duration_seconds"] = step.timing.duration
+
+                # Token usage information
+                if hasattr(step, "token_usage") and step.token_usage:
+                    log_entry["input_tokens"] = step.token_usage.input_tokens
+                    log_entry["output_tokens"] = step.token_usage.output_tokens
+                    log_entry["total_tokens"] = step.token_usage.total_tokens
+
+                # Model input messages - convert to MASEval format
+                if hasattr(step, "model_input_messages") and step.model_input_messages:
+                    log_entry["model_input_messages"] = self._convert_smolagents_messages(step.model_input_messages).to_list()
+
+                # Model output message - convert to MASEval format
+                if hasattr(step, "model_output_message") and step.model_output_message:
+                    converted = self._convert_smolagents_messages([step.model_output_message])
+                    if len(converted) > 0:
+                        log_entry["model_output_message"] = converted[0]
+
+                # Plan
+                if hasattr(step, "plan") and step.plan:
+                    log_entry["plan"] = step.plan
+
+                logs_list.append(log_entry)
+
+            elif isinstance(step, TaskStep):
+                # Log task initiation
+                log_entry = {
+                    "step_type": "TaskStep",
+                    "task": step.task,
+                }
+
+                # Task images if present
+                if hasattr(step, "task_images") and step.task_images:
+                    log_entry["task_images_count"] = len(step.task_images)
+
+                logs_list.append(log_entry)
+
+        return logs_list
+
     def gather_traces(self) -> dict:
         """Gather traces including message history and monitoring data.
 
@@ -137,7 +290,7 @@ def gather_config(self) -> dict[str, Any]:
             - gathered_at: ISO timestamp
             - name: Agent name
             - agent_type: Underlying agent class name
-            - wrapper_type: SmolAgentAdapter
+            - adapter_type: SmolAgentAdapter
             - callbacks: List of callback class names
             - smolagents_config: Full configuration from agent.to_dict() including:
                 - model: Model configuration with class and parameters
@@ -198,61 +351,14 @@ def get_messages(self) -> MessageHistory:
         # Convert and return
         return self._convert_smolagents_messages(smol_messages)
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set message history - NOT SUPPORTED by smolagents.
-
-        Args:
-            history: MASEval MessageHistory to set
-
-        Raises:
-            NotImplementedError: smolagents doesn't support arbitrary message injection
-        """
-        raise NotImplementedError(
-            "smolagents doesn't support setting arbitrary message history. "
-            "The agent's memory is built from execution steps and cannot be directly manipulated. "
-            "Use clear_message_history() to reset, then run() to generate new conversation."
-        )
-
-    def clear_message_history(self) -> None:
-        """Clear message history by resetting smolagents memory."""
-        _check_smolagents_installed()
-        from smolagents.memory import AgentMemory
-
-        # Get system prompt before clearing
-        system_prompt = ""
-        if hasattr(self.agent, "memory") and hasattr(self.agent.memory, "system_prompt"):
-            system_prompt = self.agent.memory.system_prompt
-
-        # Reset memory
-        self.agent.memory = AgentMemory(system_prompt=system_prompt)
-
-        # Also clear base class cache
-        super().clear_message_history()
-
-    def append_to_message_history(self, role: str, content: Any, **kwargs) -> None:
-        """Append message to history - NOT SUPPORTED by smolagents.
-
-        Args:
-            role: Message role
-            content: Message content (string or list)
-            **kwargs: Additional message fields
-
-        Raises:
-            NotImplementedError: smolagents doesn't support arbitrary message injection
-        """
-        raise NotImplementedError(
-            "smolagents doesn't support appending arbitrary messages to history. "
-            "The agent's memory is built from execution steps and cannot be directly manipulated. "
-            "Use run() to generate conversation messages."
-        )
-
     def _run_agent(self, query: str) -> str:
         _check_smolagents_installed()
 
         # Run the agent (this updates the agent's internal memory and returns the final answer)
+        # All execution details are tracked in agent.memory.steps automatically
         final_answer = self.agent.run(query)
 
-        # Return the final answer (traces are captured via get_messages())
+        # Return the final answer (traces are captured via get_messages() and gather_traces())
         return final_answer
 
     def _convert_smolagents_messages(self, smol_messages: list) -> MessageHistory:
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
index 04273dfb..d30989f2 100644
--- a/maseval/interface/inference/google_genai.py
+++ b/maseval/interface/inference/google_genai.py
@@ -44,7 +44,7 @@ def _extract_text(self, response: Any) -> str:
             if "candidates" in response and response["candidates"]:
                 return response["candidates"][0].get("content", "")
             if "output" in response and isinstance(response["output"], list) and response["output"]:
-                # some wrappers return a list of text chunks
+                # some implementations return a list of text chunks
                 first = response["output"][0]
                 if isinstance(first, dict):
                     return first.get("content", "")
diff --git a/tests/README.md b/tests/README.md
index 1746cd3f..269e1e56 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -101,7 +101,7 @@ Tests are organized into three directories following a **bottom-up and top-down
 Examples:
 
 - `test_model_adapter.py` - Base `ModelAdapter` class behavior
-- `test_agent_wrapper.py` - Base `AgentAdapter` class behavior
+- `test_agent_adapter.py` - Base `AgentAdapter` class behavior
 - `test_benchmark_lifecycle.py` - Core benchmark orchestration
 
 ### `test_interface/`
@@ -110,7 +110,7 @@ Examples:
 
 Examples:
 
-- `test_agent_integration/` - Framework-specific agent wrappers
+- `test_agent_integration/` - Framework-specific agent adapters
 - `test_model_integration/` - Provider-specific model adapters (OpenAI, Google, HuggingFace, LiteLLM)
 
 ### `test_contract/`
@@ -119,7 +119,7 @@ Examples:
 
 Contract tests use parametrized tests to verify that all implementations (e.g., different framework adapters) behave identically for key operations:
 
-- `test_agent_wrapper_contract.py` - All `AgentAdapter` implementations return same message format, trigger callbacks uniformly
+- `test_agent_adapter_contract.py` - All `AgentAdapter` implementations return same message format, trigger callbacks uniformly
 - `test_model_adapter_contract.py` - All `ModelAdapter` implementations log calls identically, produce same trace/config structure (65+ parameterized tests)
 - `test_collection_contract.py` - All components (Agent, Model, Environment, User) follow same tracing/config contracts
 
diff --git a/tests/TESTING_PLAN.md b/tests/TESTING_PLAN.md
index 1109bf0b..fc3292af 100644
--- a/tests/TESTING_PLAN.md
+++ b/tests/TESTING_PLAN.md
@@ -45,7 +45,7 @@ Every benchmark execution follows: **Setup → Run → Evaluate**
 
 - `setup_environment()` → creates isolated task environment
 - `setup_user()` → optional user simulator
-- `setup_agents()` → instantiates agent wrappers
+- `setup_agents()` → instantiates agent adapters
 - `run_agents()` → executes multi-agent system
 - Message collection and `evaluate()` → assessment
 
@@ -110,7 +110,7 @@ All core functionality is fully tested. See individual test files in `tests/test
 - `test_message_history.py` (14 tests) - Message history interface and operations
 - `test_trace_collection.py` (10 tests) - Trace gathering from all components
 - `test_config_collection.py` (11 tests) - Configuration collection for reproducibility
-- `test_agent_wrapper.py` (8 tests) - Agent wrapper base functionality
+- `test_agent_adapter.py` (8 tests) - agent adapter base functionality
 - `test_environment.py` (7 tests) - Environment state management and tools
 - `test_user_simulator.py` (5 tests) - User simulation for collaborative benchmarks
 - `test_model_adapter.py` (36 tests) - Model adapter comprehensive testing
@@ -125,7 +125,7 @@ All core functionality is fully tested. See individual test files in `tests/test
 
 All contract tests validate cross-implementation consistency. See individual test files in `tests/test_contract/` for complete contract guarantees:
 
-- `test_agent_wrapper_contract.py` (11 tests) - Framework-agnostic agent wrapper contract
+- `test_agent_adapter_contract.py` (11 tests) - Framework-agnostic agent adapter contract
 - `test_collection_contract.py` (20 tests) - Universal tracing and config contract
 - `test_model_adapter_contract.py` (16 tests) - Model provider abstraction contract
 
@@ -187,11 +187,11 @@ Test file: `tests/test_core/test_config_collection.py`
 
 **Why:** Reproducibility depends on comprehensive config capture.
 
-#### 5. **Agent Wrapper Tests** ✅ FULLY IMPLEMENTED
+#### 5. **agent adapter Tests** ✅ FULLY IMPLEMENTED
 
 **Status:** ✅ **COMPLETE** - 8 tests implemented
 
-Test file: `tests/test_core/test_agent_wrapper.py`
+Test file: `tests/test_core/test_agent_adapter.py`
 
 **What is tested:** See test file for complete list. Tests cover callback triggering, message history operations (get/set/clear/append), trace collection, and config gathering.
 
@@ -314,7 +314,7 @@ Test file: `tests/test_core/test_benchmark_integration.py` (proposed)
 
 **Status:** ✅ **COMPLETE** - 11 tests implemented
 
-Test file: `tests/test_contract/test_agent_wrapper_contract.py`
+Test file: `tests/test_contract/test_agent_adapter_contract.py`
 
 **Purpose:** Validates that ALL AgentAdapter implementations (smolagents, langgraph, dummy) honor the same behavioral contract and behave identically for key operations. This is MASEval's **CORE PROMISE** - framework-agnostic agent abstraction.
 
@@ -393,7 +393,7 @@ Test files:
 
 **Smolagents (10 tests):**
 
-- Wrapper/user creation and import guards
+- Adapter creation and import guards
 - Trace gathering with/without monitoring
 - Trace gathering with planning steps
 - Message manipulation support (not supported)
@@ -401,7 +401,7 @@ Test files:
 
 **LangGraph (5 tests):**
 
-- Wrapper import and availability checks
+- Adapter import and availability checks
 - Message manipulation with/without system messages
 
 **Why:** Validates framework-specific adapters work correctly with their respective libraries and handle framework-specific features properly.
@@ -421,7 +421,7 @@ Test files:
 
 **Thread Safety and Concurrency:**
 
-- `test_wrapper_concurrent_runs()` - Multiple threads calling run() simultaneously
+- `test_adapter_concurrent_runs()` - Multiple threads calling run() simultaneously
 - `test_trace_collection_thread_safety()` - Trace accumulation in concurrent execution
 - `test_callback_thread_safety()` - Callbacks triggered from multiple threads
 
@@ -526,13 +526,13 @@ Each test file should:
 
 ### P1 (Should Have - High Value) ✅ ALL COMPLETE
 
-6. ✅ Agent wrapper tests
+6. ✅ agent adapter tests
 7. ✅ Environment tests
 8. ✅ Callback orchestration tests
 9. ✅ Task collection tests
 10. ✅ Evaluator tests
 11. ✅ Result logger callbacks
-12. ✅ Contract tests (agent wrapper, collection, model adapter)
+12. ✅ Contract tests (agent adapter, collection, model adapter)
 
 ### P2 (Nice to Have - Completeness) ✅ ALL COMPLETE
 
@@ -561,7 +561,7 @@ Shared fixtures implemented in `tests/conftest.py`:
 
 - `dummy_model` - DummyModelAdapter with configurable responses
 - `dummy_agent` - DummyAgent that tracks calls
-- `dummy_agent_wrapper` - DummyAgentAdapter with message history
+- `dummy_agent_adapter` - DummyAgentAdapter with message history
 - `dummy_environment` - DummyEnvironment with state management
 - `dummy_user` - DummyUser for simulation testing
 - `dummy_task` - Single Task instance
@@ -614,7 +614,7 @@ pytest -x --ff  # Stop on first failure, run previous failures first
 
 - ✅ **Test Count:** 333 tests implemented across 23 test files
 - ✅ **Core Coverage:** All P0 (4/5), P1 (7/7), and P2 (6/6) tests complete
-- ✅ **Contract Coverage:** All contract tests implemented (agent wrapper, collection, model adapter)
+- ✅ **Contract Coverage:** All contract tests implemented (agent adapter, collection, model adapter)
 - ✅ **Interface Coverage:** All adapter integration tests complete (agents + models)
 - 🟡 **Runtime:** Not yet measured
 - 🟡 **Reliability:** Not yet run in CI
@@ -639,14 +639,14 @@ pytest -x --ff  # Stop on first failure, run previous failures first
 
 ### Phase 2 (Week 2): Core Coverage ✅ COMPLETE
 
-- ✅ Implement P1 tests (agent wrapper, environment, callbacks, tasks, evaluator)
+- ✅ Implement P1 tests (agent adapter, environment, callbacks, tasks, evaluator)
 - ✅ Add callback orchestration tests
 - ✅ Message tracing callback specialized tests
 - ✅ Automatic registration tests
 
 ### Phase 3 (Week 3): Interface Coverage ✅ COMPLETE
 
-- ✅ Contract tests (agent wrapper, collection, model adapter - 47 tests)
+- ✅ Contract tests (agent adapter, collection, model adapter - 47 tests)
 - ✅ Smolagents integration (10 tests)
 - ✅ LangGraph integration (5 tests)
 - ✅ Model adapter integrations (22 tests across 4 providers)
diff --git a/tests/conftest.py b/tests/conftest.py
index 81ef463e..399eb916 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -92,19 +92,33 @@ def __str__(self):
 
 
 class DummyAgentAdapter(AgentAdapter):
-    """Test agent wrapper that populates message history."""
+    """Test agent adapter that populates message history."""
 
     def _run_agent(self, query: str) -> str:
-        # Create message history
-        history = MessageHistory()
-        history.add_message(role="user", content=query)
+        import time
+
+        # Track timing
+        start_time = time.time()
 
         # Run underlying agent
         response = self.agent.run(query)
-        history.add_message(role="assistant", content=response)
 
-        # Store history
-        self.set_message_history(history)
+        # Store history directly
+        if self.messages is None:
+            self.messages = MessageHistory()
+        self.messages.add_message(role="user", content=query)
+        self.messages.add_message(role="assistant", content=response)
+
+        # Populate logs to fulfill contract
+        duration = time.time() - start_time
+        self.logs.append(
+            {
+                "query": query,
+                "duration_seconds": duration,
+                "status": "success",
+                "response": response,
+            }
+        )
 
         # Return final answer (not the history)
         return response
@@ -199,8 +213,8 @@ def setup_agents(
     ) -> Tuple[Sequence[AgentAdapter], Dict[str, AgentAdapter]]:
         self.setup_agents_calls.append((agent_data, environment, task, user))
         agent = DummyAgent()
-        wrapper = DummyAgentAdapter(agent, "test_agent")
-        return [wrapper], {"test_agent": wrapper}
+        agent_adapter = DummyAgentAdapter(agent, "test_agent")
+        return [agent_adapter], {"test_agent": agent_adapter}
 
     def setup_evaluators(
         self, environment: Environment, task: Task, agents: Sequence[AgentAdapter], user: Optional[User]
@@ -240,8 +254,8 @@ def dummy_agent():
 
 
 @pytest.fixture
-def dummy_agent_wrapper(dummy_agent):
-    """Create a dummy agent wrapper."""
+def dummy_agent_adapter(dummy_agent):
+    """Create a dummy agent adapter."""
     return DummyAgentAdapter(dummy_agent, "test_agent")
 
 
diff --git a/tests/test_contract/test_agent_wrapper_contract.py b/tests/test_contract/test_agent_adapter_contract.py
similarity index 70%
rename from tests/test_contract/test_agent_wrapper_contract.py
rename to tests/test_contract/test_agent_adapter_contract.py
index 6cd32902..40fa4dbf 100644
--- a/tests/test_contract/test_agent_wrapper_contract.py
+++ b/tests/test_contract/test_agent_adapter_contract.py
@@ -115,7 +115,7 @@ def agent_node(state: State) -> State:
 
             response = mock_llm([{"role": "user", "content": user_msg}])
 
-            # Return LangChain-style message objects so the wrapper conversion works
+            # Return LangChain-style message objects so the adapter conversion works
             return {"messages": messages + [AIMessage(content=response)]}
 
         # Build graph
@@ -130,8 +130,8 @@ def agent_node(state: State) -> State:
         raise ValueError(f"Unknown framework: {framework}")
 
 
-def create_wrapper_for_framework(framework: str, agent, callbacks: Optional[List[AgentCallback]] = None):
-    """Create a framework-specific wrapper instance."""
+def create_adapter_for_framework(framework: str, agent, callbacks: Optional[List[AgentCallback]] = None):
+    """Create a framework-specific adapter instance."""
     # Verify agent is not None and is the expected type for the framework
     assert agent is not None, f"Agent instance is None for framework: {framework}"
 
@@ -172,7 +172,7 @@ def create_wrapper_for_framework(framework: str, agent, callbacks: Optional[List
 class TestAgentAdapterContract:
     """Verify all AgentAdapter implementations honor the same contract."""
 
-    def test_wrapper_run_returns_same_structure(self, framework):
+    def test_adapter_run_returns_same_structure(self, framework):
         """Test all frameworks return string result and populate message history.
 
         Contract: run() must return a string (the final answer) and populate
@@ -180,16 +180,16 @@ def test_wrapper_run_returns_same_structure(self, framework):
         """
         mock_llm = MockLLM(responses=["Test response to query"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
-        result = wrapper.run("Test query")
+        result = adapter.run("Test query")
 
         # All should return string (final answer)
         assert isinstance(result, str)
         assert len(result) > 0
 
         # All should populate message history identically
-        history = wrapper.get_messages()
+        history = adapter.get_messages()
         assert len(history) > 0
 
         # Some frameworks (smolagents) prepend a system message; accept either.
@@ -198,7 +198,7 @@ def test_wrapper_run_returns_same_structure(self, framework):
         # Ensure at least one assistant/tool message exists somewhere in the history
         assert any(msg.get("role") in ["assistant", "tool"] for msg in history)
 
-    def test_wrapper_message_format_identical(self, framework):
+    def test_adapter_message_format_identical(self, framework):
         """Test all frameworks produce OpenAI-compatible message format.
 
         Contract: All messages must have 'role' and 'content' keys, matching
@@ -206,10 +206,10 @@ def test_wrapper_message_format_identical(self, framework):
         """
         mock_llm = MockLLM(responses=["Response content"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
-        wrapper.run("Test query")
-        history = wrapper.get_messages()
+        adapter.run("Test query")
+        history = adapter.get_messages()
 
         # Verify OpenAI format
         for msg in history:
@@ -219,18 +219,18 @@ def test_wrapper_message_format_identical(self, framework):
             allowed = {"user", "assistant", "system", "tool"}
             assert role in allowed or role.startswith("tool"), f"Invalid role: {msg['role']}"
 
-    def test_wrapper_callbacks_triggered_uniformly(self, framework):
+    def test_adapter_callbacks_triggered_uniformly(self, framework):
         """Test callbacks fire in same order across all frameworks.
 
         Contract: on_run_start and on_run_end callbacks must fire in the
-        correct order (start before run, end after run) for all wrappers.
+        correct order (start before run, end after run) for all adapters.
         """
         callback_tracker = CallbackTracker()
         mock_llm = MockLLM(responses=["Response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent, callbacks=[callback_tracker])
+        adapter = create_adapter_for_framework(framework, agent, callbacks=[callback_tracker])
 
-        wrapper.run("Test query")
+        adapter.run("Test query")
 
         # All frameworks should trigger same callback sequence
         assert "on_agent_start" in callback_tracker.events
@@ -238,20 +238,20 @@ def test_wrapper_callbacks_triggered_uniformly(self, framework):
         assert callback_tracker.events[0] == "on_agent_start"
         assert callback_tracker.events[-1] == "on_agent_end"
 
-    def test_wrapper_traces_same_structure(self, framework):
+    def test_adapter_traces_same_structure(self, framework):
         """Test gather_traces returns consistent structure across frameworks.
 
-        Contract: All wrappers must provide message history in traces, enabling
+        Contract: All adapters must provide message history in traces, enabling
         uniform access to execution data regardless of underlying framework.
         """
         mock_llm = MockLLM(responses=["Response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
-        wrapper.run("Test query")
-        traces = wrapper.gather_traces()
+        adapter.run("Test query")
+        traces = adapter.gather_traces()
 
-        # All should include message history; different wrappers name this key
+        # All should include message history; different adapters name this key
         if "message_history" in traces:
             messages = traces["message_history"]
         else:
@@ -260,24 +260,24 @@ def test_wrapper_traces_same_structure(self, framework):
         assert isinstance(messages, list)
         assert len(messages) > 0
 
-    def test_wrapper_config_same_structure(self, framework):
+    def test_adapter_config_same_structure(self, framework):
         """Test gather_config returns consistent structure across frameworks.
 
-        Contract: All wrappers must provide agent name in config, enabling
+        Contract: All adapters must provide agent name in config, enabling
         identification and reproducibility tracking.
         """
         mock_llm = MockLLM(responses=["Response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
-        config = wrapper.gather_config()
+        config = adapter.gather_config()
 
         # All should include agent name
         assert "agent_name" in config or "name" in config
         # All should include some identifying information
         assert len(config) > 0
 
-    def test_wrapper_get_messages_after_multiple_runs(self, framework):
+    def test_adapter_get_messages_after_multiple_runs(self, framework):
         """Test message history accumulation across multiple agent runs.
 
         Contract: Message history behavior during multi-turn conversations must
@@ -285,24 +285,24 @@ def test_wrapper_get_messages_after_multiple_runs(self, framework):
         """
         mock_llm = MockLLM(responses=["First response", "Second response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
         # First run
-        wrapper.run("First query")
-        history_1 = wrapper.get_messages()
+        adapter.run("First query")
+        history_1 = adapter.get_messages()
         len_1 = len(history_1)
         assert len_1 > 0
 
         # Second run (behavior may differ: some accumulate, some reset)
-        wrapper.run("Second query")
-        history_2 = wrapper.get_messages()
+        adapter.run("Second query")
+        history_2 = adapter.get_messages()
         len_2 = len(history_2)
 
         # At minimum, should have messages from second run
         assert len_2 > 0
         # Note: We don't enforce accumulation vs reset - that's framework-specific
 
-    def test_wrapper_empty_query_handling(self, framework):
+    def test_adapter_empty_query_handling(self, framework):
         """All frameworks handle empty queries gracefully.
 
         Note: This test accepts both success and failure for empty queries.
@@ -312,21 +312,21 @@ def test_wrapper_empty_query_handling(self, framework):
         """
         mock_llm = MockLLM(responses=["Response to empty"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
         # Should not crash on empty query
         try:
-            result = wrapper.run("")
+            result = adapter.run("")
             # If it succeeds, should return something
             assert result is not None
         except (ValueError, AssertionError):
             # It's acceptable to reject empty queries
             pass
 
-    def test_wrapper_on_event_callback(self, framework):
+    def test_adapter_on_event_callback(self, framework):
         """Test that standard callback hooks fire consistently across frameworks.
 
-        Contract: All wrappers must fire on_run_start and on_run_end callbacks.
+        Contract: All adapters must fire on_run_start and on_run_end callbacks.
         The on_event hook is optional for custom events.
         """
         events = []
@@ -343,20 +343,20 @@ def on_run_end(self, agent, result):
 
         mock_llm = MockLLM(responses=["Response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent, callbacks=[EventTracker()])
+        adapter = create_adapter_for_framework(framework, agent, callbacks=[EventTracker()])
 
-        wrapper.run("Test query")
+        adapter.run("Test query")
 
         # Verify standard callbacks fired
         event_types = [e[0] for e in events]
         assert "on_run_start" in event_types
         assert "on_run_end" in event_types
 
-        # Note: on_event() is a generic hook that wrappers can use to emit custom events.
+        # Note: on_event() is a generic hook that adapters can use to emit custom events.
         # The base AgentAdapter doesn't emit any events by default, but the callback
-        # mechanism should work if wrappers choose to use it.
+        # mechanism should work if adapters choose to use it.
 
-    def test_wrapper_callback_lifecycle_order(self, framework):
+    def test_adapter_callback_lifecycle_order(self, framework):
         """Test callbacks fire in correct lifecycle order with proper state.
 
         Contract: on_run_start fires before execution with initial state,
@@ -379,9 +379,9 @@ def on_run_end(self, agent, result):
 
         mock_llm = MockLLM(responses=["Test response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent, callbacks=[LifecycleTracker()])
+        adapter = create_adapter_for_framework(framework, agent, callbacks=[LifecycleTracker()])
 
-        result = wrapper.run("Test query")
+        result = adapter.run("Test query")
 
         # Verify callback order
         assert len(lifecycle_events) == 2
@@ -399,7 +399,7 @@ def on_run_end(self, agent, result):
         # Verify result is passed to on_run_end
         assert lifecycle_events[1][2] == result
 
-    def test_wrapper_multiple_callbacks(self, framework):
+    def test_adapter_multiple_callbacks(self, framework):
         """Test multiple callbacks execute in registration order.
 
         Contract: When multiple callbacks are registered, they must execute
@@ -423,9 +423,9 @@ def on_run_end(self, agent, result):
 
         mock_llm = MockLLM(responses=["Response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent, callbacks=[FirstCallback(), SecondCallback()])
+        agent_adapter = create_adapter_for_framework(framework, agent, callbacks=[FirstCallback(), SecondCallback()])
 
-        wrapper.run("Test query")
+        agent_adapter.run("Test query")
 
         # Verify all callbacks fired
         assert len(call_order) == 4
@@ -433,35 +433,101 @@ def on_run_end(self, agent, result):
         # Verify order: all on_run_start before any on_run_end
         assert call_order == ["first_start", "second_start", "first_end", "second_end"]
 
-    def test_wrapper_message_history_after_clear_and_run(self, framework):
-        """Test message history clear resets state for fresh conversations.
+    def test_adapter_logs_populated_after_run(self, framework):
+        """Test all adapters populate self.logs during execution.
 
-        Contract: clear_message_history must fully reset history state, and
-        subsequent run() calls must start with clean history regardless of
-        framework implementation details.
+        Contract: All AgentAdapter implementations must populate the self.logs
+        attribute with execution information. This enables uniform access to
+        detailed execution traces regardless of the underlying framework.
 
-        Note: smolagents maintains a system message after clear.
+        The logs should contain basic execution information that can be used
+        for debugging, monitoring, and evaluation purposes.
+        """
+        mock_llm = MockLLM(responses=["Test response"])
+        agent = create_agent_for_framework(framework, mock_llm)
+        adapter = create_adapter_for_framework(framework, agent)
+
+        # Before run, logs should be empty
+        assert isinstance(adapter.logs, list)
+        initial_log_count = len(adapter.logs)
+
+        # Run the agent
+        adapter.run("Test query")
+
+        # After run, logs should be populated
+        assert len(adapter.logs) > initial_log_count
+        assert isinstance(adapter.logs, list)
+
+        # Verify logs contain useful information (at least one entry)
+        # Different frameworks may structure logs differently, but all should have entries
+        assert len(adapter.logs) > 0
+
+    def test_adapter_logs_in_gather_traces(self, framework):
+        """Test that gather_traces includes logs field.
+
+        Contract: The gather_traces() method must include the logs field,
+        providing a unified way to access execution details across all frameworks.
+        """
+        mock_llm = MockLLM(responses=["Test response"])
+        agent = create_agent_for_framework(framework, mock_llm)
+        adapter = create_adapter_for_framework(framework, agent)
+
+        # Run the agent
+        adapter.run("Test query")
+
+        # Gather traces
+        traces = adapter.gather_traces()
+
+        # Verify logs field exists and is populated
+        assert "logs" in traces
+        assert isinstance(traces["logs"], list)
+        assert len(traces["logs"]) > 0
+
+    def test_adapter_logs_structure_has_basic_info(self, framework):
+        """Test that logs entries contain basic execution information.
+
+        Contract: While the exact structure of log entries may vary by framework,
+        all implementations should provide basic execution information in their logs.
+        This test verifies that log entries are dictionaries containing some form
+        of execution data.
+        """
+        mock_llm = MockLLM(responses=["Test response"])
+        agent = create_agent_for_framework(framework, mock_llm)
+        adapter = create_adapter_for_framework(framework, agent)
+
+        # Run the agent
+        adapter.run("Test query")
+
+        # Verify logs contain dict entries with data
+        logs = adapter.logs
+        assert len(logs) > 0
+
+        # Each log entry should be a dictionary
+        for log_entry in logs:
+            assert isinstance(log_entry, dict)
+            # Should have at least one field with information
+            assert len(log_entry) > 0
+
+    def test_adapter_logs_accumulate_across_runs(self, framework):
+        """Test that logs accumulate or reset consistently across multiple run
+        calls to the agent.
+
+        Contract: Adapter logs should maintain a consistent lifecycle behavior
+        across runs.
         """
         mock_llm = MockLLM(responses=["First response", "Second response"])
         agent = create_agent_for_framework(framework, mock_llm)
-        wrapper = create_wrapper_for_framework(framework, agent)
+        adapter = create_adapter_for_framework(framework, agent)
 
         # First run
-        wrapper.run("First query")
-        history_1 = wrapper.get_messages()
-        assert len(history_1) > 0
-
-        # Clear and verify empty (or just system message for smolagents)
-        wrapper.clear_message_history()
-        history_after_clear = wrapper.get_messages()
-        expected_after_clear = 1 if framework == "smolagents" else 0  # smolagents keeps system message
-        assert len(history_after_clear) == expected_after_clear
-
-        # Second run should populate new history
-        wrapper.run("Second query")
-        history_2 = wrapper.get_messages()
-        assert len(history_2) > expected_after_clear  # Should have more than just system message
-
-        # History should only contain second run's messages
-        # (exact count depends on framework, but should have at least one message)
-        assert any("Second query" in str(msg.get("content", "")) for msg in history_2)
+        adapter.run("First query")
+        logs_count_after_first = len(adapter.logs)
+        assert logs_count_after_first > 0
+
+        # Second run
+        adapter.run("Second query")
+        logs_count_after_second = len(adapter.logs)
+
+        # Logs should either accumulate or stay consistent
+        # (we accept both behaviors as long as logs are populated)
+        assert logs_count_after_second > 0
diff --git a/tests/test_contract/test_collection_contract.py b/tests/test_contract/test_collection_contract.py
index 1df13085..6e77b8b3 100644
--- a/tests/test_contract/test_collection_contract.py
+++ b/tests/test_contract/test_collection_contract.py
@@ -124,7 +124,7 @@ def remove_timestamp(data: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def create_agent_for_framework(framework: str):
-    """Create agent wrapper for specified framework."""
+    """Create agent adapter for specified framework."""
     if framework == "dummy":
         agent = DummyAgent()
         return DummyAgentAdapter(agent, "test_agent")
@@ -187,7 +187,7 @@ class TestUniversalTracingContract:
     """Test that ALL components follow the same tracing contract."""
 
     def test_agent_traces_have_base_fields(self):
-        """Agent wrappers must include base trace fields including name."""
+        """agent adapters must include base trace fields including name."""
         agent = create_agent_for_framework("dummy")
         agent.run("Test query")
 
@@ -250,7 +250,7 @@ class TestUniversalConfigContract:
     """Test that ALL components follow the same configuration contract."""
 
     def test_agent_config_has_base_fields(self):
-        """Agent wrappers must include base config fields including name."""
+        """agent adapters must include base config fields including name."""
         agent = create_agent_for_framework("dummy")
 
         config = agent.gather_config()
@@ -320,7 +320,7 @@ def test_config_never_raises_exceptions(self):
 @pytest.mark.interface
 @pytest.mark.parametrize("framework", ["dummy", "smolagents", "langgraph"])
 class TestCrossFrameworkTracingConsistency:
-    """Test that agent wrappers have consistent tracing across frameworks."""
+    """Test that agent adapters have consistent tracing across frameworks."""
 
     def test_all_frameworks_return_same_base_structure(self, framework):
         """All frameworks must return same base trace structure."""
@@ -355,12 +355,12 @@ def test_all_frameworks_return_same_base_config(self, framework):
 
         # All agents must have these fields (AgentAdapter contract)
         assert "agent_type" in config, "Missing 'agent_type' field"
-        assert "wrapper_type" in config, "Missing 'wrapper_type' field"
+        assert "adapter_type" in config, "Missing 'adapter_type' field"
         assert "callbacks" in config, "Missing 'callbacks' field"
 
         # Verify types
         assert isinstance(config["agent_type"], str)
-        assert isinstance(config["wrapper_type"], str)
+        assert isinstance(config["adapter_type"], str)
         assert isinstance(config["callbacks"], list)
 
     def test_all_frameworks_have_json_serializable_traces(self, framework):
diff --git a/tests/test_core/test_agent_adapter.py b/tests/test_core/test_agent_adapter.py
new file mode 100644
index 00000000..10161403
--- /dev/null
+++ b/tests/test_core/test_agent_adapter.py
@@ -0,0 +1,93 @@
+"""Test AgentAdapter functionality.
+
+These tests verify that AgentAdapter provides the correct interface for
+adapting agents from any framework.
+"""
+
+import pytest
+from maseval import MessageHistory
+
+
+@pytest.mark.core
+class TestAgentAdapter:
+    """Tests for AgentAdapter interface and behavior."""
+
+    def test_agent_adapter_run_triggers_callbacks(self, dummy_agent_adapter):
+        """Test that run() triggers agent callbacks."""
+        from maseval import AgentCallback
+
+        # Track callback invocations
+        callback_calls = []
+
+        class TrackingCallback(AgentCallback):
+            def on_run_start(self, agent):
+                callback_calls.append(("start", agent.name))
+
+            def on_run_end(self, agent, result):
+                callback_calls.append(("end", agent.name, result))
+
+        dummy_agent_adapter.callbacks = [TrackingCallback()]
+        _ = dummy_agent_adapter.run("Test query")
+
+        assert len(callback_calls) == 2
+        assert callback_calls[0] == ("start", "test_agent")
+        assert callback_calls[1][0] == "end"
+        assert callback_calls[1][1] == "test_agent"
+        assert "Response to: Test query" in callback_calls[1][2]
+
+    def test_agent_adapter_get_messages_returns_history(self, dummy_agent_adapter):
+        """Test that get_messages() returns MessageHistory."""
+        # Before run, should return empty history
+        history = dummy_agent_adapter.get_messages()
+        assert isinstance(history, MessageHistory)
+        assert len(history) == 0
+
+        # After run, should have messages
+        dummy_agent_adapter.run("Test query")
+        history = dummy_agent_adapter.get_messages()
+        assert len(history) == 2
+        assert history[0]["role"] == "user"
+        assert history[1]["role"] == "assistant"
+
+    def test_agent_adapter_gather_traces_includes_messages(self, dummy_agent_adapter):
+        """Test that gather_traces() includes message history."""
+        dummy_agent_adapter.run("Test query")
+
+        traces = dummy_agent_adapter.gather_traces()
+
+        assert "type" in traces
+        assert "gathered_at" in traces
+        assert "name" in traces
+        assert "agent_type" in traces
+        assert "message_count" in traces
+        assert "messages" in traces
+
+        assert traces["name"] == "test_agent"
+        assert traces["message_count"] == 2
+        assert len(traces["messages"]) == 2
+
+    def test_agent_adapter_gather_config(self, dummy_agent_adapter):
+        """Test that gather_config() returns configuration."""
+        config = dummy_agent_adapter.gather_config()
+
+        assert "type" in config
+        assert "gathered_at" in config
+        assert "name" in config
+        assert "agent_type" in config
+
+        assert config["name"] == "test_agent"
+        assert config["type"] == "DummyAgentAdapter"
+
+    def test_agent_adapter_multiple_runs(self, dummy_agent_adapter):
+        """Test that adapter can be run multiple times and history accumulates."""
+        result1 = dummy_agent_adapter.run("Query 1")
+        assert "Query 1" in result1
+
+        result2 = dummy_agent_adapter.run("Query 2")
+        assert "Query 2" in result2
+
+        # History should have both runs
+        history = dummy_agent_adapter.get_messages()
+        assert len(history) == 4  # 2 messages per run
+        assert history[0]["content"] == "Query 1"
+        assert history[2]["content"] == "Query 2"
diff --git a/tests/test_core/test_agent_wrapper.py b/tests/test_core/test_agent_wrapper.py
deleted file mode 100644
index d91d8be2..00000000
--- a/tests/test_core/test_agent_wrapper.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""Test AgentAdapter functionality.
-
-These tests verify that AgentAdapter provides the correct interface for
-wrapping agents from any framework.
-"""
-
-import pytest
-from maseval import MessageHistory
-
-
-@pytest.mark.core
-class TestAgentAdapter:
-    """Tests for AgentAdapter interface and behavior."""
-
-    def test_agent_wrapper_run_triggers_callbacks(self, dummy_agent_wrapper):
-        """Test that run() triggers agent callbacks."""
-        from maseval import AgentCallback
-
-        # Track callback invocations
-        callback_calls = []
-
-        class TrackingCallback(AgentCallback):
-            def on_run_start(self, agent):
-                callback_calls.append(("start", agent.name))
-
-            def on_run_end(self, agent, result):
-                callback_calls.append(("end", agent.name, result))
-
-        dummy_agent_wrapper.callbacks = [TrackingCallback()]
-        _ = dummy_agent_wrapper.run("Test query")
-
-        assert len(callback_calls) == 2
-        assert callback_calls[0] == ("start", "test_agent")
-        assert callback_calls[1][0] == "end"
-        assert callback_calls[1][1] == "test_agent"
-        assert "Response to: Test query" in callback_calls[1][2]
-
-    def test_agent_wrapper_get_messages_returns_history(self, dummy_agent_wrapper):
-        """Test that get_messages() returns MessageHistory."""
-        # Before run, should return empty history
-        history = dummy_agent_wrapper.get_messages()
-        assert isinstance(history, MessageHistory)
-        assert len(history) == 0
-
-        # After run, should have messages
-        dummy_agent_wrapper.run("Test query")
-        history = dummy_agent_wrapper.get_messages()
-        assert len(history) == 2
-        assert history[0]["role"] == "user"
-        assert history[1]["role"] == "assistant"
-
-    def test_agent_wrapper_set_message_history(self, dummy_agent_wrapper):
-        """Test that message history can be set manually."""
-        new_history = MessageHistory()
-        new_history.add_message("user", "Custom message")
-        new_history.add_message("assistant", "Custom response")
-
-        dummy_agent_wrapper.set_message_history(new_history)
-
-        retrieved = dummy_agent_wrapper.get_messages()
-        assert len(retrieved) == 2
-        assert retrieved[0]["content"] == "Custom message"
-        assert retrieved[1]["content"] == "Custom response"
-
-    def test_agent_wrapper_clear_message_history(self, dummy_agent_wrapper):
-        """Test that message history can be cleared."""
-        dummy_agent_wrapper.run("Test")
-        assert len(dummy_agent_wrapper.get_messages()) > 0
-
-        dummy_agent_wrapper.clear_message_history()
-        assert len(dummy_agent_wrapper.get_messages()) == 0
-
-    def test_agent_wrapper_append_to_message_history(self, dummy_agent_wrapper):
-        """Test that messages can be appended to history."""
-        dummy_agent_wrapper.append_to_message_history("user", "First message")
-        dummy_agent_wrapper.append_to_message_history("assistant", "First response")
-
-        history = dummy_agent_wrapper.get_messages()
-        assert len(history) == 2
-        assert history[0]["content"] == "First message"
-        assert history[1]["content"] == "First response"
-
-    def test_agent_wrapper_gather_traces_includes_messages(self, dummy_agent_wrapper):
-        """Test that gather_traces() includes message history."""
-        dummy_agent_wrapper.run("Test query")
-
-        traces = dummy_agent_wrapper.gather_traces()
-
-        assert "type" in traces
-        assert "gathered_at" in traces
-        assert "name" in traces
-        assert "agent_type" in traces
-        assert "message_count" in traces
-        assert "messages" in traces
-
-        assert traces["name"] == "test_agent"
-        assert traces["message_count"] == 2
-        assert len(traces["messages"]) == 2
-
-    def test_agent_wrapper_gather_config(self, dummy_agent_wrapper):
-        """Test that gather_config() returns configuration."""
-        config = dummy_agent_wrapper.gather_config()
-
-        assert "type" in config
-        assert "gathered_at" in config
-        assert "name" in config
-        assert "agent_type" in config
-
-        assert config["name"] == "test_agent"
-        assert config["type"] == "DummyAgentAdapter"
-
-    def test_agent_wrapper_multiple_runs(self, dummy_agent_wrapper):
-        """Test that wrapper can be run multiple times."""
-        result1 = dummy_agent_wrapper.run("Query 1")
-        assert "Query 1" in result1
-
-        # Clear history for second run
-        dummy_agent_wrapper.clear_message_history()
-
-        result2 = dummy_agent_wrapper.run("Query 2")
-        assert "Query 2" in result2
-
-        # History should only have second run
-        history = dummy_agent_wrapper.get_messages()
-        assert len(history) == 2
-        assert history[0]["content"] == "Query 2"
diff --git a/tests/test_core/test_benchmark/test_automatic_registration.py b/tests/test_core/test_benchmark/test_automatic_registration.py
index 6ead667a..e1c55aa7 100644
--- a/tests/test_core/test_benchmark/test_automatic_registration.py
+++ b/tests/test_core/test_benchmark/test_automatic_registration.py
@@ -88,12 +88,12 @@ def test_duplicate_registration_helpful_message():
 
     # Create and register an agent
     agent = DummyAgent()
-    wrapper = DummyAgentAdapter(agent, "my_agent")
-    benchmark.register("agents", "first_name", wrapper)
+    agent_adapter = DummyAgentAdapter(agent, "my_agent")
+    benchmark.register("agents", "first_name", agent_adapter)
 
     # Try to register again with different name
     with pytest.raises(ValueError) as exc_info:
-        benchmark.register("agents", "second_name", wrapper)
+        benchmark.register("agents", "second_name", agent_adapter)
 
     error_message = str(exc_info.value)
     assert "already registered as 'agents:first_name'" in error_message
diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
index fa8a0e84..0344872d 100644
--- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
+++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
@@ -318,8 +318,8 @@ def _run_agent(self, query: str) -> str:
         class TaskFailureBenchmark(DummyBenchmark):
             def setup_agents(self, agent_data, environment, task, user):
                 agent = FailingAgent()
-                wrapper = FailingAgentAdapter(agent, "failing_agent")
-                return [wrapper], {"failing_agent": wrapper}
+                agent_adapter = FailingAgentAdapter(agent, "failing_agent")
+                return [agent_adapter], {"failing_agent": agent_adapter}
 
         tasks = TaskCollection.from_list([{"query": "Test query", "environment_data": {}}])
         benchmark = TaskFailureBenchmark(
@@ -354,8 +354,8 @@ def _run_agent(self, query: str) -> str:
         class TaskFailureBenchmark(DummyBenchmark):
             def setup_agents(self, agent_data, environment, task, user):
                 agent = FailingAgent()
-                wrapper = FailingAgentAdapter(agent, "failing_agent")
-                return [wrapper], {"failing_agent": wrapper}
+                agent_adapter = FailingAgentAdapter(agent, "failing_agent")
+                return [agent_adapter], {"failing_agent": agent_adapter}
 
         tasks = TaskCollection.from_list([{"query": "Test query", "environment_data": {}}])
         benchmark = TaskFailureBenchmark(
@@ -440,13 +440,13 @@ def __init__(self, *args, **kwargs):
             def setup_agents(self, agent_data, environment, task, user):
                 if self.task_counter == 1:  # Fail second task
                     agent = FailingAgent()
-                    wrapper = FailingAgentAdapter(agent, "failing")
+                    agent_adapter = FailingAgentAdapter(agent, "failing")
                 else:
                     agent = DummyAgent()
-                    wrapper = DummyAgentAdapter(agent, "test_agent")
+                    agent_adapter = DummyAgentAdapter(agent, "test_agent")
 
                 self.task_counter += 1
-                return [wrapper], {wrapper.name: wrapper}
+                return [agent_adapter], {agent_adapter.name: agent_adapter}
 
         tasks = TaskCollection.from_list(
             [
@@ -611,13 +611,13 @@ def setup_agents(self, agent_data, environment, task, user):
                 # Fail second task on first run only
                 if self.task_counter == 1 and self.fail_on_first_run:
                     agent = FailingAgent()
-                    wrapper = FailingAgentAdapter(agent, "failing")
+                    agent_adapter = FailingAgentAdapter(agent, "failing")
                 else:
                     agent = DummyAgent()
-                    wrapper = DummyAgentAdapter(agent, "test_agent")
+                    agent_adapter = DummyAgentAdapter(agent, "test_agent")
 
                 self.task_counter += 1
-                return [wrapper], {wrapper.name: wrapper}
+                return [agent_adapter], {agent_adapter.name: agent_adapter}
 
         tasks = TaskCollection.from_list(
             [
diff --git a/tests/test_core/test_benchmark/test_config_collection.py b/tests/test_core/test_benchmark/test_config_collection.py
index 15076b3c..e1a4ad3c 100644
--- a/tests/test_core/test_benchmark/test_config_collection.py
+++ b/tests/test_core/test_benchmark/test_config_collection.py
@@ -150,7 +150,7 @@ def test_config_handles_component_errors_gracefully(self):
         from conftest import DummyBenchmark
         from maseval import AgentAdapter
 
-        class FailingConfigWrapper(AgentAdapter):
+        class FailingConfigAdapter(AgentAdapter):
             def _run_agent(self, query: str) -> str:
                 return "success"
 
@@ -163,8 +163,8 @@ def setup_agents(self, agent_data, environment, task, user):
                 from conftest import DummyAgent
 
                 agent = DummyAgent()
-                wrapper = FailingConfigWrapper(agent, "failing_agent")
-                return [wrapper], {"failing_agent": wrapper}  # type: ignore[return-value]
+                agent_adapter = FailingConfigAdapter(agent, "failing_agent")
+                return [agent_adapter], {"failing_agent": agent_adapter}  # type: ignore[return-value]
 
         tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
         benchmark = TestBenchmark(agent_data={"model": "test"})
diff --git a/tests/test_core/test_benchmark/test_trace_collection.py b/tests/test_core/test_benchmark/test_trace_collection.py
index 6d89d2e2..2b801a70 100644
--- a/tests/test_core/test_benchmark/test_trace_collection.py
+++ b/tests/test_core/test_benchmark/test_trace_collection.py
@@ -86,8 +86,8 @@ def setup_agents(self, agent_data, environment, task, user):
                 from conftest import DummyAgent
 
                 agent = DummyAgent()
-                wrapper = FailingAgentAdapter(agent, "failing_agent")
-                return [wrapper], {"failing_agent": wrapper}  # type: ignore[return-value]
+                agent_adapter = FailingAgentAdapter(agent, "failing_agent")
+                return [agent_adapter], {"failing_agent": agent_adapter}  # type: ignore[return-value]
 
         tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
         benchmark = TestBenchmark(agent_data={"model": "test"})
@@ -125,10 +125,10 @@ def setup_agents(self, agent_data, environment, task, user):
                 from conftest import DummyAgent
 
                 agent = DummyAgent()
-                wrapper = ModelUsingAgentAdapter(agent, "test_agent", model)
+                agent_adapter = ModelUsingAgentAdapter(agent, "test_agent", model)
                 # Manually register the model
                 self.register("models", "test_model", model)
-                return [wrapper], {"test_agent": wrapper}  # type: ignore[return-value]
+                return [agent_adapter], {"test_agent": agent_adapter}  # type: ignore[return-value]
 
         tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
         benchmark = TestBenchmark(agent_data={"model": "test"})
diff --git a/tests/test_core/test_message_tracing_callback.py b/tests/test_core/test_message_tracing_callback.py
index d7f11bb1..bfd6b9ed 100644
--- a/tests/test_core/test_message_tracing_callback.py
+++ b/tests/test_core/test_message_tracing_callback.py
@@ -14,10 +14,10 @@
 from conftest import DummyAgent
 
 
-class TestAgentAdapter(AgentAdapter):
-    """Test wrapper implementation that populates message history for testing.
+class TracingTestAgentAdapter(AgentAdapter):
+    """Test adapter implementation that populates message history for testing.
 
-    This wrapper simulates realistic agent behavior by creating proper message
+    This adapter simulates realistic agent behavior by creating proper message
     histories with user queries, assistant responses, and optional tool calls.
     """
 
@@ -47,8 +47,8 @@ def _run_agent(self, query: str) -> str:
             # Normal response without tools
             history.add_message(role="assistant", content=response)
 
-        # Store history so get_messages() can retrieve it
-        self.set_message_history(history)
+        # Store history directly
+        self.messages = history
 
         return response
 
@@ -76,10 +76,10 @@ def test_basic_tracing(self):
         """
         callback = MessageTracingAgentCallback()
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="test_agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="test_agent", callbacks=[callback])
 
-        # Run query
-        wrapper.run("Test query")
+        # Run agent - should trigger callbacks
+        agent_adapter.run("Test query")
 
         # Check traced conversation
         conversations = callback.get_all_conversations()
@@ -101,12 +101,11 @@ def test_multiple_conversations(self):
         """
         callback = MessageTracingAgentCallback()
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="agent1", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="agent1", callbacks=[callback])
 
-        # Run multiple queries
         queries = ["Query 1", "Query 2", "Query 3"]
         for query in queries:
-            wrapper.run(query)
+            agent_adapter.run(query)
 
         # Check all conversations traced
         conversations = callback.get_all_conversations()
@@ -124,9 +123,9 @@ def test_metadata_included(self):
         """
         callback = MessageTracingAgentCallback(include_metadata=True)
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback])
 
-        wrapper.run("Test query with tool")
+        agent_adapter.run("Test query with tool")
 
         conv = callback.get_all_conversations()[0]
         assert "metadata" in conv
@@ -143,9 +142,9 @@ def test_metadata_excluded(self):
         """
         callback = MessageTracingAgentCallback(include_metadata=False)
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback])
 
-        wrapper.run("Test query")
+        agent_adapter.run("Test query")
 
         conv = callback.get_all_conversations()[0]
         assert "metadata" not in conv
@@ -164,15 +163,15 @@ def test_multi_agent_tracing(self):
 
         # Create two agents
         agent1 = DummyAgent()
-        wrapper1 = TestAgentAdapter(agent1, name="agent1", callbacks=[callback])
+        adapter1 = TracingTestAgentAdapter(agent1, name="agent1", callbacks=[callback])
 
         agent2 = DummyAgent()
-        wrapper2 = TestAgentAdapter(agent2, name="agent2", callbacks=[callback])
+        adapter2 = TracingTestAgentAdapter(agent2, name="agent2", callbacks=[callback])
 
-        # Run queries on both
-        wrapper1.run("Query for agent1")
-        wrapper2.run("Query for agent2")
-        wrapper1.run("Another query for agent1")
+        # Run both agents
+        adapter1.run("Query for agent1")
+        adapter2.run("Query for agent2")
+        adapter1.run("Another query for agent1")
 
         # Check all conversations traced
         conversations = callback.get_all_conversations()
@@ -193,11 +192,11 @@ def test_statistics(self):
         """
         callback = MessageTracingAgentCallback()
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="test_agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="test_agent", callbacks=[callback])
 
-        # Run queries
-        wrapper.run("Query 1")
-        wrapper.run("Query 2 with tool")
+        # Run multiple times
+        agent_adapter.run("Query 1")
+        agent_adapter.run("Query 2 with tool")
 
         stats = callback.get_statistics()
 
@@ -218,11 +217,11 @@ def test_clear(self):
         """
         callback = MessageTracingAgentCallback()
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback])
 
         # Trace some conversations
-        wrapper.run("Query 1")
-        wrapper.run("Query 2")
+        agent_adapter.run("Query 1")
+        agent_adapter.run("Query 2")
         assert len(callback.get_all_conversations()) == 2
 
         # Clear
@@ -238,10 +237,10 @@ def test_tool_call_tracing(self):
         """
         callback = MessageTracingAgentCallback()
         agent = DummyAgent()
-        wrapper = TestAgentAdapter(agent, name="agent", callbacks=[callback])
+        agent_adapter = TracingTestAgentAdapter(agent, name="agent", callbacks=[callback])
 
         # Run query that triggers tool call
-        wrapper.run("Query with tool")
+        agent_adapter.run("Query with tool")
 
         conv = callback.get_all_conversations()[0]
 
@@ -259,19 +258,21 @@ def test_no_history_handling(self):
         """Test graceful handling when agent returns empty message history.
 
         Verifies that callback creates valid conversation records even when
-        agent wrappers return empty histories (edge case for minimal agents).
+        agent adapters return empty histories (edge case for minimal agents).
         """
+        callback = MessageTracingAgentCallback()
+        agent = DummyAgent()
 
-        class NoHistoryWrapper(AgentAdapter):
-            def _run_agent(self, query: str) -> MessageHistory:
-                # Don't populate history
+        class NoHistoryAdapter(AgentAdapter):
+            def get_messages(self):
                 return MessageHistory()
 
-        callback = MessageTracingAgentCallback()
-        agent = DummyAgent()
-        wrapper = NoHistoryWrapper(agent, name="agent", callbacks=[callback])
+            def _run_agent(self, query: str):
+                return MessageHistory()
+
+        agent_adapter = NoHistoryAdapter(agent, name="agent", callbacks=[callback])
 
-        wrapper.run("Test")
+        agent_adapter.run("Test")
 
         # Should still trace, but with empty messages
         conversations = callback.get_all_conversations()
diff --git a/tests/test_interface/test_agent_integration/test_langgraph_integration.py b/tests/test_interface/test_agent_integration/test_langgraph_integration.py
index 32546b25..5d5d646a 100644
--- a/tests/test_interface/test_agent_integration/test_langgraph_integration.py
+++ b/tests/test_interface/test_agent_integration/test_langgraph_integration.py
@@ -13,7 +13,7 @@
 pytestmark = [pytest.mark.interface, pytest.mark.langgraph]
 
 
-def test_langgraph_wrapper_import():
+def test_langgraph_adapter_import():
     """Test that LangGraphAgentAdapter can be imported when langgraph is installed."""
     from maseval.interface.agents.langgraph import LangGraphAgentAdapter, LangGraphUser
 
@@ -37,31 +37,42 @@ def test_check_langgraph_installed_function():
     _check_langgraph_installed()
 
 
-def test_langgraph_wrapper_message_manipulation():
-    """Test that LangGraphAgentAdapter supports message history manipulation.
+def test_langgraph_adapter_logs_after_run():
+    """Test that LangGraphAgentAdapter.logs is populated after run().
 
-    LangGraph supports manually managing message history through:
-    - append_to_message_history: Add individual messages
-    - set_message_history: Replace entire history
-    - clear_message_history: Remove all messages
-    - get_messages: Retrieve current history
-
-    This is useful for multi-turn conversations and testing scenarios.
+    This test validates that the manual logging implementation in LangGraphAgentAdapter
+    captures all relevant execution information including:
+    - Timing information (timestamp, duration)
+    - Query information
+    - Token usage (extracted from message metadata)
+    - Status (success/error)
+    - State information (keys, message count)
+    - Checkpoint metadata (if available)
     """
     from maseval.interface.agents.langgraph import LangGraphAgentAdapter
-    from maseval import MessageHistory
     from langgraph.graph import StateGraph, END
     from typing_extensions import TypedDict
     from langchain_core.messages import AIMessage
+    from langchain_core.messages.ai import UsageMetadata
+    import time
 
-    # Create a simple LangGraph agent
+    # Create a LangGraph agent with token usage metadata
     class State(TypedDict):
         messages: list
 
     def agent_node(state: State) -> State:
         messages = state["messages"]
-        messages.append(AIMessage(content="Test response"))
-        return {"messages": messages}
+        # Create AI message with usage metadata (simulates LLM response)
+        # UsageMetadata is a TypedDict, so we create it properly
+        response = AIMessage(
+            content="Test response",
+            usage_metadata=UsageMetadata(
+                input_tokens=50,
+                output_tokens=30,
+                total_tokens=80,
+            ),
+        )
+        return {"messages": messages + [response]}
 
     graph = StateGraph(State)
     graph.add_node("agent", agent_node)
@@ -69,63 +80,68 @@ def agent_node(state: State) -> State:
     graph.add_edge("agent", END)
     compiled = graph.compile()
 
-    wrapper = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
+    adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
 
-    # Test append_to_message_history
-    wrapper.append_to_message_history("user", "First message")
-    wrapper.append_to_message_history("assistant", "First response")
+    # Capture time before run
+    time_before = time.time()
 
-    history = wrapper.get_messages()
-    assert len(history) == 2
-    assert history[0]["role"] == "user"
-    assert history[0]["content"] == "First message"
-    assert history[1]["role"] == "assistant"
-    assert history[1]["content"] == "First response"
+    # Run the agent
+    adapter.run("Test query")
 
-    # Test clear_message_history
-    wrapper.clear_message_history()
-    history = wrapper.get_messages()
-    assert len(history) == 0
+    # Capture time after run
+    time_after = time.time()
 
-    # Test set_message_history
-    new_history = MessageHistory()
-    new_history.add_message("user", "Set message 1")
-    new_history.add_message("assistant", "Set response 1")
-    new_history.add_message("user", "Set message 2")
+    # Access logs
+    logs = adapter.logs
 
-    wrapper.set_message_history(new_history)
-    history = wrapper.get_messages()
-    assert len(history) == 3
-    assert history[0]["content"] == "Set message 1"
-    assert history[1]["content"] == "Set response 1"
-    assert history[2]["content"] == "Set message 2"
+    # Verify logs structure
+    assert isinstance(logs, list)
+    assert len(logs) >= 1  # At least one log entry
 
-    # Verify history persists across multiple retrievals
-    history_again = wrapper.get_messages()
-    assert len(history_again) == 3
-    assert history_again[0]["content"] == "Set message 1"
+    # Get the most recent log entry
+    log_entry = logs[-1]
 
+    # Verify required fields
+    assert "timestamp" in log_entry
+    assert "query" in log_entry
+    assert "duration_seconds" in log_entry
+    assert "status" in log_entry
 
-def test_langgraph_wrapper_message_manipulation_with_system_message():
-    """Test message manipulation with system messages.
+    # Verify field values
+    assert log_entry["query"] == "Test query"
+    assert log_entry["status"] == "success"
+    assert log_entry["duration_seconds"] > 0
+    assert log_entry["duration_seconds"] < (time_after - time_before) + 0.1  # Reasonable duration
 
-    Verifies that system messages are properly converted and handled
-    when manipulating message history in LangGraph.
-    """
+    # Verify state information
+    assert "state_keys" in log_entry
+    assert "messages" in log_entry["state_keys"]
+    assert "message_count" in log_entry
+    assert log_entry["message_count"] >= 1
+
+    # Verify token usage is captured from message metadata
+    assert "input_tokens" in log_entry
+    assert "output_tokens" in log_entry
+    assert "total_tokens" in log_entry
+    assert log_entry["input_tokens"] == 50
+    assert log_entry["output_tokens"] == 30
+    assert log_entry["total_tokens"] == 80
+
+
+def test_langgraph_adapter_logs_multiple_runs():
+    """Test that logs accumulate across multiple runs."""
     from maseval.interface.agents.langgraph import LangGraphAgentAdapter
-    from maseval import MessageHistory
     from langgraph.graph import StateGraph, END
     from typing_extensions import TypedDict
     from langchain_core.messages import AIMessage
 
-    # Create a simple LangGraph agent
     class State(TypedDict):
         messages: list
 
     def agent_node(state: State) -> State:
         messages = state["messages"]
-        messages.append(AIMessage(content="Response"))
-        return {"messages": messages}
+        response = AIMessage(content="Response")
+        return {"messages": messages + [response]}
 
     graph = StateGraph(State)
     graph.add_node("agent", agent_node)
@@ -133,19 +149,94 @@ def agent_node(state: State) -> State:
     graph.add_edge("agent", END)
     compiled = graph.compile()
 
-    wrapper = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
+    adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
+
+    # First run
+    adapter.run("Query 1")
+    logs_after_first = adapter.logs
+    assert len(logs_after_first) == 1
+    assert logs_after_first[0]["query"] == "Query 1"
+
+    # Second run
+    adapter.run("Query 2")
+    logs_after_second = adapter.logs
+    assert len(logs_after_second) == 2
+    assert logs_after_second[0]["query"] == "Query 1"
+    assert logs_after_second[1]["query"] == "Query 2"
+
+
+def test_langgraph_adapter_logs_error_handling():
+    """Test that logs capture error information when agent execution fails."""
+    from maseval.interface.agents.langgraph import LangGraphAgentAdapter
+    from langgraph.graph import StateGraph, END
+    from typing_extensions import TypedDict
+
+    class State(TypedDict):
+        messages: list
+
+    def failing_node(state: State) -> State:
+        raise ValueError("Intentional test error")
+
+    graph = StateGraph(State)
+    graph.add_node("agent", failing_node)
+    graph.set_entry_point("agent")
+    graph.add_edge("agent", END)
+    compiled = graph.compile()
+
+    adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
+
+    # Run should raise an error
+    try:
+        adapter.run("Test query")
+        assert False, "Expected ValueError to be raised"
+    except ValueError as e:
+        assert "Intentional test error" in str(e)
+
+    # Verify error is logged
+    logs = adapter.logs
+    assert len(logs) == 1
+
+    log_entry = logs[0]
+    assert log_entry["status"] == "error"
+    assert "error" in log_entry
+    assert "error_type" in log_entry
+    assert log_entry["error_type"] == "ValueError"
+    assert "Intentional test error" in log_entry["error"]
+    assert log_entry["query"] == "Test query"
+    assert log_entry["duration_seconds"] >= 0
+
+
+def test_langgraph_adapter_logs_without_token_metadata():
+    """Test that logs work correctly when messages don't have usage metadata."""
+    from maseval.interface.agents.langgraph import LangGraphAgentAdapter
+    from langgraph.graph import StateGraph, END
+    from typing_extensions import TypedDict
+    from langchain_core.messages import AIMessage
+
+    class State(TypedDict):
+        messages: list
+
+    def agent_node(state: State) -> State:
+        messages = state["messages"]
+        # Create response without usage metadata
+        response = AIMessage(content="Test response")
+        return {"messages": messages + [response]}
+
+    graph = StateGraph(State)
+    graph.add_node("agent", agent_node)
+    graph.set_entry_point("agent")
+    graph.add_edge("agent", END)
+    compiled = graph.compile()
 
-    # Test set_message_history with system message
-    new_history = MessageHistory()
-    new_history.add_message("system", "You are a helpful assistant")
-    new_history.add_message("user", "Hello")
-    new_history.add_message("assistant", "Hi there")
+    adapter = LangGraphAgentAdapter(agent_instance=compiled, name="test_agent")
+    adapter.run("Test query")
 
-    wrapper.set_message_history(new_history)
-    history = wrapper.get_messages()
+    # Verify logs exist but token fields are None or 0
+    logs = adapter.logs
+    assert len(logs) == 1
 
-    assert len(history) == 3
-    assert history[0]["role"] == "system"
-    assert history[0]["content"] == "You are a helpful assistant"
-    assert history[1]["role"] == "user"
-    assert history[2]["role"] == "assistant"
+    log_entry = logs[0]
+    # Token fields should be present but with default values
+    assert log_entry.get("input_tokens") in [None, 0]
+    assert log_entry.get("output_tokens") in [None, 0]
+    assert log_entry.get("total_tokens") in [None, 0]
diff --git a/tests/test_interface/test_agent_integration/test_smolagents_integration.py b/tests/test_interface/test_agent_integration/test_smolagents_integration.py
index 2bbdf1e5..3fdb8412 100644
--- a/tests/test_interface/test_agent_integration/test_smolagents_integration.py
+++ b/tests/test_interface/test_agent_integration/test_smolagents_integration.py
@@ -13,7 +13,7 @@
 pytestmark = [pytest.mark.interface, pytest.mark.smolagents]
 
 
-def test_smolagents_wrapper_import():
+def test_smolagents_adapter_import():
     """Test that SmolAgentAdapter can be imported when smolagents is installed."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter, SmolAgentUser
 
@@ -37,15 +37,15 @@ def test_check_smolagents_installed_function():
     _check_smolagents_installed()
 
 
-def test_smolagents_wrapper_creation():
+def test_smolagents_adapter_creation():
     """Test that SmolAgentAdapter can be created."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter
 
-    # Create wrapper with mock agent
-    wrapper = SmolAgentAdapter(agent_instance=object(), name="test_agent")
+    # Create adapter with mock agent
+    agent_adapter = SmolAgentAdapter(agent_instance=object(), name="test_agent")
 
-    assert wrapper.name == "test_agent"
-    assert wrapper.agent is not None
+    assert agent_adapter.name == "test_agent"
+    assert agent_adapter.agent is not None
 
 
 def test_smolagents_user_creation():
@@ -67,7 +67,7 @@ def test_smolagents_user_creation():
     assert user.name == "test_user"
 
 
-def test_smolagents_wrapper_gather_traces_with_monitoring():
+def test_smolagents_adapter_gather_traces_with_monitoring():
     """Test that SmolAgentAdapter.gather_traces() captures token and timing data."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter
     from smolagents.memory import ActionStep, AgentMemory
@@ -107,11 +107,11 @@ def test_smolagents_wrapper_gather_traces_with_monitoring():
     # Mock write_memory_to_messages to return empty list (we're testing gather_traces, not get_messages)
     mock_agent.write_memory_to_messages = Mock(return_value=[])
 
-    # Create wrapper
-    wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
+    # Create adapter
+    agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
 
     # Call gather_traces
-    traces = wrapper.gather_traces()
+    traces = agent_adapter.gather_traces()
 
     # Verify aggregated statistics
     assert "total_steps" in traces
@@ -154,7 +154,7 @@ def test_smolagents_wrapper_gather_traces_with_monitoring():
     assert step2_detail["action_output"] == "Output from step 2"
 
 
-def test_smolagents_wrapper_gather_traces_without_monitoring():
+def test_smolagents_adapter_gather_traces_without_monitoring():
     """Test that gather_traces works when agent has no monitoring data."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter
     from smolagents.memory import AgentMemory
@@ -165,11 +165,11 @@ def test_smolagents_wrapper_gather_traces_without_monitoring():
     mock_agent.memory = AgentMemory(system_prompt="Test system prompt")
     mock_agent.write_memory_to_messages = Mock(return_value=[])
 
-    # Create wrapper
-    wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
+    # Create adapter
+    agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
 
     # Call gather_traces
-    traces = wrapper.gather_traces()
+    traces = agent_adapter.gather_traces()
 
     # Verify aggregated statistics show zero usage
     assert "total_steps" in traces
@@ -191,7 +191,7 @@ def test_smolagents_wrapper_gather_traces_without_monitoring():
     assert len(traces["steps_detail"]) == 0
 
 
-def test_smolagents_wrapper_gather_traces_with_planning_step():
+def test_smolagents_adapter_gather_traces_with_planning_step():
     """Test that gather_traces captures PlanningStep data correctly."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter
     from smolagents.memory import PlanningStep, AgentMemory
@@ -218,11 +218,11 @@ def test_smolagents_wrapper_gather_traces_with_planning_step():
     # Mock write_memory_to_messages
     mock_agent.write_memory_to_messages = Mock(return_value=[])
 
-    # Create wrapper
-    wrapper = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
+    # Create adapter
+    agent_adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
 
     # Call gather_traces
-    traces = wrapper.gather_traces()
+    traces = agent_adapter.gather_traces()
 
     # Verify aggregated statistics
     assert traces["total_steps"] == 1
@@ -245,71 +245,168 @@ def test_smolagents_wrapper_gather_traces_with_planning_step():
     assert "observations" not in step_detail
 
 
-def test_smolagents_wrapper_message_manipulation_not_supported():
-    """Test that smolagents explicitly raises NotImplementedError for message manipulation.
+def test_smolagents_adapter_logs_property():
+    """Test that SmolAgentAdapter.logs property returns converted memory steps.
 
-    smolagents builds its AgentMemory from execution steps and does not support
-    arbitrary message injection. The wrapper should raise clear NotImplementedError
-    for set_message_history and append_to_message_history operations.
-
-    Only clear_message_history is supported (resets memory with system prompt).
+    This test validates that the logs property correctly extracts all relevant
+    information from smolagents' internal memory system, including:
+    - Step types (ActionStep, PlanningStep)
+    - Timing information (start_time, end_time, duration)
+    - Token usage (input_tokens, output_tokens, total_tokens)
+    - Model input/output messages
+    - Tool calls and observations
+    - Error information
     """
     from maseval.interface.agents.smolagents import SmolAgentAdapter
-    from maseval import MessageHistory
-    from smolagents import CodeAgent
-    from conftest import FakeSmolagentsModel
+    from smolagents.memory import ActionStep, PlanningStep, AgentMemory, ToolCall
+    from smolagents.monitoring import TokenUsage, Timing
+    from smolagents.models import ChatMessage, MessageRole
+    from unittest.mock import Mock
+    import time
+
+    # Create a mock agent with memory
+    mock_agent = Mock()
+    mock_agent.memory = AgentMemory(system_prompt="Test system prompt")
+
+    # Add an ActionStep with comprehensive data
+    start_time = time.time()
+    step1 = ActionStep(
+        step_number=1,
+        timing=Timing(start_time=start_time, end_time=start_time + 0.5),
+        observations_images=[],
+    )
+    step1.token_usage = TokenUsage(input_tokens=100, output_tokens=50)
+    step1.observations = "Tool returned: success"
+    step1.action_output = "Final output from action"
+    step1.tool_calls = [ToolCall(name="test_tool", arguments={"arg": "value"}, id="call_123")]
+    step1.model_input_messages = [
+        ChatMessage(role=MessageRole.USER, content="Execute this task"),
+        ChatMessage(role=MessageRole.SYSTEM, content="System context"),
+    ]
+    mock_agent.memory.steps.append(step1)
+
+    # Add a PlanningStep
+    step2 = PlanningStep(
+        timing=Timing(start_time=start_time + 0.5, end_time=start_time + 1.0),
+        model_input_messages=[ChatMessage(role=MessageRole.USER, content="What should I do?")],
+        model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Here's the plan"),
+        plan="Step 1: Do this\nStep 2: Do that",
+    )
+    step2.token_usage = TokenUsage(input_tokens=200, output_tokens=150)
+    mock_agent.memory.steps.append(step2)
+
+    # Mock write_memory_to_messages
+    mock_agent.write_memory_to_messages = Mock(return_value=[])
+
+    # Create adapter
+    adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
+
+    # Access logs property
+    logs = adapter.logs
+
+    # Verify logs structure
+    assert isinstance(logs, list)
+    assert len(logs) == 2
+
+    # Verify ActionStep log entry
+    action_log = logs[0]
+    assert action_log["step_type"] == "ActionStep"
+    assert action_log["step_number"] == 1
+    assert action_log["input_tokens"] == 100
+    assert action_log["output_tokens"] == 50
+    assert action_log["total_tokens"] == 150
+    assert action_log["duration_seconds"] == pytest.approx(0.5, abs=0.01)
+    assert action_log["observations"] == "Tool returned: success"
+    assert action_log["action_output"] == "Final output from action"
+    assert "tool_calls" in action_log
+    assert len(action_log["tool_calls"]) == 1
+    assert action_log["tool_calls"][0]["name"] == "test_tool"
+
+    # Verify model_input_messages are converted
+    assert "model_input_messages" in action_log
+    assert isinstance(action_log["model_input_messages"], list)
+    assert len(action_log["model_input_messages"]) == 2
+    assert action_log["model_input_messages"][0]["role"] == "user"
+    assert action_log["model_input_messages"][0]["content"] == "Execute this task"
+    assert action_log["model_input_messages"][1]["role"] == "system"
+
+    # Verify PlanningStep log entry
+    planning_log = logs[1]
+    assert planning_log["step_type"] == "PlanningStep"
+    assert planning_log["input_tokens"] == 200
+    assert planning_log["output_tokens"] == 150
+    assert planning_log["total_tokens"] == 350
+    assert planning_log["duration_seconds"] == pytest.approx(0.5, abs=0.01)
+    assert planning_log["plan"] == "Step 1: Do this\nStep 2: Do that"
+
+    # Verify model_input_messages for planning step
+    assert "model_input_messages" in planning_log
+    assert len(planning_log["model_input_messages"]) == 1
+    assert planning_log["model_input_messages"][0]["content"] == "What should I do?"
+
+    # PlanningStep should not have action-specific fields
+    assert "action_output" not in planning_log
+    assert "observations" not in planning_log
+    assert "tool_calls" not in planning_log
+
+
+def test_smolagents_adapter_logs_with_errors():
+    """Test that adapter.logs captures error information from failed steps."""
+    from maseval.interface.agents.smolagents import SmolAgentAdapter
+    from smolagents import AgentError
+    from smolagents.memory import ActionStep, AgentMemory
+    from smolagents.monitoring import Timing
+    from unittest.mock import Mock
+    import time
 
-    # Create a smolagents agent
-    mock_model = FakeSmolagentsModel(["Test response"])
-    agent = CodeAgent(tools=[], model=mock_model, max_steps=1)
-    wrapper = SmolAgentAdapter(agent_instance=agent, name="test_agent")
+    # Create a mock agent with memory
+    mock_agent = Mock()
+    mock_agent.memory = AgentMemory(system_prompt="Test system prompt")
 
-    # Test that append_to_message_history raises NotImplementedError
-    with pytest.raises(NotImplementedError) as exc_info:
-        wrapper.append_to_message_history("user", "Manual message")
+    # Add an ActionStep with an error
+    start_time = time.time()
+    step = ActionStep(
+        step_number=1,
+        timing=Timing(start_time=start_time, end_time=start_time + 0.2),
+        observations_images=[],
+    )
+    # Create a proper AgentError object with mock logger
+    mock_logger = Mock()
+    step.error = AgentError("Tool execution failed: Connection timeout", logger=mock_logger)
+    mock_agent.memory.steps.append(step)
 
-    assert "doesn't support appending" in str(exc_info.value)
-    assert "memory is built from execution steps" in str(exc_info.value)
+    # Mock write_memory_to_messages
+    mock_agent.write_memory_to_messages = Mock(return_value=[])
 
-    # Test that set_message_history raises NotImplementedError
-    with pytest.raises(NotImplementedError) as exc_info:
-        new_history = MessageHistory()
-        new_history.add_message("user", "Test message")
-        wrapper.set_message_history(new_history)
+    # Create adapter
+    adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
 
-    assert "doesn't support setting" in str(exc_info.value)
-    assert "memory is built from execution steps" in str(exc_info.value)
+    # Access logs property
+    logs = adapter.logs
 
+    # Verify error is captured
+    assert len(logs) == 1
+    assert "error" in logs[0]
+    assert logs[0]["error"] == "Tool execution failed: Connection timeout"
 
-def test_smolagents_wrapper_clear_message_history_supported():
-    """Test that smolagents supports clear_message_history.
 
-    clear_message_history is the only history manipulation operation
-    supported by smolagents. It resets the AgentMemory while preserving
-    the system prompt.
-    """
+def test_smolagents_adapter_logs_empty_when_no_steps():
+    """Test that adapter.logs returns empty list when no execution has occurred."""
     from maseval.interface.agents.smolagents import SmolAgentAdapter
-    from smolagents import CodeAgent
-    from conftest import FakeSmolagentsModel
-
-    # Create a smolagents agent
-    mock_model = FakeSmolagentsModel(["Test response"])
-    agent = CodeAgent(tools=[], model=mock_model, max_steps=1)
-    wrapper = SmolAgentAdapter(agent_instance=agent, name="test_agent")
-
-    # Run the agent to populate memory
-    wrapper.run("Test query")
-
-    # Verify memory has content (should have multiple messages after run)
-    messages_before = wrapper.get_messages()
-    assert len(messages_before) > 1  # At least system + user messages
-
-    # Clear the memory
-    wrapper.clear_message_history()
-
-    # Verify memory is reset (only system message remains)
-    messages_after = wrapper.get_messages()
-    assert len(messages_after) == 1
-    assert messages_after[0]["role"] == "system"
-    # System prompt content is framework-specific, just verify it exists and has content
-    assert len(messages_after[0]["content"]) > 0
+    from smolagents.memory import AgentMemory
+    from unittest.mock import Mock
+
+    # Create a mock agent with empty memory
+    mock_agent = Mock()
+    mock_agent.memory = AgentMemory(system_prompt="Test system prompt")
+    mock_agent.write_memory_to_messages = Mock(return_value=[])
+
+    # Create adapter
+    adapter = SmolAgentAdapter(agent_instance=mock_agent, name="test_agent")
+
+    # Access logs property
+    logs = adapter.logs
+
+    # Should be empty
+    assert isinstance(logs, list)
+    assert len(logs) == 0