removed non-contractual functions from AgentAdapter

cemde · cemde · commit fdc6a51ec42b · 2025-11-21T20:26:42.000+01:00
diff --git a/AGENTS.md b/AGENTS.md
@@ -236,3 +236,7 @@ For lists and dictionaries, use `Dict[...,...]`, `List[...]`, `Sequence[...]` et
 - DO NOT publicly distribute code or data
 - DO NOT publish without explicit permission
 - DO NOT share copyrighted third-party benchmark data
+
+## Changelog
+
+When the task is completed, add your changes to the Changelog.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Removed
 
+- Removed `set_message_history`, `append_message_history` and `clear_message_history` for `AgentAdapter` and subclasses. (PR: #3)
+
 ## [0.1.2] - 2025-11-18
 
 ### Added
diff --git a/docs/guides/message-tracing.md b/docs/guides/message-tracing.md
@@ -45,18 +45,21 @@ for msg in messages:
         print(f"  Tools called: {[tc['function']['name'] for tc in msg['tool_calls']]}")
 ```
 
-### Clearing History Between Tasks
+### Fresh Conversations for Multiple Tasks
 
-In benchmarks, you typically want to clear history before each new task:
+In benchmarks, you typically want a fresh agent instance for each task:
 
 ```python
 # In your benchmark loop
 for task in benchmark.tasks:
-    agent_adapter.clear_message_history()  # Reset for new task
+    # Create a new adapter instance for each task
+    agent_adapter = YourAgentAdapter(agent_instance=agent, name="task_agent")
     result = agent_adapter.run(task.query)
     evaluate(result, task.ground_truth)
 ```
 
+This ensures each task starts with a clean slate and avoids conversation history contamination.
+
 ## Using the Tracing Callback
 
 For multi-agent systems or when you need to collect conversations from many runs, use `MessageTracingAgentCallback`:
@@ -190,7 +193,7 @@ Messages use OpenAI's chat completion format:
 }
 ```
 
-## Custom agent adapters
+## Custom Agent Adapters
 
 If you're implementing a custom adapter, the framework handles message storage automatically via `get_messages()`. Just ensure your `_run_agent()` method returns a `MessageHistory`:
 
@@ -211,13 +214,13 @@ class MyAgentAdapter(AgentAdapter):
         return history
 ```
 
-See the [agent adapter guide](../reference/agent.md) for details on implementing custom adapters.
+See the [AgentAdapter guide](../reference/agent.md) for details on implementing custom adapters.
 
 ## Tips
 
 **For debugging**: Use `verbose=True` to see traces in real-time.
 
-**For benchmarks**: Clear history between tasks with `agent_adapter.clear_message_history()`.
+**For benchmarks**: Create a new adapter instance for each task to ensure clean conversation history.
 
 **For multi-agent systems**: Use a shared tracer and `get_conversations_by_agent()` to analyze each agent separately.
 
diff --git a/maseval/core/agent.py b/maseval/core/agent.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
-from typing import List, Any, Optional, Union, Dict
+from typing import List, Any, Optional, Dict
 
 from .callback import AgentCallback
-from .history import MessageHistory, RoleType
+from .history import MessageHistory
 from .tracing import TraceableMixin
 from .config import ConfigurableMixin
 
@@ -101,35 +101,6 @@ def get_messages(self) -> MessageHistory:
         """
         return self.messages if self.messages is not None else MessageHistory()
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set the message history.
-
-        This is typically called by _run_agent() implementations after executing
-        the agent, but can also be used to inject or modify history.
-
-        Args:
-            history: The MessageHistory to set
-        """
-        self.messages = history
-
-    def clear_message_history(self) -> None:
-        """Clear the message history."""
-        self.messages = None
-
-    def append_to_message_history(self, role: Union[RoleType, str], content: Union[str, List[Any]], **kwargs) -> None:
-        """Append a message to the history.
-
-        If no history exists, creates a new one.
-
-        Args:
-            role: The message role ("user", "assistant", "system", "tool")
-            content: The message content (string or list of content parts)
-            **kwargs: Additional fields (name, metadata, timestamp, etc.)
-        """
-        if self.messages is None:
-            self.messages = MessageHistory()
-        self.messages.add_message(role, content, **kwargs)  # type: ignore
-
     def gather_traces(self) -> dict[str, Any]:
         """Gather execution traces from this agent.
 
diff --git a/maseval/interface/agents/langgraph.py b/maseval/interface/agents/langgraph.py
@@ -105,85 +105,6 @@ def get_messages(self) -> MessageHistory:
         # No messages available
         return MessageHistory()
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set message history for langgraph.
-
-        For stateless graphs, updates the cached result.
-        For stateful graphs, this is not fully supported as LangGraph manages state internally.
-
-        Args:
-            history: MASEval MessageHistory to set
-        """
-        _check_langgraph_installed()
-        from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, ToolMessage
-
-        # Convert MessageHistory to LangChain messages
-        lc_messages = []
-        for msg in history:
-            role = msg.get("role", "assistant")
-            content = msg.get("content", "")
-
-            if role == "user":
-                lc_messages.append(HumanMessage(content=content))
-            elif role == "assistant":
-                lc_messages.append(AIMessage(content=content))
-            elif role == "system":
-                lc_messages.append(SystemMessage(content=content))
-            elif role == "tool":
-                tool_call_id = msg.get("tool_call_id", "")
-                lc_messages.append(ToolMessage(content=content, tool_call_id=tool_call_id))
-
-        # Update cached result
-        self._last_result = {"messages": lc_messages}
-
-        # Also update base class cache
-        super().set_message_history(history)
-
-    def clear_message_history(self) -> None:
-        """Clear message history for langgraph.
-
-        Clears the cached result. For stateful graphs, this doesn't clear
-        the persistent state in the checkpointer.
-        """
-        self._last_result = None
-        super().clear_message_history()
-
-    def append_to_message_history(self, role: str, content: Any, **kwargs) -> None:
-        """Append message to history.
-
-        For stateless graphs, this appends to the cached result.
-        For stateful graphs, messages are managed by LangGraph during invoke().
-
-        Args:
-            role: Message role
-            content: Message content (string or list)
-            **kwargs: Additional message fields
-        """
-        _check_langgraph_installed()
-        from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-
-        # Get current messages
-        current_messages = []
-        if self._last_result and "messages" in self._last_result:
-            current_messages = self._last_result["messages"]
-
-        # Create new message
-        if role == "user":
-            new_msg = HumanMessage(content=str(content))
-        elif role == "assistant":
-            new_msg = AIMessage(content=str(content))
-        elif role == "system":
-            new_msg = SystemMessage(content=str(content))
-        else:
-            new_msg = AIMessage(content=str(content))
-
-        # Append and update cache
-        current_messages.append(new_msg)
-        self._last_result = {"messages": current_messages}
-
-        # Also update base class cache
-        super().append_to_message_history(role, content, **kwargs)
-
     def gather_config(self) -> dict[str, Any]:
         """Gather configuration from this LangGraph agent.
 
diff --git a/maseval/interface/agents/smolagents.py b/maseval/interface/agents/smolagents.py
@@ -351,54 +351,6 @@ def get_messages(self) -> MessageHistory:
         # Convert and return
         return self._convert_smolagents_messages(smol_messages)
 
-    def set_message_history(self, history: MessageHistory) -> None:
-        """Set message history - NOT SUPPORTED by smolagents.
-
-        Args:
-            history: MASEval MessageHistory to set
-
-        Raises:
-            NotImplementedError: smolagents doesn't support arbitrary message injection
-        """
-        raise NotImplementedError(
-            "smolagents doesn't support setting arbitrary message history. "
-            "The agent's memory is built from execution steps and cannot be directly manipulated. "
-            "Use clear_message_history() to reset, then run() to generate new conversation."
-        )
-
-    def clear_message_history(self) -> None:
-        """Clear message history by resetting smolagents memory."""
-        _check_smolagents_installed()
-        from smolagents.memory import AgentMemory
-
-        # Get system prompt before clearing
-        system_prompt = ""
-        if hasattr(self.agent, "memory") and hasattr(self.agent.memory, "system_prompt"):
-            system_prompt = self.agent.memory.system_prompt
-
-        # Reset memory
-        self.agent.memory = AgentMemory(system_prompt=system_prompt)
-
-        # Also clear base class cache
-        super().clear_message_history()
-
-    def append_to_message_history(self, role: str, content: Any, **kwargs) -> None:
-        """Append message to history - NOT SUPPORTED by smolagents.
-
-        Args:
-            role: Message role
-            content: Message content (string or list)
-            **kwargs: Additional message fields
-
-        Raises:
-            NotImplementedError: smolagents doesn't support arbitrary message injection
-        """
-        raise NotImplementedError(
-            "smolagents doesn't support appending arbitrary messages to history. "
-            "The agent's memory is built from execution steps and cannot be directly manipulated. "
-            "Use run() to generate conversation messages."
-        )
-
     def _run_agent(self, query: str) -> str:
         _check_smolagents_installed()
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -97,19 +97,17 @@ class DummyAgentAdapter(AgentAdapter):
     def _run_agent(self, query: str) -> str:
         import time
 
-        # Create message history
-        history = MessageHistory()
-        history.add_message(role="user", content=query)
-
         # Track timing
         start_time = time.time()
 
         # Run underlying agent
         response = self.agent.run(query)
-        history.add_message(role="assistant", content=response)
 
-        # Store history
-        self.set_message_history(history)
+        # Store history directly
+        if self.messages is None:
+            self.messages = MessageHistory()
+        self.messages.add_message(role="user", content=query)
+        self.messages.add_message(role="assistant", content=response)
 
         # Populate logs to fulfill contract
         duration = time.time() - start_time
diff --git a/tests/test_contract/test_agent_adapter_contract.py b/tests/test_contract/test_agent_adapter_contract.py
@@ -433,37 +433,6 @@ def on_run_end(self, agent, result):
         # Verify order: all on_run_start before any on_run_end
         assert call_order == ["first_start", "second_start", "first_end", "second_end"]
 
-    def test_adapter_message_history_after_clear_and_run(self, framework):
-        """Test that message history is correctly populated after clearing and running.
-
-        This test validates two key contract requirements:
-        1. Clear history should reset the agent's state
-        2. Running the agent after clearing should start with a fresh history
-        """
-        mock_llm = MockLLM(responses=["Test response"])
-        agent = create_agent_for_framework(framework, mock_llm)
-        adapter = create_adapter_for_framework(framework, agent)
-
-        # First run
-        adapter.run("First query")
-        history_1 = adapter.get_messages()
-        assert len(history_1) > 0
-
-        # Clear and verify empty (or just system message for smolagents)
-        adapter.clear_message_history()
-        history_after_clear = adapter.get_messages()
-        expected_after_clear = 1 if framework == "smolagents" else 0  # smolagents keeps system message
-        assert len(history_after_clear) == expected_after_clear
-
-        # Second run should populate new history
-        adapter.run("Second query")
-        history_2 = adapter.get_messages()
-        assert len(history_2) > expected_after_clear  # Should have more than just system message
-
-        # History should only contain second run's messages
-        # (exact count depends on framework, but should have at least one message)
-        assert any("Second query" in str(msg.get("content", "")) for msg in history_2)
-
     def test_adapter_logs_populated_after_run(self, framework):
         """Test all adapters populate self.logs during execution.
 
diff --git a/tests/test_core/test_agent_adapter.py b/tests/test_core/test_agent_adapter.py
@@ -49,37 +49,6 @@ def test_agent_adapter_get_messages_returns_history(self, dummy_agent_adapter):
         assert history[0]["role"] == "user"
         assert history[1]["role"] == "assistant"
 
-    def test_agent_adapter_set_message_history(self, dummy_agent_adapter):
-        """Test that message history can be set manually."""
-        new_history = MessageHistory()
-        new_history.add_message("user", "Custom message")
-        new_history.add_message("assistant", "Custom response")
-
-        dummy_agent_adapter.set_message_history(new_history)
-
-        retrieved = dummy_agent_adapter.get_messages()
-        assert len(retrieved) == 2
-        assert retrieved[0]["content"] == "Custom message"
-        assert retrieved[1]["content"] == "Custom response"
-
-    def test_agent_adapter_clear_message_history(self, dummy_agent_adapter):
-        """Test that message history can be cleared."""
-        dummy_agent_adapter.run("Test")
-        assert len(dummy_agent_adapter.get_messages()) > 0
-
-        dummy_agent_adapter.clear_message_history()
-        assert len(dummy_agent_adapter.get_messages()) == 0
-
-    def test_agent_adapter_append_to_message_history(self, dummy_agent_adapter):
-        """Test that messages can be appended to history."""
-        dummy_agent_adapter.append_to_message_history("user", "First message")
-        dummy_agent_adapter.append_to_message_history("assistant", "First response")
-
-        history = dummy_agent_adapter.get_messages()
-        assert len(history) == 2
-        assert history[0]["content"] == "First message"
-        assert history[1]["content"] == "First response"
-
     def test_agent_adapter_gather_traces_includes_messages(self, dummy_agent_adapter):
         """Test that gather_traces() includes message history."""
         dummy_agent_adapter.run("Test query")
@@ -110,17 +79,15 @@ def test_agent_adapter_gather_config(self, dummy_agent_adapter):
         assert config["type"] == "DummyAgentAdapter"
 
     def test_agent_adapter_multiple_runs(self, dummy_agent_adapter):
-        """Test that adapter can be run multiple times."""
+        """Test that adapter can be run multiple times and history accumulates."""
         result1 = dummy_agent_adapter.run("Query 1")
         assert "Query 1" in result1
 
-        # Clear history for second run
-        dummy_agent_adapter.clear_message_history()
-
         result2 = dummy_agent_adapter.run("Query 2")
         assert "Query 2" in result2
 
-        # History should only have second run
+        # History should have both runs
         history = dummy_agent_adapter.get_messages()
-        assert len(history) == 2
-        assert history[0]["content"] == "Query 2"
+        assert len(history) == 4  # 2 messages per run
+        assert history[0]["content"] == "Query 1"
+        assert history[2]["content"] == "Query 2"
diff --git a/tests/test_core/test_message_tracing_callback.py b/tests/test_core/test_message_tracing_callback.py
@@ -47,8 +47,8 @@ def _run_agent(self, query: str) -> str:
             # Normal response without tools
             history.add_message(role="assistant", content=response)
 
-        # Store history so get_messages() can retrieve it
-        self.set_message_history(history)
+        # Store history directly
+        self.messages = history
 
         return response
 
diff --git a/tests/test_interface/test_agent_integration/test_langgraph_integration.py b/tests/test_interface/test_agent_integration/test_langgraph_integration.py
diff --git a/tests/test_interface/test_agent_integration/test_smolagents_integration.py b/tests/test_interface/test_agent_integration/test_smolagents_integration.py