Agents are harder to test than traditional software:
- Non-deterministic (LLM responses vary)
- External dependencies (Ollama, file system)
- Complex state (conversation history)
- Tool execution with side effects
But testing is crucial:
- Catch bugs before they delete files
- Ensure tools work correctly
- Verify safety checks
- Document expected behavior
/\
/ \ Unit Tests (70%)
/────\ - Individual tools
/ \ - Context management
/────────\ - Utility functions
/ Integ. \ Integration Tests (20%)
/────────────\ - Agent loop with mock LLM
/ E2E Tests \- Full stack with real LLM (10%)
/────────────────\
Test individual components in isolation.
# tests/test_tools.py
import pytest
from pathlib import Path
from tools.filesystem import ReadFileTool, WriteFileTool
@pytest.fixture
def temp_dir(tmp_path):
"""Provide a temporary directory for tests"""
return tmp_path
def test_read_file_success(temp_dir):
# Arrange
test_file = temp_dir / "test.txt"
test_file.write_text("Hello world")
tool = ReadFileTool()
# Act
result = tool.execute(path=str(test_file))
# Assert
assert result.success
assert result.output == "Hello world"
assert result.error is None
def test_read_file_not_found(temp_dir):
tool = ReadFileTool()
result = tool.execute(path=str(temp_dir / "missing.txt"))
assert not result.success
assert "not found" in result.error.lower()
def test_write_file(temp_dir):
tool = WriteFileTool()
result = tool.execute(
path=str(temp_dir / "output.txt"),
content="Test content"
)
assert result.success
assert (temp_dir / "output.txt").read_text() == "Test content"
def test_path_traversal_blocked():
"""Security: Ensure path traversal is blocked"""
tool = ReadFileTool()
result = tool.execute(path="../../etc/passwd")
assert not result.success
assert "denied" in result.error.lower()# tests/test_context.py
import pytest
from context import ContextManager, Message
def test_add_message():
ctx = ContextManager()
ctx.add_user_message("Hello")
assert len(ctx.messages) == 1
assert ctx.messages[0].role == "user"
assert ctx.messages[0].content == "Hello"
def test_token_estimation():
ctx = ContextManager()
ctx.add_user_message("a" * 400) # ~100 tokens
tokens = ctx.estimate_tokens()
assert 90 <= tokens <= 110 # Rough estimate
def test_budget_enforcement():
ctx = ContextManager(max_tokens=200)
# Add many large messages
for i in range(10):
ctx.add_user_message("x" * 400) # Each ~100 tokens
# Should have pruned old messages
assert ctx.estimate_tokens() <= 200 * 0.8
def test_save_load(tmp_path):
# Create and save
ctx1 = ContextManager()
ctx1.add_user_message("Hello")
ctx1.add_assistant_message("Hi there")
path = tmp_path / "session.json"
ctx1.save(str(path))
# Load
ctx2 = ContextManager.load(str(path))
assert len(ctx2.messages) == 2
assert ctx2.messages[0].content == "Hello"
assert ctx2.messages[1].content == "Hi there"
def test_system_prompt_preserved():
ctx = ContextManager(max_tokens=50, system_prompt="You are helpful")
# Fill context to trigger pruning
for i in range(10):
ctx.add_user_message("test" * 20)
# System prompt should still be there
assert ctx.messages[0].role == "system"
assert ctx.messages[0].content == "You are helpful"Test components working together, mock external dependencies.
# tests/mocks.py
class MockLLM:
"""Mock LLM for testing"""
def __init__(self, responses: list = None):
self.responses = responses or []
self.call_count = 0
self.last_messages = None
def generate(self, messages, tools=None):
self.last_messages = messages
self.call_count += 1
if self.responses:
return self.responses[self.call_count - 1]
# Default response
return {
"message": {
"role": "assistant",
"content": "Mock response"
}
}# tests/test_agent_integration.py
import pytest
from agent import Agent
from tests.mocks import MockLLM
def test_simple_conversation():
# Mock LLM that just echoes
mock_llm = MockLLM(responses=[
{
"message": {
"role": "assistant",
"content": "Hello to you too"
}
}
])
agent = Agent(llm=mock_llm)
response = agent.run("Hello")
assert response == "Hello to you too"
assert mock_llm.call_count == 1
def test_tool_calling_loop():
# Mock LLM that calls a tool, then responds
mock_llm = MockLLM(responses=[
# First call: request tool
{
"message": {
"role": "assistant",
"content": ""
},
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "read_file",
"arguments": '{"path": "test.txt"}'
}
}
]
},
# Second call: respond with result
{
"message": {
"role": "assistant",
"content": "The file says: mock content"
}
}
])
agent = Agent(llm=mock_llm)
response = agent.run("What's in test.txt?")
assert "mock content" in response
assert mock_llm.call_count == 2 # Two LLM calls
def test_max_turns_prevents_infinite_loop():
# Mock LLM that always calls a tool (infinite loop)
mock_llm = MockLLM(responses=[
{
"message": {"role": "assistant", "content": ""},
"tool_calls": [{
"id": "call_1",
"function": {"name": "read_file", "arguments": '{"path": "test.txt"}'}
}]
}
] * 20) # Repeat same response
agent = Agent(llm=mock_llm, max_turns=10)
response = agent.run("Test")
assert "max turns" in response.lower() or "error" in response.lower()
assert mock_llm.call_count <= 10Test with real LLM (slower, use sparingly).
# tests/test_e2e.py
import pytest
from agent import Agent
from llm import OllamaLLM
@pytest.mark.e2e
@pytest.mark.slow
def test_real_llm_simple_task():
"""Test with real Ollama (requires Ollama running)"""
llm = OllamaLLM(model="llama3.2:3b") # Use smallest model
agent = Agent(llm=llm)
response = agent.run("What is 2 + 2?")
# Loose assertion (LLM output varies)
assert "4" in response
@pytest.mark.e2e
def test_file_operations_e2e(tmp_path):
"""Test agent can actually read/write files"""
llm = OllamaLLM(model="llama3.2:3b")
agent = Agent(llm=llm, safe_dir=tmp_path)
# Ask agent to create a file
response = agent.run("Create a file called hello.txt with the text 'Hello world'")
# Verify file was created
assert (tmp_path / "hello.txt").exists()
assert (tmp_path / "hello.txt").read_text() == "Hello world"Running E2E tests:
# Skip E2E tests by default (too slow)
pytest
# Run only E2E tests
pytest -m e2e
# Run all tests including slow ones
pytest -m "e2e or slow"LLMs are random. How to test?
def test_response_structure():
response = agent.run("Write a Python hello world")
# Don't check exact code, check presence of key elements
assert "print" in response.lower()
assert "hello" in response.lower()
# or
assert response.count("def") >= 1 # Has a function definitionllm = OllamaLLM(model="llama3.3", temperature=0.0) # Deterministic
agent = Agent(llm=llm)
response = agent.run("What is 2 + 2?")
assert response == "2 + 2 equals 4." # More predictable# Use mock for testing logic, real LLM only for manual validation
mock_llm = MockLLM(responses=[...]) # Known responses
agent = Agent(llm=mock_llm)# tests/test_safety.py
from tools.shell import ShellTool
def test_dangerous_commands_blocked():
tool = ShellTool()
dangerous_cmds = [
"rm -rf /",
"del /f *",
"format c:",
"dd if=/dev/zero of=/dev/sda"
]
for cmd in dangerous_cmds:
result = tool.execute(command=cmd)
assert not result.success
assert "blocked" in result.error.lower()
def test_allowed_commands_work():
tool = ShellTool()
result = tool.execute(command="echo hello")
assert result.success
assert "hello" in result.outputdef test_cannot_access_system_files():
tool = ReadFileTool()
system_paths = [
"/etc/passwd",
"C:\\Windows\\System32\\config\\SAM",
"../../../root/.ssh/id_rsa"
]
for path in system_paths:
result = tool.execute(path=path)
assert not result.success
assert "denied" in result.error.lower()tests/
├── conftest.py # Shared fixtures
├── test_tools.py # Unit tests for tools
├── test_context.py # Unit tests for context
├── test_llm.py # Unit tests for LLM interface
├── test_agent.py # Integration tests
├── test_safety.py # Security tests
├── test_e2e.py # End-to-end tests (slow)
└── mocks.py # Mock objects
# tests/conftest.py
import pytest
from pathlib import Path
@pytest.fixture
def temp_dir(tmp_path):
"""Temporary directory for file operations"""
return tmp_path
@pytest.fixture
def mock_llm():
"""Mock LLM instance"""
from tests.mocks import MockLLM
return MockLLM()
@pytest.fixture
def agent(mock_llm, temp_dir):
"""Agent instance with mocked LLM"""
from agent import Agent
return Agent(llm=mock_llm, safe_dir=temp_dir)# pytest.ini
[pytest]
markers =
e2e: End-to-end tests with real LLM (slow)
slow: Slow tests
unit: Fast unit tests
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
# Default: skip slow tests
addopts = -v --tb=short -m "not e2e and not slow"# Install pytest-watch
pip install pytest-watch
# Auto-run tests on file changes
ptw -- -m "unit"Some things need human verification:
Before each release:
- Can create a simple file
- Can read an existing file
- Handles non-existent files gracefully
- Can run a safe shell command
- Blocks dangerous shell commands
- Multi-turn conversation works
- Context doesn't overflow on long conversation
- Session save/load works
- Error messages are clear and helpful
# tests/test_performance.py
import time
import pytest
@pytest.mark.slow
def test_response_time(agent):
"""Ensure agent responds within reasonable time"""
start = time.time()
response = agent.run("What is 2 + 2?")
duration = time.time() - start
assert duration < 10.0 # Should respond within 10 seconds
def test_context_size_scaling(agent):
"""Test performance with large context"""
# Add many messages
for i in range(100):
agent.context.add_user_message(f"Message {i}")
start = time.time()
response = agent.run("Summary?")
duration = time.time() - start
assert duration < 30.0 # Should handle large context# Install coverage
pip install pytest-cov
# Run tests with coverage
pytest --cov=src --cov-report=html
# View report
open htmlcov/index.html
# Aim for:
# - 80%+ coverage on tools
# - 70%+ on agent logic
# - 60%+ on context management
# Don't obsess over 100%- Test behavior, not implementation - If you refactor, tests should still pass
- Use fixtures - DRY principle applies to tests too
- Test edge cases - Empty files, long files, special characters
- Mock external dependencies - Don't rely on Ollama being available for unit tests
- Keep tests fast - Slow test suites don't get run
- Test error handling - Test failure paths, not just happy paths
- Document test intent - Use clear names and docstrings
- Set up pytest and basic test structure (Phase 1)
- Write tests as you write code (TDD style)
- Run tests before committing changes
- Add E2E tests once core is stable (Phase 4)
- Use coverage to find untested code
Remember: Tests are documentation of how your agent should behave. Future-you will thank present-you for writing them.