Skip to content

Latest commit

 

History

History
567 lines (431 loc) · 13.3 KB

File metadata and controls

567 lines (431 loc) · 13.3 KB

Testing Guide - Test Structure and Strategies

Why Test an Agent?

Agents are harder to test than traditional software:

  • Non-deterministic (LLM responses vary)
  • External dependencies (Ollama, file system)
  • Complex state (conversation history)
  • Tool execution with side effects

But testing is crucial:

  • Catch bugs before they delete files
  • Ensure tools work correctly
  • Verify safety checks
  • Document expected behavior

Testing Pyramid

        /\
       /  \      Unit Tests (70%)
      /────\     - Individual tools
     /      \    - Context management
    /────────\   - Utility functions
   /  Integ.  \  Integration Tests (20%)
  /────────────\ - Agent loop with mock LLM
 /   E2E Tests  \- Full stack with real LLM (10%)
/────────────────\

1. Unit Tests

Test individual components in isolation.

Testing Tools

# tests/test_tools.py
import pytest
from pathlib import Path
from tools.filesystem import ReadFileTool, WriteFileTool

@pytest.fixture
def temp_dir(tmp_path):
    """Provide a temporary directory for tests"""
    return tmp_path

def test_read_file_success(temp_dir):
    # Arrange
    test_file = temp_dir / "test.txt"
    test_file.write_text("Hello world")
    tool = ReadFileTool()
    
    # Act
    result = tool.execute(path=str(test_file))
    
    # Assert
    assert result.success
    assert result.output == "Hello world"
    assert result.error is None

def test_read_file_not_found(temp_dir):
    tool = ReadFileTool()
    result = tool.execute(path=str(temp_dir / "missing.txt"))
    
    assert not result.success
    assert "not found" in result.error.lower()

def test_write_file(temp_dir):
    tool = WriteFileTool()
    result = tool.execute(
        path=str(temp_dir / "output.txt"),
        content="Test content"
    )
    
    assert result.success
    assert (temp_dir / "output.txt").read_text() == "Test content"

def test_path_traversal_blocked():
    """Security: Ensure path traversal is blocked"""
    tool = ReadFileTool()
    result = tool.execute(path="../../etc/passwd")
    
    assert not result.success
    assert "denied" in result.error.lower()

Testing Context Management

# tests/test_context.py
import pytest
from context import ContextManager, Message

def test_add_message():
    ctx = ContextManager()
    ctx.add_user_message("Hello")
    
    assert len(ctx.messages) == 1
    assert ctx.messages[0].role == "user"
    assert ctx.messages[0].content == "Hello"

def test_token_estimation():
    ctx = ContextManager()
    ctx.add_user_message("a" * 400)  # ~100 tokens
    
    tokens = ctx.estimate_tokens()
    assert 90 <= tokens <= 110  # Rough estimate

def test_budget_enforcement():
    ctx = ContextManager(max_tokens=200)
    
    # Add many large messages
    for i in range(10):
        ctx.add_user_message("x" * 400)  # Each ~100 tokens
    
    # Should have pruned old messages
    assert ctx.estimate_tokens() <= 200 * 0.8

def test_save_load(tmp_path):
    # Create and save
    ctx1 = ContextManager()
    ctx1.add_user_message("Hello")
    ctx1.add_assistant_message("Hi there")
    
    path = tmp_path / "session.json"
    ctx1.save(str(path))
    
    # Load
    ctx2 = ContextManager.load(str(path))
    
    assert len(ctx2.messages) == 2
    assert ctx2.messages[0].content == "Hello"
    assert ctx2.messages[1].content == "Hi there"

def test_system_prompt_preserved():
    ctx = ContextManager(max_tokens=50, system_prompt="You are helpful")
    
    # Fill context to trigger pruning
    for i in range(10):
        ctx.add_user_message("test" * 20)
    
    # System prompt should still be there
    assert ctx.messages[0].role == "system"
    assert ctx.messages[0].content == "You are helpful"

2. Integration Tests

Test components working together, mock external dependencies.

Mock LLM

# tests/mocks.py
class MockLLM:
    """Mock LLM for testing"""
    
    def __init__(self, responses: list = None):
        self.responses = responses or []
        self.call_count = 0
        self.last_messages = None
    
    def generate(self, messages, tools=None):
        self.last_messages = messages
        self.call_count += 1
        
        if self.responses:
            return self.responses[self.call_count - 1]
        
        # Default response
        return {
            "message": {
                "role": "assistant",
                "content": "Mock response"
            }
        }

Test Agent Loop with Mock

# tests/test_agent_integration.py
import pytest
from agent import Agent
from tests.mocks import MockLLM

def test_simple_conversation():
    # Mock LLM that just echoes
    mock_llm = MockLLM(responses=[
        {
            "message": {
                "role": "assistant",
                "content": "Hello to you too"
            }
        }
    ])
    
    agent = Agent(llm=mock_llm)
    response = agent.run("Hello")
    
    assert response == "Hello to you too"
    assert mock_llm.call_count == 1

def test_tool_calling_loop():
    # Mock LLM that calls a tool, then responds
    mock_llm = MockLLM(responses=[
        # First call: request tool
        {
            "message": {
                "role": "assistant",
                "content": ""
            },
            "tool_calls": [
                {
                    "id": "call_1",
                    "function": {
                        "name": "read_file",
                        "arguments": '{"path": "test.txt"}'
                    }
                }
            ]
        },
        # Second call: respond with result
        {
            "message": {
                "role": "assistant",
                "content": "The file says: mock content"
            }
        }
    ])
    
    agent = Agent(llm=mock_llm)
    response = agent.run("What's in test.txt?")
    
    assert "mock content" in response
    assert mock_llm.call_count == 2  # Two LLM calls

def test_max_turns_prevents_infinite_loop():
    # Mock LLM that always calls a tool (infinite loop)
    mock_llm = MockLLM(responses=[
        {
            "message": {"role": "assistant", "content": ""},
            "tool_calls": [{
                "id": "call_1",
                "function": {"name": "read_file", "arguments": '{"path": "test.txt"}'}
            }]
        }
    ] * 20)  # Repeat same response
    
    agent = Agent(llm=mock_llm, max_turns=10)
    response = agent.run("Test")
    
    assert "max turns" in response.lower() or "error" in response.lower()
    assert mock_llm.call_count <= 10

3. End-to-End Tests

Test with real LLM (slower, use sparingly).

# tests/test_e2e.py
import pytest
from agent import Agent
from llm import OllamaLLM

@pytest.mark.e2e
@pytest.mark.slow
def test_real_llm_simple_task():
    """Test with real Ollama (requires Ollama running)"""
    llm = OllamaLLM(model="llama3.2:3b")  # Use smallest model
    agent = Agent(llm=llm)
    
    response = agent.run("What is 2 + 2?")
    
    # Loose assertion (LLM output varies)
    assert "4" in response

@pytest.mark.e2e
def test_file_operations_e2e(tmp_path):
    """Test agent can actually read/write files"""
    llm = OllamaLLM(model="llama3.2:3b")
    agent = Agent(llm=llm, safe_dir=tmp_path)
    
    # Ask agent to create a file
    response = agent.run("Create a file called hello.txt with the text 'Hello world'")
    
    # Verify file was created
    assert (tmp_path / "hello.txt").exists()
    assert (tmp_path / "hello.txt").read_text() == "Hello world"

Running E2E tests:

# Skip E2E tests by default (too slow)
pytest

# Run only E2E tests
pytest -m e2e

# Run all tests including slow ones
pytest -m "e2e or slow"

4. Testing Non-Deterministic Output

LLMs are random. How to test?

Strategy 1: Test Structure, Not Content

def test_response_structure():
    response = agent.run("Write a Python hello world")
    
    # Don't check exact code, check presence of key elements
    assert "print" in response.lower()
    assert "hello" in response.lower()
    # or
    assert response.count("def") >= 1  # Has a function definition

Strategy 2: Use Temperature 0

llm = OllamaLLM(model="llama3.3", temperature=0.0)  # Deterministic
agent = Agent(llm=llm)

response = agent.run("What is 2 + 2?")
assert response == "2 + 2 equals 4."  # More predictable

Strategy 3: Mock LLM for Exact Tests

# Use mock for testing logic, real LLM only for manual validation
mock_llm = MockLLM(responses=[...])  # Known responses
agent = Agent(llm=mock_llm)

5. Testing Safety Features

Test Command Whitelist

# tests/test_safety.py
from tools.shell import ShellTool

def test_dangerous_commands_blocked():
    tool = ShellTool()
    
    dangerous_cmds = [
        "rm -rf /",
        "del /f *",
        "format c:",
        "dd if=/dev/zero of=/dev/sda"
    ]
    
    for cmd in dangerous_cmds:
        result = tool.execute(command=cmd)
        assert not result.success
        assert "blocked" in result.error.lower()

def test_allowed_commands_work():
    tool = ShellTool()
    
    result = tool.execute(command="echo hello")
    assert result.success
    assert "hello" in result.output

Test Path Restrictions

def test_cannot_access_system_files():
    tool = ReadFileTool()
    
    system_paths = [
        "/etc/passwd",
        "C:\\Windows\\System32\\config\\SAM",
        "../../../root/.ssh/id_rsa"
    ]
    
    for path in system_paths:
        result = tool.execute(path=path)
        assert not result.success
        assert "denied" in result.error.lower()

6. Test Organization

tests/
├── conftest.py          # Shared fixtures
├── test_tools.py        # Unit tests for tools
├── test_context.py      # Unit tests for context
├── test_llm.py          # Unit tests for LLM interface
├── test_agent.py        # Integration tests
├── test_safety.py       # Security tests
├── test_e2e.py          # End-to-end tests (slow)
└── mocks.py             # Mock objects

Shared Fixtures

# tests/conftest.py
import pytest
from pathlib import Path

@pytest.fixture
def temp_dir(tmp_path):
    """Temporary directory for file operations"""
    return tmp_path

@pytest.fixture
def mock_llm():
    """Mock LLM instance"""
    from tests.mocks import MockLLM
    return MockLLM()

@pytest.fixture
def agent(mock_llm, temp_dir):
    """Agent instance with mocked LLM"""
    from agent import Agent
    return Agent(llm=mock_llm, safe_dir=temp_dir)

7. Continuous Testing

pytest Configuration

# pytest.ini
[pytest]
markers =
    e2e: End-to-end tests with real LLM (slow)
    slow: Slow tests
    unit: Fast unit tests

testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*

# Default: skip slow tests
addopts = -v --tb=short -m "not e2e and not slow"

Run Tests on Save

# Install pytest-watch
pip install pytest-watch

# Auto-run tests on file changes
ptw -- -m "unit"

8. Manual Testing Checklist

Some things need human verification:

Before each release:

  • Can create a simple file
  • Can read an existing file
  • Handles non-existent files gracefully
  • Can run a safe shell command
  • Blocks dangerous shell commands
  • Multi-turn conversation works
  • Context doesn't overflow on long conversation
  • Session save/load works
  • Error messages are clear and helpful

9. Performance Testing

# tests/test_performance.py
import time
import pytest

@pytest.mark.slow
def test_response_time(agent):
    """Ensure agent responds within reasonable time"""
    start = time.time()
    response = agent.run("What is 2 + 2?")
    duration = time.time() - start
    
    assert duration < 10.0  # Should respond within 10 seconds

def test_context_size_scaling(agent):
    """Test performance with large context"""
    # Add many messages
    for i in range(100):
        agent.context.add_user_message(f"Message {i}")
    
    start = time.time()
    response = agent.run("Summary?")
    duration = time.time() - start
    
    assert duration < 30.0  # Should handle large context

10. Test Coverage

# Install coverage
pip install pytest-cov

# Run tests with coverage
pytest --cov=src --cov-report=html

# View report
open htmlcov/index.html

# Aim for:
# - 80%+ coverage on tools
# - 70%+ on agent logic
# - 60%+ on context management
# Don't obsess over 100%

Best Practices

  1. Test behavior, not implementation - If you refactor, tests should still pass
  2. Use fixtures - DRY principle applies to tests too
  3. Test edge cases - Empty files, long files, special characters
  4. Mock external dependencies - Don't rely on Ollama being available for unit tests
  5. Keep tests fast - Slow test suites don't get run
  6. Test error handling - Test failure paths, not just happy paths
  7. Document test intent - Use clear names and docstrings

Next Steps

  1. Set up pytest and basic test structure (Phase 1)
  2. Write tests as you write code (TDD style)
  3. Run tests before committing changes
  4. Add E2E tests once core is stable (Phase 4)
  5. Use coverage to find untested code

Remember: Tests are documentation of how your agent should behave. Future-you will thank present-you for writing them.