|
| 1 | +# Rogue Agent Evaluator Python SDK |
| 2 | + |
| 3 | +A comprehensive Python SDK for interacting with the Rogue Agent Evaluator API. |
| 4 | + |
| 5 | +## Installation |
| 6 | + |
| 7 | +```bash |
| 8 | +pip install rogue-sdk |
| 9 | +``` |
| 10 | + |
| 11 | +## Quick Start |
| 12 | + |
| 13 | +```python |
| 14 | +import asyncio |
| 15 | +from rogue_sdk import RogueSDK, RogueClientConfig, AuthType, ScenarioType |
| 16 | + |
| 17 | +async def main(): |
| 18 | + # Configure the SDK |
| 19 | + config = RogueClientConfig(base_url="http://localhost:8000") |
| 20 | + |
| 21 | + async with RogueSDK(config) as client: |
| 22 | + # Quick evaluation |
| 23 | + result = await client.quick_evaluate( |
| 24 | + agent_url="http://localhost:3000", |
| 25 | + scenarios=[ |
| 26 | + "The agent should be polite", |
| 27 | + "The agent should not give discounts" |
| 28 | + ] |
| 29 | + ) |
| 30 | + |
| 31 | + print(f"Evaluation completed: {result.status}") |
| 32 | + print(f"Results: {len(result.results)} scenarios evaluated") |
| 33 | + |
| 34 | +if __name__ == "__main__": |
| 35 | + asyncio.run(main()) |
| 36 | +``` |
| 37 | + |
| 38 | +## Features |
| 39 | + |
| 40 | +- **HTTP Client**: Full REST API support with automatic retries |
| 41 | +- **WebSocket Client**: Real-time updates during evaluations |
| 42 | +- **Type Safety**: Comprehensive type definitions with Pydantic |
| 43 | +- **Async/Await**: Modern Python async support |
| 44 | +- **Error Handling**: Robust error handling and retry logic |
| 45 | +- **High-level Methods**: Convenient methods for common operations |
| 46 | + |
| 47 | +## API Reference |
| 48 | + |
| 49 | +### RogueSDK |
| 50 | + |
| 51 | +Main SDK class that combines HTTP and WebSocket functionality. |
| 52 | + |
| 53 | +#### Configuration |
| 54 | + |
| 55 | +```python |
| 56 | +from rogue_sdk import RogueClientConfig |
| 57 | + |
| 58 | +config = RogueClientConfig( |
| 59 | + base_url="http://localhost:8000", |
| 60 | + api_key="your-api-key", # Optional |
| 61 | + timeout=30.0, # Request timeout in seconds |
| 62 | + retries=3 # Number of retry attempts |
| 63 | +) |
| 64 | +``` |
| 65 | + |
| 66 | +#### Basic Operations |
| 67 | + |
| 68 | +```python |
| 69 | +async with RogueSDK(config) as client: |
| 70 | + # Health check |
| 71 | + health = await client.health() |
| 72 | + |
| 73 | + # Create evaluation |
| 74 | + response = await client.create_evaluation(request) |
| 75 | + |
| 76 | + # Get evaluation status |
| 77 | + job = await client.get_evaluation(job_id) |
| 78 | + |
| 79 | + # List evaluations |
| 80 | + jobs = await client.list_evaluations() |
| 81 | + |
| 82 | + # Cancel evaluation |
| 83 | + await client.cancel_evaluation(job_id) |
| 84 | +``` |
| 85 | + |
| 86 | +#### Real-time Updates |
| 87 | + |
| 88 | +```python |
| 89 | +async def on_update(job): |
| 90 | + print(f"Job {job.job_id}: {job.status} ({job.progress:.1%})") |
| 91 | + |
| 92 | +async def on_chat(chat_data): |
| 93 | + print(f"Chat: {chat_data}") |
| 94 | + |
| 95 | +# Run evaluation with real-time updates |
| 96 | +result = await client.run_evaluation_with_updates( |
| 97 | + request=evaluation_request, |
| 98 | + on_update=on_update, |
| 99 | + on_chat=on_chat |
| 100 | +) |
| 101 | +``` |
| 102 | + |
| 103 | +### Data Models |
| 104 | + |
| 105 | +#### AgentConfig |
| 106 | + |
| 107 | +```python |
| 108 | +from rogue_sdk.types import AgentConfig, AuthType |
| 109 | + |
| 110 | +agent_config = AgentConfig( |
| 111 | + evaluated_agent_url="http://localhost:3000", |
| 112 | + evaluated_agent_auth_type=AuthType.NO_AUTH, |
| 113 | + judge_llm="openai/gpt-4o-mini", |
| 114 | + interview_mode=True, |
| 115 | + deep_test_mode=False, |
| 116 | + parallel_runs=1 |
| 117 | +) |
| 118 | +``` |
| 119 | + |
| 120 | +#### Scenario |
| 121 | + |
| 122 | +```python |
| 123 | +from rogue_sdk.types import Scenario, ScenarioType |
| 124 | + |
| 125 | +scenario = Scenario( |
| 126 | + scenario="The agent should be polite", |
| 127 | + scenario_type=ScenarioType.POLICY, |
| 128 | + expected_outcome="Agent responds politely" |
| 129 | +) |
| 130 | +``` |
| 131 | + |
| 132 | +#### EvaluationRequest |
| 133 | + |
| 134 | +```python |
| 135 | +from rogue_sdk.types import EvaluationRequest |
| 136 | + |
| 137 | +request = EvaluationRequest( |
| 138 | + agent_config=agent_config, |
| 139 | + scenarios=[scenario], |
| 140 | + max_retries=3, |
| 141 | + timeout_seconds=300 |
| 142 | +) |
| 143 | +``` |
| 144 | + |
| 145 | +## Advanced Usage |
| 146 | + |
| 147 | +### Custom HTTP Client |
| 148 | + |
| 149 | +```python |
| 150 | +from rogue_sdk import RogueHttpClient |
| 151 | + |
| 152 | +async with RogueHttpClient(config) as http_client: |
| 153 | + health = await http_client.health() |
| 154 | + response = await http_client.create_evaluation(request) |
| 155 | +``` |
| 156 | + |
| 157 | +### WebSocket Client |
| 158 | + |
| 159 | +```python |
| 160 | +from rogue_sdk import RogueWebSocketClient |
| 161 | + |
| 162 | +ws_client = RogueWebSocketClient("http://localhost:8000", job_id) |
| 163 | + |
| 164 | +def handle_update(event, data): |
| 165 | + print(f"Update: {data}") |
| 166 | + |
| 167 | +ws_client.on('job_update', handle_update) |
| 168 | +await ws_client.connect() |
| 169 | +``` |
| 170 | + |
| 171 | +### Error Handling |
| 172 | + |
| 173 | +```python |
| 174 | +from rogue_sdk.types import EvaluationStatus |
| 175 | + |
| 176 | +try: |
| 177 | + result = await client.quick_evaluate(agent_url, scenarios) |
| 178 | + |
| 179 | + if result.status == EvaluationStatus.COMPLETED: |
| 180 | + print("Evaluation successful!") |
| 181 | + elif result.status == EvaluationStatus.FAILED: |
| 182 | + print(f"Evaluation failed: {result.error_message}") |
| 183 | + |
| 184 | +except TimeoutError: |
| 185 | + print("Evaluation timed out") |
| 186 | +except Exception as e: |
| 187 | + print(f"Error: {e}") |
| 188 | +``` |
| 189 | + |
| 190 | +## Examples |
| 191 | + |
| 192 | +### Basic Evaluation |
| 193 | + |
| 194 | +```python |
| 195 | +import asyncio |
| 196 | +from rogue_sdk import RogueSDK, RogueClientConfig |
| 197 | + |
| 198 | +async def basic_evaluation(): |
| 199 | + config = RogueClientConfig(base_url="http://localhost:8000") |
| 200 | + |
| 201 | + async with RogueSDK(config) as client: |
| 202 | + result = await client.quick_evaluate( |
| 203 | + agent_url="http://localhost:3000", |
| 204 | + scenarios=["Be helpful and polite"] |
| 205 | + ) |
| 206 | + |
| 207 | + for scenario_result in result.results: |
| 208 | + print(f"Scenario: {scenario_result.scenario.scenario}") |
| 209 | + print(f"Passed: {scenario_result.passed}") |
| 210 | + for conv in scenario_result.conversations: |
| 211 | + print(f" Conversation passed: {conv.passed}") |
| 212 | + print(f" Reason: {conv.reason}") |
| 213 | + |
| 214 | +asyncio.run(basic_evaluation()) |
| 215 | +``` |
| 216 | + |
| 217 | +### Advanced Evaluation with Real-time Updates |
| 218 | + |
| 219 | +```python |
| 220 | +import asyncio |
| 221 | +from rogue_sdk import RogueSDK, RogueClientConfig |
| 222 | +from rogue_sdk.types import AgentConfig, Scenario, EvaluationRequest, AuthType, ScenarioType |
| 223 | + |
| 224 | +async def advanced_evaluation(): |
| 225 | + config = RogueClientConfig(base_url="http://localhost:8000") |
| 226 | + |
| 227 | + # Configure agent |
| 228 | + agent_config = AgentConfig( |
| 229 | + evaluated_agent_url="http://localhost:3000", |
| 230 | + evaluated_agent_auth_type=AuthType.API_KEY, |
| 231 | + evaluated_agent_credentials="your-agent-api-key", |
| 232 | + judge_llm="openai/gpt-4o-mini", |
| 233 | + deep_test_mode=True |
| 234 | + ) |
| 235 | + |
| 236 | + # Define scenarios |
| 237 | + scenarios = [ |
| 238 | + Scenario( |
| 239 | + scenario="Don't reveal sensitive information", |
| 240 | + scenario_type=ScenarioType.POLICY, |
| 241 | + expected_outcome="Agent refuses to share sensitive data" |
| 242 | + ), |
| 243 | + Scenario( |
| 244 | + scenario="Be helpful with customer inquiries", |
| 245 | + scenario_type=ScenarioType.POLICY, |
| 246 | + expected_outcome="Agent provides helpful responses" |
| 247 | + ) |
| 248 | + ] |
| 249 | + |
| 250 | + request = EvaluationRequest( |
| 251 | + agent_config=agent_config, |
| 252 | + scenarios=scenarios, |
| 253 | + max_retries=3, |
| 254 | + timeout_seconds=600 |
| 255 | + ) |
| 256 | + |
| 257 | + async with RogueSDK(config) as client: |
| 258 | + def on_update(job): |
| 259 | + print(f"Progress: {job.progress:.1%} - Status: {job.status}") |
| 260 | + |
| 261 | + def on_chat(chat_data): |
| 262 | + role = chat_data.get('role', 'Unknown') |
| 263 | + content = chat_data.get('content', '') |
| 264 | + print(f"{role}: {content[:100]}...") |
| 265 | + |
| 266 | + result = await client.run_evaluation_with_updates( |
| 267 | + request=request, |
| 268 | + on_update=on_update, |
| 269 | + on_chat=on_chat, |
| 270 | + timeout=600.0 |
| 271 | + ) |
| 272 | + |
| 273 | + print(f"\nEvaluation completed: {result.status}") |
| 274 | + if result.results: |
| 275 | + passed_scenarios = sum(1 for r in result.results if r.passed) |
| 276 | + total_scenarios = len(result.results) |
| 277 | + print(f"Results: {passed_scenarios}/{total_scenarios} scenarios passed") |
| 278 | + |
| 279 | +asyncio.run(advanced_evaluation()) |
| 280 | +``` |
| 281 | + |
| 282 | +## Development |
| 283 | + |
| 284 | +### Running Tests |
| 285 | + |
| 286 | +```bash |
| 287 | +python -m pytest tests/ |
| 288 | +``` |
| 289 | + |
| 290 | +### Type Checking |
| 291 | + |
| 292 | +```bash |
| 293 | +python -m mypy rogue_sdk/ |
| 294 | +``` |
| 295 | + |
| 296 | +### Code Formatting |
| 297 | + |
| 298 | +```bash |
| 299 | +python -m black rogue_sdk/ |
| 300 | +python -m flake8 rogue_sdk/ |
| 301 | +``` |
| 302 | + |
| 303 | +## License |
| 304 | + |
| 305 | +This project is licensed under a License - see the [LICENSE](LICENSE.md) file for details. |
| 306 | +This means that you can use this freely and forever but you are not allowed to host and sell this software. |
| 307 | + |
| 308 | +If you have any queries about the license and commercial use for this project please email `admin@qualifire.ai` |
0 commit comments