|
16 | 16 |
|
17 | 17 | from agent_debugger_sdk.core.context import configure_event_pipeline |
18 | 18 | from agent_debugger_sdk.core.events import Checkpoint, Session, TraceEvent |
19 | | -from benchmarks import DEFAULT_SEED_SESSION_IDS, iter_seed_scenarios |
| 19 | +from benchmarks import ( |
| 20 | + DEFAULT_SEED_SESSION_IDS, |
| 21 | + SESSION_ENRICHMENT, |
| 22 | + iter_seed_scenarios, |
| 23 | + validate_session_enrichment, |
| 24 | +) |
20 | 25 | from collector.buffer import get_event_buffer |
21 | 26 | from collector.server import configure_storage |
22 | 27 | from storage import Base, TraceRepository |
23 | 28 | from storage.models import AnomalyAlertModel |
24 | 29 |
|
25 | 30 | DATABASE_URL = os.environ.get("AGENT_DEBUGGER_DB_URL", "sqlite+aiosqlite:///./data/agent_debugger.db") |
26 | 31 |
|
27 | | -# Session enrichment data: realistic values for demo sessions |
28 | | -# Note: failure_count is computed in API layer (services.py) as errors count |
29 | | -# behavior_alert_count is computed in API layer from AnomalyAlertModel records |
30 | | -def validate_session_enrichment(session_id: str, enrichment: dict[str, object]) -> None: |
31 | | - """Validate curated enrichment metrics for demo seed sessions.""" |
32 | | - total_tokens = enrichment.get("total_tokens") |
33 | | - total_cost_usd = enrichment.get("total_cost_usd") |
34 | | - |
35 | | - if not isinstance(total_tokens, int) or total_tokens <= 0: |
36 | | - raise ValueError(f"Seed enrichment for {session_id} must define positive total_tokens") |
37 | | - |
38 | | - if not isinstance(total_cost_usd, (int, float)) or float(total_cost_usd) <= 0: |
39 | | - raise ValueError(f"Seed enrichment for {session_id} must define positive total_cost_usd") |
40 | | - |
41 | | - |
42 | | -SESSION_ENRICHMENT = { |
43 | | - "seed-prompt-injection": { |
44 | | - "total_tokens": 856, |
45 | | - "total_cost_usd": 0.0042, |
46 | | - "retention_tier": "summarized", |
47 | | - "fix_note": "Added input sanitization and prompt boundary checks", |
48 | | - "errors": 0, |
49 | | - "behavior_alerts": 1, |
50 | | - }, |
51 | | - "seed-evidence-grounding": { |
52 | | - "total_tokens": 140, |
53 | | - "total_cost_usd": 0.0021, |
54 | | - "retention_tier": "summarized", |
55 | | - "fix_note": None, |
56 | | - "errors": 0, |
57 | | - "behavior_alerts": 0, |
58 | | - }, |
59 | | - "seed-multi-agent-dialogue": { |
60 | | - "total_tokens": 412, |
61 | | - "total_cost_usd": 0.0038, |
62 | | - "retention_tier": "summarized", |
63 | | - "fix_note": None, |
64 | | - "errors": 0, |
65 | | - "behavior_alerts": 0, |
66 | | - }, |
67 | | - "seed-prompt-policy-shift": { |
68 | | - "total_tokens": 164, |
69 | | - "total_cost_usd": 0.0028, |
70 | | - "retention_tier": "summarized", |
71 | | - "fix_note": "Added policy consistency checks across turns", |
72 | | - "errors": 0, |
73 | | - "behavior_alerts": 1, |
74 | | - }, |
75 | | - "seed-safety-escalation": { |
76 | | - "total_tokens": 1987, |
77 | | - "total_cost_usd": 0.0142, |
78 | | - "retention_tier": "full", |
79 | | - "fix_note": "Added output validation after tool call", |
80 | | - "errors": 1, |
81 | | - "behavior_alerts": 1, |
82 | | - }, |
83 | | - "seed-looping-behavior": { |
84 | | - "total_tokens": 1245, |
85 | | - "total_cost_usd": 0.0089, |
86 | | - "retention_tier": "summarized", |
87 | | - "fix_note": "Added max iteration limit with circuit breaker", |
88 | | - "errors": 0, |
89 | | - "behavior_alerts": 2, |
90 | | - }, |
91 | | - "seed-failure-cluster": { |
92 | | - "total_tokens": 1567, |
93 | | - "total_cost_usd": 0.0112, |
94 | | - "retention_tier": "full", |
95 | | - "fix_note": "Added pre-call validation and error recovery", |
96 | | - "errors": 0, |
97 | | - "behavior_alerts": 1, |
98 | | - }, |
99 | | - "seed-replay-determinism": { |
100 | | - "total_tokens": 289, |
101 | | - "total_cost_usd": 0.0031, |
102 | | - "retention_tier": "summarized", |
103 | | - "fix_note": None, |
104 | | - "errors": 0, |
105 | | - "behavior_alerts": 0, |
106 | | - }, |
107 | | -} |
108 | | - |
109 | 32 |
|
110 | 33 | def validate_session_metrics(total_tokens: int, total_cost_usd: float, *, context: str) -> None: |
111 | 34 | """Validate curated session metrics before persisting demo seed data.""" |
|
0 commit comments