Skip to content

Commit adaa614

Browse files
author
amabito
committed
feat(evaluators): add built-in budget evaluator for per-agent cost tracking
Closes #130 Add BudgetEvaluator -- a deterministic evaluator that tracks cumulative LLM token and cost usage per agent, per channel, per user, with configurable time windows (daily/weekly/monthly/cumulative). Components: - BudgetStore protocol + InMemoryBudgetStore (dict + threading.Lock) - BudgetSnapshot frozen dataclass for atomic state reads - BudgetEvaluator with scope key building, period key derivation, token extraction, and optional model pricing estimation - BudgetLimitRule config with scope, per, window, limit_usd, limit_tokens - 48 tests covering store, config, evaluator, registration Design: - In-memory only (no PostgreSQL, no new dependencies) - Store is "dumb" (accumulate + check), evaluator is "smart" (resolve scope, derive period, extract tokens, check limits) - record_and_check() is atomic (single lock acquisition) - Evaluator instances are cached per config (thread-safe by design) - matched=True only when limit exceeded, confidence=1.0 always - Utilization ratio in metadata, not confidence
1 parent 1a6f731 commit adaa614

7 files changed

Lines changed: 1015 additions & 0 deletions

File tree

evaluators/builtin/src/agent_control_evaluators/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- list: List-based value matching
1010
- json: JSON validation
1111
- sql: SQL query validation
12+
- budget: Cumulative LLM token and cost tracking
1213
1314
Naming convention:
1415
- Built-in: "regex", "list", "json", "sql"
@@ -47,6 +48,13 @@
4748
from agent_control_evaluators.json import JSONEvaluator, JSONEvaluatorConfig
4849
from agent_control_evaluators.list import ListEvaluator, ListEvaluatorConfig
4950
from agent_control_evaluators.regex import RegexEvaluator, RegexEvaluatorConfig
51+
from agent_control_evaluators.budget import (
52+
BudgetEvaluator,
53+
BudgetEvaluatorConfig,
54+
BudgetSnapshot,
55+
BudgetStore,
56+
InMemoryBudgetStore,
57+
)
5058
from agent_control_evaluators.sql import SQLEvaluator, SQLEvaluatorConfig
5159

5260
__all__ = [
@@ -73,4 +81,9 @@
7381
"JSONEvaluatorConfig",
7482
"SQLEvaluator",
7583
"SQLEvaluatorConfig",
84+
"BudgetEvaluator",
85+
"BudgetEvaluatorConfig",
86+
"BudgetSnapshot",
87+
"BudgetStore",
88+
"InMemoryBudgetStore",
7689
]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Budget evaluator for per-agent LLM cost and token tracking."""
2+
3+
from agent_control_evaluators.budget.config import BudgetEvaluatorConfig
4+
from agent_control_evaluators.budget.evaluator import BudgetEvaluator
5+
from agent_control_evaluators.budget.store import (
6+
BudgetSnapshot,
7+
BudgetStore,
8+
InMemoryBudgetStore,
9+
)
10+
11+
__all__ = [
12+
"BudgetEvaluator",
13+
"BudgetEvaluatorConfig",
14+
"BudgetSnapshot",
15+
"BudgetStore",
16+
"InMemoryBudgetStore",
17+
]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""Configuration for the budget evaluator."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any, Literal
6+
7+
from pydantic import Field, field_validator, model_validator
8+
9+
from agent_control_evaluators._base import EvaluatorConfig
10+
11+
12+
class BudgetLimitRule(EvaluatorConfig):
13+
"""A single budget limit rule.
14+
15+
Each rule defines a ceiling (USD and/or tokens) for a combination
16+
of scope dimensions and time window. Multiple rules can apply to
17+
the same step -- the evaluator checks all of them and triggers
18+
on the first breach.
19+
20+
Attributes:
21+
scope: Static scope dimensions that must match for this rule
22+
to apply. Empty dict = global rule.
23+
per: If set, the limit is applied independently for each unique
24+
value of this metadata field (e.g. "user_id" creates per-user
25+
budgets within the scope).
26+
window: Time window for accumulation. None = cumulative (no reset).
27+
limit_usd: Maximum USD spend in the window. None = uncapped.
28+
limit_tokens: Maximum tokens in the window. None = uncapped.
29+
"""
30+
31+
scope: dict[str, str] = Field(default_factory=dict)
32+
per: str | None = None
33+
window: Literal["daily", "weekly", "monthly"] | None = None
34+
limit_usd: float | None = None
35+
limit_tokens: int | None = None
36+
37+
@model_validator(mode="after")
38+
def at_least_one_limit(self) -> "BudgetLimitRule":
39+
if self.limit_usd is None and self.limit_tokens is None:
40+
raise ValueError("At least one of limit_usd or limit_tokens must be set")
41+
return self
42+
43+
@field_validator("limit_usd")
44+
@classmethod
45+
def validate_limit_usd(cls, v: float | None) -> float | None:
46+
if v is not None and v <= 0:
47+
raise ValueError("limit_usd must be positive")
48+
return v
49+
50+
@field_validator("limit_tokens")
51+
@classmethod
52+
def validate_limit_tokens(cls, v: int | None) -> int | None:
53+
if v is not None and v <= 0:
54+
raise ValueError("limit_tokens must be positive")
55+
return v
56+
57+
58+
class BudgetEvaluatorConfig(EvaluatorConfig):
59+
"""Configuration for the budget evaluator.
60+
61+
Attributes:
62+
limits: List of budget limit rules. Each is checked independently.
63+
pricing: Optional model pricing table. Maps model name to per-1K
64+
token rates. Used to derive cost_usd from token counts when
65+
cost is not provided in step data.
66+
token_path: Dot-notation path to extract token usage from step
67+
data (e.g. "usage.total_tokens"). If None, looks for standard
68+
fields (input_tokens, output_tokens, total_tokens, usage).
69+
cost_path: Dot-notation path to extract cost from step data.
70+
model_path: Dot-notation path to extract model name (for pricing lookup).
71+
metadata_paths: Mapping of metadata field name to dot-notation path
72+
in step data. Used to extract scope dimensions (channel, user_id, etc).
73+
"""
74+
75+
limits: list[BudgetLimitRule] = Field(min_length=1)
76+
pricing: dict[str, dict[str, float]] | None = None
77+
token_path: str | None = None
78+
cost_path: str | None = None
79+
model_path: str | None = None
80+
metadata_paths: dict[str, str] = Field(default_factory=dict)

0 commit comments

Comments
 (0)