Skip to content

Commit cd473e8

Browse files
author
amabito
committed
feat(budget): address R3 review -- async ABC, TTL prune, defensive guards
Respond to lan17's R3 review on PR #144 with the mechanical items that do not depend on pending config-layer decisions (limit model, budget_id, unknown_model_behavior). Changes: - Migrate BudgetStore from Protocol to async ABC with __init_subclass__ guard that walks the MRO to reject sync overrides at class creation - InMemoryBudgetStore: async wrapper around sync helper, threading.Lock retained for CPU-bound critical section - TTL prune for stale period buckets on rollover, runs before max_buckets capacity check so rollover at capacity reclaims space - Monotonic prune watermark (rejects backwards clock) - _compute_utilization low-side clamp to [0.0, 1.0] (refund semantic) - Defensive guards: NaN/Inf cost and clock coerced to 0.0, negative token counts clamped to 0 - Revert root pyproject.toml (remove unrelated [dependency-groups], restore version 7.3.1) - Remove clear_budget_stores from __all__ (testing utility) - Document token attribution intent (single int -> output-only) Tests: 67 -> 91 (24 new: async migration, TTL prune coverage, adversarial guards, ABC contract enforcement)
1 parent e37d86b commit cd473e8

6 files changed

Lines changed: 1185 additions & 152 deletions

File tree

evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
from agent_control_evaluator_budget.budget.memory_store import InMemoryBudgetStore
66
from agent_control_evaluator_budget.budget.store import BudgetSnapshot, BudgetStore
77

8+
# Note: clear_budget_stores is a testing utility and is intentionally not
9+
# re-exported here. Import it directly from the evaluator submodule in tests:
10+
# from agent_control_evaluator_budget.budget.evaluator import clear_budget_stores
11+
812
__all__ = [
913
"BudgetEvaluator",
1014
"BudgetEvaluatorConfig",

evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py

Lines changed: 86 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,19 @@
33
Deterministic evaluator: confidence is always 1.0, matched is True when
44
any configured limit is exceeded. Utilization ratio and spend breakdown
55
are returned in result metadata, not in confidence.
6+
7+
The evaluator is stateless. Budget state lives in a module-level store
8+
registry, independent of the evaluator instance cache in _factory.py.
9+
This prevents silent state loss on LRU eviction and avoids cross-control
10+
leakage when different controls share the same config.
611
"""
712

813
from __future__ import annotations
914

15+
import json
1016
import logging
1117
import math
18+
import threading
1219
from typing import Any
1320

1421
from agent_control_evaluators._base import Evaluator, EvaluatorMetadata
@@ -17,9 +24,61 @@
1724

1825
from .config import BudgetEvaluatorConfig
1926
from .memory_store import InMemoryBudgetStore
27+
from .store import BudgetStore
2028

2129
logger = logging.getLogger(__name__)
2230

31+
# ---------------------------------------------------------------------------
32+
# Module-level store registry
33+
#
34+
# Decoupled from the evaluator instance cache so that LRU eviction in
35+
# _factory.py does not destroy accumulated budget state. The registry
36+
# is keyed by a stable config hash. Two controls with identical config
37+
# intentionally share a budget pool (same config = same budget).
38+
# ---------------------------------------------------------------------------
39+
40+
# NOTE: The registry is unbounded. In practice a deployment has a finite
41+
# set of budget configs. If dynamic config generation becomes a concern,
42+
# add a max-size cap with LRU eviction here.
43+
_STORE_REGISTRY: dict[str, BudgetStore] = {}
44+
_STORE_REGISTRY_LOCK = threading.Lock()
45+
46+
47+
def _config_key(config: BudgetEvaluatorConfig) -> str:
48+
"""Build a stable key for the store registry from evaluator config.
49+
50+
The limits list is sorted before hashing so that two configs with
51+
semantically identical rules in different order share a store.
52+
"""
53+
config_dict = config.model_dump(mode="json")
54+
config_dict["limits"] = sorted(
55+
config_dict["limits"],
56+
key=lambda r: json.dumps(r, sort_keys=True, default=str),
57+
)
58+
return f"budget:{json.dumps(config_dict, sort_keys=True, default=str)}"
59+
60+
61+
def get_or_create_store(config: BudgetEvaluatorConfig) -> BudgetStore:
62+
"""Get or create a store for the given config, thread-safe."""
63+
key = _config_key(config)
64+
with _STORE_REGISTRY_LOCK:
65+
store = _STORE_REGISTRY.get(key)
66+
if store is None:
67+
store = InMemoryBudgetStore(rules=config.limits)
68+
_STORE_REGISTRY[key] = store
69+
return store
70+
71+
72+
def clear_budget_stores() -> None:
73+
"""Clear all budget stores. Useful for testing."""
74+
with _STORE_REGISTRY_LOCK:
75+
_STORE_REGISTRY.clear()
76+
77+
78+
# ---------------------------------------------------------------------------
79+
# Helpers
80+
# ---------------------------------------------------------------------------
81+
2382

2483
def _extract_by_path(data: Any, path: str) -> Any:
2584
"""Extract a value from nested data using dot-notation path."""
@@ -50,6 +109,10 @@ def _extract_tokens(data: Any, token_path: str | None) -> tuple[int, int]:
50109
if token_path:
51110
val = _extract_by_path(data, token_path)
52111
if isinstance(val, int) and not isinstance(val, bool) and val >= 0:
112+
# When token_path resolves to a single int we cannot distinguish
113+
# input vs output. Attribute the whole count to output because
114+
# output rates are typically higher than input rates in pricing
115+
# tables, so this over-estimates cost rather than under-estimates.
53116
return 0, val
54117
if isinstance(val, dict):
55118
data = val
@@ -78,19 +141,23 @@ def _estimate_cost(
78141
input_tokens: int,
79142
output_tokens: int,
80143
pricing: dict[str, dict[str, float]] | None,
81-
) -> int:
82-
"""Estimate cost in minor units from model pricing table. Returns 0 if unknown."""
144+
) -> float:
145+
"""Estimate cost in cents (USD) from model pricing table.
146+
147+
Returns a float for precision. Rounding happens at snapshot time,
148+
not per call.
149+
"""
83150
if not model or not pricing:
84-
return 0
151+
return 0.0
85152
rates = pricing.get(model)
86153
if not rates:
87-
return 0
154+
return 0.0
88155
input_rate = rates.get("input_per_1k", 0.0)
89156
output_rate = rates.get("output_per_1k", 0.0)
90157
cost = (input_tokens * input_rate + output_tokens * output_rate) / 1000.0
91158
if not math.isfinite(cost) or cost < 0:
92-
return 0
93-
return math.ceil(cost)
159+
return 0.0
160+
return cost
94161

95162

96163
def _extract_metadata(data: Any, metadata_paths: dict[str, str]) -> dict[str, str]:
@@ -103,28 +170,29 @@ def _extract_metadata(data: Any, metadata_paths: dict[str, str]) -> dict[str, st
103170
return result
104171

105172

173+
# ---------------------------------------------------------------------------
174+
# Evaluator
175+
# ---------------------------------------------------------------------------
176+
177+
106178
@register_evaluator
107179
class BudgetEvaluator(Evaluator[BudgetEvaluatorConfig]):
108180
"""Tracks cumulative LLM token and cost usage per scope and time window.
109181
110182
Deterministic evaluator: matched=True when any configured limit is
111183
exceeded, confidence=1.0 always.
112184
113-
The evaluator is stateful -- it accumulates usage in a BudgetStore.
114-
The store is created per evaluator config and is thread-safe.
185+
The evaluator is stateless. Budget state is managed by a module-level
186+
store registry (get_or_create_store), not by the evaluator instance.
115187
"""
116188

117189
metadata = EvaluatorMetadata(
118190
name="budget",
119-
version="2.0.0",
191+
version="3.0.0",
120192
description="Cumulative LLM token and cost budget tracking",
121193
)
122194
config_model = BudgetEvaluatorConfig
123195

124-
def __init__(self, config: BudgetEvaluatorConfig) -> None:
125-
super().__init__(config)
126-
self._store = InMemoryBudgetStore(rules=config.limits)
127-
128196
async def evaluate(self, data: Any) -> EvaluatorResult:
129197
"""Evaluate step data against all configured budget limits."""
130198
if data is None:
@@ -146,7 +214,8 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
146214

147215
step_metadata = _extract_metadata(data, self.config.metadata_paths)
148216

149-
snapshots = self._store.record_and_check(
217+
store = get_or_create_store(self.config)
218+
snapshots = await store.record_and_check(
150219
scope=step_metadata,
151220
input_tokens=input_tokens,
152221
output_tokens=output_tokens,
@@ -156,7 +225,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
156225
breached: list[dict[str, Any]] = []
157226
all_snaps: list[dict[str, Any]] = []
158227

159-
for i, snap in enumerate(snapshots):
228+
for snap in snapshots:
160229
snap_info = {
161230
"spent": snap.spent,
162231
"spent_tokens": snap.spent_tokens,
@@ -180,7 +249,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
180249
"all_snapshots": all_snaps,
181250
"input_tokens": input_tokens,
182251
"output_tokens": output_tokens,
183-
"cost": cost,
252+
"cost": round(cost, 6),
184253
},
185254
)
186255

@@ -193,7 +262,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
193262
"all_snapshots": all_snaps,
194263
"input_tokens": input_tokens,
195264
"output_tokens": output_tokens,
196-
"cost": cost,
265+
"cost": round(cost, 6),
197266
"max_utilization": round(max_util, 4),
198267
},
199268
)

0 commit comments

Comments
 (0)