33Deterministic evaluator: confidence is always 1.0, matched is True when
44any configured limit is exceeded. Utilization ratio and spend breakdown
55are returned in result metadata, not in confidence.
6+
7+ The evaluator is stateless. Budget state lives in a module-level store
8+ registry, independent of the evaluator instance cache in _factory.py.
9+ This prevents silent state loss on LRU eviction and avoids cross-control
10+ leakage when different controls share the same config.
611"""
712
813from __future__ import annotations
914
15+ import json
1016import logging
1117import math
18+ import threading
1219from typing import Any
1320
1421from agent_control_evaluators ._base import Evaluator , EvaluatorMetadata
1724
1825from .config import BudgetEvaluatorConfig
1926from .memory_store import InMemoryBudgetStore
27+ from .store import BudgetStore
2028
2129logger = logging .getLogger (__name__ )
2230
31+ # ---------------------------------------------------------------------------
32+ # Module-level store registry
33+ #
34+ # Decoupled from the evaluator instance cache so that LRU eviction in
35+ # _factory.py does not destroy accumulated budget state. The registry
36+ # is keyed by a stable config hash. Two controls with identical config
37+ # intentionally share a budget pool (same config = same budget).
38+ # ---------------------------------------------------------------------------
39+
40+ # NOTE: The registry is unbounded. In practice a deployment has a finite
41+ # set of budget configs. If dynamic config generation becomes a concern,
42+ # add a max-size cap with LRU eviction here.
43+ _STORE_REGISTRY : dict [str , BudgetStore ] = {}
44+ _STORE_REGISTRY_LOCK = threading .Lock ()
45+
46+
47+ def _config_key (config : BudgetEvaluatorConfig ) -> str :
48+ """Build a stable key for the store registry from evaluator config.
49+
50+ The limits list is sorted before hashing so that two configs with
51+ semantically identical rules in different order share a store.
52+ """
53+ config_dict = config .model_dump (mode = "json" )
54+ config_dict ["limits" ] = sorted (
55+ config_dict ["limits" ],
56+ key = lambda r : json .dumps (r , sort_keys = True , default = str ),
57+ )
58+ return f"budget:{ json .dumps (config_dict , sort_keys = True , default = str )} "
59+
60+
61+ def get_or_create_store (config : BudgetEvaluatorConfig ) -> BudgetStore :
62+ """Get or create a store for the given config, thread-safe."""
63+ key = _config_key (config )
64+ with _STORE_REGISTRY_LOCK :
65+ store = _STORE_REGISTRY .get (key )
66+ if store is None :
67+ store = InMemoryBudgetStore (rules = config .limits )
68+ _STORE_REGISTRY [key ] = store
69+ return store
70+
71+
72+ def clear_budget_stores () -> None :
73+ """Clear all budget stores. Useful for testing."""
74+ with _STORE_REGISTRY_LOCK :
75+ _STORE_REGISTRY .clear ()
76+
77+
78+ # ---------------------------------------------------------------------------
79+ # Helpers
80+ # ---------------------------------------------------------------------------
81+
2382
2483def _extract_by_path (data : Any , path : str ) -> Any :
2584 """Extract a value from nested data using dot-notation path."""
@@ -50,6 +109,10 @@ def _extract_tokens(data: Any, token_path: str | None) -> tuple[int, int]:
50109 if token_path :
51110 val = _extract_by_path (data , token_path )
52111 if isinstance (val , int ) and not isinstance (val , bool ) and val >= 0 :
112+ # When token_path resolves to a single int we cannot distinguish
113+ # input vs output. Attribute the whole count to output because
114+ # output rates are typically higher than input rates in pricing
115+ # tables, so this over-estimates cost rather than under-estimates.
53116 return 0 , val
54117 if isinstance (val , dict ):
55118 data = val
@@ -78,19 +141,23 @@ def _estimate_cost(
78141 input_tokens : int ,
79142 output_tokens : int ,
80143 pricing : dict [str , dict [str , float ]] | None ,
81- ) -> int :
82- """Estimate cost in minor units from model pricing table. Returns 0 if unknown."""
144+ ) -> float :
145+ """Estimate cost in cents (USD) from model pricing table.
146+
147+ Returns a float for precision. Rounding happens at snapshot time,
148+ not per call.
149+ """
83150 if not model or not pricing :
84- return 0
151+ return 0.0
85152 rates = pricing .get (model )
86153 if not rates :
87- return 0
154+ return 0.0
88155 input_rate = rates .get ("input_per_1k" , 0.0 )
89156 output_rate = rates .get ("output_per_1k" , 0.0 )
90157 cost = (input_tokens * input_rate + output_tokens * output_rate ) / 1000.0
91158 if not math .isfinite (cost ) or cost < 0 :
92- return 0
93- return math . ceil ( cost )
159+ return 0.0
160+ return cost
94161
95162
96163def _extract_metadata (data : Any , metadata_paths : dict [str , str ]) -> dict [str , str ]:
@@ -103,28 +170,29 @@ def _extract_metadata(data: Any, metadata_paths: dict[str, str]) -> dict[str, st
103170 return result
104171
105172
173+ # ---------------------------------------------------------------------------
174+ # Evaluator
175+ # ---------------------------------------------------------------------------
176+
177+
106178@register_evaluator
107179class BudgetEvaluator (Evaluator [BudgetEvaluatorConfig ]):
108180 """Tracks cumulative LLM token and cost usage per scope and time window.
109181
110182 Deterministic evaluator: matched=True when any configured limit is
111183 exceeded, confidence=1.0 always.
112184
113- The evaluator is stateful -- it accumulates usage in a BudgetStore.
114- The store is created per evaluator config and is thread-safe .
185+ The evaluator is stateless. Budget state is managed by a module-level
186+ store registry (get_or_create_store), not by the evaluator instance .
115187 """
116188
117189 metadata = EvaluatorMetadata (
118190 name = "budget" ,
119- version = "2 .0.0" ,
191+ version = "3 .0.0" ,
120192 description = "Cumulative LLM token and cost budget tracking" ,
121193 )
122194 config_model = BudgetEvaluatorConfig
123195
124- def __init__ (self , config : BudgetEvaluatorConfig ) -> None :
125- super ().__init__ (config )
126- self ._store = InMemoryBudgetStore (rules = config .limits )
127-
128196 async def evaluate (self , data : Any ) -> EvaluatorResult :
129197 """Evaluate step data against all configured budget limits."""
130198 if data is None :
@@ -146,7 +214,8 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
146214
147215 step_metadata = _extract_metadata (data , self .config .metadata_paths )
148216
149- snapshots = self ._store .record_and_check (
217+ store = get_or_create_store (self .config )
218+ snapshots = await store .record_and_check (
150219 scope = step_metadata ,
151220 input_tokens = input_tokens ,
152221 output_tokens = output_tokens ,
@@ -156,7 +225,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
156225 breached : list [dict [str , Any ]] = []
157226 all_snaps : list [dict [str , Any ]] = []
158227
159- for i , snap in enumerate ( snapshots ) :
228+ for snap in snapshots :
160229 snap_info = {
161230 "spent" : snap .spent ,
162231 "spent_tokens" : snap .spent_tokens ,
@@ -180,7 +249,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
180249 "all_snapshots" : all_snaps ,
181250 "input_tokens" : input_tokens ,
182251 "output_tokens" : output_tokens ,
183- "cost" : cost ,
252+ "cost" : round ( cost , 6 ) ,
184253 },
185254 )
186255
@@ -193,7 +262,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
193262 "all_snapshots" : all_snaps ,
194263 "input_tokens" : input_tokens ,
195264 "output_tokens" : output_tokens ,
196- "cost" : cost ,
265+ "cost" : round ( cost , 6 ) ,
197266 "max_utilization" : round (max_util , 4 ),
198267 },
199268 )
0 commit comments