Skip to content

Commit fa414d7

Browse files
author
amabito
committed
fix(evaluators): budget R10 -- prevent double-counting for shared scope+period
R10 finding: when multiple limit rules share the same (scope_key, period_key), each rule called record_and_check() independently, causing the same tokens and cost to be counted N times in the store. Fix: track recorded (scope_key, period_key) pairs per evaluate() call. First rule records; subsequent rules for the same pair use get_snapshot(). Tests: 2 new tests for same-scope double-count prevention. 63 budget tests, 293 total evaluator tests passing. Review loop: R9 CLEAN, R10 fix, R11 CLEAN -- 3 consecutive clean achieved.
1 parent 96e59ba commit fa414d7

3 files changed

Lines changed: 76 additions & 11 deletions

File tree

evaluators/builtin/src/agent_control_evaluators/budget/evaluator.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,13 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
230230
# Extract metadata for scope key building
231231
step_metadata = _extract_metadata(data, self.config.metadata_paths)
232232

233-
# Check each limit rule
233+
# Check each limit rule.
234+
# Track which (scope_key, period_key) pairs have already been recorded
235+
# this evaluation to prevent double-counting when multiple rules share
236+
# the same scope and window.
234237
breached_rules: list[dict[str, Any]] = []
235238
all_snapshots: list[dict[str, Any]] = []
239+
recorded_pairs: set[tuple[str, str]] = set()
236240

237241
for rule in self.config.limits:
238242
# Check if rule scope matches step metadata
@@ -241,16 +245,28 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
241245

242246
scope_key = _build_scope_key(rule.scope, rule.per, step_metadata)
243247
period_key = _derive_period_key(rule.window)
244-
245-
snapshot = self._store.record_and_check(
246-
scope_key=scope_key,
247-
period_key=period_key,
248-
input_tokens=input_tokens,
249-
output_tokens=output_tokens,
250-
cost_usd=cost_usd,
251-
limit_usd=rule.limit_usd,
252-
limit_tokens=rule.limit_tokens,
253-
)
248+
pair = (scope_key, period_key)
249+
250+
if pair not in recorded_pairs:
251+
# First rule for this (scope, period): record usage and check.
252+
snapshot = self._store.record_and_check(
253+
scope_key=scope_key,
254+
period_key=period_key,
255+
input_tokens=input_tokens,
256+
output_tokens=output_tokens,
257+
cost_usd=cost_usd,
258+
limit_usd=rule.limit_usd,
259+
limit_tokens=rule.limit_tokens,
260+
)
261+
recorded_pairs.add(pair)
262+
else:
263+
# Subsequent rule for same (scope, period): read without recording.
264+
snapshot = self._store.get_snapshot(
265+
scope_key=scope_key,
266+
period_key=period_key,
267+
limit_usd=rule.limit_usd,
268+
limit_tokens=rule.limit_tokens,
269+
)
254270

255271
snap_info = {
256272
"scope_key": scope_key,

evaluators/builtin/tests/budget/test_budget.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,49 @@ async def test_negative_pricing_does_not_reduce_budget(self) -> None:
495495
snap = ev._store.get_snapshot("__global__", "", limit_usd=0.01)
496496
assert snap.spent_usd == pytest.approx(0.0) # negative rates clamped to 0
497497

498+
@pytest.mark.asyncio
499+
async def test_two_rules_same_scope_no_double_count(self) -> None:
500+
"""Two rules with the same scope+window must not double-record usage.
501+
502+
When limits=[{limit_usd: 1.0}, {limit_tokens: 5000}] are both global
503+
and cumulative, they share scope_key='__global__' and period_key=''.
504+
Recording twice to the same bucket would inflate spend 2x, causing
505+
the tighter limit to trigger at half the configured threshold.
506+
"""
507+
from agent_control_evaluators.budget.evaluator import BudgetEvaluator
508+
config = BudgetEvaluatorConfig(
509+
limits=[
510+
{"limit_usd": 1.0},
511+
{"limit_tokens": 5000},
512+
],
513+
cost_path="cost",
514+
)
515+
ev = BudgetEvaluator(config)
516+
await ev.evaluate({"cost": 0.1, "usage": {"input_tokens": 100, "output_tokens": 100}})
517+
snap = ev._store.get_snapshot("__global__", "", limit_usd=1.0)
518+
assert snap.spent_usd == pytest.approx(0.1), "spent_usd must not be double-counted"
519+
assert snap.spent_tokens == 200, "spent_tokens must not be double-counted"
520+
521+
@pytest.mark.asyncio
522+
async def test_two_rules_same_scope_different_windows_no_double_count(self) -> None:
523+
"""Rules with different windows use different period_keys -- no shared bucket."""
524+
from agent_control_evaluators.budget.evaluator import BudgetEvaluator, _derive_period_key
525+
config = BudgetEvaluatorConfig(
526+
limits=[
527+
{"window": "daily", "limit_usd": 1.0},
528+
{"window": "monthly", "limit_usd": 10.0},
529+
],
530+
cost_path="cost",
531+
)
532+
ev = BudgetEvaluator(config)
533+
await ev.evaluate({"cost": 0.6})
534+
daily_key = _derive_period_key("daily")
535+
monthly_key = _derive_period_key("monthly")
536+
snap_d = ev._store.get_snapshot("__global__", daily_key, limit_usd=1.0)
537+
snap_m = ev._store.get_snapshot("__global__", monthly_key, limit_usd=10.0)
538+
assert snap_d.spent_usd == pytest.approx(0.6)
539+
assert snap_m.spent_usd == pytest.approx(0.6)
540+
498541
@pytest.mark.asyncio
499542
async def test_inf_pricing_does_not_cause_inf_cost(self) -> None:
500543
"""Inf pricing rates must not produce inf cost (permanent false positive)."""

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,9 @@ tag_format = "v{version}"
8080
# feat = minor, fix/perf/refactor = patch, breaking (!) = major
8181
allowed_tags = ["feat", "fix", "perf", "chore", "docs", "style", "refactor", "test", "ci"]
8282
patch_tags = ["fix", "perf", "chore", "refactor"]
83+
84+
[dependency-groups]
85+
dev = [
86+
"pytest>=9.0.2",
87+
"pytest-asyncio>=1.3.0",
88+
]

0 commit comments

Comments
 (0)