fix(evaluators): budget R10 -- prevent double-counting for shared scope+period

amabito · amabito · commit fa414d744afd · 2026-03-21T10:10:22.000+09:00
R10 finding: when multiple limit rules share the same (scope_key, period_key),
each rule called record_and_check() independently, causing the same tokens
and cost to be counted N times in the store.

Fix: track recorded (scope_key, period_key) pairs per evaluate() call.
First rule records; subsequent rules for the same pair use get_snapshot().

Tests: 2 new tests for same-scope double-count prevention.
63 budget tests, 293 total evaluator tests passing.

Review loop: R9 CLEAN, R10 fix, R11 CLEAN -- 3 consecutive clean achieved.
diff --git a/evaluators/builtin/src/agent_control_evaluators/budget/evaluator.py b/evaluators/builtin/src/agent_control_evaluators/budget/evaluator.py
@@ -230,9 +230,13 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
         # Extract metadata for scope key building
         step_metadata = _extract_metadata(data, self.config.metadata_paths)
 
-        # Check each limit rule
+        # Check each limit rule.
+        # Track which (scope_key, period_key) pairs have already been recorded
+        # this evaluation to prevent double-counting when multiple rules share
+        # the same scope and window.
         breached_rules: list[dict[str, Any]] = []
         all_snapshots: list[dict[str, Any]] = []
+        recorded_pairs: set[tuple[str, str]] = set()
 
         for rule in self.config.limits:
             # Check if rule scope matches step metadata
@@ -241,16 +245,28 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
 
             scope_key = _build_scope_key(rule.scope, rule.per, step_metadata)
             period_key = _derive_period_key(rule.window)
-
-            snapshot = self._store.record_and_check(
-                scope_key=scope_key,
-                period_key=period_key,
-                input_tokens=input_tokens,
-                output_tokens=output_tokens,
-                cost_usd=cost_usd,
-                limit_usd=rule.limit_usd,
-                limit_tokens=rule.limit_tokens,
-            )
+            pair = (scope_key, period_key)
+
+            if pair not in recorded_pairs:
+                # First rule for this (scope, period): record usage and check.
+                snapshot = self._store.record_and_check(
+                    scope_key=scope_key,
+                    period_key=period_key,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    cost_usd=cost_usd,
+                    limit_usd=rule.limit_usd,
+                    limit_tokens=rule.limit_tokens,
+                )
+                recorded_pairs.add(pair)
+            else:
+                # Subsequent rule for same (scope, period): read without recording.
+                snapshot = self._store.get_snapshot(
+                    scope_key=scope_key,
+                    period_key=period_key,
+                    limit_usd=rule.limit_usd,
+                    limit_tokens=rule.limit_tokens,
+                )
 
             snap_info = {
                 "scope_key": scope_key,
diff --git a/evaluators/builtin/tests/budget/test_budget.py b/evaluators/builtin/tests/budget/test_budget.py
@@ -495,6 +495,49 @@ async def test_negative_pricing_does_not_reduce_budget(self) -> None:
         snap = ev._store.get_snapshot("__global__", "", limit_usd=0.01)
         assert snap.spent_usd == pytest.approx(0.0)  # negative rates clamped to 0
 
+    @pytest.mark.asyncio
+    async def test_two_rules_same_scope_no_double_count(self) -> None:
+        """Two rules with the same scope+window must not double-record usage.
+
+        When limits=[{limit_usd: 1.0}, {limit_tokens: 5000}] are both global
+        and cumulative, they share scope_key='__global__' and period_key=''.
+        Recording twice to the same bucket would inflate spend 2x, causing
+        the tighter limit to trigger at half the configured threshold.
+        """
+        from agent_control_evaluators.budget.evaluator import BudgetEvaluator
+        config = BudgetEvaluatorConfig(
+            limits=[
+                {"limit_usd": 1.0},
+                {"limit_tokens": 5000},
+            ],
+            cost_path="cost",
+        )
+        ev = BudgetEvaluator(config)
+        await ev.evaluate({"cost": 0.1, "usage": {"input_tokens": 100, "output_tokens": 100}})
+        snap = ev._store.get_snapshot("__global__", "", limit_usd=1.0)
+        assert snap.spent_usd == pytest.approx(0.1), "spent_usd must not be double-counted"
+        assert snap.spent_tokens == 200, "spent_tokens must not be double-counted"
+
+    @pytest.mark.asyncio
+    async def test_two_rules_same_scope_different_windows_no_double_count(self) -> None:
+        """Rules with different windows use different period_keys -- no shared bucket."""
+        from agent_control_evaluators.budget.evaluator import BudgetEvaluator, _derive_period_key
+        config = BudgetEvaluatorConfig(
+            limits=[
+                {"window": "daily", "limit_usd": 1.0},
+                {"window": "monthly", "limit_usd": 10.0},
+            ],
+            cost_path="cost",
+        )
+        ev = BudgetEvaluator(config)
+        await ev.evaluate({"cost": 0.6})
+        daily_key = _derive_period_key("daily")
+        monthly_key = _derive_period_key("monthly")
+        snap_d = ev._store.get_snapshot("__global__", daily_key, limit_usd=1.0)
+        snap_m = ev._store.get_snapshot("__global__", monthly_key, limit_usd=10.0)
+        assert snap_d.spent_usd == pytest.approx(0.6)
+        assert snap_m.spent_usd == pytest.approx(0.6)
+
     @pytest.mark.asyncio
     async def test_inf_pricing_does_not_cause_inf_cost(self) -> None:
         """Inf pricing rates must not produce inf cost (permanent false positive)."""
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,3 +80,9 @@ tag_format = "v{version}"
 # feat = minor, fix/perf/refactor = patch, breaking (!) = major
 allowed_tags = ["feat", "fix", "perf", "chore", "docs", "style", "refactor", "test", "ci"]
 patch_tags = ["fix", "perf", "chore", "refactor"]
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+    "pytest-asyncio>=1.3.0",
+]