feat(budget): address R3 review -- async ABC, TTL prune, defensive guards

amabito · amabito · commit b189e851f071 · 2026-04-10T12:12:27.000+09:00
Respond to lan17's R3 review on PR #144 with the mechanical items that do not depend on pending config-layer decisions (limit model, budget_id, unknown_model_behavior). Changes: - Migrate BudgetStore from Protocol to async ABC with __init_subclass__ guard that walks the MRO to reject sync overrides at class creation - InMemoryBudgetStore: async wrapper around sync helper, threading.Lock retained for CPU-bound critical section - TTL prune for stale period buckets on rollover, runs before max_buckets capacity check so rollover at capacity reclaims space - Monotonic prune watermark (rejects backwards clock) - _compute_utilization low-side clamp to [0.0, 1.0] (refund semantic) - Defensive guards: NaN/Inf cost and clock coerced to 0.0, negative token counts clamped to 0 - Revert root pyproject.toml (remove unrelated [dependency-groups], restore version 7.3.1) - Remove clear_budget_stores from __all__ (testing utility) - Document token attribution intent (single int -> output-only) Tests: 67 -> 91 (24 new: async migration, TTL prune coverage, adversarial guards, ABC contract enforcement)
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/__init__.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/__init__.py
@@ -1,18 +1,18 @@
 """Budget evaluator for per-agent LLM cost and token tracking."""
 
 from agent_control_evaluator_budget.budget.config import BudgetEvaluatorConfig
-from agent_control_evaluator_budget.budget.evaluator import (
-    BudgetEvaluator,
-    clear_budget_stores,
-)
+from agent_control_evaluator_budget.budget.evaluator import BudgetEvaluator
 from agent_control_evaluator_budget.budget.memory_store import InMemoryBudgetStore
 from agent_control_evaluator_budget.budget.store import BudgetSnapshot, BudgetStore
 
+# Note: clear_budget_stores is a testing utility and is intentionally not
+# re-exported here. Import it directly from the evaluator submodule in tests:
+#   from agent_control_evaluator_budget.budget.evaluator import clear_budget_stores
+
 __all__ = [
     "BudgetEvaluator",
     "BudgetEvaluatorConfig",
     "BudgetSnapshot",
     "BudgetStore",
     "InMemoryBudgetStore",
-    "clear_budget_stores",
 ]
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py
@@ -109,6 +109,10 @@ def _extract_tokens(data: Any, token_path: str | None) -> tuple[int, int]:
     if token_path:
         val = _extract_by_path(data, token_path)
         if isinstance(val, int) and not isinstance(val, bool) and val >= 0:
+            # When token_path resolves to a single int we cannot distinguish
+            # input vs output. Attribute the whole count to output because
+            # output rates are typically higher than input rates in pricing
+            # tables, so this over-estimates cost rather than under-estimates.
             return 0, val
         if isinstance(val, dict):
             data = val
@@ -211,7 +215,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult:
         step_metadata = _extract_metadata(data, self.config.metadata_paths)
 
         store = get_or_create_store(self.config)
-        snapshots = store.record_and_check(
+        snapshots = await store.record_and_check(
             scope=step_metadata,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/memory_store.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/memory_store.py
@@ -6,13 +6,14 @@
 
 from __future__ import annotations
 
+import math
 import threading
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
 
 from .config import BudgetLimitRule
-from .store import BudgetSnapshot, round_spent
+from .store import BudgetSnapshot, BudgetStore, round_spent
 
 
 def _sanitize_scope_value(val: str) -> str:
@@ -34,6 +35,20 @@ def _build_scope_key(
     return "|".join(parts) if parts else "__global__"
 
 
+def _parse_period_key(key: str) -> tuple[int, int] | None:
+    """Parse 'P{window}:{index}' into (window_seconds, bucket_index).
+
+    Returns None for empty/cumulative keys.
+    """
+    if not key or not key.startswith("P"):
+        return None
+    try:
+        window_part, index_part = key[1:].split(":", 1)
+        return int(window_part), int(index_part)
+    except (ValueError, IndexError):
+        return None
+
+
 def _derive_period_key(window_seconds: int | None, now: float) -> str:
     """Derive a period key from window_seconds and a timestamp.
 
@@ -63,12 +78,17 @@ def _compute_utilization(
     limit: int | None,
     limit_tokens: int | None,
 ) -> float:
-    """Return max(spend_ratio, token_ratio) clamped to [0.0, 1.0]."""
+    """Return max(spend_ratio, token_ratio) clamped to [0.0, 1.0].
+
+    The low-side clamp is load-bearing: under refund semantics the internal
+    `spent` accumulator may go negative, which would otherwise produce a
+    negative ratio and violate the BudgetSnapshot.utilization contract.
+    """
     ratios: list[float] = []
     if limit is not None and limit > 0:
-        ratios.append(min(spent / limit, 1.0))
+        ratios.append(max(0.0, min(spent / limit, 1.0)))
     if limit_tokens is not None and limit_tokens > 0:
-        ratios.append(min(spent_tokens / limit_tokens, 1.0))
+        ratios.append(max(0.0, min(spent_tokens / limit_tokens, 1.0)))
     return max(ratios) if ratios else 0.0
 
 
@@ -85,14 +105,21 @@ def total_tokens(self) -> int:
         return self.input_tokens + self.output_tokens
 
 
-class InMemoryBudgetStore:
+class InMemoryBudgetStore(BudgetStore):
     """Thread-safe in-memory budget store.
 
     Initialized with a list of BudgetLimitRule. Derives period keys
     internally from window_seconds + injected clock.
 
     Cost is accumulated as float for precision. Integer rounding
     happens only at snapshot time for display/reporting.
+
+    TTL prune: on new period rollover per window, buckets older than
+    `current - 1` for that window are dropped. This keeps memory bounded
+    for long-running deployments with windowed rules.
+
+    `max_buckets` remains as a backstop for high-cardinality group_by
+    explosions that TTL cannot protect against.
     """
 
     _DEFAULT_MAX_BUCKETS = 100_000
@@ -109,16 +136,41 @@ def __init__(
         self._lock = threading.Lock()
         self._buckets: dict[tuple[str, str], _Bucket] = {}
         self._max_buckets = max_buckets
+        self._last_pruned_period: dict[int, int] = {}
 
-    def record_and_check(
+    async def record_and_check(
         self,
         scope: dict[str, str],
         input_tokens: int,
         output_tokens: int,
         cost: float,
     ) -> list[BudgetSnapshot]:
         """Atomically record usage and return snapshots for all matching rules."""
+        return self._record_and_check_sync(scope, input_tokens, output_tokens, cost)
+
+    def _record_and_check_sync(
+        self,
+        scope: dict[str, str],
+        input_tokens: int,
+        output_tokens: int,
+        cost: float,
+    ) -> list[BudgetSnapshot]:
+        """Sync implementation of record_and_check.
+
+        NaN/Inf cost is coerced to 0.0 defensively. Once NaN enters a
+        bucket's float accumulator, all subsequent additions produce NaN
+        and `nan >= limit` is always False (IEEE 754), permanently
+        disabling budget enforcement for that bucket.
+        """
+        if not math.isfinite(cost):
+            cost = 0.0
+        # Token counts have no refund semantics; clamp to non-negative
+        # to prevent negative injection from resetting the accumulator.
+        input_tokens = max(0, input_tokens)
+        output_tokens = max(0, output_tokens)
         now = self._clock()
+        if not math.isfinite(now):
+            now = 0.0
         snapshots: list[BudgetSnapshot] = []
         recorded_pairs: set[tuple[str, str]] = set()
 
@@ -152,8 +204,14 @@ def record_and_check(
                     recorded_pairs.add(pair)
                 else:
                     bucket = self._buckets.get(pair)
-                    if bucket is None:
-                        continue
+                    # Defensive: this branch is unreachable under current
+                    # invariants (recorded_pairs only contains pairs whose
+                    # bucket was successfully created, and self._lock prevents
+                    # concurrent deletion). If a future refactor violates
+                    # this, the assertion surfaces it.
+                    assert bucket is not None, (
+                        f"bucket for {pair!r} was in recorded_pairs but missing from _buckets"
+                    )
 
                 total_tokens = bucket.total_tokens
                 utilization = _compute_utilization(
@@ -219,6 +277,7 @@ def reset(self, scope_key: str | None = None, period_key: str | None = None) ->
         with self._lock:
             if scope_key is None and period_key is None:
                 self._buckets.clear()
+                self._last_pruned_period.clear()
                 return
             keys_to_remove = [
                 k
@@ -230,10 +289,44 @@ def reset(self, scope_key: str | None = None, period_key: str | None = None) ->
                 del self._buckets[k]
 
     def _get_or_create_bucket(self, key: tuple[str, str]) -> _Bucket | None:
-        """Get or create a bucket. Returns None if max_buckets reached."""
+        """Get or create a bucket. Returns None if max_buckets reached.
+
+        On period rollover (new windowed bucket with a forward period index),
+        stale buckets for the same window (bucket_index < current - 1) are
+        pruned BEFORE the max_buckets capacity check, so that a rollover at
+        capacity can free space rather than fail closed. Cross-scope pruning
+        is intentional: all stale same-window buckets are dropped regardless
+        of scope key, since the period has expired globally.
+
+        The watermark `_last_pruned_period[window]` only advances forward;
+        a backwards clock does not trigger spurious prune work.
+
+        Caller must hold self._lock.
+        """
         bucket = self._buckets.get(key)
         if bucket is not None:
             return bucket
+
+        # TTL prune runs BEFORE the max_buckets check so that rollover at
+        # capacity can reclaim space rather than fail closed permanently.
+        parsed = _parse_period_key(key[1])
+        if parsed is not None:
+            window, index = parsed
+            last_pruned = self._last_pruned_period.get(window)
+            # Only advance on forward progress. Backwards clock is a no-op;
+            # the previously established watermark still protects us.
+            if last_pruned is None or index > last_pruned:
+                stale_keys = [
+                    k
+                    for k in self._buckets
+                    if (kp := _parse_period_key(k[1])) is not None
+                    and kp[0] == window
+                    and kp[1] < index - 1
+                ]
+                for k in stale_keys:
+                    del self._buckets[k]
+                self._last_pruned_period[window] = index
+
         if len(self._buckets) >= self._max_buckets:
             return None
         bucket = _Bucket()
diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/store.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/store.py
@@ -1,4 +1,4 @@
-"""BudgetStore protocol -- interface for budget storage backends.
+"""BudgetStore abstract base class -- interface for budget storage backends.
 
 Implementations must provide atomic record-and-check: a single call
 that records usage and returns the current totals. This prevents
@@ -10,9 +10,11 @@
 
 from __future__ import annotations
 
+import inspect
 import math
+from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Protocol, runtime_checkable
+from typing import Any
 
 
 @dataclass(frozen=True)
@@ -49,17 +51,56 @@ def round_spent(value: float) -> int:
     return int(value)
 
 
-@runtime_checkable
-class BudgetStore(Protocol):
-    """Protocol for budget storage backends.
+class BudgetStore(ABC):
+    """Abstract base class for budget storage backends.
 
     The store is initialized with a list of BudgetLimitRule and derives
     period keys internally from window_seconds + current time.
 
     Callers pass only usage data: scope dict, input_tokens, output_tokens, cost.
+
+    Negative `cost` values are permitted and reduce accumulated spend (refund
+    semantics). `round_spent()` floors the displayed snapshot spend to 0 for
+    negative accumulators, but the internal float accumulator may go negative
+    so that a subsequent positive charge cancels correctly. Validation of
+    cost >= 0 is NOT performed at the store boundary; it is the caller's
+    responsibility if strict positive accounting is required.
+
+    Implementations should be safe to call from async contexts.
+    InMemoryBudgetStore wraps a sync critical section under threading.Lock
+    because the work is CPU-bound and brief; distributed backends
+    (Redis/Postgres) should use native async I/O.
+
+    Subclasses must override `record_and_check` with a coroutine function
+    (`async def`). A sync override is rejected at class creation time rather
+    than failing silently at the first `await` site in production.
     """
 
-    def record_and_check(
+    def __init_subclass__(cls, **kwargs: Any) -> None:
+        super().__init_subclass__(**kwargs)
+        # Walk the MRO to find the nearest override of record_and_check.
+        # Checking only cls.__dict__ misses mixin-inherited sync overrides
+        # that satisfy ABC's abstractmethod check but silently break at the
+        # first `await` call site.
+        method = None
+        for base in cls.__mro__:
+            if base is BudgetStore:
+                break
+            if "record_and_check" in base.__dict__:
+                raw = base.__dict__["record_and_check"]
+                # Unwrap staticmethod/classmethod descriptors so that
+                # inspect.iscoroutinefunction sees the underlying function.
+                method = getattr(raw, "__func__", raw)
+                break
+        if method is not None and not inspect.iscoroutinefunction(method):
+            raise TypeError(
+                f"{cls.__name__}.record_and_check must be an async def "
+                "(coroutine function); got a sync function. BudgetStore is "
+                "an async ABC."
+            )
+
+    @abstractmethod
+    async def record_and_check(
         self,
         scope: dict[str, str],
         input_tokens: int,
@@ -77,4 +118,3 @@ def record_and_check(
         Returns:
             List of BudgetSnapshot, one per matching rule.
         """
-        ...
diff --git a/evaluators/contrib/budget/tests/budget/test_budget.py b/evaluators/contrib/budget/tests/budget/test_budget.py
diff --git a/pyproject.toml b/pyproject.toml