diff --git a/evaluators/contrib/budget/README.md b/evaluators/contrib/budget/README.md index 15c35de6..c83de1ab 100644 --- a/evaluators/contrib/budget/README.md +++ b/evaluators/contrib/budget/README.md @@ -99,7 +99,7 @@ ModelPricing(input_per_1k=0.04, output_per_1k=0.16) `input_per_1k` is applied to input tokens. `output_per_1k` is applied to output tokens. -Pricing is required when any rule uses `limit_unit="usd_cents"`. Token-only rules can omit pricing. If an event uses a model that is not in the pricing table and a cost rule exists, `unknown_model_behavior="block"` fails closed. Use `"warn"` to log a warning and treat the cost as 0. +Pricing and `model_path` are required when any rule uses `limit_unit="usd_cents"`. Token-only rules can omit both. If an event uses a model that is not in the pricing table and a cost rule exists, `unknown_model_behavior="block"` fails closed. Use `"warn"` to log a warning and treat the cost as 0. ## Dual Ceiling Pattern diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py index 2871b099..795044be 100644 --- a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py +++ b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/config.py @@ -79,11 +79,13 @@ class BudgetEvaluatorConfig(EvaluatorConfig): pricing table and a cost-based rule exists. block=fail closed, warn=log warning and treat cost as 0. pricing: Optional model pricing table. Maps model name to ModelPricing. - Used to derive cost in USD from token counts and model name. + Required when any rule uses limit_unit="usd_cents". Used to + derive cost in USD from token counts and model name. token_path: Dot-notation path to extract token usage from step data (e.g. "usage.total_tokens"). If None, looks for standard fields (input_tokens, output_tokens, total_tokens, usage). model_path: Dot-notation path to extract model name (for pricing lookup). + Required when any rule uses limit_unit="usd_cents". metadata_paths: Mapping of metadata field name to dot-notation path in step data. Used to extract scope dimensions (channel, user_id, etc). """ diff --git a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py index a2b43f04..1e980c2c 100644 --- a/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py +++ b/evaluators/contrib/budget/src/agent_control_evaluator_budget/budget/evaluator.py @@ -117,16 +117,23 @@ def _extract_tokens(data: Any, token_path: str | None) -> tuple[int, int]: out = usage.get("output_tokens") if out is None: out = usage.get("completion_tokens") - inp_ok = isinstance(inp, int) and not isinstance(inp, bool) - out_ok = isinstance(out, int) and not isinstance(out, bool) - if inp_ok and out_ok: - return max(0, inp), max(0, out) + input_tokens = _extract_non_negative_int(inp) + output_tokens = _extract_non_negative_int(out) + if input_tokens is not None and output_tokens is not None: + return input_tokens, output_tokens total = usage.get("total_tokens") if isinstance(total, int) and not isinstance(total, bool) and total > 0: return 0, max(0, total) return 0, 0 +def _extract_non_negative_int(value: Any) -> int | None: + """Return a non-negative integer or None for invalid token values.""" + if not isinstance(value, int) or isinstance(value, bool): + return None + return max(0, value) + + def _estimate_cost( model: str | None, input_tokens: int, @@ -196,9 +203,10 @@ async def evaluate(self, data: Any) -> EvaluatorResult: input_tokens, output_tokens = _extract_tokens(data, self.config.token_path) model: str | None = None - model_path_configured = bool(self.config.model_path) - if model_path_configured: - val = _extract_by_path(data, self.config.model_path) + model_path = self.config.model_path + model_path_configured = bool(model_path) + if model_path: + val = _extract_by_path(data, model_path) if val is not None: model = str(val) @@ -220,9 +228,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ) if has_matching_cost_rule: if model is None: - block_reason = ( - f"Model not found at path '{self.config.model_path}'" - ) + block_reason = f"Model not found at path '{model_path}'" else: block_reason = f"Unknown model: {model}" if self.config.unknown_model_behavior == "block": diff --git a/evaluators/contrib/budget/tests/budget/test_budget.py b/evaluators/contrib/budget/tests/budget/test_budget.py index 10299e7d..43d19952 100644 --- a/evaluators/contrib/budget/tests/budget/test_budget.py +++ b/evaluators/contrib/budget/tests/budget/test_budget.py @@ -384,6 +384,33 @@ def test_extract_tokens_openai(self) -> None: data = {"usage": {"prompt_tokens": 80, "completion_tokens": 40}} assert _extract_tokens(data, None) == (80, 40) + def test_extract_tokens_falls_back_when_normalized_fields_are_none(self) -> None: + # Given: normalized fields present but unset, plus legacy OpenAI fields + data = { + "usage": { + "input_tokens": None, + "output_tokens": None, + "prompt_tokens": 80, + "completion_tokens": 40, + } + } + + # When/Then: fallback still uses the legacy fields + assert _extract_tokens(data, None) == (80, 40) + + def test_extract_tokens_falls_back_per_field(self) -> None: + # Given: one normalized field missing, the other present + data = { + "usage": { + "input_tokens": 100, + "output_tokens": None, + "completion_tokens": 40, + } + } + + # When/Then: fallback applies independently per token side + assert _extract_tokens(data, None) == (100, 40) + def test_extract_tokens_none(self) -> None: # Given: None data / Then: (0, 0) assert _extract_tokens(None, None) == (0, 0)