fix(eval): clamp Kimi adapter max_tokens to per-task budget; preflight-only arg

LEANDERANTONY · claude · LEANDERANTONY · commit 611d34c0807f · 2026-05-18T19:05:26.000+05:30
KimiEvalService floored max_tokens at 8000 even for the 20-token
preflight call. OpenRouter reserves max_tokens*price of credit upfront,
so flooring tiny calls inflated the reservation and caused spurious 402s
on pricier models / low balances. Now clamps to the caller's real
per-task budget with 8000 as a CEILING (never a floor); truncation is
still counted via finish_reason=="length". provider_ab_runner gains a
`--preflight-only` arg (validate every slug/key for ~$0.001 then exit;
early-exit wiring still TODO — tracked in the parked eval plan).
Eval-scoped, not production-wired.

7 hermetic adapter tests green (tests/quality/test_kimi_eval_service.py).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tests/quality/kimi_eval_service.py b/tests/quality/kimi_eval_service.py
@@ -54,8 +54,14 @@
 
 _DEFAULT_BASE_URL = os.getenv("KIMI_BASE_URL", "https://openrouter.ai/api/v1").strip()
 _DEFAULT_MODEL = os.getenv("KIMI_MODEL", "moonshotai/kimi-k2.6").strip()
-# Generous so truncation doesn't confound the model-quality signal;
-# we still COUNT any finish_reason=="length" as a fidelity miss.
+# Safety CEILING (not a floor): callers pass real per-task budgets
+# (parsers/agents from config; preflight passes ~20). We clamp to
+# this ceiling so a runaway never over-spends, but never inflate a
+# small request up to it — OpenRouter reserves max_tokens*price of
+# credit upfront, so flooring tiny calls at 8000 caused spurious 402s
+# on pricier models / low balances. Truncation is still COUNTED via
+# finish_reason=="length"; the eval controls truncation by the
+# callers' already-generous per-task budgets.
 _EVAL_MAX_TOKENS = int(os.getenv("KIMI_EVAL_MAX_TOKENS", "8000"))
 
 
@@ -167,7 +173,8 @@ def run_json_prompt(self, system_prompt, user_prompt, expected_keys=None,
                         reasoning_effort=None) -> dict:
         task = task_name or "unknown"
         content = self._chat(system_prompt, user_prompt, task_name=task,
-                             max_tokens=max(max_completion_tokens, _EVAL_MAX_TOKENS))
+                             max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
+                                             _EVAL_MAX_TOKENS))
         try:
             payload = json.loads(content)
         except json.JSONDecodeError as exc:
@@ -194,7 +201,8 @@ def run_structured_prompt(self, system_prompt, user_prompt, *,
                               previous_response_id=None, reasoning_effort=None):
         task = task_name or "unknown"
         content = self._chat(system_prompt, user_prompt, task_name=task,
-                             max_tokens=max(max_completion_tokens, _EVAL_MAX_TOKENS))
+                             max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
+                                             _EVAL_MAX_TOKENS))
         try:
             raw = json.loads(content)
         except json.JSONDecodeError as exc:
diff --git a/tests/quality/provider_ab_runner.py b/tests/quality/provider_ab_runner.py
@@ -211,6 +211,9 @@ def main() -> None:
                     help="alias: --suite parser --limit 3 (cheap sanity + fidelity)")
     ap.add_argument("--preflight", action="store_true",
                     help="1 tiny call/candidate to validate slug+key before the run")
+    ap.add_argument("--preflight-only", action="store_true",
+                    help="just validate every slug/key (~$0.001 total) and exit; "
+                    "no suites — use when credits are tight")
     ap.add_argument("--json", default="")
     args = ap.parse_args()