feat(eval): Slice 1J'' — reasoning_effort + run_json_prompt usage tracking

LEANDERANTONY · claude · LEANDERANTONY · commit 02e985302db9 · 2026-05-21T04:27:38.000+05:30
Two eval-adapter extensions needed for Slice 1K's 5-candidate slate:

1. **reasoning_effort thread-through**. `OpenRouterEvalService` and
   `KimiEvalService` both accepted `reasoning_effort` as a kwarg for
   signature compatibility with `OpenAIService` but never forwarded
   it into the underlying `chat.completions.create` call. The Slice
   1K slate has three reasoning-class candidates (gpt-5.4@medium,
   gpt-5.4-mini@medium, o4-mini@high) that need the effort signal to
   behave at the intended cost/latency tier — without forwarding it
   they'd all run at their default effort, distorting the
   comparison.

   Forwarded conditionally (only when truthy). Anthropic slugs in
   non-thinking mode and DeepSeek v4 leave the kwarg unset and would
   400 if it were passed.

2. **Per-call usage accumulation in `run_json_prompt`**. Caught
   during the Slice 1K smoke run: every cost column read $0.0000
   because the single-shot path was never bumping the `_usage`
   counters — only `run_tool_loop` was. Mirrored the accumulator into
   the JSON-prompt path so the assistant / parser / structuring
   suites also surface accurate per-call cost.

Plus pricing-table additions in `tests/quality/provider_pricing.py`
for the two Slice 1K newcomers: `openai/o4-mini`
($1.10 / $4.40 per Mtok — substituted for the non-existent
`openai/gpt-5.1-mini`) and `anthropic/claude-haiku-4.5`
($1.00 / $5.00 per Mtok).

Verification: 22 / 22 openrouter-adapter unit tests pass. Live
preflight against all 5 Slice 1K slugs returned valid JSON.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tests/quality/kimi_eval_service.py b/tests/quality/kimi_eval_service.py
@@ -142,17 +142,28 @@ def _get_client(self):
         return self._client
 
     def _chat(self, system_prompt: str, user_prompt: str, *, task_name: str,
-              max_tokens: int) -> str:
+              max_tokens: int, reasoning_effort: Any = None) -> str:
         if not self.is_available():
             raise AgentExecutionError("Kimi eval adapter not configured (KIMI_API_KEY).")
         started = time.perf_counter()
+        # Slice 1J'': reasoning_effort is valid only on reasoning-class
+        # slugs (OpenAI o-series / gpt-5.x). Other slugs 400 if it's
+        # set, so forward conditionally. The original adapter was built
+        # for Kimi K2 (non-thinking) — fine to leave the param off — but
+        # Phase B's runner now points this same adapter at o4-mini and
+        # gpt-5.4 reasoning slugs, and they need the effort signal to
+        # behave at the intended cost/latency tier.
+        extra_kwargs: dict[str, Any] = {}
+        if reasoning_effort:
+            extra_kwargs["reasoning_effort"] = reasoning_effort
         resp = self._get_client().chat.completions.create(
             model=self.default_model,
             messages=[{"role": "system", "content": system_prompt},
                       {"role": "user", "content": user_prompt}],
             response_format={"type": "json_object"},
             max_tokens=max_tokens,
             temperature=0,
+            **extra_kwargs,
         )
         usage = getattr(resp, "usage", None)
         pt = getattr(usage, "prompt_tokens", 0) or 0
@@ -185,7 +196,8 @@ def run_json_prompt(self, system_prompt, user_prompt, expected_keys=None,
         task = task_name or "unknown"
         content = self._chat(system_prompt, user_prompt, task_name=task,
                              max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
-                                             _EVAL_MAX_TOKENS))
+                                             _EVAL_MAX_TOKENS),
+                             reasoning_effort=reasoning_effort)
         try:
             payload = _parse_provider_json(content)
         except ValueError as exc:
@@ -213,7 +225,8 @@ def run_structured_prompt(self, system_prompt, user_prompt, *,
         task = task_name or "unknown"
         content = self._chat(system_prompt, user_prompt, task_name=task,
                              max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
-                                             _EVAL_MAX_TOKENS))
+                                             _EVAL_MAX_TOKENS),
+                             reasoning_effort=reasoning_effort)
         try:
             raw = _parse_provider_json(content)
         except ValueError as exc:
diff --git a/tests/quality/openrouter_eval_service.py b/tests/quality/openrouter_eval_service.py
@@ -312,6 +312,15 @@ def run_tool_loop(
         ]
         tool_trace: list[dict] = []
 
+        # Slice 1J'': reasoning_effort is only valid on reasoning-class
+        # models (OpenAI o-series and gpt-5.x via OpenRouter). Passing
+        # it to a non-reasoning slug (Sonnet, Haiku, DeepSeek v4) is a
+        # 400. Threaded as an optional kwarg so the call ignores it when
+        # the caller didn't ask. Truthy check rejects "" / None alike.
+        extra_kwargs: dict[str, Any] = {}
+        if reasoning_effort:
+            extra_kwargs["reasoning_effort"] = reasoning_effort
+
         for iteration in range(max_iterations):
             started_at = time.perf_counter()
             try:
@@ -323,6 +332,7 @@ def run_tool_loop(
                     response_format={"type": "json_object"},
                     max_tokens=max_completion_tokens,
                     temperature=0,
+                    **extra_kwargs,
                 )
             except Exception as exc:
                 LOGGER.exception(
@@ -452,6 +462,11 @@ def run_json_prompt(
                 "OpenRouter adapter is not configured (OPENROUTER_API_KEY)."
             )
         resolved_model = (model or self.default_model).strip()
+        # Slice 1J'': forward reasoning_effort only when set — see the
+        # matching comment in run_tool_loop for why this is conditional.
+        extra_kwargs: dict[str, Any] = {}
+        if reasoning_effort:
+            extra_kwargs["reasoning_effort"] = reasoning_effort
         response = self._get_client().chat.completions.create(
             model=resolved_model,
             messages=[
@@ -461,7 +476,20 @@ def run_json_prompt(
             response_format={"type": "json_object"},
             max_tokens=max_completion_tokens,
             temperature=0,
+            **extra_kwargs,
+        )
+        # Slice 1K bugfix: the smoke run showed $0.0000 cost on every
+        # call because this path never accumulated usage. run_tool_loop
+        # tracks it at the matching site — mirror that here so single-
+        # shot prompts (which is what the assistant + parser suites use)
+        # also surface accurate per-call cost.
+        usage = getattr(response, "usage", None)
+        self._usage["request_count"] += 1
+        self._usage["prompt_tokens"] += getattr(usage, "prompt_tokens", 0) or 0
+        self._usage["completion_tokens"] += (
+            getattr(usage, "completion_tokens", 0) or 0
         )
+        self._usage["total_tokens"] += getattr(usage, "total_tokens", 0) or 0
         content = response.choices[0].message.content or ""
         try:
             payload = _parse_provider_json(content)
@@ -506,6 +534,7 @@ def run_structured_prompt(
             max_completion_tokens=max_completion_tokens,
             task_name=task_name,
             model=model,
+            reasoning_effort=reasoning_effort,
         )
         try:
             return response_model.model_validate(payload)
diff --git a/tests/quality/provider_pricing.py b/tests/quality/provider_pricing.py
@@ -40,9 +40,16 @@
     # OpenRouter catalogue — the 4-candidate shortlist plus the
     # adjacent models we might compare to (Opus for judge runs, etc).
     "anthropic/claude-sonnet-4.5": (3.00, 15.00),
+    "anthropic/claude-haiku-4.5": (1.00, 5.00),  # Slice 1K assistant eval
     "anthropic/claude-opus-4.7": (15.00, 75.00),
     "google/gemini-3.1-pro-preview": (2.00, 12.00),
     "deepseek/deepseek-v4-pro": (0.50, 2.00),
+    # OpenAI o-series reasoning model (Slice 1K substituted o4-mini in
+    # for the non-existent gpt-5.1-mini). The o-series is priced at the
+    # cheap-mini tier; reasoning tokens are billed as output tokens, so
+    # the eval cost on this slug is dominated by completion_tokens *
+    # output_rate when reasoning_effort=high is set.
+    "openai/o4-mini": (1.10, 4.40),
 
     # Slugs from the broader provider_ab_runner candidate slate.
     # Kept current so an apples-to-apples comparison stays possible