Skip to content

Commit 02e9853

Browse files
LEANDERANTONYclaude
andcommitted
feat(eval): Slice 1J'' — reasoning_effort + run_json_prompt usage tracking
Two eval-adapter extensions needed for Slice 1K's 5-candidate slate: 1. **reasoning_effort thread-through**. `OpenRouterEvalService` and `KimiEvalService` both accepted `reasoning_effort` as a kwarg for signature compatibility with `OpenAIService` but never forwarded it into the underlying `chat.completions.create` call. The Slice 1K slate has three reasoning-class candidates (gpt-5.4@medium, gpt-5.4-mini@medium, o4-mini@high) that need the effort signal to behave at the intended cost/latency tier — without forwarding it they'd all run at their default effort, distorting the comparison. Forwarded conditionally (only when truthy). Anthropic slugs in non-thinking mode and DeepSeek v4 leave the kwarg unset and would 400 if it were passed. 2. **Per-call usage accumulation in `run_json_prompt`**. Caught during the Slice 1K smoke run: every cost column read $0.0000 because the single-shot path was never bumping the `_usage` counters — only `run_tool_loop` was. Mirrored the accumulator into the JSON-prompt path so the assistant / parser / structuring suites also surface accurate per-call cost. Plus pricing-table additions in `tests/quality/provider_pricing.py` for the two Slice 1K newcomers: `openai/o4-mini` ($1.10 / $4.40 per Mtok — substituted for the non-existent `openai/gpt-5.1-mini`) and `anthropic/claude-haiku-4.5` ($1.00 / $5.00 per Mtok). Verification: 22 / 22 openrouter-adapter unit tests pass. Live preflight against all 5 Slice 1K slugs returned valid JSON. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a619cd8 commit 02e9853

3 files changed

Lines changed: 52 additions & 3 deletions

File tree

tests/quality/kimi_eval_service.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,17 +142,28 @@ def _get_client(self):
142142
return self._client
143143

144144
def _chat(self, system_prompt: str, user_prompt: str, *, task_name: str,
145-
max_tokens: int) -> str:
145+
max_tokens: int, reasoning_effort: Any = None) -> str:
146146
if not self.is_available():
147147
raise AgentExecutionError("Kimi eval adapter not configured (KIMI_API_KEY).")
148148
started = time.perf_counter()
149+
# Slice 1J'': reasoning_effort is valid only on reasoning-class
150+
# slugs (OpenAI o-series / gpt-5.x). Other slugs 400 if it's
151+
# set, so forward conditionally. The original adapter was built
152+
# for Kimi K2 (non-thinking) — fine to leave the param off — but
153+
# Phase B's runner now points this same adapter at o4-mini and
154+
# gpt-5.4 reasoning slugs, and they need the effort signal to
155+
# behave at the intended cost/latency tier.
156+
extra_kwargs: dict[str, Any] = {}
157+
if reasoning_effort:
158+
extra_kwargs["reasoning_effort"] = reasoning_effort
149159
resp = self._get_client().chat.completions.create(
150160
model=self.default_model,
151161
messages=[{"role": "system", "content": system_prompt},
152162
{"role": "user", "content": user_prompt}],
153163
response_format={"type": "json_object"},
154164
max_tokens=max_tokens,
155165
temperature=0,
166+
**extra_kwargs,
156167
)
157168
usage = getattr(resp, "usage", None)
158169
pt = getattr(usage, "prompt_tokens", 0) or 0
@@ -185,7 +196,8 @@ def run_json_prompt(self, system_prompt, user_prompt, expected_keys=None,
185196
task = task_name or "unknown"
186197
content = self._chat(system_prompt, user_prompt, task_name=task,
187198
max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
188-
_EVAL_MAX_TOKENS))
199+
_EVAL_MAX_TOKENS),
200+
reasoning_effort=reasoning_effort)
189201
try:
190202
payload = _parse_provider_json(content)
191203
except ValueError as exc:
@@ -213,7 +225,8 @@ def run_structured_prompt(self, system_prompt, user_prompt, *,
213225
task = task_name or "unknown"
214226
content = self._chat(system_prompt, user_prompt, task_name=task,
215227
max_tokens=min(max_completion_tokens or _EVAL_MAX_TOKENS,
216-
_EVAL_MAX_TOKENS))
228+
_EVAL_MAX_TOKENS),
229+
reasoning_effort=reasoning_effort)
217230
try:
218231
raw = _parse_provider_json(content)
219232
except ValueError as exc:

tests/quality/openrouter_eval_service.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,15 @@ def run_tool_loop(
312312
]
313313
tool_trace: list[dict] = []
314314

315+
# Slice 1J'': reasoning_effort is only valid on reasoning-class
316+
# models (OpenAI o-series and gpt-5.x via OpenRouter). Passing
317+
# it to a non-reasoning slug (Sonnet, Haiku, DeepSeek v4) is a
318+
# 400. Threaded as an optional kwarg so the call ignores it when
319+
# the caller didn't ask. Truthy check rejects "" / None alike.
320+
extra_kwargs: dict[str, Any] = {}
321+
if reasoning_effort:
322+
extra_kwargs["reasoning_effort"] = reasoning_effort
323+
315324
for iteration in range(max_iterations):
316325
started_at = time.perf_counter()
317326
try:
@@ -323,6 +332,7 @@ def run_tool_loop(
323332
response_format={"type": "json_object"},
324333
max_tokens=max_completion_tokens,
325334
temperature=0,
335+
**extra_kwargs,
326336
)
327337
except Exception as exc:
328338
LOGGER.exception(
@@ -452,6 +462,11 @@ def run_json_prompt(
452462
"OpenRouter adapter is not configured (OPENROUTER_API_KEY)."
453463
)
454464
resolved_model = (model or self.default_model).strip()
465+
# Slice 1J'': forward reasoning_effort only when set — see the
466+
# matching comment in run_tool_loop for why this is conditional.
467+
extra_kwargs: dict[str, Any] = {}
468+
if reasoning_effort:
469+
extra_kwargs["reasoning_effort"] = reasoning_effort
455470
response = self._get_client().chat.completions.create(
456471
model=resolved_model,
457472
messages=[
@@ -461,7 +476,20 @@ def run_json_prompt(
461476
response_format={"type": "json_object"},
462477
max_tokens=max_completion_tokens,
463478
temperature=0,
479+
**extra_kwargs,
480+
)
481+
# Slice 1K bugfix: the smoke run showed $0.0000 cost on every
482+
# call because this path never accumulated usage. run_tool_loop
483+
# tracks it at the matching site — mirror that here so single-
484+
# shot prompts (which is what the assistant + parser suites use)
485+
# also surface accurate per-call cost.
486+
usage = getattr(response, "usage", None)
487+
self._usage["request_count"] += 1
488+
self._usage["prompt_tokens"] += getattr(usage, "prompt_tokens", 0) or 0
489+
self._usage["completion_tokens"] += (
490+
getattr(usage, "completion_tokens", 0) or 0
464491
)
492+
self._usage["total_tokens"] += getattr(usage, "total_tokens", 0) or 0
465493
content = response.choices[0].message.content or ""
466494
try:
467495
payload = _parse_provider_json(content)
@@ -506,6 +534,7 @@ def run_structured_prompt(
506534
max_completion_tokens=max_completion_tokens,
507535
task_name=task_name,
508536
model=model,
537+
reasoning_effort=reasoning_effort,
509538
)
510539
try:
511540
return response_model.model_validate(payload)

tests/quality/provider_pricing.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,16 @@
4040
# OpenRouter catalogue — the 4-candidate shortlist plus the
4141
# adjacent models we might compare to (Opus for judge runs, etc).
4242
"anthropic/claude-sonnet-4.5": (3.00, 15.00),
43+
"anthropic/claude-haiku-4.5": (1.00, 5.00), # Slice 1K assistant eval
4344
"anthropic/claude-opus-4.7": (15.00, 75.00),
4445
"google/gemini-3.1-pro-preview": (2.00, 12.00),
4546
"deepseek/deepseek-v4-pro": (0.50, 2.00),
47+
# OpenAI o-series reasoning model (Slice 1K substituted o4-mini in
48+
# for the non-existent gpt-5.1-mini). The o-series is priced at the
49+
# cheap-mini tier; reasoning tokens are billed as output tokens, so
50+
# the eval cost on this slug is dominated by completion_tokens *
51+
# output_rate when reasoning_effort=high is set.
52+
"openai/o4-mini": (1.10, 4.40),
4653

4754
# Slugs from the broader provider_ab_runner candidate slate.
4855
# Kept current so an apples-to-apples comparison stays possible

0 commit comments

Comments
 (0)