Skip to content

Commit f54f189

Browse files
authored
[codex] Reject reasoning-only model responses (#1537)
1 parent 7211661 commit f54f189

7 files changed

Lines changed: 163 additions & 11 deletions

tests/test_client_multimodal_types.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
import pytest
21
from types import SimpleNamespace
32

3+
import pytest
4+
45
from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
6+
from verifiers.errors import EmptyModelResponseError
57
from verifiers.types import (
68
AssistantMessage,
79
ImageUrlContentPart,
@@ -18,6 +20,11 @@
1820
from verifiers.utils.response_utils import parse_response_message
1921

2022

23+
class _OpenAIMessage(SimpleNamespace):
24+
def model_dump(self):
25+
return self.__dict__
26+
27+
2128
@pytest.mark.asyncio
2229
async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
2330
client = OpenAIChatCompletionsClient(object())
@@ -52,6 +59,53 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
5259
]
5360

5461

62+
@pytest.mark.asyncio
63+
async def test_openai_chat_rejects_reasoning_only_native_response():
64+
client = OpenAIChatCompletionsClient(object())
65+
native_response = SimpleNamespace(
66+
choices=[
67+
SimpleNamespace(
68+
message=_OpenAIMessage(
69+
content=None,
70+
reasoning_content="hidden chain",
71+
tool_calls=None,
72+
)
73+
)
74+
]
75+
)
76+
77+
with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
78+
await client.raise_from_native_response(native_response)
79+
80+
81+
@pytest.mark.asyncio
82+
async def test_openai_chat_accepts_refusal_with_reasoning_native_response():
83+
client = OpenAIChatCompletionsClient(object())
84+
native_response = SimpleNamespace(
85+
id="chatcmpl_refusal",
86+
created=0,
87+
model="gpt-5.2",
88+
usage=None,
89+
choices=[
90+
SimpleNamespace(
91+
finish_reason="stop",
92+
message=_OpenAIMessage(
93+
content=None,
94+
refusal="I cannot help with that.",
95+
reasoning_content="hidden chain",
96+
tool_calls=None,
97+
),
98+
)
99+
],
100+
)
101+
102+
await client.raise_from_native_response(native_response)
103+
response = await client.from_native_response(native_response)
104+
105+
assert response.message.content == "I cannot help with that."
106+
assert response.message.reasoning_content == "hidden chain"
107+
108+
55109
@pytest.mark.asyncio
56110
async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
57111
pytest.importorskip("anthropic")
@@ -228,6 +282,24 @@ async def test_anthropic_from_native_response_always_parses_reasoning():
228282
assert response.message.content == "final answer"
229283

230284

285+
@pytest.mark.asyncio
286+
async def test_anthropic_rejects_reasoning_only_native_response():
287+
pytest.importorskip("anthropic")
288+
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
289+
290+
client = AnthropicMessagesClient(object())
291+
native_response = SimpleNamespace(
292+
id="msg_think",
293+
model="claude-haiku-4-5",
294+
stop_reason="end_turn",
295+
usage=SimpleNamespace(input_tokens=1, output_tokens=1),
296+
content=[SimpleNamespace(type="thinking", thinking="hidden chain")],
297+
)
298+
299+
with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
300+
await client.raise_from_native_response(native_response)
301+
302+
231303
@pytest.mark.asyncio
232304
async def test_anthropic_tool_call_round_trips_thinking_blocks():
233305
pytest.importorskip("anthropic")

tests/test_openai_responses_client.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
OPENAI_RESPONSES_OUTPUT_FIELD,
99
OpenAIResponsesClient,
1010
)
11+
from verifiers.errors import EmptyModelResponseError
1112
from verifiers.types import (
1213
AssistantMessage,
1314
ClientConfig,
@@ -133,6 +134,24 @@ async def test_get_native_response_normalizes_sampling_args_and_tools():
133134
]
134135

135136

137+
@pytest.mark.asyncio
138+
async def test_raise_from_native_response_rejects_reasoning_only_response():
139+
native_response = SimpleNamespace(
140+
output=[
141+
{
142+
"type": "reasoning",
143+
"id": "rs_1",
144+
"summary": [{"type": "summary_text", "text": "thinking"}],
145+
"status": "completed",
146+
}
147+
]
148+
)
149+
client = OpenAIResponsesClient(object())
150+
151+
with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
152+
await client.raise_from_native_response(native_response)
153+
154+
136155
@pytest.mark.asyncio
137156
async def test_to_native_tool_omits_strict_when_unset():
138157
client = OpenAIResponsesClient(object())

tests/test_renderer_client.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,14 @@ async def test_renderer_client_rejects_empty_dict_native_response():
355355
await client.raise_from_native_response({})
356356

357357

358+
@pytest.mark.asyncio
359+
async def test_renderer_client_rejects_reasoning_only_native_response():
360+
client = object.__new__(RendererClient)
361+
362+
with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
363+
await client.raise_from_native_response({"reasoning_content": "hidden chain"})
364+
365+
358366
@pytest.mark.asyncio
359367
async def test_from_native_response_uses_request_id_and_token_lengths():
360368
"""vLLM's /inference/v1/generate returns ``request_id`` (not ``id``) and

verifiers/clients/anthropic_messages_client.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
)
3030

3131
from verifiers.clients.client import Client
32-
from verifiers.errors import OverlongPromptError
32+
from verifiers.errors import EmptyModelResponseError, OverlongPromptError
3333
from verifiers.types import (
3434
AssistantMessage,
3535
ClientConfig,
@@ -378,7 +378,29 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
378378
)
379379

380380
async def raise_from_native_response(self, response: AnthropicMessage) -> None:
381-
pass
381+
if response is None:
382+
raise EmptyModelResponseError("Model returned no response")
383+
384+
has_text = False
385+
has_tool_call = False
386+
has_reasoning = False
387+
for content_block in getattr(response, "content", []) or []:
388+
block_type = getattr(content_block, "type", None)
389+
if block_type == "text" and getattr(content_block, "text", None):
390+
has_text = True
391+
elif block_type == "tool_use":
392+
has_tool_call = True
393+
elif block_type in {"thinking", "redacted_thinking"}:
394+
has_reasoning = True
395+
396+
if not (has_text or has_tool_call):
397+
if has_reasoning:
398+
raise EmptyModelResponseError(
399+
"Model returned reasoning but no content and did not call any tools"
400+
)
401+
raise EmptyModelResponseError(
402+
"Model returned no content and did not call any tools"
403+
)
382404

383405
async def from_native_response(self, response: AnthropicMessage) -> Response:
384406
def parse_content(

verifiers/clients/openai_chat_completions_client.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
ClientConfig,
4444
FinishReason,
4545
Message,
46+
MessageContent,
4647
Messages,
4748
Response,
4849
ResponseMessage,
@@ -119,6 +120,14 @@ def content_to_text(content: Any) -> str:
119120
return ""
120121

121122

123+
def parse_refusal_content(message: Any) -> str | None:
124+
if isinstance(message, Mapping):
125+
refusal = message.get("refusal")
126+
else:
127+
refusal = getattr(message, "refusal", None)
128+
return refusal if isinstance(refusal, str) and refusal else None
129+
130+
122131
DEFAULT_REASONING_FIELDS = [
123132
"reasoning", # vLLM, Together AI, OpenRouter
124133
"reasoning_content", # DeepSeek, Qwen/DashScope, SGLang, Fireworks AI, Kimi/Moonshot
@@ -331,15 +340,29 @@ async def raise_from_native_response(self, response: OpenAIChatResponse) -> None
331340
f"Model returned {len(response.choices)} choices, expected 1"
332341
)
333342
message = response.choices[0].message
334-
has_content = bool(content_to_text(getattr(message, "content", None)))
343+
has_content = bool(
344+
content_to_text(getattr(message, "content", None))
345+
or parse_refusal_content(message)
346+
)
335347
has_tool_calls = bool(getattr(message, "tool_calls", None))
336348
has_reasoning = bool(parse_reasoning_content(message))
337-
if not (has_content or has_tool_calls or has_reasoning):
349+
if not (has_content or has_tool_calls):
350+
if has_reasoning:
351+
raise EmptyModelResponseError(
352+
"Model returned reasoning but no content and did not call any tools"
353+
)
338354
raise EmptyModelResponseError(
339-
"Model returned no content, reasoning, and did not call any tools"
355+
"Model returned no content and did not call any tools"
340356
)
341357

342358
async def from_native_response(self, response: OpenAIChatResponse) -> Response:
359+
def parse_content(response: OpenAIChatResponse) -> MessageContent | None:
360+
message = response.choices[0].message
361+
content = message.content
362+
if content_to_text(content):
363+
return content
364+
return parse_refusal_content(message)
365+
343366
def parse_single_tool_call(tool_call: Any) -> ToolCall | None:
344367
if isinstance(tool_call, ChatCompletionMessageFunctionToolCall):
345368
return ToolCall(
@@ -511,7 +534,7 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
511534
model=model,
512535
usage=parse_usage(response),
513536
message=ResponseMessage(
514-
content=response.choices[0].message.content,
537+
content=parse_content(response),
515538
reasoning_content=parse_reasoning_content(response.choices[0].message),
516539
finish_reason=parse_finish_reason(response),
517540
is_truncated=parse_is_truncated(response),

verifiers/clients/openai_responses_client.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,13 @@ async def raise_from_native_response(
293293
):
294294
has_text = True
295295

296-
if not (has_text or has_tool_call or has_reasoning):
296+
if not (has_text or has_tool_call):
297+
if has_reasoning:
298+
raise EmptyModelResponseError(
299+
"Model returned reasoning but no content and did not call any tools"
300+
)
297301
raise EmptyModelResponseError(
298-
"Model returned no content, reasoning, and did not call any tools"
302+
"Model returned no content and did not call any tools"
299303
)
300304

301305
async def from_native_response(

verifiers/clients/renderer_client.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -641,9 +641,13 @@ async def raise_from_native_response(self, response: dict[str, Any]) -> None:
641641
# model having tried to call a tool, so we don't filter by status here.
642642
has_tool_calls = bool(response.get("tool_calls"))
643643
has_reasoning = bool(response.get("reasoning_content"))
644-
if not (has_content or has_tool_calls or has_reasoning):
644+
if not (has_content or has_tool_calls):
645+
if has_reasoning:
646+
raise EmptyModelResponseError(
647+
"Model returned reasoning but no content and did not call any tools"
648+
)
645649
raise EmptyModelResponseError(
646-
"Model returned no content, reasoning, and did not call any tools"
650+
"Model returned no content and did not call any tools"
647651
)
648652

649653
async def from_native_response(self, response: dict[str, Any]) -> Response:

0 commit comments

Comments
 (0)