[codex] Reject reasoning-only model responses (#1537)

willccbb · web-flow · commit f54f1897c7b2 · 2026-06-04T02:30:22.000-07:00
diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
@@ -1,7 +1,9 @@
-import pytest
 from types import SimpleNamespace
 
+import pytest
+
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
+from verifiers.errors import EmptyModelResponseError
 from verifiers.types import (
     AssistantMessage,
     ImageUrlContentPart,
@@ -18,6 +20,11 @@
 from verifiers.utils.response_utils import parse_response_message
 
 
+class _OpenAIMessage(SimpleNamespace):
+    def model_dump(self):
+        return self.__dict__
+
+
 @pytest.mark.asyncio
 async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     client = OpenAIChatCompletionsClient(object())
@@ -52,6 +59,53 @@ async def test_openai_to_native_prompt_with_typed_multimodal_content_parts():
     ]
 
 
+@pytest.mark.asyncio
+async def test_openai_chat_rejects_reasoning_only_native_response():
+    client = OpenAIChatCompletionsClient(object())
+    native_response = SimpleNamespace(
+        choices=[
+            SimpleNamespace(
+                message=_OpenAIMessage(
+                    content=None,
+                    reasoning_content="hidden chain",
+                    tool_calls=None,
+                )
+            )
+        ]
+    )
+
+    with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
+        await client.raise_from_native_response(native_response)
+
+
+@pytest.mark.asyncio
+async def test_openai_chat_accepts_refusal_with_reasoning_native_response():
+    client = OpenAIChatCompletionsClient(object())
+    native_response = SimpleNamespace(
+        id="chatcmpl_refusal",
+        created=0,
+        model="gpt-5.2",
+        usage=None,
+        choices=[
+            SimpleNamespace(
+                finish_reason="stop",
+                message=_OpenAIMessage(
+                    content=None,
+                    refusal="I cannot help with that.",
+                    reasoning_content="hidden chain",
+                    tool_calls=None,
+                ),
+            )
+        ],
+    )
+
+    await client.raise_from_native_response(native_response)
+    response = await client.from_native_response(native_response)
+
+    assert response.message.content == "I cannot help with that."
+    assert response.message.reasoning_content == "hidden chain"
+
+
 @pytest.mark.asyncio
 async def test_anthropic_to_native_prompt_with_typed_multimodal_content_parts():
     pytest.importorskip("anthropic")
@@ -228,6 +282,24 @@ async def test_anthropic_from_native_response_always_parses_reasoning():
     assert response.message.content == "final answer"
 
 
+@pytest.mark.asyncio
+async def test_anthropic_rejects_reasoning_only_native_response():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    client = AnthropicMessagesClient(object())
+    native_response = SimpleNamespace(
+        id="msg_think",
+        model="claude-haiku-4-5",
+        stop_reason="end_turn",
+        usage=SimpleNamespace(input_tokens=1, output_tokens=1),
+        content=[SimpleNamespace(type="thinking", thinking="hidden chain")],
+    )
+
+    with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
+        await client.raise_from_native_response(native_response)
+
+
 @pytest.mark.asyncio
 async def test_anthropic_tool_call_round_trips_thinking_blocks():
     pytest.importorskip("anthropic")
diff --git a/tests/test_openai_responses_client.py b/tests/test_openai_responses_client.py
@@ -8,6 +8,7 @@
     OPENAI_RESPONSES_OUTPUT_FIELD,
     OpenAIResponsesClient,
 )
+from verifiers.errors import EmptyModelResponseError
 from verifiers.types import (
     AssistantMessage,
     ClientConfig,
@@ -133,6 +134,24 @@ async def test_get_native_response_normalizes_sampling_args_and_tools():
     ]
 
 
+@pytest.mark.asyncio
+async def test_raise_from_native_response_rejects_reasoning_only_response():
+    native_response = SimpleNamespace(
+        output=[
+            {
+                "type": "reasoning",
+                "id": "rs_1",
+                "summary": [{"type": "summary_text", "text": "thinking"}],
+                "status": "completed",
+            }
+        ]
+    )
+    client = OpenAIResponsesClient(object())
+
+    with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
+        await client.raise_from_native_response(native_response)
+
+
 @pytest.mark.asyncio
 async def test_to_native_tool_omits_strict_when_unset():
     client = OpenAIResponsesClient(object())
diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py
@@ -355,6 +355,14 @@ async def test_renderer_client_rejects_empty_dict_native_response():
         await client.raise_from_native_response({})
 
 
+@pytest.mark.asyncio
+async def test_renderer_client_rejects_reasoning_only_native_response():
+    client = object.__new__(RendererClient)
+
+    with pytest.raises(EmptyModelResponseError, match="reasoning but no content"):
+        await client.raise_from_native_response({"reasoning_content": "hidden chain"})
+
+
 @pytest.mark.asyncio
 async def test_from_native_response_uses_request_id_and_token_lengths():
     """vLLM's /inference/v1/generate returns ``request_id`` (not ``id``) and
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
@@ -29,7 +29,7 @@
 )
 
 from verifiers.clients.client import Client
-from verifiers.errors import OverlongPromptError
+from verifiers.errors import EmptyModelResponseError, OverlongPromptError
 from verifiers.types import (
     AssistantMessage,
     ClientConfig,
@@ -378,7 +378,29 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
             )
 
     async def raise_from_native_response(self, response: AnthropicMessage) -> None:
-        pass
+        if response is None:
+            raise EmptyModelResponseError("Model returned no response")
+
+        has_text = False
+        has_tool_call = False
+        has_reasoning = False
+        for content_block in getattr(response, "content", []) or []:
+            block_type = getattr(content_block, "type", None)
+            if block_type == "text" and getattr(content_block, "text", None):
+                has_text = True
+            elif block_type == "tool_use":
+                has_tool_call = True
+            elif block_type in {"thinking", "redacted_thinking"}:
+                has_reasoning = True
+
+        if not (has_text or has_tool_call):
+            if has_reasoning:
+                raise EmptyModelResponseError(
+                    "Model returned reasoning but no content and did not call any tools"
+                )
+            raise EmptyModelResponseError(
+                "Model returned no content and did not call any tools"
+            )
 
     async def from_native_response(self, response: AnthropicMessage) -> Response:
         def parse_content(
diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
@@ -43,6 +43,7 @@
     ClientConfig,
     FinishReason,
     Message,
+    MessageContent,
     Messages,
     Response,
     ResponseMessage,
@@ -119,6 +120,14 @@ def content_to_text(content: Any) -> str:
     return ""
 
 
+def parse_refusal_content(message: Any) -> str | None:
+    if isinstance(message, Mapping):
+        refusal = message.get("refusal")
+    else:
+        refusal = getattr(message, "refusal", None)
+    return refusal if isinstance(refusal, str) and refusal else None
+
+
 DEFAULT_REASONING_FIELDS = [
     "reasoning",  # vLLM, Together AI, OpenRouter
     "reasoning_content",  # DeepSeek, Qwen/DashScope, SGLang, Fireworks AI, Kimi/Moonshot
@@ -331,15 +340,29 @@ async def raise_from_native_response(self, response: OpenAIChatResponse) -> None
                 f"Model returned {len(response.choices)} choices, expected 1"
             )
         message = response.choices[0].message
-        has_content = bool(content_to_text(getattr(message, "content", None)))
+        has_content = bool(
+            content_to_text(getattr(message, "content", None))
+            or parse_refusal_content(message)
+        )
         has_tool_calls = bool(getattr(message, "tool_calls", None))
         has_reasoning = bool(parse_reasoning_content(message))
-        if not (has_content or has_tool_calls or has_reasoning):
+        if not (has_content or has_tool_calls):
+            if has_reasoning:
+                raise EmptyModelResponseError(
+                    "Model returned reasoning but no content and did not call any tools"
+                )
             raise EmptyModelResponseError(
-                "Model returned no content, reasoning, and did not call any tools"
+                "Model returned no content and did not call any tools"
             )
 
     async def from_native_response(self, response: OpenAIChatResponse) -> Response:
+        def parse_content(response: OpenAIChatResponse) -> MessageContent | None:
+            message = response.choices[0].message
+            content = message.content
+            if content_to_text(content):
+                return content
+            return parse_refusal_content(message)
+
         def parse_single_tool_call(tool_call: Any) -> ToolCall | None:
             if isinstance(tool_call, ChatCompletionMessageFunctionToolCall):
                 return ToolCall(
@@ -511,7 +534,7 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None:
             model=model,
             usage=parse_usage(response),
             message=ResponseMessage(
-                content=response.choices[0].message.content,
+                content=parse_content(response),
                 reasoning_content=parse_reasoning_content(response.choices[0].message),
                 finish_reason=parse_finish_reason(response),
                 is_truncated=parse_is_truncated(response),
diff --git a/verifiers/clients/openai_responses_client.py b/verifiers/clients/openai_responses_client.py
@@ -293,9 +293,13 @@ async def raise_from_native_response(
                     ):
                         has_text = True
 
-        if not (has_text or has_tool_call or has_reasoning):
+        if not (has_text or has_tool_call):
+            if has_reasoning:
+                raise EmptyModelResponseError(
+                    "Model returned reasoning but no content and did not call any tools"
+                )
             raise EmptyModelResponseError(
-                "Model returned no content, reasoning, and did not call any tools"
+                "Model returned no content and did not call any tools"
             )
 
     async def from_native_response(
diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py
@@ -641,9 +641,13 @@ async def raise_from_native_response(self, response: dict[str, Any]) -> None:
         # model having tried to call a tool, so we don't filter by status here.
         has_tool_calls = bool(response.get("tool_calls"))
         has_reasoning = bool(response.get("reasoning_content"))
-        if not (has_content or has_tool_calls or has_reasoning):
+        if not (has_content or has_tool_calls):
+            if has_reasoning:
+                raise EmptyModelResponseError(
+                    "Model returned reasoning but no content and did not call any tools"
+                )
             raise EmptyModelResponseError(
-                "Model returned no content, reasoning, and did not call any tools"
+                "Model returned no content and did not call any tools"
             )
 
     async def from_native_response(self, response: dict[str, Any]) -> Response: