Skip to content

Commit 8f2c2ab

Browse files
authored
fix(backends): populate mot._thinking from vLLM 'reasoning' wire key in LiteLLMBackend (#1169)
* fix(backends): populate mot._thinking from vLLM 'reasoning' key in LiteLLMBackend vLLM's reasoning parser surfaces the trace under the wire key "reasoning", not "reasoning_content". LiteLLM's normalisation layer only remaps this in the openai/ provider path (gpt_transformation.py) and only from v1.83 onward. Mellea's LiteLLMBackend.processing() was relying on hasattr checks for reasoning_content, which missed the raw "reasoning" field on both: - non-streaming Message objects (no __init__ fallback in LiteLLM) - any provider path where LiteLLM hasn't done the remap Replace both hasattr guards with a dual-key probe: .get("reasoning_content") or .get("reasoning") Both litellm.Message and litellm.Delta extend SafeAttributeModel/OpenAIObject and support .get(), so this works across streaming and non-streaming paths. Priority is given to reasoning_content so that providers LiteLLM has already normalised behave identically to before. Fixes #1070. Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * fix(backends): address review findings on mot._thinking vLLM fix - Replace `or` short-circuit with explicit `is None` guard in both the non-streaming and streaming branches of LiteLLMBackend.processing(). The `or` pattern silently fell through to "reasoning" when "reasoning_content" was an empty string, which is a valid intermediate streaming delta chunk. - Convert all tests to async def + await to match the project's pytest-asyncio AUTO mode convention and avoid the deprecated asyncio.get_event_loop() pattern on Python 3.12+. - Add two new test cases asserting that reasoning_content takes priority when both wire keys are present simultaneously (non-streaming and streaming paths). Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * test(backends): address litellm thinking review findings - Remove stale review-artefact parenthetical from streaming branch comment - Relax _fresh_mot() to leave _thinking=None so production None-coercion path is exercised on first chunk - Add empty-string reasoning_content tests (non-streaming + streaming) to lock in is-None guard semantics: empty-string does not fall back to the reasoning key Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> --------- Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
1 parent 5a8ec8f commit 8f2c2ab

2 files changed

Lines changed: 237 additions & 10 deletions

File tree

mellea/backends/litellm.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -435,11 +435,14 @@ async def processing(
435435

436436
message = choice.message
437437

438-
# Sometimes a message doesn't actually have this field.
439-
if hasattr(message, "reasoning_content"):
440-
thinking_chunk = message.reasoning_content
441-
if thinking_chunk is not None:
442-
mot._thinking += thinking_chunk
438+
# vLLM exposes the reasoning trace under "reasoning" (not "reasoning_content").
439+
# Some OpenAI-compatible servers (e.g. vLLM, SGLang) use this key; older LiteLLM
440+
# versions do not remap it. Use is-None guard so an empty-string chunk isn't lost.
441+
thinking_chunk = message.get("reasoning_content")
442+
if thinking_chunk is None:
443+
thinking_chunk = message.get("reasoning")
444+
if thinking_chunk is not None:
445+
mot._thinking += thinking_chunk
443446

444447
content_chunk = message.content
445448
if content_chunk is not None:
@@ -453,11 +456,12 @@ async def processing(
453456
elif isinstance(chunk, litellm.ModelResponseStream): # type: ignore
454457
message_delta = chunk.choices[0].delta
455458

456-
# Sometimes a delta doesn't actually have this field.
457-
if hasattr(message_delta, "reasoning_content"):
458-
thinking_chunk = message_delta.reasoning_content
459-
if thinking_chunk is not None:
460-
mot._thinking += thinking_chunk
459+
# Same dual-key probe for streaming deltas.
460+
thinking_chunk = message_delta.get("reasoning_content")
461+
if thinking_chunk is None:
462+
thinking_chunk = message_delta.get("reasoning")
463+
if thinking_chunk is not None:
464+
mot._thinking += thinking_chunk
461465

462466
content_chunk = message_delta.content
463467
if content_chunk is not None:
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
"""Unit tests for LiteLLMBackend mot._thinking population.
2+
3+
Covers the vLLM case where the wire key is ``"reasoning"`` instead of
4+
``"reasoning_content"``, and the case where LiteLLM has already normalised
5+
it to ``"reasoning_content"`` (so both keys are exercised).
6+
"""
7+
8+
import pytest
9+
10+
pytest.importorskip("litellm", reason="litellm not installed — install mellea[litellm]")
11+
12+
from litellm.types.utils import (
13+
Choices,
14+
Delta,
15+
Message,
16+
ModelResponse,
17+
ModelResponseStream,
18+
StreamingChoices,
19+
)
20+
21+
from mellea.backends.litellm import LiteLLMBackend
22+
from mellea.core import ModelOutputThunk
23+
24+
25+
def _make_non_streaming_chunk(
26+
content: str, reasoning_key: str, reasoning_value: str
27+
) -> ModelResponse:
28+
"""Build a minimal non-streaming ModelResponse with a custom reasoning key."""
29+
msg = Message(content=content, role="assistant")
30+
msg[reasoning_key] = reasoning_value
31+
choice = Choices(finish_reason="stop", index=0, message=msg)
32+
return ModelResponse(
33+
id="test",
34+
choices=[choice],
35+
created=0,
36+
model="openai/qwen3",
37+
object="chat.completion",
38+
)
39+
40+
41+
def _make_streaming_chunk(
42+
content: str, reasoning_key: str, reasoning_value: str
43+
) -> ModelResponseStream:
44+
"""Build a minimal streaming delta chunk with a custom reasoning key."""
45+
delta = Delta(content=content)
46+
delta[reasoning_key] = reasoning_value
47+
chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
48+
return ModelResponseStream(
49+
id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
50+
)
51+
52+
53+
@pytest.fixture()
54+
def backend() -> LiteLLMBackend:
55+
return LiteLLMBackend(model_id="openai/qwen3", base_url="http://localhost:8000/v1")
56+
57+
58+
def _fresh_mot() -> ModelOutputThunk:
59+
mot: ModelOutputThunk = ModelOutputThunk(None)
60+
mot._meta = {}
61+
return mot
62+
63+
64+
# ---------------------------------------------------------------------------
65+
# Non-streaming path
66+
# ---------------------------------------------------------------------------
67+
68+
69+
async def test_processing_non_streaming_reasoning_content_key(backend: LiteLLMBackend):
70+
"""reasoning_content (normalised key) is captured correctly."""
71+
mot = _fresh_mot()
72+
chunk = _make_non_streaming_chunk(
73+
content="Paris",
74+
reasoning_key="reasoning_content",
75+
reasoning_value="France has its capital in Paris.",
76+
)
77+
await backend.processing(mot, chunk)
78+
assert mot._thinking == "France has its capital in Paris."
79+
assert mot._underlying_value == "Paris"
80+
81+
82+
async def test_processing_non_streaming_reasoning_raw_key(backend: LiteLLMBackend):
83+
"""Fallback: vLLM 'reasoning' key (not normalised by older LiteLLM) is captured."""
84+
mot = _fresh_mot()
85+
chunk = _make_non_streaming_chunk(
86+
content="Paris",
87+
reasoning_key="reasoning",
88+
reasoning_value="France has its capital in Paris.",
89+
)
90+
await backend.processing(mot, chunk)
91+
assert mot._thinking == "France has its capital in Paris."
92+
assert mot._underlying_value == "Paris"
93+
94+
95+
async def test_processing_non_streaming_reasoning_content_wins_over_reasoning(
96+
backend: LiteLLMBackend,
97+
):
98+
"""reasoning_content takes priority when both keys are present."""
99+
mot = _fresh_mot()
100+
msg = Message(content="Paris", role="assistant")
101+
msg["reasoning_content"] = "from_reasoning_content"
102+
msg["reasoning"] = "from_reasoning"
103+
choice = Choices(finish_reason="stop", index=0, message=msg)
104+
chunk = ModelResponse(
105+
id="test",
106+
choices=[choice],
107+
created=0,
108+
model="openai/qwen3",
109+
object="chat.completion",
110+
)
111+
await backend.processing(mot, chunk)
112+
assert mot._thinking == "from_reasoning_content"
113+
114+
115+
async def test_processing_non_streaming_no_reasoning(backend: LiteLLMBackend):
116+
"""No reasoning key — thinking stays empty string, content is captured."""
117+
mot = _fresh_mot()
118+
chunk = _make_non_streaming_chunk(
119+
content="Paris",
120+
reasoning_key="unrelated_key",
121+
reasoning_value="should be ignored",
122+
)
123+
await backend.processing(mot, chunk)
124+
assert mot._thinking == ""
125+
assert mot._underlying_value == "Paris"
126+
127+
128+
async def test_processing_non_streaming_empty_reasoning_content_does_not_fall_back(
129+
backend: LiteLLMBackend,
130+
):
131+
"""Empty-string reasoning_content wins — does not fall back to reasoning key.
132+
133+
Validates that the is-None guard (not ``or``) is used: an empty-string
134+
``reasoning_content`` chunk is preserved as-is, not silently replaced by the
135+
fallback ``reasoning`` value.
136+
"""
137+
mot = _fresh_mot()
138+
msg = Message(content="Paris", role="assistant")
139+
msg["reasoning_content"] = ""
140+
msg["reasoning"] = "should not appear"
141+
choice = Choices(finish_reason="stop", index=0, message=msg)
142+
chunk = ModelResponse(
143+
id="test",
144+
choices=[choice],
145+
created=0,
146+
model="openai/qwen3",
147+
object="chat.completion",
148+
)
149+
await backend.processing(mot, chunk)
150+
assert mot._thinking == ""
151+
152+
153+
# ---------------------------------------------------------------------------
154+
# Streaming path
155+
# ---------------------------------------------------------------------------
156+
157+
158+
async def test_processing_streaming_reasoning_content_key(backend: LiteLLMBackend):
159+
"""Streaming: reasoning_content key is accumulated across chunks."""
160+
mot = _fresh_mot()
161+
for text in ("chunk1 ", "chunk2"):
162+
stream_chunk = _make_streaming_chunk(
163+
content="", reasoning_key="reasoning_content", reasoning_value=text
164+
)
165+
await backend.processing(mot, stream_chunk)
166+
assert mot._thinking == "chunk1 chunk2"
167+
168+
169+
async def test_processing_streaming_reasoning_raw_key(backend: LiteLLMBackend):
170+
"""Streaming fallback: vLLM 'reasoning' key is accumulated across chunks."""
171+
mot = _fresh_mot()
172+
for text in ("chunk1 ", "chunk2"):
173+
stream_chunk = _make_streaming_chunk(
174+
content="", reasoning_key="reasoning", reasoning_value=text
175+
)
176+
await backend.processing(mot, stream_chunk)
177+
assert mot._thinking == "chunk1 chunk2"
178+
179+
180+
async def test_processing_streaming_reasoning_content_wins_over_reasoning(
181+
backend: LiteLLMBackend,
182+
):
183+
"""Streaming: reasoning_content takes priority when both keys are present."""
184+
mot = _fresh_mot()
185+
delta = Delta(content="")
186+
delta["reasoning_content"] = "from_reasoning_content"
187+
delta["reasoning"] = "from_reasoning"
188+
chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
189+
stream_chunk = ModelResponseStream(
190+
id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
191+
)
192+
await backend.processing(mot, stream_chunk)
193+
assert mot._thinking == "from_reasoning_content"
194+
195+
196+
async def test_processing_streaming_no_reasoning(backend: LiteLLMBackend):
197+
"""Streaming: no reasoning key — thinking stays empty string."""
198+
mot = _fresh_mot()
199+
stream_chunk = _make_streaming_chunk(
200+
content="Paris", reasoning_key="unrelated_key", reasoning_value="ignored"
201+
)
202+
await backend.processing(mot, stream_chunk)
203+
assert mot._thinking == ""
204+
assert mot._underlying_value == "Paris"
205+
206+
207+
async def test_processing_streaming_empty_reasoning_content_does_not_fall_back(
208+
backend: LiteLLMBackend,
209+
):
210+
"""Streaming: empty-string reasoning_content wins — does not fall back to reasoning key.
211+
212+
Validates that the is-None guard (not ``or``) is used in the streaming branch too.
213+
"""
214+
mot = _fresh_mot()
215+
delta = Delta(content="")
216+
delta["reasoning_content"] = ""
217+
delta["reasoning"] = "should not appear"
218+
chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
219+
stream_chunk = ModelResponseStream(
220+
id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
221+
)
222+
await backend.processing(mot, stream_chunk)
223+
assert mot._thinking == ""

0 commit comments

Comments
 (0)