fix(backends): populate mot._thinking from vLLM 'reasoning' wire key in LiteLLMBackend (#1169)

planetf1 · web-flow · commit 8f2c2abf17ec · 2026-05-28T18:36:45.000Z
* fix(backends): populate mot._thinking from vLLM 'reasoning' key in LiteLLMBackend vLLM's reasoning parser surfaces the trace under the wire key "reasoning", not "reasoning_content". LiteLLM's normalisation layer only remaps this in the openai/ provider path (gpt_transformation.py) and only from v1.83 onward. Mellea's LiteLLMBackend.processing() was relying on hasattr checks for reasoning_content, which missed the raw "reasoning" field on both: - non-streaming Message objects (no __init__ fallback in LiteLLM) - any provider path where LiteLLM hasn't done the remap Replace both hasattr guards with a dual-key probe: .get("reasoning_content") or .get("reasoning") Both litellm.Message and litellm.Delta extend SafeAttributeModel/OpenAIObject and support .get(), so this works across streaming and non-streaming paths. Priority is given to reasoning_content so that providers LiteLLM has already normalised behave identically to before. Fixes #1070. Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * fix(backends): address review findings on mot._thinking vLLM fix - Replace `or` short-circuit with explicit `is None` guard in both the non-streaming and streaming branches of LiteLLMBackend.processing(). The `or` pattern silently fell through to "reasoning" when "reasoning_content" was an empty string, which is a valid intermediate streaming delta chunk. - Convert all tests to async def + await to match the project's pytest-asyncio AUTO mode convention and avoid the deprecated asyncio.get_event_loop() pattern on Python 3.12+. - Add two new test cases asserting that reasoning_content takes priority when both wire keys are present simultaneously (non-streaming and streaming paths). Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> * test(backends): address litellm thinking review findings - Remove stale review-artefact parenthetical from streaming branch comment - Relax _fresh_mot() to leave _thinking=None so production None-coercion path is exercised on first chunk - Add empty-string reasoning_content tests (non-streaming + streaming) to lock in is-None guard semantics: empty-string does not fall back to the reasoning key Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com> --------- Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
@@ -435,11 +435,14 @@ async def processing(
 
             message = choice.message
 
-            # Sometimes a message doesn't actually have this field.
-            if hasattr(message, "reasoning_content"):
-                thinking_chunk = message.reasoning_content
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
+            # vLLM exposes the reasoning trace under "reasoning" (not "reasoning_content").
+            # Some OpenAI-compatible servers (e.g. vLLM, SGLang) use this key; older LiteLLM
+            # versions do not remap it. Use is-None guard so an empty-string chunk isn't lost.
+            thinking_chunk = message.get("reasoning_content")
+            if thinking_chunk is None:
+                thinking_chunk = message.get("reasoning")
+            if thinking_chunk is not None:
+                mot._thinking += thinking_chunk
 
             content_chunk = message.content
             if content_chunk is not None:
@@ -453,11 +456,12 @@ async def processing(
         elif isinstance(chunk, litellm.ModelResponseStream):  # type: ignore
             message_delta = chunk.choices[0].delta
 
-            # Sometimes a delta doesn't actually have this field.
-            if hasattr(message_delta, "reasoning_content"):
-                thinking_chunk = message_delta.reasoning_content
-                if thinking_chunk is not None:
-                    mot._thinking += thinking_chunk
+            # Same dual-key probe for streaming deltas.
+            thinking_chunk = message_delta.get("reasoning_content")
+            if thinking_chunk is None:
+                thinking_chunk = message_delta.get("reasoning")
+            if thinking_chunk is not None:
+                mot._thinking += thinking_chunk
 
             content_chunk = message_delta.content
             if content_chunk is not None:
diff --git a/test/backends/test_litellm_thinking.py b/test/backends/test_litellm_thinking.py
@@ -0,0 +1,223 @@
+"""Unit tests for LiteLLMBackend mot._thinking population.
+
+Covers the vLLM case where the wire key is ``"reasoning"`` instead of
+``"reasoning_content"``, and the case where LiteLLM has already normalised
+it to ``"reasoning_content"`` (so both keys are exercised).
+"""
+
+import pytest
+
+pytest.importorskip("litellm", reason="litellm not installed — install mellea[litellm]")
+
+from litellm.types.utils import (
+    Choices,
+    Delta,
+    Message,
+    ModelResponse,
+    ModelResponseStream,
+    StreamingChoices,
+)
+
+from mellea.backends.litellm import LiteLLMBackend
+from mellea.core import ModelOutputThunk
+
+
+def _make_non_streaming_chunk(
+    content: str, reasoning_key: str, reasoning_value: str
+) -> ModelResponse:
+    """Build a minimal non-streaming ModelResponse with a custom reasoning key."""
+    msg = Message(content=content, role="assistant")
+    msg[reasoning_key] = reasoning_value
+    choice = Choices(finish_reason="stop", index=0, message=msg)
+    return ModelResponse(
+        id="test",
+        choices=[choice],
+        created=0,
+        model="openai/qwen3",
+        object="chat.completion",
+    )
+
+
+def _make_streaming_chunk(
+    content: str, reasoning_key: str, reasoning_value: str
+) -> ModelResponseStream:
+    """Build a minimal streaming delta chunk with a custom reasoning key."""
+    delta = Delta(content=content)
+    delta[reasoning_key] = reasoning_value
+    chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
+    return ModelResponseStream(
+        id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
+    )
+
+
+@pytest.fixture()
+def backend() -> LiteLLMBackend:
+    return LiteLLMBackend(model_id="openai/qwen3", base_url="http://localhost:8000/v1")
+
+
+def _fresh_mot() -> ModelOutputThunk:
+    mot: ModelOutputThunk = ModelOutputThunk(None)
+    mot._meta = {}
+    return mot
+
+
+# ---------------------------------------------------------------------------
+# Non-streaming path
+# ---------------------------------------------------------------------------
+
+
+async def test_processing_non_streaming_reasoning_content_key(backend: LiteLLMBackend):
+    """reasoning_content (normalised key) is captured correctly."""
+    mot = _fresh_mot()
+    chunk = _make_non_streaming_chunk(
+        content="Paris",
+        reasoning_key="reasoning_content",
+        reasoning_value="France has its capital in Paris.",
+    )
+    await backend.processing(mot, chunk)
+    assert mot._thinking == "France has its capital in Paris."
+    assert mot._underlying_value == "Paris"
+
+
+async def test_processing_non_streaming_reasoning_raw_key(backend: LiteLLMBackend):
+    """Fallback: vLLM 'reasoning' key (not normalised by older LiteLLM) is captured."""
+    mot = _fresh_mot()
+    chunk = _make_non_streaming_chunk(
+        content="Paris",
+        reasoning_key="reasoning",
+        reasoning_value="France has its capital in Paris.",
+    )
+    await backend.processing(mot, chunk)
+    assert mot._thinking == "France has its capital in Paris."
+    assert mot._underlying_value == "Paris"
+
+
+async def test_processing_non_streaming_reasoning_content_wins_over_reasoning(
+    backend: LiteLLMBackend,
+):
+    """reasoning_content takes priority when both keys are present."""
+    mot = _fresh_mot()
+    msg = Message(content="Paris", role="assistant")
+    msg["reasoning_content"] = "from_reasoning_content"
+    msg["reasoning"] = "from_reasoning"
+    choice = Choices(finish_reason="stop", index=0, message=msg)
+    chunk = ModelResponse(
+        id="test",
+        choices=[choice],
+        created=0,
+        model="openai/qwen3",
+        object="chat.completion",
+    )
+    await backend.processing(mot, chunk)
+    assert mot._thinking == "from_reasoning_content"
+
+
+async def test_processing_non_streaming_no_reasoning(backend: LiteLLMBackend):
+    """No reasoning key — thinking stays empty string, content is captured."""
+    mot = _fresh_mot()
+    chunk = _make_non_streaming_chunk(
+        content="Paris",
+        reasoning_key="unrelated_key",
+        reasoning_value="should be ignored",
+    )
+    await backend.processing(mot, chunk)
+    assert mot._thinking == ""
+    assert mot._underlying_value == "Paris"
+
+
+async def test_processing_non_streaming_empty_reasoning_content_does_not_fall_back(
+    backend: LiteLLMBackend,
+):
+    """Empty-string reasoning_content wins — does not fall back to reasoning key.
+
+    Validates that the is-None guard (not ``or``) is used: an empty-string
+    ``reasoning_content`` chunk is preserved as-is, not silently replaced by the
+    fallback ``reasoning`` value.
+    """
+    mot = _fresh_mot()
+    msg = Message(content="Paris", role="assistant")
+    msg["reasoning_content"] = ""
+    msg["reasoning"] = "should not appear"
+    choice = Choices(finish_reason="stop", index=0, message=msg)
+    chunk = ModelResponse(
+        id="test",
+        choices=[choice],
+        created=0,
+        model="openai/qwen3",
+        object="chat.completion",
+    )
+    await backend.processing(mot, chunk)
+    assert mot._thinking == ""
+
+
+# ---------------------------------------------------------------------------
+# Streaming path
+# ---------------------------------------------------------------------------
+
+
+async def test_processing_streaming_reasoning_content_key(backend: LiteLLMBackend):
+    """Streaming: reasoning_content key is accumulated across chunks."""
+    mot = _fresh_mot()
+    for text in ("chunk1 ", "chunk2"):
+        stream_chunk = _make_streaming_chunk(
+            content="", reasoning_key="reasoning_content", reasoning_value=text
+        )
+        await backend.processing(mot, stream_chunk)
+    assert mot._thinking == "chunk1 chunk2"
+
+
+async def test_processing_streaming_reasoning_raw_key(backend: LiteLLMBackend):
+    """Streaming fallback: vLLM 'reasoning' key is accumulated across chunks."""
+    mot = _fresh_mot()
+    for text in ("chunk1 ", "chunk2"):
+        stream_chunk = _make_streaming_chunk(
+            content="", reasoning_key="reasoning", reasoning_value=text
+        )
+        await backend.processing(mot, stream_chunk)
+    assert mot._thinking == "chunk1 chunk2"
+
+
+async def test_processing_streaming_reasoning_content_wins_over_reasoning(
+    backend: LiteLLMBackend,
+):
+    """Streaming: reasoning_content takes priority when both keys are present."""
+    mot = _fresh_mot()
+    delta = Delta(content="")
+    delta["reasoning_content"] = "from_reasoning_content"
+    delta["reasoning"] = "from_reasoning"
+    chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
+    stream_chunk = ModelResponseStream(
+        id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
+    )
+    await backend.processing(mot, stream_chunk)
+    assert mot._thinking == "from_reasoning_content"
+
+
+async def test_processing_streaming_no_reasoning(backend: LiteLLMBackend):
+    """Streaming: no reasoning key — thinking stays empty string."""
+    mot = _fresh_mot()
+    stream_chunk = _make_streaming_chunk(
+        content="Paris", reasoning_key="unrelated_key", reasoning_value="ignored"
+    )
+    await backend.processing(mot, stream_chunk)
+    assert mot._thinking == ""
+    assert mot._underlying_value == "Paris"
+
+
+async def test_processing_streaming_empty_reasoning_content_does_not_fall_back(
+    backend: LiteLLMBackend,
+):
+    """Streaming: empty-string reasoning_content wins — does not fall back to reasoning key.
+
+    Validates that the is-None guard (not ``or``) is used in the streaming branch too.
+    """
+    mot = _fresh_mot()
+    delta = Delta(content="")
+    delta["reasoning_content"] = ""
+    delta["reasoning"] = "should not appear"
+    chunk_choice = StreamingChoices(finish_reason=None, index=0, delta=delta)
+    stream_chunk = ModelResponseStream(
+        id="test", choices=[chunk_choice], created=0, model="openai/qwen3"
+    )
+    await backend.processing(mot, stream_chunk)
+    assert mot._thinking == ""