[Bugfix] KimiK2ReasoningParser: guard against buffered end-token text in streaming

Keyi Li · claude · Keyi Li · commit 093c83ac6993 · 2026-04-29T18:51:55.000-07:00
When stop sequences set output_text_buffer_length > 0, token IDs arrive in delta_token_ids before their text is flushed into delta_text. Without a guard, find() returns -1 and the reasoning/content split is silently corrupted. Add text-presence checks before both find() calls in extract_reasoning_streaming: - </think> end token path (line 215) - <|tool_calls_section_begin|> section start path (line 223) Return None (wait for flush) when the token ID is present but the text is not, matching the fix pattern from PR vllm-project#39044 (BaseThinkingReasoningParser / DeepSeekR1) and PR vllm-project#40352 (Step3ReasoningParser). Fixes vllm-project#41067 Co-authored-by: Claude <noreply@anthropic.com> Signed-off-by: Keyi Li <likey6688@gmail.com>
diff --git a/tests/reasoning/test_kimi_k2_reasoning_parser.py b/tests/reasoning/test_kimi_k2_reasoning_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock
+
 import pytest
 
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
@@ -12,6 +14,20 @@
 REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
 
 
+@pytest.fixture
+def mock_kimi_k2_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.get_vocab.return_value = {
+        "<think>": 100,
+        "</think>": 101,
+        "<|tool_calls_section_begin|>": 200,
+        "<|tool_calls_section_end|>": 201,
+        "<|tool_call_begin|>": 202,
+        "<|tool_call_end|>": 203,
+    }
+    return tokenizer
+
+
 @pytest.fixture(scope="module")
 def kimi_k2_tokenizer():
     return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
     )
     assert isinstance(result, DeltaMessage)
     assert result.content == "<|tool_calls_section_begin|>"
+
+
+def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, </think> ID arrives before its text.
+
+    The token ID is present in delta_token_ids but the actual string is not
+    yet in delta_text (still buffered). The parser must return None to wait
+    for the next delta, instead of calling find() which returns -1 and
+    silently corrupting the text split.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    end_think_id = parser._end_token_id
+
+    # Simulate: </think> ID arrived but text not yet flushed.
+    # Two token IDs in delta to bypass the single-special-token guard.
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # </think> text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, end_think_id, 999],
+        delta_token_ids=[end_think_id, 999],
+    )
+    assert result is None
+
+
+def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, tool section start ID arrives before its text.
+
+    Same buffering scenario as above but for <|tool_calls_section_begin|>.
+    Without the guard, find() returns -1 and delta_text[:tool_index] silently
+    drops the last character of reasoning.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    tool_begin_id = parser._tool_section_start_token_id
+
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # tool section text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, tool_begin_id, 999],
+        delta_token_ids=[tool_begin_id, 999],
+    )
+    assert result is None
diff --git a/vllm/reasoning/kimi_k2_reasoning_parser.py b/vllm/reasoning/kimi_k2_reasoning_parser.py
@@ -221,6 +221,10 @@ def extract_reasoning_streaming(
             return None
 
         if self._end_token_id in delta_token_ids:
+            if self._end_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                # Wait for the next delta when the text becomes visible.
+                return None
             end_index = delta_text.find(self._end_token)
             reasoning = delta_text[:end_index]
             content = delta_text[end_index + len(self._end_token) :]
@@ -229,6 +233,9 @@ def extract_reasoning_streaming(
             )
 
         if self._tool_section_start_token_id in delta_token_ids:
+            if self._tool_section_start_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                return None
             tool_index = delta_text.find(self._tool_section_start_token)
             reasoning = delta_text[:tool_index]
             content = delta_text[tool_index:]