Skip to content

Commit 6ec30ae

Browse files
committed
feat: add custom response handler for reasoning tokens
1 parent 686b31d commit 6ec30ae

4 files changed

Lines changed: 153 additions & 0 deletions

File tree

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ What it adds
1111
- Optional server-side progress updates during benchmarks.
1212
- ShareGPT dataset conversion to GuideLLM-compatible JSONL.
1313
- A JSON summary output format for benchmark reports.
14+
- Custom response handler for accurate TTFT/ITL metrics with reasoning tokens (e.g., DeepSeek-R1).
1415

1516
Install
1617
-------
@@ -91,6 +92,21 @@ benchmark-runner benchmark \
9192
--output-dir ./benchmarks
9293
```
9394

95+
Reasoning Tokens Support
96+
-------------------------
97+
For models that output reasoning tokens (e.g., DeepSeek-R1, o1-preview), use the custom
98+
response handler to get accurate TTFT and ITL metrics:
99+
100+
```bash
101+
benchmark-runner benchmark run \
102+
--target http://localhost:8000/v1 \
103+
--backend openai_http \
104+
--backend-kwargs '{"response_handlers": {"chat_completions": "chat_completions_with_reasoning"}}' \
105+
--model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
106+
--data your-dataset \
107+
--max-requests 100
108+
```
109+
94110
Docker
95111
------
96112
This repository includes a Dockerfile used to build a runtime image.

benchmark_runner/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
__version__ = '0.0.0'
22
__git_commit__ = 'HEAD'
33

4+
from . import custom_response_handler # noqa: F401 # Register custom response handler
45
from . import output_summary_json # noqa: F401 # Register output format
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"""
2+
Custom response handler that fixes TTFT and ITL calculation for models with reasoning tokens.
3+
4+
This handler extends guidellm's ChatCompletionsResponseHandler to properly handle
5+
both regular content tokens and reasoning_content tokens, ensuring accurate timing metrics.
6+
7+
Usage:
8+
To use this handler, pass it via backend_kwargs when running benchmarks:
9+
10+
benchmark-runner benchmark run \\
11+
--target http://localhost:8000/v1 \\
12+
--backend openai_http \\
13+
--backend-kwargs '{"response_handlers": {"chat_completions": "chat_completions_with_reasoning"}}' \\
14+
--model your-model-name \\
15+
--data your-dataset
16+
17+
Or in a scenario config file:
18+
{
19+
"backend_kwargs": {
20+
"response_handlers": {
21+
"chat_completions": "chat_completions_with_reasoning"
22+
}
23+
}
24+
}
25+
"""
26+
27+
from guidellm.backends.response_handlers import (
28+
ChatCompletionsResponseHandler,
29+
GenerationResponseHandlerFactory,
30+
)
31+
32+
33+
@GenerationResponseHandlerFactory.register("chat_completions_with_reasoning")
34+
class ChatCompletionsWithReasoningResponseHandler(ChatCompletionsResponseHandler):
35+
"""
36+
Response handler for chat completions that supports reasoning tokens.
37+
38+
This handler extends the standard ChatCompletionsResponseHandler to properly
39+
track both regular content tokens and reasoning_content tokens. This ensures
40+
that TTFT (Time To First Token) and ITL (Inter-Token Latency) are calculated
41+
correctly for models that output reasoning tokens before regular content.
42+
43+
Key differences from the base handler:
44+
- Tracks both delta.content and delta.reasoning_content in streaming responses
45+
- Ensures first_token_iteration is set when ANY token arrives (not just content)
46+
- Fixes ITL calculation by properly tracking all token arrivals
47+
48+
Example:
49+
::
50+
handler = ChatCompletionsWithReasoningResponseHandler()
51+
response = handler.compile_streaming(request)
52+
"""
53+
54+
def __json__(self):
55+
"""
56+
Return JSON-serializable representation of this handler class.
57+
58+
This method is called by custom JSON encoders to serialize the handler
59+
class to its registered name.
60+
61+
:return: The registered name of this handler
62+
"""
63+
return "chat_completions_with_reasoning"
64+
65+
@classmethod
66+
def __class_json__(cls):
67+
"""
68+
Return JSON-serializable representation of this handler class.
69+
70+
This class method is called when the class itself (not an instance)
71+
needs to be serialized.
72+
73+
:return: The registered name of this handler
74+
"""
75+
return "chat_completions_with_reasoning"
76+
77+
def add_streaming_line(self, line: str) -> int | None:
78+
"""
79+
Process a single line from a chat completion streaming response.
80+
81+
Handles both regular content and reasoning_content tokens to ensure
82+
accurate timing metrics (TTFT and ITL).
83+
84+
:param line: Raw SSE line from the streaming response
85+
:return: 1 if any token was extracted, 0 if line ignored, None if done
86+
"""
87+
if not (data := self.extract_line_data(line)):
88+
return None if data is None else 0
89+
90+
if "id" in data and self.streaming_response_id is None:
91+
self.streaming_response_id = data["id"]
92+
93+
updated = False
94+
choices, usage = self.extract_choices_and_usage(data)
95+
choice: dict[str, dict] = choices[0] if choices else {}
96+
97+
# Support both regular content and reasoning_content tokens
98+
# This ensures TTFT and ITL are calculated correctly for models with reasoning
99+
if choices:
100+
delta = choice.get("delta", {})
101+
content = delta.get("content")
102+
reasoning_content = delta.get("reasoning_content")
103+
104+
# Track any token arrival (content or reasoning)
105+
# IMPORTANT: Check if field exists (not if it's truthy) to handle empty strings
106+
# The first chunk often has content="" which should still count for TTFT
107+
if content is not None or reasoning_content is not None:
108+
# Append whichever content is present (prioritize regular content)
109+
self.streaming_texts.append(content or reasoning_content or "")
110+
updated = True
111+
112+
if usage:
113+
self.streaming_usage = usage
114+
115+
return 1 if updated else 0

benchmark_runner/main.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from guidellm.benchmark.entrypoints import benchmark_generative_text
2727
from benchmark_runner.progress import ServerBenchmarkerProgress
2828
from benchmark_runner.sharegpt_adapter import prepare_datasets
29+
from guidellm.backends.response_handlers import GenerationResponseHandlerFactory
2930

3031
try:
3132
import uvloop
@@ -472,6 +473,26 @@ def run(**kwargs): # noqa: C901
472473
errs[0]["msg"], ctx=click.get_current_context(), param_hint=param_name
473474
) from err
474475

476+
# Convert string handler names to actual handler classes
477+
if args.backend_kwargs and "response_handlers" in args.backend_kwargs:
478+
handlers = args.backend_kwargs["response_handlers"]
479+
if isinstance(handlers, dict):
480+
for key, value in handlers.items():
481+
if isinstance(value, str):
482+
# Look up the handler class from the factory registry
483+
handler_class = (
484+
GenerationResponseHandlerFactory.get_registered_object(value)
485+
)
486+
if handler_class:
487+
handlers[key] = handler_class
488+
else:
489+
registry = GenerationResponseHandlerFactory.registry or {}
490+
available = ", ".join(registry.keys())
491+
raise ValueError(
492+
f"Unknown response handler: '{value}'. "
493+
f"Available handlers: {available}"
494+
)
495+
475496
if uvloop is not None:
476497
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
477498
asyncio.run(

0 commit comments

Comments
 (0)