Skip to content

Commit e92d43f

Browse files
authored
fix(benchmark/multi-round-qa): fix TTFT NoneType crash caused by reasoning models emitting reasoning_content instead of content (#873)
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
1 parent 99ab33a commit e92d43f

1 file changed

Lines changed: 26 additions & 8 deletions

File tree

benchmarks/multi-round-qa/multi-round-qa.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
126126
start_time = time.time()
127127
first_token_time = None
128128
words = ""
129+
tokens_out = 0
130+
tokens_prefill = 0
129131

130132
response = await self.client.chat.completions.create(
131133
messages=messages,
@@ -138,20 +140,36 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
138140
)
139141

140142
async for tok in response:
141-
if not tok.choices:
143+
# 1. Handle Usage (Newer vLLM puts this in the last chunk)
144+
if hasattr(tok, "usage") and tok.usage is not None:
145+
tokens_out = tok.usage.completion_tokens
146+
tokens_prefill = tok.usage.prompt_tokens
147+
148+
# 2. Skip chunks without content (like the final usage chunk)
149+
if not tok.choices or len(tok.choices) == 0:
142150
continue
143-
chunk_message = tok.choices[0].delta.content
144-
if chunk_message is not None:
145-
if first_token_time is None and chunk_message != "":
151+
152+
# 3. Support both 'content' and 'reasoning_content'
153+
delta = tok.choices[0].delta
154+
chunk_message = getattr(delta, "content", None) or getattr(
155+
delta, "reasoning_content", None
156+
)
157+
158+
if chunk_message:
159+
if first_token_time is None:
146160
first_token_time = time.time()
147161
words += chunk_message
148-
tokens_out = tok.usage.completion_tokens
149-
tokens_prefill = tok.usage.prompt_tokens
162+
163+
# 4. Final Math Safety (The Crash Fix)
164+
# If the model failed to return tokens, we set TTFT to 0 to avoid NoneType errors
165+
actual_first_token = (
166+
first_token_time if first_token_time is not None else start_time
167+
)
150168

151169
return Response(
152170
body=words,
153-
ttft=first_token_time - start_time,
154-
generation_time=time.time() - first_token_time,
171+
ttft=actual_first_token - start_time,
172+
generation_time=time.time() - actual_first_token,
155173
prompt_tokens=tokens_prefill,
156174
generation_tokens=tokens_out,
157175
launch_time=start_time,

0 commit comments

Comments
 (0)