@@ -126,6 +126,8 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
126126 start_time = time .time ()
127127 first_token_time = None
128128 words = ""
129+ tokens_out = 0
130+ tokens_prefill = 0
129131
130132 response = await self .client .chat .completions .create (
131133 messages = messages ,
@@ -138,20 +140,36 @@ async def _async_launch_request(self, messages, max_tokens, extra_headers=None):
138140 )
139141
140142 async for tok in response :
141- if not tok .choices :
143+ # 1. Handle Usage (Newer vLLM puts this in the last chunk)
144+ if hasattr (tok , "usage" ) and tok .usage is not None :
145+ tokens_out = tok .usage .completion_tokens
146+ tokens_prefill = tok .usage .prompt_tokens
147+
148+ # 2. Skip chunks without content (like the final usage chunk)
149+ if not tok .choices or len (tok .choices ) == 0 :
142150 continue
143- chunk_message = tok .choices [0 ].delta .content
144- if chunk_message is not None :
145- if first_token_time is None and chunk_message != "" :
151+
152+ # 3. Support both 'content' and 'reasoning_content'
153+ delta = tok .choices [0 ].delta
154+ chunk_message = getattr (delta , "content" , None ) or getattr (
155+ delta , "reasoning_content" , None
156+ )
157+
158+ if chunk_message :
159+ if first_token_time is None :
146160 first_token_time = time .time ()
147161 words += chunk_message
148- tokens_out = tok .usage .completion_tokens
149- tokens_prefill = tok .usage .prompt_tokens
162+
163+ # 4. Final Math Safety (The Crash Fix)
164+ # If the model failed to return tokens, we set TTFT to 0 to avoid NoneType errors
165+ actual_first_token = (
166+ first_token_time if first_token_time is not None else start_time
167+ )
150168
151169 return Response (
152170 body = words ,
153- ttft = first_token_time - start_time ,
154- generation_time = time .time () - first_token_time ,
171+ ttft = actual_first_token - start_time ,
172+ generation_time = time .time () - actual_first_token ,
155173 prompt_tokens = tokens_prefill ,
156174 generation_tokens = tokens_out ,
157175 launch_time = start_time ,
0 commit comments