From 6d050f613dd7401720d9360199627aaf0c70e525 Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Fri, 3 Apr 2026 17:38:20 -0700 Subject: [PATCH 1/2] fix: Use CompletionStreamResponse for streaming completions usage chunk The completion_stream_post_processor incorrectly used ChatCompletionStreamResponse for the final usage-only chunk, causing streaming /v1/completions responses to include "object": "chat.completion.chunk" instead of the expected "object": "text_completion". This breaks OpenAI-compatible clients (e.g., aiperf) that validate the object type field per endpoint. Replace ChatCompletionStreamResponse with CompletionStreamResponse at line 512 to match the type already used for regular streaming chunks (line 492). Signed-off-by: Jhao-Ting Chen --- tensorrt_llm/serve/postprocess_handlers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py index 75e918fe55d8..ea5a4d26d926 100644 --- a/tensorrt_llm/serve/postprocess_handlers.py +++ b/tensorrt_llm/serve/postprocess_handlers.py @@ -509,9 +509,9 @@ def completion_stream_post_processor(rsp: DetokenizedGenerationResultBase, cached_tokens=rsp.cached_tokens), ) - final_usage_chunk = ChatCompletionStreamResponse(choices=[], - model=args.model, - usage=final_usage) + final_usage_chunk = CompletionStreamResponse(choices=[], + model=args.model, + usage=final_usage) final_usage_data = final_usage_chunk.model_dump_json() res.append(f"data: {final_usage_data}\n\n") args.first_iteration = False From 8a814268889d794b8d5df922561504224fbaaa4b Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Fri, 3 Apr 2026 18:11:01 -0700 Subject: [PATCH 2/2] test: Assert object type in streaming completions usage chunks Add assertions to test_completion_stream_options to verify that all streaming chunks (including the final usage-only chunk) return "object": "text_completion" for the /v1/completions endpoint. This guards against regressions where the usage chunk might incorrectly use ChatCompletionStreamResponse. Signed-off-by: Jhao-Ting Chen --- tests/unittest/llmapi/apps/_test_openai_completions.py | 4 ++++ 1 file changed, 4 insertions(+) mode change 100644 => 100755 tests/unittest/llmapi/apps/_test_openai_completions.py diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py old mode 100644 new mode 100755 index 03f08a5c6b8f..ff411be19ef4 --- a/tests/unittest/llmapi/apps/_test_openai_completions.py +++ b/tests/unittest/llmapi/apps/_test_openai_completions.py @@ -279,11 +279,13 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI, False, }) async for chunk in stream: + assert chunk.object == "text_completion" if chunk.choices[0].finish_reason is None: assert chunk.usage is None else: assert chunk.usage is None final_chunk = await stream.__anext__() + assert final_chunk.object == "text_completion" assert final_chunk.usage is not None assert final_chunk.usage.prompt_tokens > 0 assert final_chunk.usage.completion_tokens > 0 @@ -306,6 +308,7 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI, True, }) async for chunk in stream: + assert chunk.object == "text_completion" assert chunk.usage is not None assert chunk.usage.prompt_tokens > 0 assert chunk.usage.completion_tokens > 0 @@ -313,6 +316,7 @@ async def test_completion_stream_options(async_client: openai.AsyncOpenAI, chunk.usage.completion_tokens) if chunk.choices[0].finish_reason is not None: final_chunk = await stream.__anext__() + assert final_chunk.object == "text_completion" assert final_chunk.usage is not None assert final_chunk.usage.prompt_tokens > 0 assert final_chunk.usage.completion_tokens > 0