Skip to content

Commit fe9e1a3

Browse files
authored
[https://nvbugs/5955188][fix] Fix harmony parsers for agentic coding use cases (#12045)
Signed-off-by: Dongfeng Yu <dongfengy@nvidia.com>
1 parent 5a7076c commit fe9e1a3

3 files changed

Lines changed: 86 additions & 21 deletions

File tree

tensorrt_llm/serve/harmony_adapter.py

Lines changed: 80 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -202,42 +202,68 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
202202
self.parser.current_channel):
203203
return {"should_stop": "Repeated message"}
204204

205+
# Check for tool calls first, regardless of channel.
206+
# The model may emit tool calls on either "commentary" or "analysis" channel.
207+
if (self.parser.current_channel in ("commentary", "analysis")
208+
and self.parser.current_recipient
209+
and "functions." in str(self.parser.current_recipient)):
210+
func_name = str(
211+
self.parser.current_recipient).split("functions.")[-1]
212+
self.current_channel_state = "commentary_tool"
213+
214+
# Check if tool is allowed
215+
if self.should_filter_tools and func_name not in self.available_tools:
216+
logger.debug(
217+
f"Request {self.request_id}: tool {func_name} not in available tools"
218+
)
219+
return None
220+
221+
# Get or create tool call
222+
tool_id = self._get_or_create_tool_call(func_name)
223+
224+
# Accumulate arguments
225+
self.tool_calls[tool_id][
226+
"arguments"] += self.parser.last_content_delta
227+
228+
# Create tool call delta - return only the new content delta, not accumulated
229+
return {
230+
"tool_calls": [{
231+
"id": tool_id,
232+
"type": "function",
233+
"function": {
234+
"name": func_name,
235+
"arguments": self.parser.
236+
last_content_delta # Only the new content delta
237+
},
238+
"index": self.tool_calls[tool_id]["index"]
239+
}]
240+
}
241+
205242
if self.parser.current_channel == "analysis":
206243
# Analysis channel -> reasoning (no token wrapping needed)
207244
self.current_channel_state = "analysis"
208245
return {"reasoning": self.parser.last_content_delta}
209246

210247
elif self.parser.current_channel == "commentary":
211-
if self.parser.current_recipient and "functions." in str(
212-
self.parser.current_recipient):
213-
# Tool call in commentary channel
214-
func_name = str(
215-
self.parser.current_recipient).split("functions.")[-1]
248+
if self.parser.current_recipient and self.parser.current_recipient != 'assistant':
249+
# Non-functions tool call (e.g., browser, python)
250+
func_name = str(self.parser.current_recipient)
216251
self.current_channel_state = "commentary_tool"
217252

218-
# Check if tool is allowed
219253
if self.should_filter_tools and func_name not in self.available_tools:
220-
logger.debug(
221-
f"Request {self.request_id}: tool {func_name} not in available tools"
222-
)
223254
return None
224255

225-
# Get or create tool call
226256
tool_id = self._get_or_create_tool_call(func_name)
227-
228-
# Accumulate arguments
229257
self.tool_calls[tool_id][
230258
"arguments"] += self.parser.last_content_delta
231259

232-
# Create tool call delta - return only the new content delta, not accumulated
233260
return {
234261
"tool_calls": [{
235262
"id": tool_id,
236263
"type": "function",
237264
"function": {
238265
"name": func_name,
239-
"arguments": self.parser.
240-
last_content_delta # Only the new content delta
266+
"arguments": self.parser.last_content_delta
241267
},
242268
"index": self.tool_calls[tool_id]["index"]
243269
}]
@@ -903,7 +929,7 @@ def _parse_tool_call_from_harmony_message(
903929
)
904930
return None
905931
elif msg_content_type and "code" in msg_content_type:
906-
function_name = str(msg_recipient)
932+
function_name = str(msg_recipient).split("functions.")[-1]
907933
return {
908934
"id": f"call_{uuid.uuid4().hex[:8]}",
909935
"type": "function",
@@ -1050,7 +1076,20 @@ def harmony_output_to_openai(
10501076
if not _check_channel_valid(generated_channels, msg_channel):
10511077
continue
10521078

1053-
if msg_channel == "analysis":
1079+
# Check for tool calls first, regardless of channel.
1080+
# The model may emit tool calls on either "commentary" or "analysis" channel
1081+
# (other frameworks handle both channels when recipient starts with "functions.")
1082+
if (msg_channel in ("commentary", "analysis") and msg_recipient
1083+
and msg_recipient != 'assistant'
1084+
and str(msg_recipient).startswith("functions.")):
1085+
# Tool call
1086+
tool_call = self._parse_tool_call_from_harmony_message(msg)
1087+
if tool_call and self._is_tool_call_allowed(
1088+
tool_call, external_tools,
1089+
should_filter_external_tools):
1090+
tool_calls.append(tool_call)
1091+
1092+
elif msg_channel == "analysis":
10541093
for content in msg_content:
10551094
if isinstance(content, TextContent):
10561095
analysis_content += content.text
@@ -1061,7 +1100,7 @@ def harmony_output_to_openai(
10611100

10621101
elif msg_channel == "commentary":
10631102
if msg_recipient and msg_recipient != 'assistant':
1064-
# Tool call
1103+
# Non-functions tool call (e.g., browser, python)
10651104
tool_call = self._parse_tool_call_from_harmony_message(
10661105
msg)
10671106
if tool_call and self._is_tool_call_allowed(
@@ -1366,6 +1405,7 @@ def create_openai_streaming_response(
13661405
# Handle reasoning content
13671406
if "reasoning" in harmony_delta:
13681407
delta_message.reasoning = harmony_delta["reasoning"]
1408+
delta_message.reasoning_content = harmony_delta["reasoning"]
13691409
# tool_calls will use default factory (empty list)
13701410

13711411
# Handle regular content
@@ -1570,6 +1610,27 @@ def end_streaming(res):
15701610
try:
15711611
res = []
15721612
if done:
1613+
# Process any remaining tokens before sending final message
1614+
if output.token_ids_diff:
1615+
remaining_responses, _ = harmony_adapter.create_openai_streaming_response(
1616+
request_id=request_id,
1617+
tokens=output.token_ids_diff,
1618+
available_tools=tools_for_parser,
1619+
model_name=model,
1620+
tool_choice=tool_choice)
1621+
if first_iteration and remaining_responses:
1622+
first_delta = DeltaMessage(role="assistant")
1623+
choice = ChatCompletionResponseStreamChoice(
1624+
index=0, delta=first_delta)
1625+
first_response = ChatCompletionStreamResponse(
1626+
model=model,
1627+
choices=[choice],
1628+
)
1629+
response_json = first_response.model_dump_json(
1630+
exclude_none=True)
1631+
res.append(f"data: {response_json}\n\n")
1632+
res.extend(remaining_responses)
1633+
15731634
# Send final message with finish_reason
15741635
final_response = ChatCompletionStreamResponse(
15751636
model=model,
@@ -1708,6 +1769,7 @@ def _create_response_message(parsed_output: dict[str, Any]) -> dict[str, Any]:
17081769
# Add reasoning_content if present
17091770
if "reasoning" in parsed_output:
17101771
message["reasoning"] = parsed_output["reasoning"]
1772+
message["reasoning_content"] = parsed_output["reasoning"]
17111773

17121774
return message
17131775

tensorrt_llm/serve/openai_server.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,8 +1275,6 @@ async def create_streaming_generator(promise: RequestOutput,
12751275
disaggregated_params=disaggregated_params,
12761276
trace_headers=trace_headers,
12771277
)
1278-
postproc_args.request_id = promise.request_id
1279-
12801278
if not self.postproc_worker_enabled:
12811279
postproc_args.num_prompt_tokens = len(promise.prompt_token_ids)
12821280

tensorrt_llm/serve/postprocess_handlers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,12 +600,17 @@ def chat_harmony_post_processor(
600600
@nvtx_range_debug("chat_harmony_streaming_post_processor")
601601
def chat_harmony_streaming_post_processor(
602602
rsp: GenerationResult, args: ChatCompletionPostprocArgs) -> List[str]:
603+
# Read the request ID directly from rsp.id instead of args.request_id.
604+
# Both are the same executor-assigned ID, but args.request_id is set too
605+
# late (after generate_async returns) for the postprocess worker path:
606+
# the worker receives a copy of args before the ID is assigned, so
607+
# args.request_id is always None with num_postprocess_workers > 0.
603608
response = handle_streaming_response(
604609
tools=args.tools,
605610
tool_choice=args.tool_choice,
606611
result=rsp,
607612
model=args.model,
608-
request_id=args.request_id,
613+
request_id=str(rsp.id),
609614
done=rsp._done,
610615
num_prompt_tokens=args.num_prompt_tokens,
611616
first_iteration=args.first_iteration,

0 commit comments

Comments
 (0)