fix(agent): filter AIMessage state updates from streaming output

MylesShannon · MylesShannon · commit fd518ff84118 · 2026-04-20T07:13:11.000-04:00
LangGraph's stream_mode="messages" emits both AIMessageChunk (incremental
tokens) and AIMessage (final state update) from the agent node. The
_stream_fn was accepting both via isinstance(msg, (AIMessage, AIMessageChunk)),
causing the full accumulated response to be emitted as a final chunk after
all the individual tokens had already been streamed. Clients saw the
complete response duplicated at the end of the SSE stream.

Filter to only AIMessageChunk so the state update is excluded.

Adds a regression test that confirms AIMessage objects are emitted by the
graph stream (the duplicate source) and that filtering to AIMessageChunk
excludes them.

Signed-off-by: Myles Shannon &lt;mshannon@nvidia.com&gt;
diff --git a/packages/nvidia_nat_langchain/src/nat/plugins/langchain/agent/tool_calling_agent/register.py b/packages/nvidia_nat_langchain/src/nat/plugins/langchain/agent/tool_calling_agent/register.py
@@ -101,7 +101,6 @@ class ToolCallAgentWorkflowConfig(AgentBaseConfig, name="tool_calling_agent"):
 
 @register_function(config_type=ToolCallAgentWorkflowConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN])
 async def tool_calling_agent_workflow(config: ToolCallAgentWorkflowConfig, builder: Builder):
-    from langchain_core.messages import AIMessage
     from langchain_core.messages import AIMessageChunk
     from langchain_core.messages import trim_messages
     from langchain_core.messages.base import BaseMessage
@@ -219,7 +218,7 @@ async def _stream_fn(chat_request_or_message: ChatRequestOrMessage) -> AsyncGene
                     state,
                     config={'recursion_limit': (config.max_iterations + 1) * 2},
                     stream_mode="messages"):
-                if not isinstance(msg, (AIMessage, AIMessageChunk)):
+                if not isinstance(msg, AIMessageChunk):
                     continue
                 if metadata.get("langgraph_node") != "agent":
                     continue
diff --git a/packages/nvidia_nat_langchain/tests/agent/test_tool_calling.py b/packages/nvidia_nat_langchain/tests/agent/test_tool_calling.py
@@ -367,6 +367,46 @@ async def test_graph_astream_yields_message_chunks(mock_tool_graph):
     assert len(combined_content) > 0, "Expected non-empty content from streamed agent messages"
 
 
+async def test_stream_fn_no_duplicate_content(mock_tool_graph):
+    """Regression: streaming must not duplicate the previous assistant message as a final chunk.
+
+    When stream=true, _stream_fn uses graph.astream(stream_mode="messages") which emits
+    both AIMessageChunk (incremental tokens) and AIMessage (state update). Accepting
+    AIMessage causes the accumulated response to appear twice in the output. The fix
+    filters to AIMessageChunk only. This test exercises the same graph.astream path and
+    asserts that the filtering logic in _stream_fn would prevent duplicates.
+    """
+    from langchain_core.messages import AIMessageChunk
+
+    prior_reply = "Hi there!"
+    mock_state = ToolCallAgentGraphState(messages=[
+        HumanMessage(content="hello"),
+        AIMessage(content=prior_reply),
+        HumanMessage(content="what can you do?"),
+    ])
+
+    chunk_contents = []
+    full_contents = []
+    async for msg, metadata in mock_tool_graph.astream(
+            mock_state, config={"recursion_limit": 5}, stream_mode="messages"):
+        if metadata.get("langgraph_node") != "agent":
+            continue
+        if isinstance(msg, AIMessageChunk) and isinstance(msg.content, str) and msg.content:
+            chunk_contents.append(msg.content)
+        if hasattr(msg, "content") and isinstance(msg.content, str) and msg.content:
+            full_contents.append(msg.content)
+
+    chunk_response = "".join(chunk_contents)
+    full_response = "".join(full_contents)
+
+    assert prior_reply in full_response, (
+        "AIMessage state update with prior reply should appear in unfiltered stream"
+    )
+    assert prior_reply not in chunk_response, (
+        f"AIMessageChunk-only stream must not contain prior assistant reply: {chunk_response!r}"
+    )
+
+
 def test_tool_call_chunk_serialization():
     """Test that ChatResponseChunk with tool_calls in ChoiceDelta serializes to OpenAI-compatible SSE format."""
     chunk = ChatResponseChunk(