fix: Replace fan-out static edges with LLM-driven handoff routing

jsonbailey · claude · jsonbailey · commit 85ed502d7dc1 · 2026-04-06T10:59:26.000-05:00
Multi-child nodes previously used static add_edge calls, causing LangGraph to fan-out to all children in parallel. Replace with handoff tools (Command(goto=child_key) via @tool + InjectedToolCallId) so the LLM picks exactly one child per turn. Bind handoff nodes with parallel_tool_calls=False to prevent the model from selecting multiple destinations in a single response. Switch WorkflowState.messages to add_messages reducer (deduplicates by ID) and add recursion_limit=25 as a safety cap. Adds test asserting single-child routing in a 3-node orchestrator graph. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py
@@ -1,6 +1,5 @@
 """LangGraph agent graph runner for LaunchDarkly AI SDK."""
 
-import operator
 import time
 from typing import Annotated, Any, Dict, List, Tuple
 
@@ -25,6 +24,40 @@ def _tool_call_id_from_entry(tc: Any) -> Any:
     return getattr(tc, 'id', None)
 
 
+def _make_handoff_tool(child_key: str, description: str) -> Any:
+    """
+    Create a tool that transfers control to ``child_key``.
+
+    Uses the ``@tool`` decorator with ``InjectedState`` + ``InjectedToolCallId``
+    so LangGraph's ToolNode handles the ``Command`` return value correctly.
+    The tool explicitly creates a ToolMessage in ``Command.update`` to satisfy
+    the LangChain/OpenAI message-chain contract.
+    """
+    from typing import Annotated as _Annotated
+
+    from langchain_core.messages import ToolMessage
+    from langchain_core.tools import tool
+    from langchain_core.tools.base import InjectedToolCallId
+    from langgraph.prebuilt import InjectedState
+    from langgraph.types import Command
+
+    tool_name = f"transfer_to_{child_key.replace('-', '_')}"
+
+    @tool(tool_name, description=description)
+    def handoff(
+        state: _Annotated[Any, InjectedState],  # noqa: ARG001
+        tool_call_id: _Annotated[str, InjectedToolCallId],
+    ) -> Command:
+        tool_message = ToolMessage(
+            content=f'Transferred to {child_key}',
+            name=tool_name,
+            tool_call_id=tool_call_id,
+        )
+        return Command(goto=child_key, update={'messages': [tool_message]})
+
+    return handoff
+
+
 def _coalesce_tool_messages_for_openai(msgs: List[Any]) -> List[Any]:
     """
     Rewind shared LangGraph message state into OpenAI's required shape.
@@ -168,11 +201,12 @@ def _build_graph(self) -> Tuple[Any, Dict[str, str]]:
         """
         from langchain_core.messages import SystemMessage
         from langgraph.graph import END, START, StateGraph
+        from langgraph.graph.message import add_messages
         from langgraph.prebuilt import ToolNode, tools_condition
         from typing_extensions import TypedDict
 
         class WorkflowState(TypedDict):
-            messages: Annotated[List[Any], operator.add]
+            messages: Annotated[List[Any], add_messages]
 
         agent_builder: StateGraph = StateGraph(WorkflowState)
         root_node = self._graph.root()
@@ -184,22 +218,16 @@ class WorkflowState(TypedDict):
         def handle_traversal(node: AgentGraphNode, ctx: dict) -> None:
             node_config = node.get_config()
             node_key = node.get_key()
+            instructions = node_config.instructions if hasattr(node_config, 'instructions') else None
+            outgoing_edges = node.get_edges()
 
+            lc_model = None
             tool_fns: list = []
-            model = None
-            instructions = node_config.instructions if hasattr(node_config, 'instructions') else None
             if node_config.model:
                 # We send an empty tool registry to avoid binding tools to the model.
                 lc_model = create_langchain_model(node_config, tool_registry=None)
 
-                # Retrieve tool definitions to build fn_name_to_config_key map
-                config_dict = node_config.to_dict()
-                model_dict = config_dict.get('model') or {}
-                parameters = dict(model_dict.get('parameters') or {})
-                tool_defs = parameters.get('tools', []) or []
-
                 tool_fns = build_structured_tools(node_config, tools_ref)
-                model = lc_model.bind_tools(tool_fns) if tool_fns else lc_model
 
                 # Map tool name -> LD config key for callback attribution.
                 # build_structured_tools returns StructuredTool instances with tool.name set
@@ -209,6 +237,33 @@ def handle_traversal(node: AgentGraphNode, ctx: dict) -> None:
                     if tool_name:
                         fn_name_to_config_key[tool_name] = tool_name
 
+            # For nodes with multiple children, create a handoff tool per child so the
+            # LLM decides which agent to route to.  Uses Command(goto=child_key) so
+            # LangGraph routes to the target without looping back here.
+            handoff_fns: list = []
+            if lc_model and len(outgoing_edges) > 1:
+                for edge in outgoing_edges:
+                    child_node = self._graph.get_node(edge.target_config)
+                    description = (
+                        (edge.handoff or {}).get('description')
+                        or (
+                            child_node.get_config().instructions[:120]
+                            if child_node and child_node.get_config().instructions
+                            else None
+                        )
+                        or f"Transfer control to {edge.target_config}"
+                    )
+                    handoff_fns.append(_make_handoff_tool(edge.target_config, description))
+
+            all_tools = tool_fns + handoff_fns
+            if lc_model and all_tools:
+                # When handoff tools are present, disable parallel tool calls so the LLM
+                # picks exactly one destination rather than routing to multiple children.
+                bind_kwargs = {'parallel_tool_calls': False} if handoff_fns else {}
+                model = lc_model.bind_tools(all_tools, **bind_kwargs)
+            else:
+                model = lc_model
+
             def make_node_fn(bound_model: Any, node_instructions: Any, nk: str):
                 async def invoke(state: WorkflowState) -> dict:
                     if not bound_model:
@@ -234,30 +289,45 @@ async def invoke(state: WorkflowState) -> dict:
             if node_key == root_key:
                 agent_builder.add_edge(START, node_key)
 
-            outgoing_edges = node.get_edges()
-
             # Collect node info for graph structure log
             tool_names = [str(getattr(t, 'name', None) or getattr(t, '__name__', t)) for t in tool_fns]
             edge_targets = [e.target_config for e in outgoing_edges]
             node_desc = node_key
             if tool_names:
                 node_desc += f"[tools:{','.join(tool_names)}]"
-            node_desc += f"→{','.join(edge_targets)}" if edge_targets else "(terminal)"
+            if handoff_fns:
+                node_desc += f"[handoff:{','.join(edge_targets)}]"
+            elif edge_targets:
+                node_desc += f"→{','.join(edge_targets)}"
+            else:
+                node_desc += "(terminal)"
             graph_structure.append(node_desc)
 
-            if tool_fns:
-                # Pair this node with a ToolNode and loop it back (standard LangGraph pattern).
-                # tools_condition routes to "tools" when the response has tool calls,
-                # and to END otherwise; the path_map redirects those to our named nodes.
+            if all_tools:
+                # ToolNode handles Command returns from handoff tools, routing to the target
+                # node.  For functional tools it returns normal ToolMessages and we loop back.
+                # tools_condition exits to END when no tool is called.
                 tools_node_key = f"{node_key}__tools"
-                after_loop = outgoing_edges[0].target_config if outgoing_edges else END
-                agent_builder.add_node(tools_node_key, ToolNode(tool_fns))
-                agent_builder.add_edge(tools_node_key, node_key)
-                agent_builder.add_conditional_edges(
-                    node_key,
-                    tools_condition,
-                    {"tools": tools_node_key, END: after_loop},
-                )
+                agent_builder.add_node(tools_node_key, ToolNode(all_tools))
+
+                if not handoff_fns:
+                    # No handoff tools: standard loop-back after tool execution.
+                    after_loop = outgoing_edges[0].target_config if outgoing_edges else END
+                    agent_builder.add_edge(tools_node_key, node_key)
+                    agent_builder.add_conditional_edges(
+                        node_key,
+                        tools_condition,
+                        {"tools": tools_node_key, END: after_loop},
+                    )
+                else:
+                    # Handoff tools use Command(goto=child_key) — LangGraph routes to the
+                    # target directly without any extra edge.  The ToolNode does NOT loop
+                    # back here.  tools_condition exits to END when no tool is called.
+                    agent_builder.add_conditional_edges(
+                        node_key,
+                        tools_condition,
+                        {"tools": tools_node_key, END: END},
+                    )
             else:
                 if node.is_terminal():
                     agent_builder.add_edge(node_key, END)
@@ -276,14 +346,6 @@ async def invoke(state: WorkflowState) -> dict:
         )
 
         compiled = agent_builder.compile()
-        # try:
-        #     image_data = compiled.get_graph().draw_mermaid_png()
-        #     out_path = f"{graph_key_str}_langgraph.png"
-        #     with open(out_path, mode='wb') as f:
-        #         f.write(image_data)
-        # except Exception as exc:
-        #     log.debug('LangGraphAgentGraphRunner: could not write graph PNG (%s)', exc)
-
         return compiled, fn_name_to_config_key
 
     async def run(self, input: Any) -> AgentGraphResult:
@@ -310,7 +372,7 @@ async def run(self, input: Any) -> AgentGraphResult:
 
             result = await compiled.ainvoke(  # type: ignore[call-overload]
                 {'messages': [HumanMessage(content=str(input))]},
-                config={'callbacks': [handler]},
+                config={'callbacks': [handler], 'recursion_limit': 25},
             )
 
             duration = (time.perf_counter_ns() - start_ns) // 1_000_000
diff --git a/packages/ai-providers/server-ai-langchain/tests/test_tracking_langgraph.py b/packages/ai-providers/server-ai-langchain/tests/test_tracking_langgraph.py
@@ -490,3 +490,113 @@ def model_factory(node_config, **kwargs):
     path_data = ev['$ld:ai:graph:path'][0][0]
     assert 'root-agent' in path_data['path']
     assert 'child-agent' in path_data['path']
+
+
+def _make_multi_child_graph(mock_ld_client: MagicMock) -> 'AgentGraphDefinition':
+    """Build a 3-node graph: orchestrator → agent-a, orchestrator → agent-b."""
+    context = MagicMock()
+
+    def _node_tracker(key: str) -> LDAIConfigTracker:
+        return LDAIConfigTracker(
+            ld_client=mock_ld_client,
+            variation_key='test-variation',
+            config_key=key,
+            version=1,
+            model_name='gpt-4',
+            provider_name='openai',
+            context=context,
+        )
+
+    graph_tracker = AIGraphTracker(
+        ld_client=mock_ld_client,
+        variation_key='test-variation',
+        graph_key='multi-child-graph',
+        version=1,
+        context=context,
+    )
+
+    configs = {
+        'orchestrator': AIAgentConfig(
+            key='orchestrator',
+            enabled=True,
+            model=ModelConfig(name='gpt-4', parameters={}),
+            provider=ProviderConfig(name='openai'),
+            instructions='Route to the appropriate specialist agent.',
+            tracker=_node_tracker('orchestrator'),
+        ),
+        'agent-a': AIAgentConfig(
+            key='agent-a',
+            enabled=True,
+            model=ModelConfig(name='gpt-4', parameters={}),
+            provider=ProviderConfig(name='openai'),
+            instructions='You handle topic A.',
+            tracker=_node_tracker('agent-a'),
+        ),
+        'agent-b': AIAgentConfig(
+            key='agent-b',
+            enabled=True,
+            model=ModelConfig(name='gpt-4', parameters={}),
+            provider=ProviderConfig(name='openai'),
+            instructions='You handle topic B.',
+            tracker=_node_tracker('agent-b'),
+        ),
+    }
+
+    edges = [
+        Edge(key='orch-to-a', source_config='orchestrator', target_config='agent-a'),
+        Edge(key='orch-to-b', source_config='orchestrator', target_config='agent-b'),
+    ]
+    graph_config = AIAgentGraphConfig(
+        key='multi-child-graph',
+        root_config_key='orchestrator',
+        edges=edges,
+        enabled=True,
+    )
+    nodes = AgentGraphDefinition.build_nodes(graph_config, configs)
+    return AgentGraphDefinition(
+        agent_graph=graph_config,
+        nodes=nodes,
+        context=context,
+        enabled=True,
+        tracker=graph_tracker,
+    )
+
+
+@pytest.mark.asyncio
+async def test_multi_child_routes_via_handoff_not_fan_out():
+    """Orchestrator with two children routes to exactly one child via handoff tool,
+    not a fan-out that invokes both children."""
+    from langchain_core.messages import AIMessage
+
+    mock_ld_client = MagicMock()
+    graph = _make_multi_child_graph(mock_ld_client)
+
+    # Orchestrator calls transfer_to_agent_a (handoff tool name derived from child key)
+    orchestrator_response = AIMessage(
+        content='',
+        tool_calls=[{
+            'name': 'transfer_to_agent_a',
+            'args': {},
+            'id': 'call_handoff_1',
+            'type': 'tool_call',
+        }],
+    )
+    agent_a_response = _make_fake_response('Agent A handled it.')
+    agent_b_model = _mock_model(_make_fake_response('Agent B handled it.'))
+
+    def model_factory(node_config, **kwargs):
+        if node_config.key == 'orchestrator':
+            return _mock_model(orchestrator_response)
+        if node_config.key == 'agent-a':
+            return _mock_model(agent_a_response)
+        return agent_b_model
+
+    with patch('ldai_langchain.langgraph_agent_graph_runner.create_langchain_model',
+               side_effect=model_factory):
+        runner = LangGraphAgentGraphRunner(graph, {})
+        result = await runner.run('hello')
+
+    assert result.metrics.success is True
+    assert 'Agent A' in result.output
+    # Agent B's model must never have been invoked — no fan-out
+    agent_b_model.ainvoke.assert_not_called()