Skip to content

Commit b425394

Browse files
jsonbaileyclaude
andcommitted
feat: add judge evaluation support to agent graphs
Adds per-node judge evaluation to agent graph execution. Each AIAgentConfig now carries a pre-built Evaluator (mirroring AICompletionConfig) that the provider-specific AgentGraphRunner invokes after each node's model response. Results are tracked via the same AIConfigTracker used for that node's LLM metrics, ensuring evaluation data is correlated correctly. Key changes: - New Evaluator class coordinating multiple judges; evaluate() returns an asyncio Task so evaluation fires immediately and is awaited in flush() - AIAgentConfig and AICompletionConfig carry an eager evaluator (kw_only field) - LangGraph runner stores per-node eval tasks in _pending_eval_tasks and flushes them via the callback handler's async flush() method - OpenAI runner fires judge evaluation at handoff and final-segment points - client._build_evaluator() handles empty/None judge config via Evaluator.noop() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent da0c9c6 commit b425394

20 files changed

Lines changed: 311 additions & 115 deletions

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ def create_agent(self, config: Any, tools: Optional[ToolRegistry] = None) -> Lan
3939
)
4040
return LangChainAgentRunner(agent)
4141

42-
def create_agent_graph(self, graph_def: Any, tools: ToolRegistry) -> Any:
42+
def create_agent_graph(
43+
self,
44+
graph_def: Any,
45+
tools: ToolRegistry,
46+
) -> Any:
4347
"""
4448
CAUTION:
4549
This feature is experimental and should NOT be considered ready for production use.

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_agent_graph_runner.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""LangGraph agent graph runner for LaunchDarkly AI SDK."""
22

3+
import asyncio
34
import time
4-
from typing import Annotated, Any, Dict, List, Optional, Set, Tuple
5+
from typing import Annotated, Any, Dict, List, Set, Tuple
56

67
from ldai import log
78
from ldai.agent_graph import AgentGraphDefinition, AgentGraphNode
@@ -67,7 +68,11 @@ class LangGraphAgentGraphRunner(AgentGraphRunner):
6768
Requires ``langgraph`` to be installed.
6869
"""
6970

70-
def __init__(self, graph: AgentGraphDefinition, tools: ToolRegistry):
71+
def __init__(
72+
self,
73+
graph: AgentGraphDefinition,
74+
tools: ToolRegistry,
75+
):
7176
"""
7277
Initialize the runner.
7378
@@ -79,6 +84,7 @@ def __init__(self, graph: AgentGraphDefinition, tools: ToolRegistry):
7984
self._compiled: Any = None
8085
self._fn_name_to_config_key: Dict[str, str] = {}
8186
self._node_keys: Set[str] = set()
87+
self._pending_eval_tasks: Dict[str, asyncio.Task] = {}
8288

8389
def _ensure_compiled(self) -> None:
8490
"""Build and cache the compiled graph if not already done."""
@@ -172,6 +178,18 @@ async def invoke(state: WorkflowState) -> dict:
172178
if node_instructions:
173179
msgs = [SystemMessage(content=node_instructions)] + msgs
174180
response = await bound_model.ainvoke(msgs)
181+
182+
node_obj = self._graph.get_node(nk)
183+
if node_obj is not None:
184+
input_text = '\r\n'.join(
185+
m.content if isinstance(m.content, str) else str(m.content)
186+
for m in msgs
187+
) if msgs else ''
188+
output_text = (
189+
response.content if hasattr(response, 'content') else str(response)
190+
)
191+
self._pending_eval_tasks[nk] = node_obj.get_config().evaluator.evaluate(input_text, output_text)
192+
175193
return {'messages': [response]}
176194

177195
invoke.__name__ = nk
@@ -280,6 +298,7 @@ async def run(self, input: Any) -> AgentGraphResult:
280298
:param input: The string prompt to send to the agent graph
281299
:return: AgentGraphResult with the final output and metrics
282300
"""
301+
self._pending_eval_tasks = {}
283302
tracker = self._graph.create_tracker() if self._graph.create_tracker is not None else None
284303
start_ns = time.perf_counter_ns()
285304

@@ -299,7 +318,7 @@ async def run(self, input: Any) -> AgentGraphResult:
299318
output = extract_last_message_content(messages)
300319

301320
# Flush per-node metrics to LD trackers
302-
handler.flush(self._graph)
321+
await handler.flush(self._graph, self._pending_eval_tasks)
303322

304323
# Graph-level metrics
305324
if tracker:

packages/ai-providers/server-ai-langchain/src/ldai_langchain/langgraph_callback_handler.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,13 +188,15 @@ def on_tool_end(
188188
# Flush
189189
# ------------------------------------------------------------------
190190

191-
def flush(self, graph: AgentGraphDefinition) -> None:
191+
async def flush(self, graph: AgentGraphDefinition, eval_tasks=None) -> None:
192192
"""
193193
Emit all collected per-node metrics to the LaunchDarkly trackers.
194194
195195
Call this once after the graph run completes.
196196
197197
:param graph: The AgentGraphDefinition whose nodes hold the LD config trackers.
198+
:param eval_tasks: Optional dict mapping node key to an awaitable that returns
199+
judge evaluation results.
198200
"""
199201
node_trackers: Dict[str, Any] = {}
200202
for node_key in self._path:
@@ -220,3 +222,15 @@ def flush(self, graph: AgentGraphDefinition) -> None:
220222

221223
for tool_key in self._node_tool_calls.get(node_key, []):
222224
config_tracker.track_tool_call(tool_key)
225+
226+
if not eval_tasks:
227+
continue
228+
229+
eval_task = eval_tasks.get(node_key)
230+
if not eval_task:
231+
continue
232+
233+
results = await eval_task
234+
for r in results:
235+
if r.success:
236+
config_tracker.track_judge_result(r)

packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
77

88
from ldai import LDMessage
9+
from ldai.evaluator import Evaluator
910

1011
from ldai_langchain import (
1112
LangChainModelRunner,
@@ -530,6 +531,7 @@ def sync_tool(x: str = '') -> str:
530531
cfg = AIAgentConfig(
531532
key='n',
532533
enabled=True,
534+
evaluator=Evaluator.noop(),
533535
create_tracker=MagicMock(),
534536
model=ModelConfig(
535537
name='gpt-4',
@@ -553,6 +555,7 @@ async def async_tool(x: str = '') -> str:
553555
cfg = AIAgentConfig(
554556
key='n',
555557
enabled=True,
558+
evaluator=Evaluator.noop(),
556559
create_tracker=MagicMock(),
557560
model=ModelConfig(
558561
name='gpt-4',

packages/ai-providers/server-ai-langchain/tests/test_langgraph_agent_graph_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from unittest.mock import AsyncMock, MagicMock, patch
55

66
from ldai.agent_graph import AgentGraphDefinition
7+
from ldai.evaluator import Evaluator
78
from ldai.models import AIAgentGraphConfig, AIAgentConfig, ModelConfig, ProviderConfig
89
from ldai.providers import AgentGraphResult, ToolRegistry
910
from ldai_langchain.langgraph_agent_graph_runner import LangGraphAgentGraphRunner
@@ -20,6 +21,7 @@ def _make_graph(enabled: bool = True) -> AgentGraphDefinition:
2021
model=ModelConfig(name='gpt-4'),
2122
provider=ProviderConfig(name='openai'),
2223
instructions='You are a helpful assistant.',
24+
evaluator=Evaluator.noop(),
2325
)
2426
graph_config = AIAgentGraphConfig(
2527
key='test-graph',

packages/ai-providers/server-ai-langchain/tests/test_langgraph_callback_handler.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from ldai.agent_graph import AgentGraphDefinition
1818
from ldai.models import AIAgentConfig, AIAgentGraphConfig, ModelConfig, ProviderConfig
1919
from ldai.tracker import AIGraphTracker, LDAIConfigTracker, TokenUsage
20+
from ldai.evaluator import Evaluator
2021
from ldai_langchain.langgraph_callback_handler import LDMetricsCallbackHandler
2122

2223

@@ -48,6 +49,7 @@ def _make_graph(mock_ld_client: MagicMock, node_key: str = 'root-agent', graph_k
4849
node_config = AIAgentConfig(
4950
key=node_key,
5051
enabled=True,
52+
evaluator=Evaluator.noop(),
5153
model=ModelConfig(name='gpt-4', parameters={}),
5254
provider=ProviderConfig(name='openai'),
5355
instructions='Be helpful.',
@@ -317,7 +319,8 @@ def test_on_tool_end_none_name_ignored():
317319
# flush() tests
318320
# ---------------------------------------------------------------------------
319321

320-
def test_flush_emits_token_events_to_ld_tracker():
322+
@pytest.mark.asyncio
323+
async def test_flush_emits_token_events_to_ld_tracker():
321324
"""flush() calls track_tokens on the node's config tracker."""
322325
mock_ld_client = MagicMock()
323326
graph = _make_graph(mock_ld_client, node_key='root-agent', graph_key='g1')
@@ -327,7 +330,7 @@ def test_flush_emits_token_events_to_ld_tracker():
327330
node_run_id = uuid4()
328331
handler.on_chain_start({}, {}, run_id=node_run_id, name='root-agent')
329332
handler.on_llm_end(_llm_result(15, 10, 5), run_id=uuid4(), parent_run_id=node_run_id)
330-
handler.flush(graph)
333+
await handler.flush(graph)
331334

332335
ev = _events(mock_ld_client)
333336
assert ev['$ld:ai:tokens:total'][0][1] == 15
@@ -336,7 +339,8 @@ def test_flush_emits_token_events_to_ld_tracker():
336339
assert ev['$ld:ai:generation:success'][0][1] == 1
337340

338341

339-
def test_flush_emits_duration():
342+
@pytest.mark.asyncio
343+
async def test_flush_emits_duration():
340344
"""flush() calls track_duration when duration was recorded."""
341345
mock_ld_client = MagicMock()
342346
graph = _make_graph(mock_ld_client)
@@ -346,13 +350,14 @@ def test_flush_emits_duration():
346350
run_id = uuid4()
347351
handler.on_chain_start({}, {}, run_id=run_id, name='root-agent')
348352
handler.on_chain_end({}, run_id=run_id)
349-
handler.flush(graph)
353+
await handler.flush(graph)
350354

351355
ev = _events(mock_ld_client)
352356
assert '$ld:ai:duration:total' in ev
353357

354358

355-
def test_flush_emits_tool_calls():
359+
@pytest.mark.asyncio
360+
async def test_flush_emits_tool_calls():
356361
"""flush() calls track_tool_call for each recorded tool invocation."""
357362
mock_ld_client = MagicMock()
358363
graph = _make_graph(mock_ld_client)
@@ -366,15 +371,16 @@ def test_flush_emits_tool_calls():
366371
tools_run_id = uuid4()
367372
handler.on_chain_start({}, {}, run_id=tools_run_id, name='root-agent__tools')
368373
handler.on_tool_end('r', run_id=uuid4(), parent_run_id=tools_run_id, name='fn_search')
369-
handler.flush(graph)
374+
await handler.flush(graph)
370375

371376
ev = _events(mock_ld_client)
372377
tool_events = ev.get('$ld:ai:tool_call', [])
373378
assert len(tool_events) == 1
374379
assert tool_events[0][0]['toolKey'] == 'search'
375380

376381

377-
def test_flush_includes_graph_key_in_node_events():
382+
@pytest.mark.asyncio
383+
async def test_flush_includes_graph_key_in_node_events():
378384
"""flush() passes graph_key to the node tracker so graphKey appears in events."""
379385
mock_ld_client = MagicMock()
380386
graph = _make_graph(mock_ld_client, graph_key='my-graph')
@@ -384,14 +390,15 @@ def test_flush_includes_graph_key_in_node_events():
384390
node_run_id = uuid4()
385391
handler.on_chain_start({}, {}, run_id=node_run_id, name='root-agent')
386392
handler.on_llm_end(_llm_result(5, 3, 2), run_id=uuid4(), parent_run_id=node_run_id)
387-
handler.flush(graph)
393+
await handler.flush(graph)
388394

389395
ev = _events(mock_ld_client)
390396
token_data = ev['$ld:ai:tokens:total'][0][0]
391397
assert token_data.get('graphKey') == 'my-graph'
392398

393399

394-
def test_flush_with_no_graph_key_on_node_tracker():
400+
@pytest.mark.asyncio
401+
async def test_flush_with_no_graph_key_on_node_tracker():
395402
"""When node tracker has no graph_key, events omit graphKey."""
396403
mock_ld_client = MagicMock()
397404
context = MagicMock()
@@ -408,6 +415,7 @@ def test_flush_with_no_graph_key_on_node_tracker():
408415
node_config = AIAgentConfig(
409416
key='root-agent',
410417
enabled=True,
418+
evaluator=Evaluator.noop(),
411419
model=ModelConfig(name='gpt-4', parameters={}),
412420
provider=ProviderConfig(name='openai'),
413421
instructions='Be helpful.',
@@ -432,29 +440,31 @@ def test_flush_with_no_graph_key_on_node_tracker():
432440
node_run_id = uuid4()
433441
handler.on_chain_start({}, {}, run_id=node_run_id, name='root-agent')
434442
handler.on_llm_end(_llm_result(5, 3, 2), run_id=uuid4(), parent_run_id=node_run_id)
435-
handler.flush(graph)
443+
await handler.flush(graph)
436444

437445
ev = _events(mock_ld_client)
438446
token_data = ev['$ld:ai:tokens:total'][0][0]
439447
assert 'graphKey' not in token_data
440448

441449

442-
def test_flush_skips_nodes_not_in_path():
450+
@pytest.mark.asyncio
451+
async def test_flush_skips_nodes_not_in_path():
443452
"""flush() only emits events for nodes that were actually executed."""
444453
mock_ld_client = MagicMock()
445454
graph = _make_graph(mock_ld_client)
446455
tracker = graph.create_tracker()
447456

448457
# Handler with 'root-agent' in node_keys but never started
449458
handler = LDMetricsCallbackHandler({'root-agent'}, {})
450-
handler.flush(graph)
459+
await handler.flush(graph)
451460

452461
ev = _events(mock_ld_client)
453462
assert '$ld:ai:tokens:total' not in ev
454463
assert '$ld:ai:generation:success' not in ev
455464

456465

457-
def test_flush_skips_node_without_tracker():
466+
@pytest.mark.asyncio
467+
async def test_flush_skips_node_without_tracker():
458468
"""flush() silently skips nodes whose config has no tracker."""
459469
mock_ld_client = MagicMock()
460470
context = MagicMock()
@@ -463,6 +473,7 @@ def test_flush_skips_node_without_tracker():
463473
key='no-track',
464474
enabled=True,
465475
create_tracker=lambda: None,
476+
evaluator=Evaluator.noop(),
466477
model=ModelConfig(name='gpt-4', parameters={}),
467478
provider=ProviderConfig(name='openai'),
468479
instructions='',
@@ -483,7 +494,7 @@ def test_flush_skips_node_without_tracker():
483494
node_run_id = uuid4()
484495
handler.on_chain_start({}, {}, run_id=node_run_id, name='no-track')
485496
handler.on_llm_end(_llm_result(5, 3, 2), run_id=uuid4(), parent_run_id=node_run_id)
486-
handler.flush(graph) # should not raise
497+
await handler.flush(graph) # should not raise
487498

488499
mock_ld_client.track.assert_not_called()
489500

0 commit comments

Comments
 (0)