Skip to content

Commit 430b67e

Browse files
jsonbaileyclaude
andcommitted
feat: Add evaluations support to ManagedAgent.run()
Wire judge evaluations into ManagedAgent.run() via an asyncio.Task, mirroring ManagedModel.run(). Awaiting result.evaluations guarantees both evaluation and tracker.track_judge_result() complete. run() returns immediately; the evaluations task resolves asynchronously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 8a049e2 commit 430b67e

2 files changed

Lines changed: 226 additions & 20 deletions

File tree

packages/sdk/server-ai/src/ldai/managed_agent.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
"""ManagedAgent — LaunchDarkly managed wrapper for agent invocations."""
22

3-
from typing import Union
3+
import asyncio
4+
from typing import List, Union
45

56
from ldai.models import AIAgentConfig
67
from ldai.providers import AgentRunner
78
from ldai.providers.runner import Runner
8-
from ldai.providers.types import ManagedResult, RunnerResult
9+
from ldai.providers.types import JudgeResult, ManagedResult, RunnerResult
10+
from ldai.tracker import LDAIConfigTracker
911

1012

1113
class ManagedAgent:
1214
"""
1315
LaunchDarkly managed wrapper for AI agent invocations.
1416
15-
Holds an AgentRunner or Runner. Handles tracking automatically via
16-
``create_tracker()``.
17+
Holds an AgentRunner or Runner. Handles tracking and judge evaluation
18+
dispatch automatically via ``create_tracker()``.
1719
Obtain an instance via ``LDAIClient.create_agent()``.
1820
"""
1921

@@ -29,20 +31,46 @@ async def run(self, input: str) -> ManagedResult:
2931
"""
3032
Run the agent with the given input string.
3133
34+
Invokes the runner, tracks metrics, and dispatches judge evaluations
35+
asynchronously. Returns immediately; awaiting ``result.evaluations``
36+
guarantees both evaluation and tracking complete.
37+
3238
:param input: The user prompt or input to the agent
33-
:return: ManagedResult containing the agent's output and metric summary
39+
:return: ManagedResult containing the agent's output, metric summary,
40+
and an optional evaluations task
3441
"""
3542
tracker = self._ai_config.create_tracker()
3643
result: RunnerResult = await tracker.track_metrics_of_async(
3744
lambda r: r.metrics,
3845
lambda: self._agent_runner.run(input),
3946
)
47+
48+
evaluations_task = self._track_judge_results(tracker, input, result.content)
49+
4050
return ManagedResult(
4151
content=result.content,
4252
metrics=tracker.get_summary(),
4353
raw=result.raw,
54+
evaluations=evaluations_task,
4455
)
4556

57+
def _track_judge_results(
58+
self,
59+
tracker: LDAIConfigTracker,
60+
input_text: str,
61+
output_text: str,
62+
) -> asyncio.Task[List[JudgeResult]]:
63+
evaluator_task = self._ai_config.evaluator.evaluate(input_text, output_text)
64+
65+
async def _run_and_track(eval_task: asyncio.Task) -> List[JudgeResult]:
66+
results = await eval_task
67+
for r in results:
68+
if r.success:
69+
tracker.track_judge_result(r)
70+
return results
71+
72+
return asyncio.create_task(_run_and_track(evaluator_task))
73+
4674
def get_agent_runner(self) -> Union[Runner, AgentRunner]:
4775
"""
4876
Return the underlying runner for advanced use.

packages/sdk/server-ai/tests/test_managed_agent.py

Lines changed: 193 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
"""Tests for ManagedAgent."""
22

3+
import asyncio
34
import pytest
5+
from typing import List
46
from unittest.mock import AsyncMock, MagicMock
57

68
from ldai import LDAIClient, ManagedAgent
9+
from ldai.evaluator import Evaluator
710
from ldai.managed_agent import ManagedAgent
811
from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig
9-
from ldai.providers.types import LDAIMetrics, ManagedResult, RunnerResult
10-
from ldai.tracker import LDAIMetricSummary
12+
from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult, RunnerResult
13+
from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary
1114

1215
from ldclient import Config, Context, LDClient
1316
from ldclient.integrations.test_data import TestData
@@ -19,6 +22,23 @@ def _make_summary(success: bool = True) -> LDAIMetricSummary:
1922
return summary
2023

2124

25+
def _make_noop_evaluator_config() -> MagicMock:
26+
"""Build a minimal mock AIAgentConfig with a noop evaluator and a mock tracker."""
27+
mock_config = MagicMock(spec=AIAgentConfig)
28+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
29+
mock_tracker.track_metrics_of_async = AsyncMock(
30+
return_value=RunnerResult(
31+
content="Test response",
32+
raw=None,
33+
metrics=LDAIMetrics(success=True, usage=None),
34+
)
35+
)
36+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
37+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
38+
mock_config.evaluator = Evaluator.noop()
39+
return mock_config
40+
41+
2242
@pytest.fixture
2343
def td() -> TestData:
2444
td = TestData.data_source()
@@ -60,17 +80,7 @@ class TestManagedAgentRun:
6080
@pytest.mark.asyncio
6181
async def test_run_delegates_to_agent_runner(self):
6282
"""Should delegate run() to the underlying AgentRunner and return ManagedResult."""
63-
mock_config = MagicMock(spec=AIAgentConfig)
64-
mock_tracker = MagicMock()
65-
mock_tracker.track_metrics_of_async = AsyncMock(
66-
return_value=RunnerResult(
67-
content="Test response",
68-
metrics=LDAIMetrics(success=True, usage=None),
69-
raw=None,
70-
)
71-
)
72-
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
73-
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
83+
mock_config = _make_noop_evaluator_config()
7484
mock_runner = MagicMock()
7585
mock_runner.run = AsyncMock(
7686
return_value=RunnerResult(
@@ -87,13 +97,16 @@ async def test_run_delegates_to_agent_runner(self):
8797
assert result.content == "Test response"
8898
assert result.metrics.success is True
8999
mock_config.create_tracker.assert_called_once()
90-
mock_tracker.track_metrics_of_async.assert_called_once()
100+
mock_config.create_tracker.return_value.track_metrics_of_async.assert_called_once()
101+
# evaluations should be present (from noop evaluator)
102+
if result.evaluations is not None:
103+
await result.evaluations
91104

92105
@pytest.mark.asyncio
93106
async def test_run_uses_create_tracker_for_fresh_tracker(self):
94107
"""Should use create_tracker() factory for a fresh tracker per invocation."""
95108
mock_config = MagicMock(spec=AIAgentConfig)
96-
fresh_tracker = MagicMock()
109+
fresh_tracker = MagicMock(spec=LDAIConfigTracker)
97110
fresh_tracker.track_metrics_of_async = AsyncMock(
98111
return_value=RunnerResult(
99112
content="Fresh tracker response",
@@ -103,6 +116,7 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
103116
)
104117
fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True))
105118
mock_config.create_tracker = MagicMock(return_value=fresh_tracker)
119+
mock_config.evaluator = Evaluator.noop()
106120

107121
mock_runner = MagicMock()
108122

@@ -113,6 +127,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
113127
assert result.content == "Fresh tracker response"
114128
mock_config.create_tracker.assert_called_once()
115129
fresh_tracker.track_metrics_of_async.assert_called_once()
130+
if result.evaluations is not None:
131+
await result.evaluations
116132

117133
def test_get_agent_runner_returns_runner(self):
118134
"""Should return the underlying AgentRunner."""
@@ -129,6 +145,168 @@ def test_get_config_returns_config(self):
129145
assert agent.get_config() is mock_config
130146

131147

148+
class TestManagedAgentEvaluations:
149+
"""Tests for ManagedAgent evaluations chain (PR 12)."""
150+
151+
@pytest.mark.asyncio
152+
async def test_run_returns_before_evaluations_resolve(self):
153+
"""run() should return before evaluations complete."""
154+
barrier = asyncio.Event()
155+
156+
async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]:
157+
await barrier.wait()
158+
return []
159+
160+
mock_evaluator = MagicMock(spec=Evaluator)
161+
mock_evaluator.evaluate = MagicMock(
162+
side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o))
163+
)
164+
165+
mock_config = MagicMock(spec=AIAgentConfig)
166+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
167+
mock_tracker.track_metrics_of_async = AsyncMock(
168+
return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True))
169+
)
170+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
171+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
172+
mock_config.evaluator = mock_evaluator
173+
174+
mock_runner = MagicMock()
175+
agent = ManagedAgent(mock_config, mock_runner)
176+
result = await agent.run("Hello")
177+
178+
assert result is not None
179+
assert result.evaluations is not None
180+
assert not result.evaluations.done(), "evaluations task should still be pending"
181+
182+
barrier.set()
183+
await result.evaluations
184+
185+
@pytest.mark.asyncio
186+
async def test_await_evaluations_collects_results(self):
187+
"""await result.evaluations should return the list of JudgeResult instances."""
188+
judge_result = JudgeResult(
189+
judge_config_key='judge-key',
190+
success=True,
191+
sampled=True,
192+
metric_key='$ld:ai:judge:relevance',
193+
score=0.9,
194+
reasoning='Good agent response',
195+
)
196+
197+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
198+
return [judge_result]
199+
200+
mock_evaluator = MagicMock(spec=Evaluator)
201+
mock_evaluator.evaluate = MagicMock(
202+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
203+
)
204+
205+
mock_config = MagicMock(spec=AIAgentConfig)
206+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
207+
mock_tracker.track_metrics_of_async = AsyncMock(
208+
return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True))
209+
)
210+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
211+
mock_tracker.track_judge_result = MagicMock()
212+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
213+
mock_config.evaluator = mock_evaluator
214+
215+
mock_runner = MagicMock()
216+
agent = ManagedAgent(mock_config, mock_runner)
217+
result = await agent.run("Hello")
218+
219+
results = await result.evaluations # type: ignore[misc]
220+
assert results == [judge_result]
221+
222+
@pytest.mark.asyncio
223+
async def test_tracking_fires_inside_awaited_chain(self):
224+
"""tracker.track_judge_result() must be called when evaluations are awaited."""
225+
judge_result = JudgeResult(
226+
judge_config_key='agent-judge',
227+
success=True,
228+
sampled=True,
229+
metric_key='$ld:ai:judge:relevance',
230+
score=0.85,
231+
)
232+
233+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
234+
return [judge_result]
235+
236+
mock_evaluator = MagicMock(spec=Evaluator)
237+
mock_evaluator.evaluate = MagicMock(
238+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
239+
)
240+
241+
mock_config = MagicMock(spec=AIAgentConfig)
242+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
243+
mock_tracker.track_metrics_of_async = AsyncMock(
244+
return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True))
245+
)
246+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
247+
mock_tracker.track_judge_result = MagicMock()
248+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
249+
mock_config.evaluator = mock_evaluator
250+
251+
mock_runner = MagicMock()
252+
agent = ManagedAgent(mock_config, mock_runner)
253+
result = await agent.run("Hello")
254+
255+
# Tracking should NOT have fired yet (before we await evaluations)
256+
mock_tracker.track_judge_result.assert_not_called()
257+
258+
# Now await the evaluations task — tracking fires inside the chain
259+
await result.evaluations # type: ignore[misc]
260+
261+
mock_tracker.track_judge_result.assert_called_once_with(judge_result)
262+
263+
@pytest.mark.asyncio
264+
async def test_noop_evaluator_returns_empty_list(self):
265+
"""With a noop evaluator, awaiting evaluations should return an empty list."""
266+
mock_config = _make_noop_evaluator_config()
267+
mock_runner = MagicMock()
268+
agent = ManagedAgent(mock_config, mock_runner)
269+
result = await agent.run("Hello")
270+
271+
results = await result.evaluations # type: ignore[misc]
272+
assert results == []
273+
274+
@pytest.mark.asyncio
275+
async def test_tracking_not_called_for_failed_judge_result(self):
276+
"""tracker.track_judge_result() should NOT be called for unsuccessful judge results."""
277+
failed_result = JudgeResult(
278+
success=False,
279+
sampled=True,
280+
metric_key='$ld:ai:judge:relevance',
281+
error_message='Judge evaluation failed',
282+
)
283+
284+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
285+
return [failed_result]
286+
287+
mock_evaluator = MagicMock(spec=Evaluator)
288+
mock_evaluator.evaluate = MagicMock(
289+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
290+
)
291+
292+
mock_config = MagicMock(spec=AIAgentConfig)
293+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
294+
mock_tracker.track_metrics_of_async = AsyncMock(
295+
return_value=RunnerResult(content="resp", raw=None, metrics=LDAIMetrics(success=True))
296+
)
297+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
298+
mock_tracker.track_judge_result = MagicMock()
299+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
300+
mock_config.evaluator = mock_evaluator
301+
302+
mock_runner = MagicMock()
303+
agent = ManagedAgent(mock_config, mock_runner)
304+
result = await agent.run("Hello")
305+
await result.evaluations # type: ignore[misc]
306+
307+
mock_tracker.track_judge_result.assert_not_called()
308+
309+
132310
class TestLDAIClientCreateAgent:
133311
"""Tests for LDAIClient.create_agent."""
134312

0 commit comments

Comments
 (0)