Skip to content

Commit 1d9c790

Browse files
jsonbaileyclaude
andcommitted
feat: Add evaluations support to ManagedAgent.run()
Wire judge evaluations into ManagedAgent.run() via an asyncio.Task, mirroring ManagedModel.run(). Awaiting result.evaluations guarantees both evaluation and tracker.track_judge_result() complete. run() returns immediately; the evaluations task resolves asynchronously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent eb1004c commit 1d9c790

2 files changed

Lines changed: 226 additions & 20 deletions

File tree

packages/sdk/server-ai/src/ldai/managed_agent.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
"""ManagedAgent — LaunchDarkly managed wrapper for agent invocations."""
22

3-
from typing import Union
3+
import asyncio
4+
from typing import List, Union
45

56
from ldai.models import AIAgentConfig
67
from ldai.providers import AgentResult, AgentRunner
78
from ldai.providers.runner import Runner
8-
from ldai.providers.types import ManagedResult, RunnerResult
9+
from ldai.providers.types import JudgeResult, ManagedResult, RunnerResult
10+
from ldai.tracker import LDAIConfigTracker
911

1012

1113
class ManagedAgent:
1214
"""
1315
LaunchDarkly managed wrapper for AI agent invocations.
1416
15-
Holds an AgentRunner or Runner. Handles tracking automatically via
16-
``create_tracker()``.
17+
Holds an AgentRunner or Runner. Handles tracking and judge evaluation
18+
dispatch automatically via ``create_tracker()``.
1719
Obtain an instance via ``LDAIClient.create_agent()``.
1820
"""
1921

@@ -29,8 +31,13 @@ async def run(self, input: str) -> ManagedResult:
2931
"""
3032
Run the agent with the given input string.
3133
34+
Invokes the runner, tracks metrics, and dispatches judge evaluations
35+
asynchronously. Returns immediately; awaiting ``result.evaluations``
36+
guarantees both evaluation and tracking complete.
37+
3238
:param input: The user prompt or input to the agent
33-
:return: ManagedResult containing the agent's output and metric summary
39+
:return: ManagedResult containing the agent's output, metric summary,
40+
and an optional evaluations task
3441
"""
3542
tracker = self._ai_config.create_tracker()
3643
result: Union[RunnerResult, AgentResult] = await tracker.track_metrics_of_async(
@@ -39,12 +46,33 @@ async def run(self, input: str) -> ManagedResult:
3946
)
4047
# Support both RunnerResult (content) and legacy AgentResult (output)
4148
content = result.content if isinstance(result, RunnerResult) else result.output # type: ignore[union-attr]
49+
50+
evaluations_task = self._track_judge_results(tracker, input, content)
51+
4252
return ManagedResult(
4353
content=content,
4454
metrics=tracker.get_summary(),
4555
raw=result.raw,
56+
evaluations=evaluations_task,
4657
)
4758

59+
def _track_judge_results(
60+
self,
61+
tracker: LDAIConfigTracker,
62+
input_text: str,
63+
output_text: str,
64+
) -> asyncio.Task[List[JudgeResult]]:
65+
evaluator_task = self._ai_config.evaluator.evaluate(input_text, output_text)
66+
67+
async def _run_and_track(eval_task: asyncio.Task) -> List[JudgeResult]:
68+
results = await eval_task
69+
for r in results:
70+
if r.success:
71+
tracker.track_judge_result(r)
72+
return results
73+
74+
return asyncio.create_task(_run_and_track(evaluator_task))
75+
4876
def get_agent_runner(self) -> Union[Runner, AgentRunner]:
4977
"""
5078
Return the underlying runner for advanced use.

packages/sdk/server-ai/tests/test_managed_agent.py

Lines changed: 193 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
"""Tests for ManagedAgent."""
22

3+
import asyncio
34
import pytest
5+
from typing import List
46
from unittest.mock import AsyncMock, MagicMock
57

68
from ldai import LDAIClient, ManagedAgent
9+
from ldai.evaluator import Evaluator
710
from ldai.managed_agent import ManagedAgent
811
from ldai.models import AIAgentConfig, AIAgentConfigDefault, ModelConfig, ProviderConfig
912
from ldai.providers import AgentResult
10-
from ldai.providers.types import LDAIMetrics, ManagedResult
11-
from ldai.tracker import LDAIMetricSummary
13+
from ldai.providers.types import JudgeResult, LDAIMetrics, ManagedResult
14+
from ldai.tracker import LDAIConfigTracker, LDAIMetricSummary
1215

1316
from ldclient import Config, Context, LDClient
1417
from ldclient.integrations.test_data import TestData
@@ -20,6 +23,23 @@ def _make_summary(success: bool = True) -> LDAIMetricSummary:
2023
return summary
2124

2225

26+
def _make_noop_evaluator_config() -> MagicMock:
27+
"""Build a minimal mock AIAgentConfig with a noop evaluator and a mock tracker."""
28+
mock_config = MagicMock(spec=AIAgentConfig)
29+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
30+
mock_tracker.track_metrics_of_async = AsyncMock(
31+
return_value=AgentResult(
32+
output="Test response",
33+
raw=None,
34+
metrics=LDAIMetrics(success=True, usage=None),
35+
)
36+
)
37+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
38+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
39+
mock_config.evaluator = Evaluator.noop()
40+
return mock_config
41+
42+
2343
@pytest.fixture
2444
def td() -> TestData:
2545
td = TestData.data_source()
@@ -61,17 +81,7 @@ class TestManagedAgentRun:
6181
@pytest.mark.asyncio
6282
async def test_run_delegates_to_agent_runner(self):
6383
"""Should delegate run() to the underlying AgentRunner and return ManagedResult."""
64-
mock_config = MagicMock(spec=AIAgentConfig)
65-
mock_tracker = MagicMock()
66-
mock_tracker.track_metrics_of_async = AsyncMock(
67-
return_value=AgentResult(
68-
output="Test response",
69-
raw=None,
70-
metrics=LDAIMetrics(success=True, usage=None),
71-
)
72-
)
73-
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
74-
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
84+
mock_config = _make_noop_evaluator_config()
7585
mock_runner = MagicMock()
7686
mock_runner.run = AsyncMock(
7787
return_value=AgentResult(
@@ -88,13 +98,16 @@ async def test_run_delegates_to_agent_runner(self):
8898
assert result.content == "Test response"
8999
assert result.metrics.success is True
90100
mock_config.create_tracker.assert_called_once()
91-
mock_tracker.track_metrics_of_async.assert_called_once()
101+
mock_config.create_tracker.return_value.track_metrics_of_async.assert_called_once()
102+
# evaluations should be present (from noop evaluator)
103+
if result.evaluations is not None:
104+
await result.evaluations
92105

93106
@pytest.mark.asyncio
94107
async def test_run_uses_create_tracker_for_fresh_tracker(self):
95108
"""Should use create_tracker() factory for a fresh tracker per invocation."""
96109
mock_config = MagicMock(spec=AIAgentConfig)
97-
fresh_tracker = MagicMock()
110+
fresh_tracker = MagicMock(spec=LDAIConfigTracker)
98111
fresh_tracker.track_metrics_of_async = AsyncMock(
99112
return_value=AgentResult(
100113
output="Fresh tracker response",
@@ -104,6 +117,7 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
104117
)
105118
fresh_tracker.get_summary = MagicMock(return_value=_make_summary(True))
106119
mock_config.create_tracker = MagicMock(return_value=fresh_tracker)
120+
mock_config.evaluator = Evaluator.noop()
107121

108122
mock_runner = MagicMock()
109123

@@ -114,6 +128,8 @@ async def test_run_uses_create_tracker_for_fresh_tracker(self):
114128
assert result.content == "Fresh tracker response"
115129
mock_config.create_tracker.assert_called_once()
116130
fresh_tracker.track_metrics_of_async.assert_called_once()
131+
if result.evaluations is not None:
132+
await result.evaluations
117133

118134
def test_get_agent_runner_returns_runner(self):
119135
"""Should return the underlying AgentRunner."""
@@ -130,6 +146,168 @@ def test_get_config_returns_config(self):
130146
assert agent.get_config() is mock_config
131147

132148

149+
class TestManagedAgentEvaluations:
150+
"""Tests for ManagedAgent evaluations chain (PR 12)."""
151+
152+
@pytest.mark.asyncio
153+
async def test_run_returns_before_evaluations_resolve(self):
154+
"""run() should return before evaluations complete."""
155+
barrier = asyncio.Event()
156+
157+
async def _slow_evaluate(input_text: str, output_text: str) -> List[JudgeResult]:
158+
await barrier.wait()
159+
return []
160+
161+
mock_evaluator = MagicMock(spec=Evaluator)
162+
mock_evaluator.evaluate = MagicMock(
163+
side_effect=lambda i, o: asyncio.create_task(_slow_evaluate(i, o))
164+
)
165+
166+
mock_config = MagicMock(spec=AIAgentConfig)
167+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
168+
mock_tracker.track_metrics_of_async = AsyncMock(
169+
return_value=AgentResult(output="resp", raw=None, metrics=LDAIMetrics(success=True))
170+
)
171+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
172+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
173+
mock_config.evaluator = mock_evaluator
174+
175+
mock_runner = MagicMock()
176+
agent = ManagedAgent(mock_config, mock_runner)
177+
result = await agent.run("Hello")
178+
179+
assert result is not None
180+
assert result.evaluations is not None
181+
assert not result.evaluations.done(), "evaluations task should still be pending"
182+
183+
barrier.set()
184+
await result.evaluations
185+
186+
@pytest.mark.asyncio
187+
async def test_await_evaluations_collects_results(self):
188+
"""await result.evaluations should return the list of JudgeResult instances."""
189+
judge_result = JudgeResult(
190+
judge_config_key='judge-key',
191+
success=True,
192+
sampled=True,
193+
metric_key='$ld:ai:judge:relevance',
194+
score=0.9,
195+
reasoning='Good agent response',
196+
)
197+
198+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
199+
return [judge_result]
200+
201+
mock_evaluator = MagicMock(spec=Evaluator)
202+
mock_evaluator.evaluate = MagicMock(
203+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
204+
)
205+
206+
mock_config = MagicMock(spec=AIAgentConfig)
207+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
208+
mock_tracker.track_metrics_of_async = AsyncMock(
209+
return_value=AgentResult(output="resp", raw=None, metrics=LDAIMetrics(success=True))
210+
)
211+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
212+
mock_tracker.track_judge_result = MagicMock()
213+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
214+
mock_config.evaluator = mock_evaluator
215+
216+
mock_runner = MagicMock()
217+
agent = ManagedAgent(mock_config, mock_runner)
218+
result = await agent.run("Hello")
219+
220+
results = await result.evaluations # type: ignore[misc]
221+
assert results == [judge_result]
222+
223+
@pytest.mark.asyncio
224+
async def test_tracking_fires_inside_awaited_chain(self):
225+
"""tracker.track_judge_result() must be called when evaluations are awaited."""
226+
judge_result = JudgeResult(
227+
judge_config_key='agent-judge',
228+
success=True,
229+
sampled=True,
230+
metric_key='$ld:ai:judge:relevance',
231+
score=0.85,
232+
)
233+
234+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
235+
return [judge_result]
236+
237+
mock_evaluator = MagicMock(spec=Evaluator)
238+
mock_evaluator.evaluate = MagicMock(
239+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
240+
)
241+
242+
mock_config = MagicMock(spec=AIAgentConfig)
243+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
244+
mock_tracker.track_metrics_of_async = AsyncMock(
245+
return_value=AgentResult(output="resp", raw=None, metrics=LDAIMetrics(success=True))
246+
)
247+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
248+
mock_tracker.track_judge_result = MagicMock()
249+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
250+
mock_config.evaluator = mock_evaluator
251+
252+
mock_runner = MagicMock()
253+
agent = ManagedAgent(mock_config, mock_runner)
254+
result = await agent.run("Hello")
255+
256+
# Tracking should NOT have fired yet (before we await evaluations)
257+
mock_tracker.track_judge_result.assert_not_called()
258+
259+
# Now await the evaluations task — tracking fires inside the chain
260+
await result.evaluations # type: ignore[misc]
261+
262+
mock_tracker.track_judge_result.assert_called_once_with(judge_result)
263+
264+
@pytest.mark.asyncio
265+
async def test_noop_evaluator_returns_empty_list(self):
266+
"""With a noop evaluator, awaiting evaluations should return an empty list."""
267+
mock_config = _make_noop_evaluator_config()
268+
mock_runner = MagicMock()
269+
agent = ManagedAgent(mock_config, mock_runner)
270+
result = await agent.run("Hello")
271+
272+
results = await result.evaluations # type: ignore[misc]
273+
assert results == []
274+
275+
@pytest.mark.asyncio
276+
async def test_tracking_not_called_for_failed_judge_result(self):
277+
"""tracker.track_judge_result() should NOT be called for unsuccessful judge results."""
278+
failed_result = JudgeResult(
279+
success=False,
280+
sampled=True,
281+
metric_key='$ld:ai:judge:relevance',
282+
error_message='Judge evaluation failed',
283+
)
284+
285+
async def _evaluate_coro(input_text: str, output_text: str) -> List[JudgeResult]:
286+
return [failed_result]
287+
288+
mock_evaluator = MagicMock(spec=Evaluator)
289+
mock_evaluator.evaluate = MagicMock(
290+
side_effect=lambda i, o: asyncio.create_task(_evaluate_coro(i, o))
291+
)
292+
293+
mock_config = MagicMock(spec=AIAgentConfig)
294+
mock_tracker = MagicMock(spec=LDAIConfigTracker)
295+
mock_tracker.track_metrics_of_async = AsyncMock(
296+
return_value=AgentResult(output="resp", raw=None, metrics=LDAIMetrics(success=True))
297+
)
298+
mock_tracker.get_summary = MagicMock(return_value=_make_summary(True))
299+
mock_tracker.track_judge_result = MagicMock()
300+
mock_config.create_tracker = MagicMock(return_value=mock_tracker)
301+
mock_config.evaluator = mock_evaluator
302+
303+
mock_runner = MagicMock()
304+
agent = ManagedAgent(mock_config, mock_runner)
305+
result = await agent.run("Hello")
306+
await result.evaluations # type: ignore[misc]
307+
308+
mock_tracker.track_judge_result.assert_not_called()
309+
310+
133311
class TestLDAIClientCreateAgent:
134312
"""Tests for LDAIClient.create_agent."""
135313

0 commit comments

Comments
 (0)