Skip to content

Commit 45845ed

Browse files
jsonbaileyclaude
andcommitted
refactor: update Judge to use Runner protocol and RunnerResult
- Judge now accepts Runner instead of ModelRunner - evaluate() calls runner.run(output_type=...) instead of invoke_structured_model - response.parsed replaces StructuredResponse.data; None guard added - evaluate_messages() accepts RunnerResult instead of ModelResponse - Tests updated to use RunnerResult and mock_runner.run Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 7a52f24 commit 45845ed

3 files changed

Lines changed: 55 additions & 53 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def create_judge(
329329
if not provider:
330330
return None
331331

332-
return Judge(judge_config, provider) # type: ignore[arg-type]
332+
return Judge(judge_config, provider)
333333
except Exception as error:
334334
return None
335335

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from ldai import log
99
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1010
from ldai.models import AIJudgeConfig, LDMessage
11-
from ldai.providers.model_runner import ModelRunner
12-
from ldai.providers.types import JudgeResult, ModelResponse
11+
from ldai.providers.runner import Runner
12+
from ldai.providers.types import JudgeResult, RunnerResult
1313

1414

1515
class Judge:
@@ -23,7 +23,7 @@ class Judge:
2323
def __init__(
2424
self,
2525
ai_config: AIJudgeConfig,
26-
model_runner: ModelRunner,
26+
model_runner: Runner,
2727
):
2828
"""
2929
Initialize the Judge.
@@ -76,10 +76,14 @@ async def evaluate(
7676

7777
response = await tracker.track_metrics_of_async(
7878
lambda result: result.metrics,
79-
lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure),
79+
lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure),
8080
)
8181

82-
parsed = self._parse_evaluation_response(response.data)
82+
if response.parsed is None:
83+
log.warning('Judge evaluation did not return structured output')
84+
return judge_result
85+
86+
parsed = self._parse_evaluation_response(response.parsed)
8387

8488
if parsed is None:
8589
log.warning('Judge evaluation did not return the expected evaluation')
@@ -99,19 +103,19 @@ async def evaluate(
99103
async def evaluate_messages(
100104
self,
101105
messages: list[LDMessage],
102-
response: ModelResponse,
106+
response: RunnerResult,
103107
sampling_ratio: float = 1.0,
104108
) -> JudgeResult:
105109
"""
106110
Evaluates an AI response from chat messages and response.
107111
108112
:param messages: Array of messages representing the conversation history
109-
:param response: The AI response to be evaluated
113+
:param response: The runner result to be evaluated
110114
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
111115
:return: The result of the judge evaluation.
112116
"""
113117
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
114-
output_text = response.message.content
118+
output_text = response.content
115119

116120
return await self.evaluate(input_text, output_text, sampling_ratio)
117121

@@ -123,7 +127,7 @@ def get_ai_config(self) -> AIJudgeConfig:
123127
"""
124128
return self._ai_config
125129

126-
def get_model_runner(self) -> ModelRunner:
130+
def get_model_runner(self) -> Runner:
127131
"""
128132
Returns the model runner used by this judge.
129133

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 41 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ldai.judge import Judge
1010
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1111
from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig
12-
from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse
12+
from ldai.providers.types import JudgeResult, LDAIMetrics, RunnerResult
1313
from ldai.tracker import LDAIConfigTracker
1414

1515

@@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient:
4040

4141
@pytest.fixture
4242
def mock_runner():
43-
"""Create a mock AI provider."""
43+
"""Create a mock AI runner."""
4444
provider = MagicMock()
45-
provider.invoke_structured_model = AsyncMock()
45+
provider.run = AsyncMock()
4646
return provider
4747

4848

@@ -137,7 +137,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
137137
assert isinstance(result, JudgeResult)
138138
assert result.success is False
139139
assert result.sampled is False
140-
mock_runner.invoke_structured_model.assert_not_called()
140+
mock_runner.run.assert_not_called()
141141

142142
@pytest.mark.asyncio
143143
async def test_evaluate_returns_failure_when_messages_missing(
@@ -151,23 +151,23 @@ async def test_evaluate_returns_failure_when_messages_missing(
151151
assert isinstance(result, JudgeResult)
152152
assert result.success is False
153153
assert result.sampled is False
154-
mock_runner.invoke_structured_model.assert_not_called()
154+
mock_runner.run.assert_not_called()
155155

156156
@pytest.mark.asyncio
157157
async def test_evaluate_success_with_valid_response(
158158
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
159159
):
160160
"""Evaluate should return JudgeResponse with valid evaluation."""
161-
mock_response = StructuredResponse(
162-
data={
161+
mock_response = RunnerResult(
162+
content='',
163+
metrics=LDAIMetrics(success=True),
164+
parsed={
163165
'score': 0.85,
164166
'reasoning': 'The response is highly relevant to the input.'
165167
},
166-
raw_response='{"score": 0.85, "reasoning": "..."}',
167-
metrics=LDAIMetrics(success=True)
168168
)
169169

170-
mock_runner.invoke_structured_model.return_value = mock_response
170+
mock_runner.run.return_value = mock_response
171171
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
172172

173173
judge = Judge(judge_config_with_key, mock_runner)
@@ -187,15 +187,15 @@ async def test_evaluate_success_with_evaluation_response_shape(
187187
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
188188
):
189189
"""Evaluate should accept shape { score, reasoning } and key by metric."""
190-
mock_response = StructuredResponse(
191-
data={
190+
mock_response = RunnerResult(
191+
content='',
192+
metrics=LDAIMetrics(success=True),
193+
parsed={
192194
'score': 0.9,
193195
'reasoning': 'The response is accurate and complete.',
194196
},
195-
raw_response='{"score": 0.9, "reasoning": "..."}',
196-
metrics=LDAIMetrics(success=True),
197197
)
198-
mock_runner.invoke_structured_model.return_value = mock_response
198+
mock_runner.run.return_value = mock_response
199199
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
200200

201201
judge = Judge(judge_config_with_key, mock_runner)
@@ -214,13 +214,13 @@ async def test_evaluate_handles_missing_evaluation_in_response(
214214
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
215215
):
216216
"""Evaluate should handle missing score/reasoning in response."""
217-
mock_response = StructuredResponse(
218-
data={},
219-
raw_response='{}',
220-
metrics=LDAIMetrics(success=True)
217+
mock_response = RunnerResult(
218+
content='',
219+
metrics=LDAIMetrics(success=True),
220+
parsed={},
221221
)
222222

223-
mock_runner.invoke_structured_model.return_value = mock_response
223+
mock_runner.run.return_value = mock_response
224224
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
225225

226226
judge = Judge(judge_config_with_key, mock_runner)
@@ -236,16 +236,16 @@ async def test_evaluate_handles_invalid_score(
236236
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
237237
):
238238
"""Evaluate should handle invalid score values."""
239-
mock_response = StructuredResponse(
240-
data={
239+
mock_response = RunnerResult(
240+
content='',
241+
metrics=LDAIMetrics(success=True),
242+
parsed={
241243
'score': 1.5,
242-
'reasoning': 'Some reasoning'
244+
'reasoning': 'Some reasoning',
243245
},
244-
raw_response='{"score": 1.5, "reasoning": "..."}',
245-
metrics=LDAIMetrics(success=True)
246246
)
247247

248-
mock_runner.invoke_structured_model.return_value = mock_response
248+
mock_runner.run.return_value = mock_response
249249
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
250250

251251
judge = Judge(judge_config_with_key, mock_runner)
@@ -261,13 +261,13 @@ async def test_evaluate_handles_missing_reasoning(
261261
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
262262
):
263263
"""Evaluate should handle missing reasoning."""
264-
mock_response = StructuredResponse(
265-
data={'score': 0.8},
266-
raw_response='{"score": 0.8}',
267-
metrics=LDAIMetrics(success=True)
264+
mock_response = RunnerResult(
265+
content='',
266+
metrics=LDAIMetrics(success=True),
267+
parsed={'score': 0.8},
268268
)
269269

270-
mock_runner.invoke_structured_model.return_value = mock_response
270+
mock_runner.run.return_value = mock_response
271271
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
272272

273273
judge = Judge(judge_config_with_key, mock_runner)
@@ -283,7 +283,7 @@ async def test_evaluate_handles_exception(
283283
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
284284
):
285285
"""Evaluate should handle exceptions gracefully."""
286-
mock_runner.invoke_structured_model.side_effect = Exception("Provider error")
286+
mock_runner.run.side_effect = Exception("Provider error")
287287
tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error"))
288288

289289
judge = Judge(judge_config_with_key, mock_runner)
@@ -306,7 +306,7 @@ async def test_evaluate_respects_sampling_rate(
306306
assert isinstance(result, JudgeResult)
307307
assert result.sampled is False
308308
assert result.success is False
309-
mock_runner.invoke_structured_model.assert_not_called()
309+
mock_runner.run.assert_not_called()
310310

311311

312312
class TestJudgeEvaluateMessages:
@@ -317,15 +317,13 @@ async def test_evaluate_messages_calls_evaluate(
317317
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
318318
):
319319
"""evaluate_messages should call evaluate with constructed input/output."""
320-
from ldai.providers.types import ModelResponse
321-
322-
mock_response = StructuredResponse(
323-
data={'score': 0.9, 'reasoning': 'Very relevant'},
324-
raw_response='{"score": 0.9, "reasoning": "..."}',
325-
metrics=LDAIMetrics(success=True)
320+
mock_response = RunnerResult(
321+
content='',
322+
metrics=LDAIMetrics(success=True),
323+
parsed={'score': 0.9, 'reasoning': 'Very relevant'},
326324
)
327325

328-
mock_runner.invoke_structured_model.return_value = mock_response
326+
mock_runner.run.return_value = mock_response
329327
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
330328

331329
judge = Judge(judge_config_with_key, mock_runner)
@@ -334,9 +332,9 @@ async def test_evaluate_messages_calls_evaluate(
334332
LDMessage(role='user', content='Question 1'),
335333
LDMessage(role='assistant', content='Answer 1'),
336334
]
337-
chat_response = ModelResponse(
338-
message=LDMessage(role='assistant', content='Answer 2'),
339-
metrics=LDAIMetrics(success=True)
335+
chat_response = RunnerResult(
336+
content='Answer 2',
337+
metrics=LDAIMetrics(success=True),
340338
)
341339

342340
result = await judge.evaluate_messages(messages, chat_response)

0 commit comments

Comments
 (0)