Skip to content

Commit 3396233

Browse files
jsonbaileyclaude
andcommitted
refactor: update Judge to use Runner protocol and RunnerResult
- Judge now accepts Runner instead of ModelRunner - evaluate() calls runner.run(output_type=...) instead of invoke_structured_model - response.parsed replaces StructuredResponse.data; None guard added - evaluate_messages() accepts RunnerResult instead of ModelResponse - Tests updated to use RunnerResult and mock_runner.run Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 5d6c677 commit 3396233

3 files changed

Lines changed: 55 additions & 53 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def _create_judge_instance(
339339
if not provider:
340340
return None
341341

342-
return Judge(judge_config, provider, sample_rate=sample_rate) # type: ignore[arg-type]
342+
return Judge(judge_config, provider, sample_rate=sample_rate)
343343
except Exception as e:
344344
log.warning('Failed to initialize judge %r: %s', key, e)
345345
return None

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from ldai import log
99
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1010
from ldai.models import AIJudgeConfig, LDMessage
11-
from ldai.providers.model_runner import ModelRunner
12-
from ldai.providers.types import JudgeResult, ModelResponse
11+
from ldai.providers.runner import Runner
12+
from ldai.providers.types import JudgeResult, RunnerResult
1313

1414

1515
class Judge:
@@ -23,7 +23,7 @@ class Judge:
2323
def __init__(
2424
self,
2525
ai_config: AIJudgeConfig,
26-
model_runner: ModelRunner,
26+
model_runner: Runner,
2727
sample_rate: float = 1.0,
2828
):
2929
"""
@@ -82,10 +82,14 @@ async def evaluate(
8282

8383
response = await tracker.track_metrics_of_async(
8484
lambda result: result.metrics,
85-
lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure),
85+
lambda: self._model_runner.run(messages, output_type=self._evaluation_response_structure),
8686
)
8787

88-
parsed = self._parse_evaluation_response(response.data)
88+
if response.parsed is None:
89+
log.warning('Judge evaluation did not return structured output')
90+
return judge_result
91+
92+
parsed = self._parse_evaluation_response(response.parsed)
8993

9094
if parsed is None:
9195
log.warning('Judge evaluation did not return the expected evaluation')
@@ -105,20 +109,20 @@ async def evaluate(
105109
async def evaluate_messages(
106110
self,
107111
messages: list[LDMessage],
108-
response: ModelResponse,
112+
response: RunnerResult,
109113
sampling_ratio: Optional[float] = None,
110114
) -> JudgeResult:
111115
"""
112116
Evaluates an AI response from chat messages and response.
113117
114118
:param messages: Array of messages representing the conversation history
115-
:param response: The AI response to be evaluated
119+
:param response: The runner result to be evaluated
116120
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed.
117121
When ``None`` (the default), falls back to ``self.sample_rate``.
118122
:return: The result of the judge evaluation.
119123
"""
120124
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
121-
output_text = response.message.content
125+
output_text = response.content
122126

123127
return await self.evaluate(input_text, output_text, sampling_ratio)
124128

@@ -130,7 +134,7 @@ def get_ai_config(self) -> AIJudgeConfig:
130134
"""
131135
return self._ai_config
132136

133-
def get_model_runner(self) -> ModelRunner:
137+
def get_model_runner(self) -> Runner:
134138
"""
135139
Returns the model runner used by this judge.
136140

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 41 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ldai.judge import Judge
1010
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1111
from ldai.models import AIJudgeConfig, AIJudgeConfigDefault, LDMessage, ModelConfig, ProviderConfig
12-
from ldai.providers.types import JudgeResult, LDAIMetrics, StructuredResponse
12+
from ldai.providers.types import JudgeResult, LDAIMetrics, RunnerResult
1313
from ldai.tracker import LDAIConfigTracker
1414

1515

@@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient:
4040

4141
@pytest.fixture
4242
def mock_runner():
43-
"""Create a mock AI provider."""
43+
"""Create a mock AI runner."""
4444
provider = MagicMock()
45-
provider.invoke_structured_model = AsyncMock()
45+
provider.run = AsyncMock()
4646
return provider
4747

4848

@@ -151,7 +151,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
151151
assert isinstance(result, JudgeResult)
152152
assert result.success is False
153153
assert result.sampled is False
154-
mock_runner.invoke_structured_model.assert_not_called()
154+
mock_runner.run.assert_not_called()
155155

156156
@pytest.mark.asyncio
157157
async def test_evaluate_returns_failure_when_messages_missing(
@@ -165,23 +165,23 @@ async def test_evaluate_returns_failure_when_messages_missing(
165165
assert isinstance(result, JudgeResult)
166166
assert result.success is False
167167
assert result.sampled is False
168-
mock_runner.invoke_structured_model.assert_not_called()
168+
mock_runner.run.assert_not_called()
169169

170170
@pytest.mark.asyncio
171171
async def test_evaluate_success_with_valid_response(
172172
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
173173
):
174174
"""Evaluate should return JudgeResponse with valid evaluation."""
175-
mock_response = StructuredResponse(
176-
data={
175+
mock_response = RunnerResult(
176+
content='',
177+
metrics=LDAIMetrics(success=True),
178+
parsed={
177179
'score': 0.85,
178180
'reasoning': 'The response is highly relevant to the input.'
179181
},
180-
raw_response='{"score": 0.85, "reasoning": "..."}',
181-
metrics=LDAIMetrics(success=True)
182182
)
183183

184-
mock_runner.invoke_structured_model.return_value = mock_response
184+
mock_runner.run.return_value = mock_response
185185
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
186186

187187
judge = Judge(judge_config_with_key, mock_runner)
@@ -201,15 +201,15 @@ async def test_evaluate_success_with_evaluation_response_shape(
201201
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
202202
):
203203
"""Evaluate should accept shape { score, reasoning } and key by metric."""
204-
mock_response = StructuredResponse(
205-
data={
204+
mock_response = RunnerResult(
205+
content='',
206+
metrics=LDAIMetrics(success=True),
207+
parsed={
206208
'score': 0.9,
207209
'reasoning': 'The response is accurate and complete.',
208210
},
209-
raw_response='{"score": 0.9, "reasoning": "..."}',
210-
metrics=LDAIMetrics(success=True),
211211
)
212-
mock_runner.invoke_structured_model.return_value = mock_response
212+
mock_runner.run.return_value = mock_response
213213
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
214214

215215
judge = Judge(judge_config_with_key, mock_runner)
@@ -228,13 +228,13 @@ async def test_evaluate_handles_missing_evaluation_in_response(
228228
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
229229
):
230230
"""Evaluate should handle missing score/reasoning in response."""
231-
mock_response = StructuredResponse(
232-
data={},
233-
raw_response='{}',
234-
metrics=LDAIMetrics(success=True)
231+
mock_response = RunnerResult(
232+
content='',
233+
metrics=LDAIMetrics(success=True),
234+
parsed={},
235235
)
236236

237-
mock_runner.invoke_structured_model.return_value = mock_response
237+
mock_runner.run.return_value = mock_response
238238
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
239239

240240
judge = Judge(judge_config_with_key, mock_runner)
@@ -250,16 +250,16 @@ async def test_evaluate_handles_invalid_score(
250250
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
251251
):
252252
"""Evaluate should handle invalid score values."""
253-
mock_response = StructuredResponse(
254-
data={
253+
mock_response = RunnerResult(
254+
content='',
255+
metrics=LDAIMetrics(success=True),
256+
parsed={
255257
'score': 1.5,
256-
'reasoning': 'Some reasoning'
258+
'reasoning': 'Some reasoning',
257259
},
258-
raw_response='{"score": 1.5, "reasoning": "..."}',
259-
metrics=LDAIMetrics(success=True)
260260
)
261261

262-
mock_runner.invoke_structured_model.return_value = mock_response
262+
mock_runner.run.return_value = mock_response
263263
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
264264

265265
judge = Judge(judge_config_with_key, mock_runner)
@@ -275,13 +275,13 @@ async def test_evaluate_handles_missing_reasoning(
275275
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
276276
):
277277
"""Evaluate should handle missing reasoning."""
278-
mock_response = StructuredResponse(
279-
data={'score': 0.8},
280-
raw_response='{"score": 0.8}',
281-
metrics=LDAIMetrics(success=True)
278+
mock_response = RunnerResult(
279+
content='',
280+
metrics=LDAIMetrics(success=True),
281+
parsed={'score': 0.8},
282282
)
283283

284-
mock_runner.invoke_structured_model.return_value = mock_response
284+
mock_runner.run.return_value = mock_response
285285
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
286286

287287
judge = Judge(judge_config_with_key, mock_runner)
@@ -297,7 +297,7 @@ async def test_evaluate_handles_exception(
297297
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
298298
):
299299
"""Evaluate should handle exceptions gracefully."""
300-
mock_runner.invoke_structured_model.side_effect = Exception("Provider error")
300+
mock_runner.run.side_effect = Exception("Provider error")
301301
tracker.track_metrics_of_async = AsyncMock(side_effect=Exception("Provider error"))
302302

303303
judge = Judge(judge_config_with_key, mock_runner)
@@ -320,7 +320,7 @@ async def test_evaluate_respects_sampling_rate(
320320
assert isinstance(result, JudgeResult)
321321
assert result.sampled is False
322322
assert result.success is False
323-
mock_runner.invoke_structured_model.assert_not_called()
323+
mock_runner.run.assert_not_called()
324324

325325
@pytest.mark.asyncio
326326
async def test_evaluate_uses_instance_sample_rate_when_arg_omitted(
@@ -357,15 +357,13 @@ async def test_evaluate_messages_calls_evaluate(
357357
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_runner
358358
):
359359
"""evaluate_messages should call evaluate with constructed input/output."""
360-
from ldai.providers.types import ModelResponse
361-
362-
mock_response = StructuredResponse(
363-
data={'score': 0.9, 'reasoning': 'Very relevant'},
364-
raw_response='{"score": 0.9, "reasoning": "..."}',
365-
metrics=LDAIMetrics(success=True)
360+
mock_response = RunnerResult(
361+
content='',
362+
metrics=LDAIMetrics(success=True),
363+
parsed={'score': 0.9, 'reasoning': 'Very relevant'},
366364
)
367365

368-
mock_runner.invoke_structured_model.return_value = mock_response
366+
mock_runner.run.return_value = mock_response
369367
tracker.track_metrics_of_async = AsyncMock(return_value=mock_response)
370368

371369
judge = Judge(judge_config_with_key, mock_runner)
@@ -374,9 +372,9 @@ async def test_evaluate_messages_calls_evaluate(
374372
LDMessage(role='user', content='Question 1'),
375373
LDMessage(role='assistant', content='Answer 1'),
376374
]
377-
chat_response = ModelResponse(
378-
message=LDMessage(role='assistant', content='Answer 2'),
379-
metrics=LDAIMetrics(success=True)
375+
chat_response = RunnerResult(
376+
content='Answer 2',
377+
metrics=LDAIMetrics(success=True),
380378
)
381379

382380
result = await judge.evaluate_messages(messages, chat_response)

0 commit comments

Comments
 (0)