99from ldai .judge import Judge
1010from ldai .judge .evaluation_schema_builder import EvaluationSchemaBuilder
1111from ldai .models import AIJudgeConfig , AIJudgeConfigDefault , LDMessage , ModelConfig , ProviderConfig
12- from ldai .providers .types import JudgeResult , LDAIMetrics , StructuredResponse
12+ from ldai .providers .types import JudgeResult , LDAIMetrics , RunnerResult
1313from ldai .tracker import LDAIConfigTracker
1414
1515
@@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient:
4040
4141@pytest .fixture
4242def mock_runner ():
43- """Create a mock AI provider ."""
43+ """Create a mock AI runner ."""
4444 provider = MagicMock ()
45- provider .invoke_structured_model = AsyncMock ()
45+ provider .run = AsyncMock ()
4646 return provider
4747
4848
@@ -151,7 +151,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
151151 assert isinstance (result , JudgeResult )
152152 assert result .success is False
153153 assert result .sampled is False
154- mock_runner .invoke_structured_model .assert_not_called ()
154+ mock_runner .run .assert_not_called ()
155155
156156 @pytest .mark .asyncio
157157 async def test_evaluate_returns_failure_when_messages_missing (
@@ -165,23 +165,23 @@ async def test_evaluate_returns_failure_when_messages_missing(
165165 assert isinstance (result , JudgeResult )
166166 assert result .success is False
167167 assert result .sampled is False
168- mock_runner .invoke_structured_model .assert_not_called ()
168+ mock_runner .run .assert_not_called ()
169169
170170 @pytest .mark .asyncio
171171 async def test_evaluate_success_with_valid_response (
172172 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
173173 ):
174174 """Evaluate should return JudgeResponse with valid evaluation."""
175- mock_response = StructuredResponse (
176- data = {
175+ mock_response = RunnerResult (
176+ content = '' ,
177+ metrics = LDAIMetrics (success = True ),
178+ parsed = {
177179 'score' : 0.85 ,
178180 'reasoning' : 'The response is highly relevant to the input.'
179181 },
180- raw_response = '{"score": 0.85, "reasoning": "..."}' ,
181- metrics = LDAIMetrics (success = True )
182182 )
183183
184- mock_runner .invoke_structured_model .return_value = mock_response
184+ mock_runner .run .return_value = mock_response
185185 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
186186
187187 judge = Judge (judge_config_with_key , mock_runner )
@@ -201,15 +201,15 @@ async def test_evaluate_success_with_evaluation_response_shape(
201201 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
202202 ):
203203 """Evaluate should accept shape { score, reasoning } and key by metric."""
204- mock_response = StructuredResponse (
205- data = {
204+ mock_response = RunnerResult (
205+ content = '' ,
206+ metrics = LDAIMetrics (success = True ),
207+ parsed = {
206208 'score' : 0.9 ,
207209 'reasoning' : 'The response is accurate and complete.' ,
208210 },
209- raw_response = '{"score": 0.9, "reasoning": "..."}' ,
210- metrics = LDAIMetrics (success = True ),
211211 )
212- mock_runner .invoke_structured_model .return_value = mock_response
212+ mock_runner .run .return_value = mock_response
213213 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
214214
215215 judge = Judge (judge_config_with_key , mock_runner )
@@ -228,13 +228,13 @@ async def test_evaluate_handles_missing_evaluation_in_response(
228228 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
229229 ):
230230 """Evaluate should handle missing score/reasoning in response."""
231- mock_response = StructuredResponse (
232- data = {} ,
233- raw_response = '{}' ,
234- metrics = LDAIMetrics ( success = True )
231+ mock_response = RunnerResult (
232+ content = '' ,
233+ metrics = LDAIMetrics ( success = True ) ,
234+ parsed = {},
235235 )
236236
237- mock_runner .invoke_structured_model .return_value = mock_response
237+ mock_runner .run .return_value = mock_response
238238 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
239239
240240 judge = Judge (judge_config_with_key , mock_runner )
@@ -250,16 +250,16 @@ async def test_evaluate_handles_invalid_score(
250250 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
251251 ):
252252 """Evaluate should handle invalid score values."""
253- mock_response = StructuredResponse (
254- data = {
253+ mock_response = RunnerResult (
254+ content = '' ,
255+ metrics = LDAIMetrics (success = True ),
256+ parsed = {
255257 'score' : 1.5 ,
256- 'reasoning' : 'Some reasoning'
258+ 'reasoning' : 'Some reasoning' ,
257259 },
258- raw_response = '{"score": 1.5, "reasoning": "..."}' ,
259- metrics = LDAIMetrics (success = True )
260260 )
261261
262- mock_runner .invoke_structured_model .return_value = mock_response
262+ mock_runner .run .return_value = mock_response
263263 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
264264
265265 judge = Judge (judge_config_with_key , mock_runner )
@@ -275,13 +275,13 @@ async def test_evaluate_handles_missing_reasoning(
275275 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
276276 ):
277277 """Evaluate should handle missing reasoning."""
278- mock_response = StructuredResponse (
279- data = { 'score' : 0.8 } ,
280- raw_response = '{"score": 0.8}' ,
281- metrics = LDAIMetrics ( success = True )
278+ mock_response = RunnerResult (
279+ content = '' ,
280+ metrics = LDAIMetrics ( success = True ) ,
281+ parsed = { 'score' : 0.8 },
282282 )
283283
284- mock_runner .invoke_structured_model .return_value = mock_response
284+ mock_runner .run .return_value = mock_response
285285 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
286286
287287 judge = Judge (judge_config_with_key , mock_runner )
@@ -297,7 +297,7 @@ async def test_evaluate_handles_exception(
297297 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
298298 ):
299299 """Evaluate should handle exceptions gracefully."""
300- mock_runner .invoke_structured_model .side_effect = Exception ("Provider error" )
300+ mock_runner .run .side_effect = Exception ("Provider error" )
301301 tracker .track_metrics_of_async = AsyncMock (side_effect = Exception ("Provider error" ))
302302
303303 judge = Judge (judge_config_with_key , mock_runner )
@@ -320,7 +320,7 @@ async def test_evaluate_respects_sampling_rate(
320320 assert isinstance (result , JudgeResult )
321321 assert result .sampled is False
322322 assert result .success is False
323- mock_runner .invoke_structured_model .assert_not_called ()
323+ mock_runner .run .assert_not_called ()
324324
325325 @pytest .mark .asyncio
326326 async def test_evaluate_uses_instance_sample_rate_when_arg_omitted (
@@ -357,15 +357,13 @@ async def test_evaluate_messages_calls_evaluate(
357357 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
358358 ):
359359 """evaluate_messages should call evaluate with constructed input/output."""
360- from ldai .providers .types import ModelResponse
361-
362- mock_response = StructuredResponse (
363- data = {'score' : 0.9 , 'reasoning' : 'Very relevant' },
364- raw_response = '{"score": 0.9, "reasoning": "..."}' ,
365- metrics = LDAIMetrics (success = True )
360+ mock_response = RunnerResult (
361+ content = '' ,
362+ metrics = LDAIMetrics (success = True ),
363+ parsed = {'score' : 0.9 , 'reasoning' : 'Very relevant' },
366364 )
367365
368- mock_runner .invoke_structured_model .return_value = mock_response
366+ mock_runner .run .return_value = mock_response
369367 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
370368
371369 judge = Judge (judge_config_with_key , mock_runner )
@@ -374,9 +372,9 @@ async def test_evaluate_messages_calls_evaluate(
374372 LDMessage (role = 'user' , content = 'Question 1' ),
375373 LDMessage (role = 'assistant' , content = 'Answer 1' ),
376374 ]
377- chat_response = ModelResponse (
378- message = LDMessage ( role = 'assistant' , content = 'Answer 2' ) ,
379- metrics = LDAIMetrics (success = True )
375+ chat_response = RunnerResult (
376+ content = 'Answer 2' ,
377+ metrics = LDAIMetrics (success = True ),
380378 )
381379
382380 result = await judge .evaluate_messages (messages , chat_response )
0 commit comments