99from ldai .judge import Judge
1010from ldai .judge .evaluation_schema_builder import EvaluationSchemaBuilder
1111from ldai .models import AIJudgeConfig , AIJudgeConfigDefault , LDMessage , ModelConfig , ProviderConfig
12- from ldai .providers .types import JudgeResult , LDAIMetrics , StructuredResponse
12+ from ldai .providers .types import JudgeResult , LDAIMetrics , RunnerResult
1313from ldai .tracker import LDAIConfigTracker
1414
1515
@@ -40,9 +40,9 @@ def client(td: TestData) -> LDClient:
4040
4141@pytest .fixture
4242def mock_runner ():
43- """Create a mock AI provider ."""
43+ """Create a mock AI runner ."""
4444 provider = MagicMock ()
45- provider .invoke_structured_model = AsyncMock ()
45+ provider .run = AsyncMock ()
4646 return provider
4747
4848
@@ -137,7 +137,7 @@ async def test_evaluate_returns_failure_when_evaluation_metric_key_missing(
137137 assert isinstance (result , JudgeResult )
138138 assert result .success is False
139139 assert result .sampled is False
140- mock_runner .invoke_structured_model .assert_not_called ()
140+ mock_runner .run .assert_not_called ()
141141
142142 @pytest .mark .asyncio
143143 async def test_evaluate_returns_failure_when_messages_missing (
@@ -151,23 +151,23 @@ async def test_evaluate_returns_failure_when_messages_missing(
151151 assert isinstance (result , JudgeResult )
152152 assert result .success is False
153153 assert result .sampled is False
154- mock_runner .invoke_structured_model .assert_not_called ()
154+ mock_runner .run .assert_not_called ()
155155
156156 @pytest .mark .asyncio
157157 async def test_evaluate_success_with_valid_response (
158158 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
159159 ):
160160 """Evaluate should return JudgeResponse with valid evaluation."""
161- mock_response = StructuredResponse (
162- data = {
161+ mock_response = RunnerResult (
162+ content = '' ,
163+ metrics = LDAIMetrics (success = True ),
164+ parsed = {
163165 'score' : 0.85 ,
164166 'reasoning' : 'The response is highly relevant to the input.'
165167 },
166- raw_response = '{"score": 0.85, "reasoning": "..."}' ,
167- metrics = LDAIMetrics (success = True )
168168 )
169169
170- mock_runner .invoke_structured_model .return_value = mock_response
170+ mock_runner .run .return_value = mock_response
171171 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
172172
173173 judge = Judge (judge_config_with_key , mock_runner )
@@ -187,15 +187,15 @@ async def test_evaluate_success_with_evaluation_response_shape(
187187 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
188188 ):
189189 """Evaluate should accept shape { score, reasoning } and key by metric."""
190- mock_response = StructuredResponse (
191- data = {
190+ mock_response = RunnerResult (
191+ content = '' ,
192+ metrics = LDAIMetrics (success = True ),
193+ parsed = {
192194 'score' : 0.9 ,
193195 'reasoning' : 'The response is accurate and complete.' ,
194196 },
195- raw_response = '{"score": 0.9, "reasoning": "..."}' ,
196- metrics = LDAIMetrics (success = True ),
197197 )
198- mock_runner .invoke_structured_model .return_value = mock_response
198+ mock_runner .run .return_value = mock_response
199199 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
200200
201201 judge = Judge (judge_config_with_key , mock_runner )
@@ -214,13 +214,13 @@ async def test_evaluate_handles_missing_evaluation_in_response(
214214 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
215215 ):
216216 """Evaluate should handle missing score/reasoning in response."""
217- mock_response = StructuredResponse (
218- data = {} ,
219- raw_response = '{}' ,
220- metrics = LDAIMetrics ( success = True )
217+ mock_response = RunnerResult (
218+ content = '' ,
219+ metrics = LDAIMetrics ( success = True ) ,
220+ parsed = {},
221221 )
222222
223- mock_runner .invoke_structured_model .return_value = mock_response
223+ mock_runner .run .return_value = mock_response
224224 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
225225
226226 judge = Judge (judge_config_with_key , mock_runner )
@@ -236,16 +236,16 @@ async def test_evaluate_handles_invalid_score(
236236 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
237237 ):
238238 """Evaluate should handle invalid score values."""
239- mock_response = StructuredResponse (
240- data = {
239+ mock_response = RunnerResult (
240+ content = '' ,
241+ metrics = LDAIMetrics (success = True ),
242+ parsed = {
241243 'score' : 1.5 ,
242- 'reasoning' : 'Some reasoning'
244+ 'reasoning' : 'Some reasoning' ,
243245 },
244- raw_response = '{"score": 1.5, "reasoning": "..."}' ,
245- metrics = LDAIMetrics (success = True )
246246 )
247247
248- mock_runner .invoke_structured_model .return_value = mock_response
248+ mock_runner .run .return_value = mock_response
249249 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
250250
251251 judge = Judge (judge_config_with_key , mock_runner )
@@ -261,13 +261,13 @@ async def test_evaluate_handles_missing_reasoning(
261261 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
262262 ):
263263 """Evaluate should handle missing reasoning."""
264- mock_response = StructuredResponse (
265- data = { 'score' : 0.8 } ,
266- raw_response = '{"score": 0.8}' ,
267- metrics = LDAIMetrics ( success = True )
264+ mock_response = RunnerResult (
265+ content = '' ,
266+ metrics = LDAIMetrics ( success = True ) ,
267+ parsed = { 'score' : 0.8 },
268268 )
269269
270- mock_runner .invoke_structured_model .return_value = mock_response
270+ mock_runner .run .return_value = mock_response
271271 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
272272
273273 judge = Judge (judge_config_with_key , mock_runner )
@@ -283,7 +283,7 @@ async def test_evaluate_handles_exception(
283283 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
284284 ):
285285 """Evaluate should handle exceptions gracefully."""
286- mock_runner .invoke_structured_model .side_effect = Exception ("Provider error" )
286+ mock_runner .run .side_effect = Exception ("Provider error" )
287287 tracker .track_metrics_of_async = AsyncMock (side_effect = Exception ("Provider error" ))
288288
289289 judge = Judge (judge_config_with_key , mock_runner )
@@ -306,7 +306,7 @@ async def test_evaluate_respects_sampling_rate(
306306 assert isinstance (result , JudgeResult )
307307 assert result .sampled is False
308308 assert result .success is False
309- mock_runner .invoke_structured_model .assert_not_called ()
309+ mock_runner .run .assert_not_called ()
310310
311311
312312class TestJudgeEvaluateMessages :
@@ -317,15 +317,13 @@ async def test_evaluate_messages_calls_evaluate(
317317 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_runner
318318 ):
319319 """evaluate_messages should call evaluate with constructed input/output."""
320- from ldai .providers .types import ModelResponse
321-
322- mock_response = StructuredResponse (
323- data = {'score' : 0.9 , 'reasoning' : 'Very relevant' },
324- raw_response = '{"score": 0.9, "reasoning": "..."}' ,
325- metrics = LDAIMetrics (success = True )
320+ mock_response = RunnerResult (
321+ content = '' ,
322+ metrics = LDAIMetrics (success = True ),
323+ parsed = {'score' : 0.9 , 'reasoning' : 'Very relevant' },
326324 )
327325
328- mock_runner .invoke_structured_model .return_value = mock_response
326+ mock_runner .run .return_value = mock_response
329327 tracker .track_metrics_of_async = AsyncMock (return_value = mock_response )
330328
331329 judge = Judge (judge_config_with_key , mock_runner )
@@ -334,9 +332,9 @@ async def test_evaluate_messages_calls_evaluate(
334332 LDMessage (role = 'user' , content = 'Question 1' ),
335333 LDMessage (role = 'assistant' , content = 'Answer 1' ),
336334 ]
337- chat_response = ModelResponse (
338- message = LDMessage ( role = 'assistant' , content = 'Answer 2' ) ,
339- metrics = LDAIMetrics (success = True )
335+ chat_response = RunnerResult (
336+ content = 'Answer 2' ,
337+ metrics = LDAIMetrics (success = True ),
340338 )
341339
342340 result = await judge .evaluate_messages (messages , chat_response )
0 commit comments