@@ -114,15 +114,6 @@ def test_judge_initializes_with_evaluation_metric_key(
114114 assert eval_schema ['required' ] == ['score' , 'reasoning' ]
115115 assert 'score' in eval_schema ['properties' ] and 'reasoning' in eval_schema ['properties' ]
116116
117- def test_judge_initializes_without_evaluation_metric_key (
118- self , judge_config_without_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
119- ):
120- """Judge should initialize but have None for evaluation_response_structure."""
121- judge = Judge (judge_config_without_key , tracker , mock_ai_provider )
122-
123- assert judge ._ai_config == judge_config_without_key
124- assert judge ._evaluation_response_structure is None
125-
126117
127118class TestJudgeEvaluate :
128119 """Tests for Judge.evaluate() method."""
@@ -158,14 +149,12 @@ async def test_evaluate_success_with_valid_response(
158149 """Evaluate should return JudgeResponse with valid evaluation."""
159150 mock_response = StructuredResponse (
160151 data = {
161- 'evaluations' : {
162- '$ld:ai:judge:relevance' : {
163- 'score' : 0.85 ,
164- 'reasoning' : 'The response is highly relevant to the input.'
165- }
152+ 'evaluation' : {
153+ 'score' : 0.85 ,
154+ 'reasoning' : 'The response is highly relevant to the input.'
166155 }
167156 },
168- raw_response = '{"evaluations ": {... }}' ,
157+ raw_response = '{"evaluation ": {"score": 0.85, "reasoning": "..." }}' ,
169158 metrics = LDAIMetrics (success = True )
170159 )
171160
@@ -210,46 +199,14 @@ async def test_evaluate_success_with_evaluation_response_shape(
210199 assert result .evals ['$ld:ai:judge:relevance' ].score == 0.9
211200 assert 'accurate' in result .evals ['$ld:ai:judge:relevance' ].reasoning .lower ()
212201
213- @pytest .mark .asyncio
214- async def test_evaluate_success_with_evaluations_backward_compat (
215- self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
216- ):
217- """Evaluate should accept legacy shape { evaluations: { score, reasoning } }."""
218- mock_response = StructuredResponse (
219- data = {
220- 'evaluations' : {
221- 'score' : 0.7 ,
222- 'reasoning' : 'Partially correct.' ,
223- }
224- },
225- raw_response = '{"evaluations": {"score": 0.7, "reasoning": "..."}}' ,
226- metrics = LDAIMetrics (success = True ),
227- )
228- mock_ai_provider .invoke_structured_model .return_value = mock_response
229- tracker .track_metrics_of = AsyncMock (return_value = mock_response )
230-
231- judge = Judge (judge_config_with_key , tracker , mock_ai_provider )
232- result = await judge .evaluate ("input" , "output" )
233-
234- assert result is not None
235- assert result .success is True
236- assert result .evals ['$ld:ai:judge:relevance' ].score == 0.7
237-
238202 @pytest .mark .asyncio
239203 async def test_evaluate_handles_missing_evaluation_in_response (
240204 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
241205 ):
242206 """Evaluate should handle missing evaluation in response."""
243207 mock_response = StructuredResponse (
244- data = {
245- 'evaluations' : {
246- 'wrong-key' : {
247- 'score' : 0.5 ,
248- 'reasoning' : 'Some reasoning'
249- }
250- }
251- },
252- raw_response = '{"evaluations": {...}}' ,
208+ data = {},
209+ raw_response = '{}' ,
253210 metrics = LDAIMetrics (success = True )
254211 )
255212
@@ -271,14 +228,12 @@ async def test_evaluate_handles_invalid_score(
271228 """Evaluate should handle invalid score values."""
272229 mock_response = StructuredResponse (
273230 data = {
274- 'evaluations' : {
275- '$ld:ai:judge:relevance' : {
276- 'score' : 1.5 ,
277- 'reasoning' : 'Some reasoning'
278- }
231+ 'evaluation' : {
232+ 'score' : 1.5 ,
233+ 'reasoning' : 'Some reasoning'
279234 }
280235 },
281- raw_response = '{"evaluations ": {... }}' ,
236+ raw_response = '{"evaluation ": {"score": 1.5, "reasoning": "..." }}' ,
282237 metrics = LDAIMetrics (success = True )
283238 )
284239
@@ -300,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning(
300255 """Evaluate should handle missing reasoning."""
301256 mock_response = StructuredResponse (
302257 data = {
303- 'evaluations' : {
304- '$ld:ai:judge:relevance' : {
305- 'score' : 0.8 ,
306- }
258+ 'evaluation' : {
259+ 'score' : 0.8 ,
307260 }
308261 },
309- raw_response = '{"evaluations ": {... }}' ,
262+ raw_response = '{"evaluation ": {"score": 0.8 }}' ,
310263 metrics = LDAIMetrics (success = True )
311264 )
312265
@@ -364,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate(
364317
365318 mock_response = StructuredResponse (
366319 data = {
367- 'evaluations' : {
368- '$ld:ai:judge:relevance' : {
369- 'score' : 0.9 ,
370- 'reasoning' : 'Very relevant'
371- }
320+ 'evaluation' : {
321+ 'score' : 0.9 ,
322+ 'reasoning' : 'Very relevant'
372323 }
373324 },
374- raw_response = '{"evaluations ": {... }}' ,
325+ raw_response = '{"evaluation ": {"score": 0.9, "reasoning": "..." }}' ,
375326 metrics = LDAIMetrics (success = True )
376327 )
377328
0 commit comments