@@ -109,7 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key(
109109 assert judge ._ai_config == judge_config_with_key
110110 assert judge ._evaluation_response_structure is not None
111111 assert judge ._evaluation_response_structure ['title' ] == 'EvaluationResponse'
112- assert '$ld:ai:judge:relevance' in judge ._evaluation_response_structure ['properties' ]['evaluations' ]['required' ]
112+ assert judge ._evaluation_response_structure ['required' ] == ['evaluation' ]
113+ eval_schema = judge ._evaluation_response_structure ['properties' ]['evaluation' ]
114+ assert eval_schema ['required' ] == ['score' , 'reasoning' ]
115+ assert 'score' in eval_schema ['properties' ] and 'reasoning' in eval_schema ['properties' ]
113116
114117 def test_judge_initializes_without_evaluation_metric_key (
115118 self , judge_config_without_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
@@ -180,6 +183,58 @@ async def test_evaluate_success_with_valid_response(
180183 assert result .evals ['$ld:ai:judge:relevance' ].score == 0.85
181184 assert 'relevant' in result .evals ['$ld:ai:judge:relevance' ].reasoning .lower ()
182185
186+ @pytest .mark .asyncio
187+ async def test_evaluate_success_with_evaluation_response_shape (
188+ self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
189+ ):
190+ """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
191+ mock_response = StructuredResponse (
192+ data = {
193+ 'evaluation' : {
194+ 'score' : 0.9 ,
195+ 'reasoning' : 'The response is accurate and complete.' ,
196+ }
197+ },
198+ raw_response = '{"evaluation": {"score": 0.9, "reasoning": "..."}}' ,
199+ metrics = LDAIMetrics (success = True ),
200+ )
201+ mock_ai_provider .invoke_structured_model .return_value = mock_response
202+ tracker .track_metrics_of = AsyncMock (return_value = mock_response )
203+
204+ judge = Judge (judge_config_with_key , tracker , mock_ai_provider )
205+ result = await judge .evaluate ("What is feature flagging?" , "Feature flagging is..." )
206+
207+ assert result is not None
208+ assert result .success is True
209+ assert '$ld:ai:judge:relevance' in result .evals
210+ assert result .evals ['$ld:ai:judge:relevance' ].score == 0.9
211+ assert 'accurate' in result .evals ['$ld:ai:judge:relevance' ].reasoning .lower ()
212+
213+ @pytest .mark .asyncio
214+ async def test_evaluate_success_with_evaluations_backward_compat (
215+ self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
216+ ):
217+ """Evaluate should accept legacy shape { evaluations: { score, reasoning } }."""
218+ mock_response = StructuredResponse (
219+ data = {
220+ 'evaluations' : {
221+ 'score' : 0.7 ,
222+ 'reasoning' : 'Partially correct.' ,
223+ }
224+ },
225+ raw_response = '{"evaluations": {"score": 0.7, "reasoning": "..."}}' ,
226+ metrics = LDAIMetrics (success = True ),
227+ )
228+ mock_ai_provider .invoke_structured_model .return_value = mock_response
229+ tracker .track_metrics_of = AsyncMock (return_value = mock_response )
230+
231+ judge = Judge (judge_config_with_key , tracker , mock_ai_provider )
232+ result = await judge .evaluate ("input" , "output" )
233+
234+ assert result is not None
235+ assert result .success is True
236+ assert result .evals ['$ld:ai:judge:relevance' ].score == 0.7
237+
183238 @pytest .mark .asyncio
184239 async def test_evaluate_handles_missing_evaluation_in_response (
185240 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
@@ -345,30 +400,21 @@ class TestEvaluationSchemaBuilder:
345400 """Tests for EvaluationSchemaBuilder."""
346401
347402 def test_build_creates_correct_schema (self ):
348- """Schema builder should create correct schema structure ."""
349- schema = EvaluationSchemaBuilder .build ('$ld:ai:judge:relevance' )
350-
403+ """Schema builder should create fixed schema (evaluation with score + reasoning, no key param) ."""
404+ schema = EvaluationSchemaBuilder .build ()
405+
351406 assert schema ['title' ] == 'EvaluationResponse'
352407 assert schema ['type' ] == 'object'
353- assert 'evaluations' in schema ['properties' ]
354- assert '$ld:ai:judge:relevance' in schema ['properties' ]['evaluations' ]['required' ]
355- assert '$ld:ai:judge:relevance' in schema ['properties' ]['evaluations' ]['properties' ]
356-
357- metric_schema = schema ['properties' ]['evaluations' ]['properties' ]['$ld:ai:judge:relevance' ]
358- assert metric_schema ['type' ] == 'object'
359- assert 'score' in metric_schema ['properties' ]
360- assert 'reasoning' in metric_schema ['properties' ]
361- assert metric_schema ['properties' ]['score' ]['type' ] == 'number'
362- assert metric_schema ['properties' ]['score' ]['minimum' ] == 0
363- assert metric_schema ['properties' ]['score' ]['maximum' ] == 1
364-
365- def test_build_key_properties_creates_single_key (self ):
366- """_build_key_properties should create properties for a single key."""
367- properties = EvaluationSchemaBuilder ._build_key_properties ('$ld:ai:judge:relevance' )
368-
369- assert '$ld:ai:judge:relevance' in properties
370- assert len (properties ) == 1
371- assert properties ['$ld:ai:judge:relevance' ]['type' ] == 'object'
408+ assert schema ['required' ] == ['evaluation' ]
409+ assert 'evaluation' in schema ['properties' ]
410+ eval_schema = schema ['properties' ]['evaluation' ]
411+ assert eval_schema ['type' ] == 'object'
412+ assert eval_schema ['required' ] == ['score' , 'reasoning' ]
413+ assert 'score' in eval_schema ['properties' ]
414+ assert 'reasoning' in eval_schema ['properties' ]
415+ assert eval_schema ['properties' ]['score' ]['type' ] == 'number'
416+ assert eval_schema ['properties' ]['score' ]['minimum' ] == 0
417+ assert eval_schema ['properties' ]['score' ]['maximum' ] == 1
372418
373419
374420class TestJudgeConfigSerialization :
0 commit comments