@@ -109,10 +109,9 @@ def test_judge_initializes_with_evaluation_metric_key(
109109 assert judge ._ai_config == judge_config_with_key
110110 assert judge ._evaluation_response_structure is not None
111111 assert judge ._evaluation_response_structure ['title' ] == 'EvaluationResponse'
112- assert judge ._evaluation_response_structure ['required' ] == ['evaluation' ]
113- eval_schema = judge ._evaluation_response_structure ['properties' ]['evaluation' ]
114- assert eval_schema ['required' ] == ['score' , 'reasoning' ]
115- assert 'score' in eval_schema ['properties' ] and 'reasoning' in eval_schema ['properties' ]
112+ assert judge ._evaluation_response_structure ['required' ] == ['score' , 'reasoning' ]
113+ assert 'score' in judge ._evaluation_response_structure ['properties' ]
114+ assert 'reasoning' in judge ._evaluation_response_structure ['properties' ]
116115
117116
118117class TestJudgeEvaluate :
@@ -149,12 +148,10 @@ async def test_evaluate_success_with_valid_response(
149148 """Evaluate should return JudgeResponse with valid evaluation."""
150149 mock_response = StructuredResponse (
151150 data = {
152- 'evaluation' : {
153- 'score' : 0.85 ,
154- 'reasoning' : 'The response is highly relevant to the input.'
155- }
151+ 'score' : 0.85 ,
152+ 'reasoning' : 'The response is highly relevant to the input.'
156153 },
157- raw_response = '{"evaluation": {" score": 0.85, "reasoning": "..."} }' ,
154+ raw_response = '{"score": 0.85, "reasoning": "..."}' ,
158155 metrics = LDAIMetrics (success = True )
159156 )
160157
@@ -176,15 +173,13 @@ async def test_evaluate_success_with_valid_response(
176173 async def test_evaluate_success_with_evaluation_response_shape (
177174 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
178175 ):
179- """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
176+ """Evaluate should accept shape { score, reasoning } and key by metric."""
180177 mock_response = StructuredResponse (
181178 data = {
182- 'evaluation' : {
183- 'score' : 0.9 ,
184- 'reasoning' : 'The response is accurate and complete.' ,
185- }
179+ 'score' : 0.9 ,
180+ 'reasoning' : 'The response is accurate and complete.' ,
186181 },
187- raw_response = '{"evaluation": {" score": 0.9, "reasoning": "..."} }' ,
182+ raw_response = '{"score": 0.9, "reasoning": "..."}' ,
188183 metrics = LDAIMetrics (success = True ),
189184 )
190185 mock_ai_provider .invoke_structured_model .return_value = mock_response
@@ -203,7 +198,7 @@ async def test_evaluate_success_with_evaluation_response_shape(
203198 async def test_evaluate_handles_missing_evaluation_in_response (
204199 self , judge_config_with_key : AIJudgeConfig , tracker : LDAIConfigTracker , mock_ai_provider
205200 ):
206- """Evaluate should handle missing evaluation in response."""
201+ """Evaluate should handle missing score/reasoning in response."""
207202 mock_response = StructuredResponse (
208203 data = {},
209204 raw_response = '{}' ,
@@ -228,12 +223,10 @@ async def test_evaluate_handles_invalid_score(
228223 """Evaluate should handle invalid score values."""
229224 mock_response = StructuredResponse (
230225 data = {
231- 'evaluation' : {
232- 'score' : 1.5 ,
233- 'reasoning' : 'Some reasoning'
234- }
226+ 'score' : 1.5 ,
227+ 'reasoning' : 'Some reasoning'
235228 },
236- raw_response = '{"evaluation": {" score": 1.5, "reasoning": "..."} }' ,
229+ raw_response = '{"score": 1.5, "reasoning": "..."}' ,
237230 metrics = LDAIMetrics (success = True )
238231 )
239232
@@ -254,12 +247,8 @@ async def test_evaluate_handles_missing_reasoning(
254247 ):
255248 """Evaluate should handle missing reasoning."""
256249 mock_response = StructuredResponse (
257- data = {
258- 'evaluation' : {
259- 'score' : 0.8 ,
260- }
261- },
262- raw_response = '{"evaluation": {"score": 0.8}}' ,
250+ data = {'score' : 0.8 },
251+ raw_response = '{"score": 0.8}' ,
263252 metrics = LDAIMetrics (success = True )
264253 )
265254
@@ -316,13 +305,8 @@ async def test_evaluate_messages_calls_evaluate(
316305 from ldai .providers .types import ChatResponse
317306
318307 mock_response = StructuredResponse (
319- data = {
320- 'evaluation' : {
321- 'score' : 0.9 ,
322- 'reasoning' : 'Very relevant'
323- }
324- },
325- raw_response = '{"evaluation": {"score": 0.9, "reasoning": "..."}}' ,
308+ data = {'score' : 0.9 , 'reasoning' : 'Very relevant' },
309+ raw_response = '{"score": 0.9, "reasoning": "..."}' ,
326310 metrics = LDAIMetrics (success = True )
327311 )
328312
@@ -351,21 +335,17 @@ class TestEvaluationSchemaBuilder:
351335 """Tests for EvaluationSchemaBuilder."""
352336
353337 def test_build_creates_correct_schema (self ):
354- """Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
338+ """Schema builder should create fixed schema (top-level score + reasoning, no key param)."""
355339 schema = EvaluationSchemaBuilder .build ()
356340
357341 assert schema ['title' ] == 'EvaluationResponse'
358342 assert schema ['type' ] == 'object'
359- assert schema ['required' ] == ['evaluation' ]
360- assert 'evaluation' in schema ['properties' ]
361- eval_schema = schema ['properties' ]['evaluation' ]
362- assert eval_schema ['type' ] == 'object'
363- assert eval_schema ['required' ] == ['score' , 'reasoning' ]
364- assert 'score' in eval_schema ['properties' ]
365- assert 'reasoning' in eval_schema ['properties' ]
366- assert eval_schema ['properties' ]['score' ]['type' ] == 'number'
367- assert eval_schema ['properties' ]['score' ]['minimum' ] == 0
368- assert eval_schema ['properties' ]['score' ]['maximum' ] == 1
343+ assert schema ['required' ] == ['score' , 'reasoning' ]
344+ assert 'score' in schema ['properties' ]
345+ assert 'reasoning' in schema ['properties' ]
346+ assert schema ['properties' ]['score' ]['type' ] == 'number'
347+ assert schema ['properties' ]['score' ]['minimum' ] == 0
348+ assert schema ['properties' ]['score' ]['maximum' ] == 1
369349
370350
371351class TestJudgeConfigSerialization :
0 commit comments