simplify the structured output for judges further

jsonbailey · jsonbailey · commit a303c3d173d9 · 2026-03-16T13:37:45.000-07:00
diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py
@@ -174,21 +174,20 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
 
     def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
         """
-        Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}.
+        Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
         """
         results: Dict[str, EvalScore] = {}
         metric_key = self._ai_config.evaluation_metric_key
         if not metric_key:
             log.warn('Evaluation metric key is missing')
             return results
 
-        evaluation = data.get('evaluation') if isinstance(data, dict) else None
-        if not isinstance(evaluation, dict):
+        if not isinstance(data, dict):
             log.warn('Invalid response: missing or invalid evaluation')
             return results
 
-        score = evaluation.get('score')
-        reasoning = evaluation.get('reasoning')
+        score = data.get('score')
+        reasoning = data.get('reasoning')
         if not isinstance(score, (int, float)) or score < 0 or score > 1:
             log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
             return results
diff --git a/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py b/packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py
@@ -7,7 +7,7 @@ class EvaluationSchemaBuilder:
     """
     Internal class for building evaluation response schemas.
     Not exported - only used internally by Judge.
-    Schema is a fixed shape: one "evaluation" object with score and reasoning.
+    Schema is a fixed shape: top-level score and reasoning.
     The judge config's evaluation_metric_key is only used when keying the result,
     not in the schema.
     """
@@ -20,7 +20,7 @@ def build() -> Dict[str, Any]:
         evaluation_metric_key.
 
         In practice the model returns JSON like:
-          {"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}}
+          {"score": 0.85, "reasoning": "The response is accurate."}
 
         :return: Schema dictionary for structured output
         """
@@ -29,25 +29,17 @@ def build() -> Dict[str, Any]:
             'description': 'Response containing an evaluation (score and reasoning).',
             'type': 'object',
             'properties': {
-                'evaluation': {
-                    'type': 'object',
-                    'description': 'The evaluation result.',
-                    'properties': {
-                        'score': {
-                            'type': 'number',
-                            'minimum': 0,
-                            'maximum': 1,
-                            'description': 'Score between 0.0 and 1.0.',
-                        },
-                        'reasoning': {
-                            'type': 'string',
-                            'description': 'Reasoning behind the score.',
-                        },
-                    },
-                    'required': ['score', 'reasoning'],
-                    'additionalProperties': False,
+                'score': {
+                    'type': 'number',
+                    'minimum': 0,
+                    'maximum': 1,
+                    'description': 'Score between 0.0 and 1.0.',
+                },
+                'reasoning': {
+                    'type': 'string',
+                    'description': 'Reasoning behind the score.',
                 },
             },
-            'required': ['evaluation'],
+            'required': ['score', 'reasoning'],
             'additionalProperties': False,
         }
diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py
@@ -109,10 +109,9 @@ def test_judge_initializes_with_evaluation_metric_key(
         assert judge._ai_config == judge_config_with_key
         assert judge._evaluation_response_structure is not None
         assert judge._evaluation_response_structure['title'] == 'EvaluationResponse'
-        assert judge._evaluation_response_structure['required'] == ['evaluation']
-        eval_schema = judge._evaluation_response_structure['properties']['evaluation']
-        assert eval_schema['required'] == ['score', 'reasoning']
-        assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']
+        assert judge._evaluation_response_structure['required'] == ['score', 'reasoning']
+        assert 'score' in judge._evaluation_response_structure['properties']
+        assert 'reasoning' in judge._evaluation_response_structure['properties']
 
 
 class TestJudgeEvaluate:
@@ -149,12 +148,10 @@ async def test_evaluate_success_with_valid_response(
         """Evaluate should return JudgeResponse with valid evaluation."""
         mock_response = StructuredResponse(
             data={
-                'evaluation': {
-                    'score': 0.85,
-                    'reasoning': 'The response is highly relevant to the input.'
-                }
+                'score': 0.85,
+                'reasoning': 'The response is highly relevant to the input.'
             },
-            raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}',
+            raw_response='{"score": 0.85, "reasoning": "..."}',
             metrics=LDAIMetrics(success=True)
         )
         
@@ -176,15 +173,13 @@ async def test_evaluate_success_with_valid_response(
     async def test_evaluate_success_with_evaluation_response_shape(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
     ):
-        """Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
+        """Evaluate should accept shape { score, reasoning } and key by metric."""
         mock_response = StructuredResponse(
             data={
-                'evaluation': {
-                    'score': 0.9,
-                    'reasoning': 'The response is accurate and complete.',
-                }
+                'score': 0.9,
+                'reasoning': 'The response is accurate and complete.',
             },
-            raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
+            raw_response='{"score": 0.9, "reasoning": "..."}',
             metrics=LDAIMetrics(success=True),
         )
         mock_ai_provider.invoke_structured_model.return_value = mock_response
@@ -203,7 +198,7 @@ async def test_evaluate_success_with_evaluation_response_shape(
     async def test_evaluate_handles_missing_evaluation_in_response(
         self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
     ):
-        """Evaluate should handle missing evaluation in response."""
+        """Evaluate should handle missing score/reasoning in response."""
         mock_response = StructuredResponse(
             data={},
             raw_response='{}',
@@ -228,12 +223,10 @@ async def test_evaluate_handles_invalid_score(
         """Evaluate should handle invalid score values."""
         mock_response = StructuredResponse(
             data={
-                'evaluation': {
-                    'score': 1.5,
-                    'reasoning': 'Some reasoning'
-                }
+                'score': 1.5,
+                'reasoning': 'Some reasoning'
             },
-            raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}',
+            raw_response='{"score": 1.5, "reasoning": "..."}',
             metrics=LDAIMetrics(success=True)
         )
         
@@ -254,12 +247,8 @@ async def test_evaluate_handles_missing_reasoning(
     ):
         """Evaluate should handle missing reasoning."""
         mock_response = StructuredResponse(
-            data={
-                'evaluation': {
-                    'score': 0.8,
-                }
-            },
-            raw_response='{"evaluation": {"score": 0.8}}',
+            data={'score': 0.8},
+            raw_response='{"score": 0.8}',
             metrics=LDAIMetrics(success=True)
         )
         
@@ -316,13 +305,8 @@ async def test_evaluate_messages_calls_evaluate(
         from ldai.providers.types import ChatResponse
         
         mock_response = StructuredResponse(
-            data={
-                'evaluation': {
-                    'score': 0.9,
-                    'reasoning': 'Very relevant'
-                }
-            },
-            raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
+            data={'score': 0.9, 'reasoning': 'Very relevant'},
+            raw_response='{"score": 0.9, "reasoning": "..."}',
             metrics=LDAIMetrics(success=True)
         )
         
@@ -351,21 +335,17 @@ class TestEvaluationSchemaBuilder:
     """Tests for EvaluationSchemaBuilder."""
 
     def test_build_creates_correct_schema(self):
-        """Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
+        """Schema builder should create fixed schema (top-level score + reasoning, no key param)."""
         schema = EvaluationSchemaBuilder.build()
 
         assert schema['title'] == 'EvaluationResponse'
         assert schema['type'] == 'object'
-        assert schema['required'] == ['evaluation']
-        assert 'evaluation' in schema['properties']
-        eval_schema = schema['properties']['evaluation']
-        assert eval_schema['type'] == 'object'
-        assert eval_schema['required'] == ['score', 'reasoning']
-        assert 'score' in eval_schema['properties']
-        assert 'reasoning' in eval_schema['properties']
-        assert eval_schema['properties']['score']['type'] == 'number'
-        assert eval_schema['properties']['score']['minimum'] == 0
-        assert eval_schema['properties']['score']['maximum'] == 1
+        assert schema['required'] == ['score', 'reasoning']
+        assert 'score' in schema['properties']
+        assert 'reasoning' in schema['properties']
+        assert schema['properties']['score']['type'] == 'number'
+        assert schema['properties']['score']['minimum'] == 0
+        assert schema['properties']['score']['maximum'] == 1
 
 
 class TestJudgeConfigSerialization: