Skip to content

Commit a303c3d

Browse files
committed
simplify the structured output for judges further
1 parent b4e3118 commit a303c3d

3 files changed

Lines changed: 41 additions & 70 deletions

File tree

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,21 +174,20 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
174174

175175
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
176176
"""
177-
Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}.
177+
Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
178178
"""
179179
results: Dict[str, EvalScore] = {}
180180
metric_key = self._ai_config.evaluation_metric_key
181181
if not metric_key:
182182
log.warn('Evaluation metric key is missing')
183183
return results
184184

185-
evaluation = data.get('evaluation') if isinstance(data, dict) else None
186-
if not isinstance(evaluation, dict):
185+
if not isinstance(data, dict):
187186
log.warn('Invalid response: missing or invalid evaluation')
188187
return results
189188

190-
score = evaluation.get('score')
191-
reasoning = evaluation.get('reasoning')
189+
score = data.get('score')
190+
reasoning = data.get('reasoning')
192191
if not isinstance(score, (int, float)) or score < 0 or score > 1:
193192
log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
194193
return results

packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class EvaluationSchemaBuilder:
77
"""
88
Internal class for building evaluation response schemas.
99
Not exported - only used internally by Judge.
10-
Schema is a fixed shape: one "evaluation" object with score and reasoning.
10+
Schema is a fixed shape: top-level score and reasoning.
1111
The judge config's evaluation_metric_key is only used when keying the result,
1212
not in the schema.
1313
"""
@@ -20,7 +20,7 @@ def build() -> Dict[str, Any]:
2020
evaluation_metric_key.
2121
2222
In practice the model returns JSON like:
23-
{"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}}
23+
{"score": 0.85, "reasoning": "The response is accurate."}
2424
2525
:return: Schema dictionary for structured output
2626
"""
@@ -29,25 +29,17 @@ def build() -> Dict[str, Any]:
2929
'description': 'Response containing an evaluation (score and reasoning).',
3030
'type': 'object',
3131
'properties': {
32-
'evaluation': {
33-
'type': 'object',
34-
'description': 'The evaluation result.',
35-
'properties': {
36-
'score': {
37-
'type': 'number',
38-
'minimum': 0,
39-
'maximum': 1,
40-
'description': 'Score between 0.0 and 1.0.',
41-
},
42-
'reasoning': {
43-
'type': 'string',
44-
'description': 'Reasoning behind the score.',
45-
},
46-
},
47-
'required': ['score', 'reasoning'],
48-
'additionalProperties': False,
32+
'score': {
33+
'type': 'number',
34+
'minimum': 0,
35+
'maximum': 1,
36+
'description': 'Score between 0.0 and 1.0.',
37+
},
38+
'reasoning': {
39+
'type': 'string',
40+
'description': 'Reasoning behind the score.',
4941
},
5042
},
51-
'required': ['evaluation'],
43+
'required': ['score', 'reasoning'],
5244
'additionalProperties': False,
5345
}

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 25 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,9 @@ def test_judge_initializes_with_evaluation_metric_key(
109109
assert judge._ai_config == judge_config_with_key
110110
assert judge._evaluation_response_structure is not None
111111
assert judge._evaluation_response_structure['title'] == 'EvaluationResponse'
112-
assert judge._evaluation_response_structure['required'] == ['evaluation']
113-
eval_schema = judge._evaluation_response_structure['properties']['evaluation']
114-
assert eval_schema['required'] == ['score', 'reasoning']
115-
assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']
112+
assert judge._evaluation_response_structure['required'] == ['score', 'reasoning']
113+
assert 'score' in judge._evaluation_response_structure['properties']
114+
assert 'reasoning' in judge._evaluation_response_structure['properties']
116115

117116

118117
class TestJudgeEvaluate:
@@ -149,12 +148,10 @@ async def test_evaluate_success_with_valid_response(
149148
"""Evaluate should return JudgeResponse with valid evaluation."""
150149
mock_response = StructuredResponse(
151150
data={
152-
'evaluation': {
153-
'score': 0.85,
154-
'reasoning': 'The response is highly relevant to the input.'
155-
}
151+
'score': 0.85,
152+
'reasoning': 'The response is highly relevant to the input.'
156153
},
157-
raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}',
154+
raw_response='{"score": 0.85, "reasoning": "..."}',
158155
metrics=LDAIMetrics(success=True)
159156
)
160157

@@ -176,15 +173,13 @@ async def test_evaluate_success_with_valid_response(
176173
async def test_evaluate_success_with_evaluation_response_shape(
177174
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
178175
):
179-
"""Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
176+
"""Evaluate should accept shape { score, reasoning } and key by metric."""
180177
mock_response = StructuredResponse(
181178
data={
182-
'evaluation': {
183-
'score': 0.9,
184-
'reasoning': 'The response is accurate and complete.',
185-
}
179+
'score': 0.9,
180+
'reasoning': 'The response is accurate and complete.',
186181
},
187-
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
182+
raw_response='{"score": 0.9, "reasoning": "..."}',
188183
metrics=LDAIMetrics(success=True),
189184
)
190185
mock_ai_provider.invoke_structured_model.return_value = mock_response
@@ -203,7 +198,7 @@ async def test_evaluate_success_with_evaluation_response_shape(
203198
async def test_evaluate_handles_missing_evaluation_in_response(
204199
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
205200
):
206-
"""Evaluate should handle missing evaluation in response."""
201+
"""Evaluate should handle missing score/reasoning in response."""
207202
mock_response = StructuredResponse(
208203
data={},
209204
raw_response='{}',
@@ -228,12 +223,10 @@ async def test_evaluate_handles_invalid_score(
228223
"""Evaluate should handle invalid score values."""
229224
mock_response = StructuredResponse(
230225
data={
231-
'evaluation': {
232-
'score': 1.5,
233-
'reasoning': 'Some reasoning'
234-
}
226+
'score': 1.5,
227+
'reasoning': 'Some reasoning'
235228
},
236-
raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}',
229+
raw_response='{"score": 1.5, "reasoning": "..."}',
237230
metrics=LDAIMetrics(success=True)
238231
)
239232

@@ -254,12 +247,8 @@ async def test_evaluate_handles_missing_reasoning(
254247
):
255248
"""Evaluate should handle missing reasoning."""
256249
mock_response = StructuredResponse(
257-
data={
258-
'evaluation': {
259-
'score': 0.8,
260-
}
261-
},
262-
raw_response='{"evaluation": {"score": 0.8}}',
250+
data={'score': 0.8},
251+
raw_response='{"score": 0.8}',
263252
metrics=LDAIMetrics(success=True)
264253
)
265254

@@ -316,13 +305,8 @@ async def test_evaluate_messages_calls_evaluate(
316305
from ldai.providers.types import ChatResponse
317306

318307
mock_response = StructuredResponse(
319-
data={
320-
'evaluation': {
321-
'score': 0.9,
322-
'reasoning': 'Very relevant'
323-
}
324-
},
325-
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
308+
data={'score': 0.9, 'reasoning': 'Very relevant'},
309+
raw_response='{"score": 0.9, "reasoning": "..."}',
326310
metrics=LDAIMetrics(success=True)
327311
)
328312

@@ -351,21 +335,17 @@ class TestEvaluationSchemaBuilder:
351335
"""Tests for EvaluationSchemaBuilder."""
352336

353337
def test_build_creates_correct_schema(self):
354-
"""Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
338+
"""Schema builder should create fixed schema (top-level score + reasoning, no key param)."""
355339
schema = EvaluationSchemaBuilder.build()
356340

357341
assert schema['title'] == 'EvaluationResponse'
358342
assert schema['type'] == 'object'
359-
assert schema['required'] == ['evaluation']
360-
assert 'evaluation' in schema['properties']
361-
eval_schema = schema['properties']['evaluation']
362-
assert eval_schema['type'] == 'object'
363-
assert eval_schema['required'] == ['score', 'reasoning']
364-
assert 'score' in eval_schema['properties']
365-
assert 'reasoning' in eval_schema['properties']
366-
assert eval_schema['properties']['score']['type'] == 'number'
367-
assert eval_schema['properties']['score']['minimum'] == 0
368-
assert eval_schema['properties']['score']['maximum'] == 1
343+
assert schema['required'] == ['score', 'reasoning']
344+
assert 'score' in schema['properties']
345+
assert 'reasoning' in schema['properties']
346+
assert schema['properties']['score']['type'] == 'number'
347+
assert schema['properties']['score']['minimum'] == 0
348+
assert schema['properties']['score']['maximum'] == 1
369349

370350

371351
class TestJudgeConfigSerialization:

0 commit comments

Comments
 (0)