Skip to content

Commit f8c6eba

Browse files
committed
fix: Remove evaluation metric key from schema which failed on some LLMs
1 parent d9cac0a commit f8c6eba

3 files changed

Lines changed: 108 additions & 111 deletions

File tree

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(
3737
self._ai_config = ai_config
3838
self._ai_config_tracker = ai_config_tracker
3939
self._ai_provider = ai_provider
40-
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
40+
self._evaluation_response_structure = EvaluationSchemaBuilder.build()
4141

4242
async def evaluate(
4343
self,
@@ -77,10 +77,9 @@ async def evaluate(
7777
)
7878

7979
success = response.metrics.success
80-
8180
evals = self._parse_evaluation_response(response.data)
8281

83-
if self._ai_config.evaluation_metric_key not in evals:
82+
if not evals:
8483
log.warn('Judge evaluation did not return the expected evaluation')
8584
success = False
8685

@@ -175,47 +174,27 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
175174

176175
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
177176
"""
178-
Parses the structured evaluation response from the AI provider.
179-
180-
:param data: The structured response data
181-
:return: Dictionary of evaluation scores keyed by metric key
177+
Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}.
182178
"""
183179
results: Dict[str, EvalScore] = {}
184-
185-
if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
186-
log.warn('Invalid response: missing or invalid evaluations object')
187-
return results
188-
189-
evaluations = data['evaluations']
190-
191180
metric_key = self._ai_config.evaluation_metric_key
192181
if not metric_key:
193182
log.warn('Evaluation metric key is missing')
194183
return results
195184

196-
evaluation = evaluations.get(metric_key)
197-
198-
if not evaluation or not isinstance(evaluation, dict):
199-
log.warn(f'Missing evaluation for metric key: {metric_key}')
185+
evaluation = data.get('evaluation') if isinstance(data, dict) else None
186+
if not isinstance(evaluation, dict):
187+
log.warn('Invalid response: missing or invalid evaluation')
200188
return results
201189

202190
score = evaluation.get('score')
203191
reasoning = evaluation.get('reasoning')
204-
205192
if not isinstance(score, (int, float)) or score < 0 or score > 1:
206-
log.warn(
207-
f'Invalid score evaluated for {metric_key}: {score}. '
208-
'Score must be a number between 0 and 1 inclusive'
209-
)
193+
log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
210194
return results
211-
212195
if not isinstance(reasoning, str):
213-
log.warn(
214-
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
215-
'Reasoning must be a string'
216-
)
196+
log.warn('Invalid reasoning: must be a string')
217197
return results
218198

219199
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
220-
221200
return results
Lines changed: 31 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,51 @@
1-
"""Internal class for building dynamic evaluation response schemas."""
1+
"""Internal class for building evaluation response schemas."""
22

3-
from typing import Any, Dict, Optional
3+
from typing import Any, Dict
44

55

66
class EvaluationSchemaBuilder:
77
"""
8-
Internal class for building dynamic evaluation response schemas.
8+
Internal class for building evaluation response schemas.
99
Not exported - only used internally by Judge.
10+
Schema is a fixed shape: one "evaluation" object with score and reasoning.
11+
The judge config's evaluation_metric_key is only used when keying the result,
12+
not in the schema.
1013
"""
1114

1215
@staticmethod
13-
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
16+
def build() -> Dict[str, Any]:
1417
"""
15-
Build an evaluation response schema from evaluation metric key.
18+
Build the evaluation response schema. No parameters; the schema is
19+
always the same. The judge keys the parsed result by its config's
20+
evaluation_metric_key.
1621
17-
:param evaluation_metric_key: Evaluation metric key, or None if not available
18-
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
19-
"""
20-
if not evaluation_metric_key:
21-
return None
22+
In practice the model returns JSON like:
23+
{"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}}
2224
25+
:return: Schema dictionary for structured output
26+
"""
2327
return {
2428
'title': 'EvaluationResponse',
25-
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
29+
'description': 'Response containing an evaluation (score and reasoning).',
2630
'type': 'object',
2731
'properties': {
28-
'evaluations': {
32+
'evaluation': {
2933
'type': 'object',
30-
'description': (
31-
f"Object containing evaluation results for "
32-
f"{evaluation_metric_key} metric"
33-
),
34-
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
35-
'required': [evaluation_metric_key],
36-
'additionalProperties': False,
37-
},
38-
},
39-
'required': ['evaluations'],
40-
'additionalProperties': False,
41-
}
42-
43-
@staticmethod
44-
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
45-
"""
46-
Build properties for a single evaluation metric key.
47-
48-
:param evaluation_metric_key: Evaluation metric key
49-
:return: Dictionary of properties for the key
50-
"""
51-
return {
52-
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
53-
}
54-
55-
@staticmethod
56-
def _build_key_schema(key: str) -> Dict[str, Any]:
57-
"""
58-
Build schema for a single evaluation metric key.
59-
60-
:param key: Evaluation metric key
61-
:return: Schema dictionary for the key
62-
"""
63-
return {
64-
'type': 'object',
65-
'properties': {
66-
'score': {
67-
'type': 'number',
68-
'minimum': 0,
69-
'maximum': 1,
70-
'description': f'Score between 0.0 and 1.0 for {key}',
71-
},
72-
'reasoning': {
73-
'type': 'string',
74-
'description': f'Reasoning behind the score for {key}',
34+
'description': 'The evaluation result.',
35+
'properties': {
36+
'score': {
37+
'type': 'number',
38+
'minimum': 0,
39+
'maximum': 1,
40+
'description': 'Score between 0.0 and 1.0.',
41+
},
42+
'reasoning': {
43+
'type': 'string',
44+
'description': 'Reasoning behind the score.',
45+
},
46+
},
47+
'required': ['score', 'reasoning'],
7548
},
7649
},
77-
'required': ['score', 'reasoning'],
78-
'additionalProperties': False,
50+
'required': ['evaluation'],
7951
}

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 69 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key(
109109
assert judge._ai_config == judge_config_with_key
110110
assert judge._evaluation_response_structure is not None
111111
assert judge._evaluation_response_structure['title'] == 'EvaluationResponse'
112-
assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required']
112+
assert judge._evaluation_response_structure['required'] == ['evaluation']
113+
eval_schema = judge._evaluation_response_structure['properties']['evaluation']
114+
assert eval_schema['required'] == ['score', 'reasoning']
115+
assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']
113116

114117
def test_judge_initializes_without_evaluation_metric_key(
115118
self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
@@ -180,6 +183,58 @@ async def test_evaluate_success_with_valid_response(
180183
assert result.evals['$ld:ai:judge:relevance'].score == 0.85
181184
assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()
182185

186+
@pytest.mark.asyncio
187+
async def test_evaluate_success_with_evaluation_response_shape(
188+
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
189+
):
190+
"""Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
191+
mock_response = StructuredResponse(
192+
data={
193+
'evaluation': {
194+
'score': 0.9,
195+
'reasoning': 'The response is accurate and complete.',
196+
}
197+
},
198+
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
199+
metrics=LDAIMetrics(success=True),
200+
)
201+
mock_ai_provider.invoke_structured_model.return_value = mock_response
202+
tracker.track_metrics_of = AsyncMock(return_value=mock_response)
203+
204+
judge = Judge(judge_config_with_key, tracker, mock_ai_provider)
205+
result = await judge.evaluate("What is feature flagging?", "Feature flagging is...")
206+
207+
assert result is not None
208+
assert result.success is True
209+
assert '$ld:ai:judge:relevance' in result.evals
210+
assert result.evals['$ld:ai:judge:relevance'].score == 0.9
211+
assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()
212+
213+
@pytest.mark.asyncio
214+
async def test_evaluate_success_with_evaluations_backward_compat(
215+
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
216+
):
217+
"""Evaluate should accept legacy shape { evaluations: { score, reasoning } }."""
218+
mock_response = StructuredResponse(
219+
data={
220+
'evaluations': {
221+
'score': 0.7,
222+
'reasoning': 'Partially correct.',
223+
}
224+
},
225+
raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}',
226+
metrics=LDAIMetrics(success=True),
227+
)
228+
mock_ai_provider.invoke_structured_model.return_value = mock_response
229+
tracker.track_metrics_of = AsyncMock(return_value=mock_response)
230+
231+
judge = Judge(judge_config_with_key, tracker, mock_ai_provider)
232+
result = await judge.evaluate("input", "output")
233+
234+
assert result is not None
235+
assert result.success is True
236+
assert result.evals['$ld:ai:judge:relevance'].score == 0.7
237+
183238
@pytest.mark.asyncio
184239
async def test_evaluate_handles_missing_evaluation_in_response(
185240
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
@@ -345,30 +400,21 @@ class TestEvaluationSchemaBuilder:
345400
"""Tests for EvaluationSchemaBuilder."""
346401

347402
def test_build_creates_correct_schema(self):
348-
"""Schema builder should create correct schema structure."""
349-
schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance')
350-
403+
"""Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
404+
schema = EvaluationSchemaBuilder.build()
405+
351406
assert schema['title'] == 'EvaluationResponse'
352407
assert schema['type'] == 'object'
353-
assert 'evaluations' in schema['properties']
354-
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required']
355-
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties']
356-
357-
metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance']
358-
assert metric_schema['type'] == 'object'
359-
assert 'score' in metric_schema['properties']
360-
assert 'reasoning' in metric_schema['properties']
361-
assert metric_schema['properties']['score']['type'] == 'number'
362-
assert metric_schema['properties']['score']['minimum'] == 0
363-
assert metric_schema['properties']['score']['maximum'] == 1
364-
365-
def test_build_key_properties_creates_single_key(self):
366-
"""_build_key_properties should create properties for a single key."""
367-
properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance')
368-
369-
assert '$ld:ai:judge:relevance' in properties
370-
assert len(properties) == 1
371-
assert properties['$ld:ai:judge:relevance']['type'] == 'object'
408+
assert schema['required'] == ['evaluation']
409+
assert 'evaluation' in schema['properties']
410+
eval_schema = schema['properties']['evaluation']
411+
assert eval_schema['type'] == 'object'
412+
assert eval_schema['required'] == ['score', 'reasoning']
413+
assert 'score' in eval_schema['properties']
414+
assert 'reasoning' in eval_schema['properties']
415+
assert eval_schema['properties']['score']['type'] == 'number'
416+
assert eval_schema['properties']['score']['minimum'] == 0
417+
assert eval_schema['properties']['score']['maximum'] == 1
372418

373419

374420
class TestJudgeConfigSerialization:

0 commit comments

Comments
 (0)