Skip to content
37 changes: 8 additions & 29 deletions packages/sdk/server-ai/src/ldai/judge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
self._ai_config = ai_config
self._ai_config_tracker = ai_config_tracker
self._ai_provider = ai_provider
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
self._evaluation_response_structure = EvaluationSchemaBuilder.build()

async def evaluate(
self,
Expand Down Expand Up @@ -77,10 +77,9 @@ async def evaluate(
)

success = response.metrics.success

evals = self._parse_evaluation_response(response.data)

if self._ai_config.evaluation_metric_key not in evals:
if not evals:
log.warn('Judge evaluation did not return the expected evaluation')
success = False

Expand Down Expand Up @@ -175,47 +174,27 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:

def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
"""
Parses the structured evaluation response from the AI provider.

:param data: The structured response data
:return: Dictionary of evaluation scores keyed by metric key
Parses the structured evaluation response. Expects {"evaluation": {"score": n, "reasoning": "..."}}.
"""
results: Dict[str, EvalScore] = {}

if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
log.warn('Invalid response: missing or invalid evaluations object')
return results

evaluations = data['evaluations']

metric_key = self._ai_config.evaluation_metric_key
if not metric_key:
log.warn('Evaluation metric key is missing')
return results

evaluation = evaluations.get(metric_key)

if not evaluation or not isinstance(evaluation, dict):
log.warn(f'Missing evaluation for metric key: {metric_key}')
evaluation = data.get('evaluation') if isinstance(data, dict) else None
if not isinstance(evaluation, dict):
log.warn('Invalid response: missing or invalid evaluation')
return results

score = evaluation.get('score')
reasoning = evaluation.get('reasoning')

if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warn(
f'Invalid score evaluated for {metric_key}: {score}. '
'Score must be a number between 0 and 1 inclusive'
)
log.warn(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
return results

if not isinstance(reasoning, str):
log.warn(
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
'Reasoning must be a string'
)
log.warn('Invalid reasoning: must be a string')
return results

results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)

return results
Original file line number Diff line number Diff line change
@@ -1,79 +1,53 @@
"""Internal class for building dynamic evaluation response schemas."""
"""Internal class for building evaluation response schemas."""

from typing import Any, Dict, Optional
from typing import Any, Dict


class EvaluationSchemaBuilder:
"""
Internal class for building dynamic evaluation response schemas.
Internal class for building evaluation response schemas.
Not exported - only used internally by Judge.
Schema is a fixed shape: one "evaluation" object with score and reasoning.
The judge config's evaluation_metric_key is only used when keying the result,
not in the schema.
"""

@staticmethod
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
def build() -> Dict[str, Any]:
"""
Build an evaluation response schema from evaluation metric key.
Build the evaluation response schema. No parameters; the schema is
always the same. The judge keys the parsed result by its config's
evaluation_metric_key.

:param evaluation_metric_key: Evaluation metric key, or None if not available
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
"""
if not evaluation_metric_key:
return None
In practice the model returns JSON like:
{"evaluation": {"score": 0.85, "reasoning": "The response is accurate."}}

:return: Schema dictionary for structured output
"""
return {
'title': 'EvaluationResponse',
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
'description': 'Response containing an evaluation (score and reasoning).',
'type': 'object',
'properties': {
'evaluations': {
'evaluation': {
'type': 'object',
'description': (
f"Object containing evaluation results for "
f"{evaluation_metric_key} metric"
),
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
'required': [evaluation_metric_key],
'description': 'The evaluation result.',
'properties': {
'score': {
'type': 'number',
'minimum': 0,
'maximum': 1,
'description': 'Score between 0.0 and 1.0.',
},
'reasoning': {
'type': 'string',
'description': 'Reasoning behind the score.',
},
},
'required': ['score', 'reasoning'],
'additionalProperties': False,
},
},
'required': ['evaluations'],
'additionalProperties': False,
}

@staticmethod
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
"""
Build properties for a single evaluation metric key.

:param evaluation_metric_key: Evaluation metric key
:return: Dictionary of properties for the key
"""
return {
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
}

@staticmethod
def _build_key_schema(key: str) -> Dict[str, Any]:
"""
Build schema for a single evaluation metric key.

:param key: Evaluation metric key
:return: Schema dictionary for the key
"""
return {
'type': 'object',
'properties': {
'score': {
'type': 'number',
'minimum': 0,
'maximum': 1,
'description': f'Score between 0.0 and 1.0 for {key}',
},
'reasoning': {
'type': 'string',
'description': f'Reasoning behind the score for {key}',
},
},
'required': ['score', 'reasoning'],
'required': ['evaluation'],
'additionalProperties': False,
}
123 changes: 60 additions & 63 deletions packages/sdk/server-ai/tests/test_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,10 @@ def test_judge_initializes_with_evaluation_metric_key(
assert judge._ai_config == judge_config_with_key
assert judge._evaluation_response_structure is not None
assert judge._evaluation_response_structure['title'] == 'EvaluationResponse'
assert '$ld:ai:judge:relevance' in judge._evaluation_response_structure['properties']['evaluations']['required']

def test_judge_initializes_without_evaluation_metric_key(
self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Judge should initialize but have None for evaluation_response_structure."""
judge = Judge(judge_config_without_key, tracker, mock_ai_provider)

assert judge._ai_config == judge_config_without_key
assert judge._evaluation_response_structure is None
assert judge._evaluation_response_structure['required'] == ['evaluation']
eval_schema = judge._evaluation_response_structure['properties']['evaluation']
assert eval_schema['required'] == ['score', 'reasoning']
assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated


class TestJudgeEvaluate:
Expand Down Expand Up @@ -155,14 +149,12 @@ async def test_evaluate_success_with_valid_response(
"""Evaluate should return JudgeResponse with valid evaluation."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.85,
'reasoning': 'The response is highly relevant to the input.'
}
'evaluation': {
'score': 0.85,
'reasoning': 'The response is highly relevant to the input.'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -181,20 +173,40 @@ async def test_evaluate_success_with_valid_response(
assert 'relevant' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()

@pytest.mark.asyncio
Comment thread
cursor[bot] marked this conversation as resolved.
async def test_evaluate_handles_missing_evaluation_in_response(
async def test_evaluate_success_with_evaluation_response_shape(
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Evaluate should handle missing evaluation in response."""
"""Evaluate should accept shape { evaluation: { score, reasoning } } and key by metric."""
mock_response = StructuredResponse(
data={
'evaluations': {
'wrong-key': {
'score': 0.5,
'reasoning': 'Some reasoning'
}
'evaluation': {
'score': 0.9,
'reasoning': 'The response is accurate and complete.',
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True),
)
mock_ai_provider.invoke_structured_model.return_value = mock_response
tracker.track_metrics_of = AsyncMock(return_value=mock_response)

judge = Judge(judge_config_with_key, tracker, mock_ai_provider)
result = await judge.evaluate("What is feature flagging?", "Feature flagging is...")

assert result is not None
assert result.success is True
assert '$ld:ai:judge:relevance' in result.evals
assert result.evals['$ld:ai:judge:relevance'].score == 0.9
assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()

@pytest.mark.asyncio
async def test_evaluate_handles_missing_evaluation_in_response(
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
):
"""Evaluate should handle missing evaluation in response."""
mock_response = StructuredResponse(
data={},
raw_response='{}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -216,14 +228,12 @@ async def test_evaluate_handles_invalid_score(
"""Evaluate should handle invalid score values."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 1.5,
'reasoning': 'Some reasoning'
}
'evaluation': {
'score': 1.5,
'reasoning': 'Some reasoning'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand All @@ -245,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning(
"""Evaluate should handle missing reasoning."""
mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.8,
}
'evaluation': {
'score': 0.8,
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.8}}',
metrics=LDAIMetrics(success=True)
)

Expand Down Expand Up @@ -309,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate(

mock_response = StructuredResponse(
data={
'evaluations': {
'$ld:ai:judge:relevance': {
'score': 0.9,
'reasoning': 'Very relevant'
}
'evaluation': {
'score': 0.9,
'reasoning': 'Very relevant'
}
},
raw_response='{"evaluations": {...}}',
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
metrics=LDAIMetrics(success=True)
)

Expand Down Expand Up @@ -345,30 +351,21 @@ class TestEvaluationSchemaBuilder:
"""Tests for EvaluationSchemaBuilder."""

def test_build_creates_correct_schema(self):
"""Schema builder should create correct schema structure."""
schema = EvaluationSchemaBuilder.build('$ld:ai:judge:relevance')
"""Schema builder should create fixed schema (evaluation with score + reasoning, no key param)."""
schema = EvaluationSchemaBuilder.build()

assert schema['title'] == 'EvaluationResponse'
assert schema['type'] == 'object'
assert 'evaluations' in schema['properties']
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['required']
assert '$ld:ai:judge:relevance' in schema['properties']['evaluations']['properties']

metric_schema = schema['properties']['evaluations']['properties']['$ld:ai:judge:relevance']
assert metric_schema['type'] == 'object'
assert 'score' in metric_schema['properties']
assert 'reasoning' in metric_schema['properties']
assert metric_schema['properties']['score']['type'] == 'number'
assert metric_schema['properties']['score']['minimum'] == 0
assert metric_schema['properties']['score']['maximum'] == 1

def test_build_key_properties_creates_single_key(self):
"""_build_key_properties should create properties for a single key."""
properties = EvaluationSchemaBuilder._build_key_properties('$ld:ai:judge:relevance')

assert '$ld:ai:judge:relevance' in properties
assert len(properties) == 1
assert properties['$ld:ai:judge:relevance']['type'] == 'object'
assert schema['required'] == ['evaluation']
assert 'evaluation' in schema['properties']
eval_schema = schema['properties']['evaluation']
assert eval_schema['type'] == 'object'
assert eval_schema['required'] == ['score', 'reasoning']
assert 'score' in eval_schema['properties']
assert 'reasoning' in eval_schema['properties']
assert eval_schema['properties']['score']['type'] == 'number'
assert eval_schema['properties']['score']['minimum'] == 0
assert eval_schema['properties']['score']['maximum'] == 1


class TestJudgeConfigSerialization:
Expand Down
Loading