Skip to content

Commit af4e463

Browse files
authored
feat!: Flatten JudgeResponse and EvalScore into new JudgeResult (#132)
1 parent ccae38a commit af4e463

8 files changed

Lines changed: 117 additions & 178 deletions

File tree

packages/sdk/server-ai/src/ldai/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
AgentRunner,
3737
ToolRegistry,
3838
)
39-
from ldai.providers.types import EvalScore, JudgeResponse
39+
from ldai.providers.types import JudgeResult
4040
from ldai.tracker import AIGraphTracker
4141

4242
__all__ = [
@@ -60,11 +60,10 @@
6060
'ManagedAgent',
6161
'ManagedModel',
6262
'ManagedAgentGraph',
63-
'EvalScore',
6463
'AgentGraphDefinition',
6564
'Judge',
6665
'JudgeConfiguration',
67-
'JudgeResponse',
66+
'JudgeResult',
6867
'LDMessage',
6968
'ModelConfig',
7069
'ProviderConfig',

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
"""Judge implementation for AI evaluation."""
22

33
import random
4-
from typing import Any, Dict, Optional
4+
from typing import Any, Dict, Optional, Tuple
55

66
import chevron
77

88
from ldai import log
99
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1010
from ldai.models import AIJudgeConfig, LDMessage
1111
from ldai.providers.model_runner import ModelRunner
12-
from ldai.providers.types import EvalScore, JudgeResponse, ModelResponse
12+
from ldai.providers.types import JudgeResult, ModelResponse
1313
from ldai.tracker import LDAIConfigTracker
1414

1515

@@ -44,29 +44,34 @@ async def evaluate(
4444
input_text: str,
4545
output_text: str,
4646
sampling_rate: float = 1.0,
47-
) -> Optional[JudgeResponse]:
47+
) -> JudgeResult:
4848
"""
4949
Evaluates an AI response using the judge's configuration.
5050
5151
:param input_text: The input prompt or question that was provided to the AI
5252
:param output_text: The AI-generated response to be evaluated
5353
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
54-
:return: Evaluation results or None if not sampled
54+
:return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
5555
"""
56+
judge_result = JudgeResult(judge_config_key=self._ai_config.key)
57+
5658
try:
5759
if not self._ai_config.evaluation_metric_key:
5860
log.warning(
5961
'Judge configuration is missing required evaluationMetricKey'
6062
)
61-
return None
63+
judge_result.error_message = 'Judge configuration is missing required evaluationMetricKey'
64+
return judge_result
6265

6366
if not self._ai_config.messages:
6467
log.warning('Judge configuration must include messages')
65-
return None
68+
judge_result.error_message = 'Judge configuration must include messages'
69+
return judge_result
6670

6771
if random.random() > sampling_rate:
6872
log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
69-
return None
73+
judge_result.sampled = True
74+
return judge_result
7075

7176
messages = self._construct_evaluation_messages(input_text, output_text)
7277
assert self._evaluation_response_structure is not None
@@ -76,39 +81,36 @@ async def evaluate(
7681
lambda result: result.metrics,
7782
)
7883

79-
success = response.metrics.success
80-
evals = self._parse_evaluation_response(response.data)
84+
parsed = self._parse_evaluation_response(response.data)
8185

82-
if not evals:
86+
if parsed is None:
8387
log.warning('Judge evaluation did not return the expected evaluation')
84-
success = False
85-
86-
return JudgeResponse(
87-
judge_config_key=self._ai_config.key,
88-
evals=evals,
89-
success=success,
90-
)
88+
return judge_result
89+
90+
score, reasoning = parsed
91+
judge_result.metric_key = self._ai_config.evaluation_metric_key
92+
judge_result.score = score
93+
judge_result.reasoning = reasoning
94+
judge_result.success = response.metrics.success
95+
return judge_result
9196
except Exception as error:
9297
log.error(f'Judge evaluation failed: {error}')
93-
return JudgeResponse(
94-
evals={},
95-
success=False,
96-
error=str(error) if isinstance(error, Exception) else 'Unknown error',
97-
)
98+
judge_result.error_message = str(error) if isinstance(error, Exception) else 'Unknown error'
99+
return judge_result
98100

99101
async def evaluate_messages(
100102
self,
101103
messages: list[LDMessage],
102104
response: ModelResponse,
103105
sampling_ratio: float = 1.0,
104-
) -> Optional[JudgeResponse]:
106+
) -> JudgeResult:
105107
"""
106108
Evaluates an AI response from chat messages and response.
107109
108110
:param messages: Array of messages representing the conversation history
109111
:param response: The AI response to be evaluated
110112
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
111-
:return: Evaluation results or None if not sampled
113+
:return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
112114
"""
113115
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
114116
output_text = response.message.content
@@ -172,28 +174,23 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
172174
# Use chevron (Mustache) for templating, with no escaping
173175
return chevron.render(content, variables)
174176

175-
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
177+
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Optional[Tuple[float, str]]:
176178
"""
177179
Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
178-
"""
179-
results: Dict[str, EvalScore] = {}
180-
metric_key = self._ai_config.evaluation_metric_key
181-
if not metric_key:
182-
log.warning('Evaluation metric key is missing')
183-
return results
184180
181+
:return: ``(score, reasoning)`` on success, or ``None`` if the response is invalid.
182+
"""
185183
if not isinstance(data, dict):
186184
log.warning('Invalid response: missing or invalid evaluation')
187-
return results
185+
return None
188186

189187
score = data.get('score')
190188
reasoning = data.get('reasoning')
191189
if not isinstance(score, (int, float)) or score < 0 or score > 1:
192190
log.warning(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
193-
return results
191+
return None
194192
if not isinstance(reasoning, str):
195193
log.warning('Invalid reasoning: must be a string')
196-
return results
194+
return None
197195

198-
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
199-
return results
196+
return (float(score), reasoning)

packages/sdk/server-ai/src/ldai/managed_model.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ldai.judge import Judge
66
from ldai.models import AICompletionConfig, LDMessage
77
from ldai.providers.model_runner import ModelRunner
8-
from ldai.providers.types import JudgeResponse, ModelResponse
8+
from ldai.providers.types import JudgeResult, ModelResponse
99
from ldai.tracker import LDAIConfigTracker
1010

1111

@@ -66,19 +66,19 @@ def _start_judge_evaluations(
6666
self,
6767
messages: List[LDMessage],
6868
response: ModelResponse,
69-
) -> List[asyncio.Task[Optional[JudgeResponse]]]:
69+
) -> List[asyncio.Task[Optional[JudgeResult]]]:
7070
if not self._ai_config.judge_configuration or not self._ai_config.judge_configuration.judges:
7171
return []
7272

73-
async def evaluate_judge(judge_config: Any) -> Optional[JudgeResponse]:
73+
async def evaluate_judge(judge_config: Any) -> Optional[JudgeResult]:
7474
judge = self._judges.get(judge_config.key)
7575
if not judge:
7676
log.warning(f'Judge configuration is not enabled: {judge_config.key}')
7777
return None
78-
eval_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate)
79-
if eval_result and eval_result.success:
80-
self._tracker.track_judge_response(eval_result)
81-
return eval_result
78+
judge_result = await judge.evaluate_messages(messages, response, judge_config.sampling_rate)
79+
if judge_result.success:
80+
self._tracker.track_judge_result(judge_result)
81+
return judge_result
8282

8383
return [
8484
asyncio.create_task(evaluate_judge(jc))

packages/sdk/server-ai/src/ldai/models.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
from dataclasses import dataclass, field
33
from typing import Any, Dict, List, Literal, Optional, Union
44

5-
from ldai.tracker import LDAIConfigTracker
6-
75

86
@dataclass
97
class LDMessage:
@@ -182,7 +180,7 @@ class AIConfig:
182180
enabled: bool
183181
model: Optional[ModelConfig] = None
184182
provider: Optional[ProviderConfig] = None
185-
tracker: Optional[LDAIConfigTracker] = None
183+
tracker: Optional[Any] = None
186184

187185
def _base_to_dict(self) -> Dict[str, Any]:
188186
"""

packages/sdk/server-ai/src/ldai/providers/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
from ldai.providers.types import (
77
AgentGraphResult,
88
AgentResult,
9-
EvalScore,
10-
JudgeResponse,
9+
JudgeResult,
1110
LDAIMetrics,
1211
ModelResponse,
1312
StructuredResponse,
@@ -20,8 +19,7 @@
2019
'AgentGraphRunner',
2120
'AgentResult',
2221
'AgentRunner',
23-
'EvalScore',
24-
'JudgeResponse',
22+
'JudgeResult',
2523
'LDAIMetrics',
2624
'ModelResponse',
2725
'ModelRunner',

packages/sdk/server-ai/src/ldai/providers/types.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class ModelResponse:
4444
"""
4545
message: LDMessage
4646
metrics: LDAIMetrics
47-
evaluations: Optional[List[JudgeResponse]] = None
47+
evaluations: Optional[List[JudgeResult]] = None
4848

4949

5050
@dataclass
@@ -58,45 +58,36 @@ class StructuredResponse:
5858

5959

6060
@dataclass
61-
class EvalScore:
61+
class JudgeResult:
6262
"""
63-
Score and reasoning for a single evaluation metric.
63+
Result from a judge evaluation.
6464
"""
65-
score: float # Score between 0.0 and 1.0
66-
reasoning: str # Reasoning behind the provided score
65+
judge_config_key: Optional[str] = None
66+
success: bool = False
67+
error_message: Optional[str] = None
68+
sampled: bool = False # True when the judge was skipped due to sampling rate
69+
score: Optional[float] = None
70+
reasoning: Optional[str] = None
71+
metric_key: Optional[str] = None
6772

6873
def to_dict(self) -> Dict[str, Any]:
6974
"""
70-
Render the evaluation score as a dictionary object.
71-
"""
72-
return {
73-
'score': self.score,
74-
'reasoning': self.reasoning,
75-
}
76-
77-
78-
@dataclass
79-
class JudgeResponse:
80-
"""
81-
Response from a judge evaluation containing scores and reasoning for multiple metrics.
82-
"""
83-
evals: Dict[str, EvalScore] # Dictionary where keys are metric names and values contain score and reasoning
84-
success: bool # Whether the evaluation completed successfully
85-
judge_config_key: Optional[str] = None # The key of the judge configuration that was used to generate this response
86-
error: Optional[str] = None # Error message if evaluation failed
87-
88-
def to_dict(self) -> Dict[str, Any]:
89-
"""
90-
Render the judge response as a dictionary object.
75+
Render the judge result as a dictionary object.
9176
"""
9277
result: Dict[str, Any] = {
93-
'evals': {key: eval_score.to_dict() for key, eval_score in self.evals.items()},
9478
'success': self.success,
79+
'sampled': self.sampled,
9580
}
81+
if self.score is not None:
82+
result['score'] = self.score
83+
if self.reasoning is not None:
84+
result['reasoning'] = self.reasoning
85+
if self.metric_key is not None:
86+
result['metricKey'] = self.metric_key
9687
if self.judge_config_key is not None:
9788
result['judgeConfigKey'] = self.judge_config_key
98-
if self.error is not None:
99-
result['error'] = self.error
89+
if self.error_message is not None:
90+
result['errorMessage'] = self.error_message
10091
return result
10192

10293

0 commit comments

Comments
 (0)