-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy path__init__.py
More file actions
201 lines (166 loc) · 7.25 KB
/
__init__.py
File metadata and controls
201 lines (166 loc) · 7.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""Judge implementation for AI evaluation."""
import random
import re
from typing import Any, Dict, Optional
from ldai import log
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
from ldai.models import AIJudgeConfig, LDMessage
from ldai.providers.model_runner import ModelRunner
from ldai.providers.types import EvalScore, JudgeResponse, ModelResponse
from ldai.tracker import LDAIConfigTracker
class Judge:
"""
Judge implementation that handles evaluation functionality and conversation management.
According to the AIEval spec, judges are AI Configs with mode: "judge" that evaluate
other AI Configs using structured output.
"""
def __init__(
self,
ai_config: AIJudgeConfig,
ai_config_tracker: LDAIConfigTracker,
model_runner: ModelRunner,
):
"""
Initialize the Judge.
:param ai_config: The judge AI configuration
:param ai_config_tracker: The tracker for the judge configuration
:param model_runner: The model runner to use for evaluation
"""
self._ai_config = ai_config
self._ai_config_tracker = ai_config_tracker
self._model_runner = model_runner
self._evaluation_response_structure = EvaluationSchemaBuilder.build()
async def evaluate(
self,
input_text: str,
output_text: str,
sampling_rate: float = 1.0,
) -> Optional[JudgeResponse]:
"""
Evaluates an AI response using the judge's configuration.
:param input_text: The input prompt or question that was provided to the AI
:param output_text: The AI-generated response to be evaluated
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
:return: Evaluation results or None if not sampled
"""
try:
if not self._ai_config.evaluation_metric_key:
log.warning(
'Judge configuration is missing required evaluationMetricKey'
)
return None
if not self._ai_config.messages:
log.warning('Judge configuration must include messages')
return None
if random.random() > sampling_rate:
log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
return None
messages = self._construct_evaluation_messages(input_text, output_text)
assert self._evaluation_response_structure is not None
response = await self._ai_config_tracker.track_metrics_of_async(
lambda: self._model_runner.invoke_structured_model(messages, self._evaluation_response_structure),
lambda result: result.metrics,
)
success = response.metrics.success
evals = self._parse_evaluation_response(response.data)
if not evals:
log.warning('Judge evaluation did not return the expected evaluation')
success = False
return JudgeResponse(
judge_config_key=self._ai_config.key,
evals=evals,
success=success,
)
except Exception as error:
log.error(f'Judge evaluation failed: {error}')
return JudgeResponse(
evals={},
success=False,
error=str(error) if isinstance(error, Exception) else 'Unknown error',
)
async def evaluate_messages(
self,
messages: list[LDMessage],
response: ModelResponse,
sampling_ratio: float = 1.0,
) -> Optional[JudgeResponse]:
"""
Evaluates an AI response from chat messages and response.
:param messages: Array of messages representing the conversation history
:param response: The AI response to be evaluated
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
:return: Evaluation results or None if not sampled
"""
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
output_text = response.message.content
return await self.evaluate(input_text, output_text, sampling_ratio)
def get_ai_config(self) -> AIJudgeConfig:
"""
Returns the AI Config used by this judge.
:return: The judge AI configuration
"""
return self._ai_config
def get_tracker(self) -> LDAIConfigTracker:
"""
Returns the tracker associated with this judge.
:return: The tracker for the judge configuration
"""
return self._ai_config_tracker
def get_model_runner(self) -> ModelRunner:
"""
Returns the model runner used by this judge.
:return: The model runner
"""
return self._model_runner
def _construct_evaluation_messages(self, input_text: str, output_text: str) -> list[LDMessage]:
"""
Constructs evaluation messages by combining judge's config messages with input/output.
:param input_text: The input text
:param output_text: The output text to evaluate
:return: List of messages for evaluation
"""
if not self._ai_config.messages:
return []
messages: list[LDMessage] = []
for msg in self._ai_config.messages:
# Interpolate message content with reserved variables
content = self._interpolate_message(msg.content, {
'message_history': input_text,
'response_to_evaluate': output_text,
})
messages.append(LDMessage(role=msg.role, content=content))
return messages
def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
"""Use string replacement to prevent context attributes like {{=[ ]=}}
from influencing judge template parsing.
:param content: The message content template
:param variables: Variables to interpolate
:return: Interpolated message content
"""
return re.sub(
r'\{\{(\w+)\}\}',
lambda match: variables.get(match.group(1), match.group(0)),
content,
)
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
"""
Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
"""
results: Dict[str, EvalScore] = {}
metric_key = self._ai_config.evaluation_metric_key
if not metric_key:
log.warning('Evaluation metric key is missing')
return results
if not isinstance(data, dict):
log.warning('Invalid response: missing or invalid evaluation')
return results
score = data.get('score')
reasoning = data.get('reasoning')
if not isinstance(score, (int, float)) or score < 0 or score > 1:
log.warning(f'Invalid score: {score}. Score must be a number between 0 and 1 inclusive')
return results
if not isinstance(reasoning, str):
log.warning('Invalid reasoning: must be a string')
return results
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
return results