Skip to content

Commit 231ae2e

Browse files
committed
Add Judge and evaluation metric tracking
1 parent 6ee62b4 commit 231ae2e

7 files changed

Lines changed: 523 additions & 28 deletions

File tree

ldai/__init__.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
AICompletionConfigDefault,
1414
AIJudgeConfig,
1515
AIJudgeConfigDefault,
16-
Judge,
1716
JudgeConfiguration,
1817
LDMessage,
1918
ModelConfig,
@@ -25,6 +24,12 @@
2524
LDAIAgentDefaults,
2625
)
2726

27+
# Export judge
28+
from ldai.judge import AIJudge
29+
30+
# Export judge types
31+
from ldai.providers.types import EvalScore, JudgeResponse
32+
2833
__all__ = [
2934
'LDAIClient',
3035
'AIAgentConfig',
@@ -35,8 +40,10 @@
3540
'AICompletionConfigDefault',
3641
'AIJudgeConfig',
3742
'AIJudgeConfigDefault',
38-
'Judge',
43+
'AIJudge',
44+
'EvalScore',
3945
'JudgeConfiguration',
46+
'JudgeResponse',
4047
'LDMessage',
4148
'ModelConfig',
4249
'ProviderConfig',

ldai/client.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from ldclient import Context
55
from ldclient.client import LDClient
66

7+
from ldai.judge import AIJudge
78
from ldai.models import (
89
AIAgentConfig,
910
AIAgentConfigDefault,
@@ -13,12 +14,12 @@
1314
AICompletionConfigDefault,
1415
AIJudgeConfig,
1516
AIJudgeConfigDefault,
16-
Judge,
1717
JudgeConfiguration,
1818
LDMessage,
1919
ModelConfig,
2020
ProviderConfig,
2121
)
22+
from ldai.providers.ai_provider_factory import AIProviderFactory, SupportedAIProvider
2223
from ldai.tracker import LDAIConfigTracker
2324

2425

@@ -118,6 +119,79 @@ def judge_config(
118119

119120
return config
120121

122+
async def create_judge(
123+
self,
124+
key: str,
125+
context: Context,
126+
default_value: AIJudgeConfigDefault,
127+
variables: Optional[Dict[str, Any]] = None,
128+
default_ai_provider: Optional[SupportedAIProvider] = None,
129+
) -> Optional[AIJudge]:
130+
"""
131+
Creates and returns a new Judge instance for AI evaluation.
132+
133+
:param key: The key identifying the AI judge configuration to use
134+
:param context: Standard Context used when evaluating flags
135+
:param default_value: A default value representing a standard AI config result
136+
:param variables: Dictionary of values for instruction interpolation.
137+
The variables `message_history` and `response_to_evaluate` are reserved for the judge and will be ignored.
138+
:param default_ai_provider: Optional default AI provider to use.
139+
:return: Judge instance or None if disabled/unsupported
140+
141+
Example::
142+
143+
judge = client.create_judge(
144+
"relevance-judge",
145+
context,
146+
AIJudgeConfigDefault(
147+
enabled=True,
148+
model=ModelConfig("gpt-4"),
149+
provider=ProviderConfig("openai"),
150+
evaluation_metric_keys=['$ld:ai:judge:relevance'],
151+
messages=[LDMessage(role='system', content='You are a relevance judge.')]
152+
),
153+
variables={'metric': "relevance"}
154+
)
155+
156+
if judge:
157+
result = await judge.evaluate("User question", "AI response")
158+
if result and result.evals:
159+
relevance_eval = result.evals.get('$ld:ai:judge:relevance')
160+
if relevance_eval:
161+
print('Relevance score:', relevance_eval.score)
162+
"""
163+
self._client.track('$ld:ai:judge:function:createJudge', context, key, 1)
164+
165+
try:
166+
# Warn if reserved variables are provided
167+
if variables:
168+
if 'message_history' in variables:
169+
# Note: Python doesn't have a logger on the client, but we could add one
170+
pass # Would log warning if logger available
171+
if 'response_to_evaluate' in variables:
172+
pass # Would log warning if logger available
173+
174+
# Overwrite reserved variables to ensure they remain as placeholders for judge evaluation
175+
extended_variables = dict(variables) if variables else {}
176+
extended_variables['message_history'] = '{{message_history}}'
177+
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
178+
179+
judge_config = self.judge_config(key, context, default_value, extended_variables)
180+
181+
if not judge_config.enabled or not judge_config.tracker:
182+
# Would log info if logger available
183+
return None
184+
185+
# Create AI provider for the judge
186+
provider = await AIProviderFactory.create(judge_config, None, default_ai_provider)
187+
if not provider:
188+
return None
189+
190+
return AIJudge(judge_config, judge_config.tracker, provider, None)
191+
except Exception as error:
192+
# Would log error if logger available
193+
return None
194+
121195
def agent_config(
122196
self,
123197
key: str,
@@ -337,7 +411,7 @@ def __evaluate(
337411
judge_config = variation['judgeConfiguration']
338412
if 'judges' in judge_config and isinstance(judge_config['judges'], list):
339413
judges = [
340-
Judge(
414+
JudgeConfiguration.Judge(
341415
key=judge['key'],
342416
sampling_rate=judge['samplingRate']
343417
)

ldai/judge/__init__.py

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
"""Judge implementation for AI evaluation."""
2+
3+
import random
4+
from typing import Any, Dict, Optional
5+
6+
import chevron
7+
8+
from ldai.models import AIJudgeConfig, LDMessage
9+
from ldai.providers.ai_provider import AIProvider
10+
from ldai.providers.types import ChatResponse, EvalScore, JudgeResponse, StructuredResponse
11+
from ldai.tracker import LDAIConfigTracker
12+
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
13+
14+
15+
class AIJudge:
16+
"""
17+
Judge implementation that handles evaluation functionality and conversation management.
18+
19+
According to the AIEval spec, judges are AI Configs with mode: "judge" that evaluate
20+
other AI Configs using structured output.
21+
"""
22+
23+
def __init__(
24+
self,
25+
ai_config: AIJudgeConfig,
26+
ai_config_tracker: LDAIConfigTracker,
27+
ai_provider: AIProvider,
28+
logger: Optional[Any] = None,
29+
):
30+
"""
31+
Initialize the Judge.
32+
33+
:param ai_config: The judge AI configuration
34+
:param ai_config_tracker: The tracker for the judge configuration
35+
:param ai_provider: The AI provider to use for evaluation
36+
:param logger: Optional logger for logging
37+
"""
38+
self._ai_config = ai_config
39+
self._ai_config_tracker = ai_config_tracker
40+
self._ai_provider = ai_provider
41+
self._logger = logger
42+
self._evaluation_response_structure = EvaluationSchemaBuilder.build(
43+
ai_config.evaluation_metric_keys
44+
)
45+
46+
async def evaluate(
47+
self,
48+
input_text: str,
49+
output_text: str,
50+
sampling_rate: float = 1.0,
51+
) -> Optional[JudgeResponse]:
52+
"""
53+
Evaluates an AI response using the judge's configuration.
54+
55+
:param input_text: The input prompt or question that was provided to the AI
56+
:param output_text: The AI-generated response to be evaluated
57+
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
58+
:return: Evaluation results or None if not sampled
59+
"""
60+
try:
61+
if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0:
62+
if self._logger:
63+
self._logger.warn(
64+
'Judge configuration is missing required evaluationMetricKeys'
65+
)
66+
return None
67+
68+
if not self._ai_config.messages:
69+
if self._logger:
70+
self._logger.warn('Judge configuration must include messages')
71+
return None
72+
73+
if random.random() > sampling_rate:
74+
if self._logger:
75+
self._logger.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
76+
return None
77+
78+
messages = self._construct_evaluation_messages(input_text, output_text)
79+
80+
# Track metrics of the structured model invocation
81+
response = await self._ai_config_tracker.track_metrics_of(
82+
lambda result: result.metrics,
83+
lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure)
84+
)
85+
86+
success = response.metrics.success
87+
88+
evals = self._parse_evaluation_response(response.data)
89+
90+
if len(evals) != len(self._ai_config.evaluation_metric_keys):
91+
if self._logger:
92+
self._logger.warn('Judge evaluation did not return all evaluations')
93+
success = False
94+
95+
return JudgeResponse(
96+
evals=evals,
97+
success=success,
98+
)
99+
except Exception as error:
100+
if self._logger:
101+
self._logger.error(f'Judge evaluation failed: {error}')
102+
return JudgeResponse(
103+
evals={},
104+
success=False,
105+
error=str(error) if isinstance(error, Exception) else 'Unknown error',
106+
)
107+
108+
async def evaluate_messages(
109+
self,
110+
messages: list[LDMessage],
111+
response: ChatResponse,
112+
sampling_ratio: float = 1.0,
113+
) -> Optional[JudgeResponse]:
114+
"""
115+
Evaluates an AI response from chat messages and response.
116+
117+
:param messages: Array of messages representing the conversation history
118+
:param response: The AI response to be evaluated
119+
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
120+
:return: Evaluation results or None if not sampled
121+
"""
122+
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
123+
output_text = response.message.content
124+
125+
return await self.evaluate(input_text, output_text, sampling_ratio)
126+
127+
def get_ai_config(self) -> AIJudgeConfig:
128+
"""
129+
Returns the AI Config used by this judge.
130+
131+
:return: The judge AI configuration
132+
"""
133+
return self._ai_config
134+
135+
def get_tracker(self) -> LDAIConfigTracker:
136+
"""
137+
Returns the tracker associated with this judge.
138+
139+
:return: The tracker for the judge configuration
140+
"""
141+
return self._ai_config_tracker
142+
143+
def get_provider(self) -> AIProvider:
144+
"""
145+
Returns the AI provider used by this judge.
146+
147+
:return: The AI provider
148+
"""
149+
return self._ai_provider
150+
151+
def _construct_evaluation_messages(self, input_text: str, output_text: str) -> list[LDMessage]:
152+
"""
153+
Constructs evaluation messages by combining judge's config messages with input/output.
154+
155+
:param input_text: The input text
156+
:param output_text: The output text to evaluate
157+
:return: List of messages for evaluation
158+
"""
159+
if not self._ai_config.messages:
160+
return []
161+
162+
messages: list[LDMessage] = []
163+
for msg in self._ai_config.messages:
164+
# Interpolate message content with reserved variables
165+
content = self._interpolate_message(msg.content, {
166+
'message_history': input_text,
167+
'response_to_evaluate': output_text,
168+
})
169+
messages.append(LDMessage(role=msg.role, content=content))
170+
171+
return messages
172+
173+
def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
174+
"""
175+
Interpolates message content with variables using Mustache templating.
176+
177+
:param content: The message content template
178+
:param variables: Variables to interpolate
179+
:return: Interpolated message content
180+
"""
181+
# Use chevron (Mustache) for templating, with no escaping
182+
return chevron.render(content, variables)
183+
184+
def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
185+
"""
186+
Parses the structured evaluation response from the AI provider.
187+
188+
:param data: The structured response data
189+
:return: Dictionary of evaluation scores keyed by metric key
190+
"""
191+
results: Dict[str, EvalScore] = {}
192+
193+
if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
194+
if self._logger:
195+
self._logger.warn('Invalid response: missing or invalid evaluations object')
196+
return results
197+
198+
evaluations = data['evaluations']
199+
200+
for metric_key in self._ai_config.evaluation_metric_keys:
201+
evaluation = evaluations.get(metric_key)
202+
203+
if not evaluation or not isinstance(evaluation, dict):
204+
if self._logger:
205+
self._logger.warn(f'Missing evaluation for metric key: {metric_key}')
206+
continue
207+
208+
score = evaluation.get('score')
209+
reasoning = evaluation.get('reasoning')
210+
211+
if not isinstance(score, (int, float)) or score < 0 or score > 1:
212+
if self._logger:
213+
self._logger.warn(
214+
f'Invalid score evaluated for {metric_key}: {score}. '
215+
'Score must be a number between 0 and 1 inclusive'
216+
)
217+
continue
218+
219+
if not isinstance(reasoning, str):
220+
if self._logger:
221+
self._logger.warn(
222+
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
223+
'Reasoning must be a string'
224+
)
225+
continue
226+
227+
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
228+
229+
return results
230+
231+

0 commit comments

Comments
 (0)