Skip to content

Commit 8d01693

Browse files
committed
Add support for custom judges via evaluation metric key
1 parent 0d0cecc commit 8d01693

5 files changed

Lines changed: 674 additions & 68 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -98,14 +98,35 @@ def judge_config(
9898
key, context, default_value.to_dict(), variables
9999
)
100100

101-
# Extract evaluation_metric_keys from the variation
102101
variation = self._client.variation(key, context, default_value.to_dict())
103-
evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or [])
102+
def _extract_evaluation_metric_key(variation: Dict[str, Any], default_value: AIJudgeConfigDefault) -> Optional[str]:
103+
"""
104+
Extract evaluation_metric_key with backward compatibility.
105+
106+
Priority: 1) evaluationMetricKey from variation, 2) evaluation_metric_key from default,
107+
3) first from evaluationMetricKeys in variation, 4) first from evaluation_metric_keys in default
108+
"""
109+
if evaluation_metric_key := variation.get('evaluationMetricKey'):
110+
return evaluation_metric_key
111+
112+
if default_value.evaluation_metric_key:
113+
return default_value.evaluation_metric_key
114+
115+
variation_keys = variation.get('evaluationMetricKeys')
116+
if isinstance(variation_keys, list) and variation_keys:
117+
return variation_keys[0]
118+
119+
if default_value.evaluation_metric_keys:
120+
return default_value.evaluation_metric_keys[0]
121+
122+
return None
123+
124+
evaluation_metric_key = _extract_evaluation_metric_key(variation, default_value)
104125

105126
config = AIJudgeConfig(
106127
key=key,
107128
enabled=bool(enabled),
108-
evaluation_metric_keys=evaluation_metric_keys,
129+
evaluation_metric_key=evaluation_metric_key,
109130
model=model,
110131
messages=messages,
111132
provider=provider,
@@ -142,7 +163,7 @@ async def create_judge(
142163
enabled=True,
143164
model=ModelConfig("gpt-4"),
144165
provider=ProviderConfig("openai"),
145-
evaluation_metric_keys=['$ld:ai:judge:relevance'],
166+
evaluation_metric_key='$ld:ai:judge:relevance',
146167
messages=[LDMessage(role='system', content='You are a relevance judge.')]
147168
),
148169
variables={'metric': "relevance"}
@@ -158,33 +179,27 @@ async def create_judge(
158179
self._client.track('$ld:ai:judge:function:createJudge', context, key, 1)
159180

160181
try:
161-
# Warn if reserved variables are provided
162182
if variables:
163183
if 'message_history' in variables:
164-
# Note: Python doesn't have a logger on the client, but we could add one
165-
pass # Would log warning if logger available
184+
pass
166185
if 'response_to_evaluate' in variables:
167-
pass # Would log warning if logger available
186+
pass
168187

169-
# Overwrite reserved variables to ensure they remain as placeholders for judge evaluation
170188
extended_variables = dict(variables) if variables else {}
171189
extended_variables['message_history'] = '{{message_history}}'
172190
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
173191

174192
judge_config = self.judge_config(key, context, default_value, extended_variables)
175193

176194
if not judge_config.enabled or not judge_config.tracker:
177-
# Would log info if logger available
178195
return None
179196

180-
# Create AI provider for the judge
181197
provider = await AIProviderFactory.create(judge_config, default_ai_provider)
182198
if not provider:
183199
return None
184200

185201
return Judge(judge_config, judge_config.tracker, provider)
186202
except Exception as error:
187-
# Would log error if logger available
188203
return None
189204

190205
async def _initialize_judges(
@@ -277,7 +292,6 @@ async def create_chat(
277292
config = self.completion_config(key, context, default_value, variables)
278293

279294
if not config.enabled or not config.tracker:
280-
# Would log info if logger available
281295
return None
282296

283297
provider = await AIProviderFactory.create(config, default_ai_provider)
@@ -331,7 +345,6 @@ def agent_config(
331345
:param variables: Variables for interpolation.
332346
:return: Configured AIAgentConfig instance.
333347
"""
334-
# Track single agent usage
335348
self._client.track(
336349
"$ld:ai:agent:function:single",
337350
context,
@@ -397,7 +410,6 @@ def agent_configs(
397410
:param context: The context to evaluate the agent configurations in.
398411
:return: Dictionary mapping agent keys to their AIAgentConfig configurations.
399412
"""
400-
# Track multiple agents usage
401413
agent_count = len(agent_configs)
402414
self._client.track(
403415
"$ld:ai:agent:function:multiple",
@@ -461,7 +473,6 @@ def __evaluate(
461473
all_variables.update(variables)
462474
all_variables['ldctx'] = context.to_dict()
463475

464-
# Extract messages
465476
messages = None
466477
if 'messages' in variation and isinstance(variation['messages'], list) and all(
467478
isinstance(entry, dict) for entry in variation['messages']
@@ -476,18 +487,15 @@ def __evaluate(
476487
for entry in variation['messages']
477488
]
478489

479-
# Extract instructions
480490
instructions = None
481491
if 'instructions' in variation and isinstance(variation['instructions'], str):
482492
instructions = self.__interpolate_template(variation['instructions'], all_variables)
483493

484-
# Extract provider config
485494
provider_config = None
486495
if 'provider' in variation and isinstance(variation['provider'], dict):
487496
provider = variation['provider']
488497
provider_config = ProviderConfig(provider.get('name', ''))
489498

490-
# Extract model config
491499
model = None
492500
if 'model' in variation and isinstance(variation['model'], dict):
493501
parameters = variation['model'].get('parameters', None)
@@ -498,7 +506,6 @@ def __evaluate(
498506
custom=custom
499507
)
500508

501-
# Create tracker
502509
tracker = LDAIConfigTracker(
503510
self._client,
504511
variation.get('_ldMeta', {}).get('variationKey', ''),
@@ -511,7 +518,6 @@ def __evaluate(
511518

512519
enabled = variation.get('_ldMeta', {}).get('enabled', False)
513520

514-
# Extract judge configuration
515521
judge_configuration = None
516522
if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict):
517523
judge_config = variation['judgeConfiguration']

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ def __init__(
3838
self._ai_config = ai_config
3939
self._ai_config_tracker = ai_config_tracker
4040
self._ai_provider = ai_provider
41-
self._evaluation_response_structure = EvaluationSchemaBuilder.build(
42-
ai_config.evaluation_metric_keys
43-
)
41+
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
4442

4543
async def evaluate(
4644
self,
@@ -57,9 +55,9 @@ async def evaluate(
5755
:return: Evaluation results or None if not sampled
5856
"""
5957
try:
60-
if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0:
58+
if not self._ai_config.evaluation_metric_key:
6159
log.warn(
62-
'Judge configuration is missing required evaluationMetricKeys'
60+
'Judge configuration is missing required evaluationMetricKey'
6361
)
6462
return None
6563

@@ -83,8 +81,8 @@ async def evaluate(
8381

8482
evals = self._parse_evaluation_response(response.data)
8583

86-
if len(evals) != len(self._ai_config.evaluation_metric_keys):
87-
log.warn('Judge evaluation did not return all evaluations')
84+
if self._ai_config.evaluation_metric_key not in evals:
85+
log.warn('Judge evaluation did not return the expected evaluation')
8886
success = False
8987

9088
return JudgeResponse(
@@ -191,30 +189,30 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor
191189

192190
evaluations = data['evaluations']
193191

194-
for metric_key in self._ai_config.evaluation_metric_keys:
195-
evaluation = evaluations.get(metric_key)
192+
metric_key = self._ai_config.evaluation_metric_key
193+
evaluation = evaluations.get(metric_key)
196194

197-
if not evaluation or not isinstance(evaluation, dict):
198-
log.warn(f'Missing evaluation for metric key: {metric_key}')
199-
continue
195+
if not evaluation or not isinstance(evaluation, dict):
196+
log.warn(f'Missing evaluation for metric key: {metric_key}')
197+
return results
200198

201-
score = evaluation.get('score')
202-
reasoning = evaluation.get('reasoning')
199+
score = evaluation.get('score')
200+
reasoning = evaluation.get('reasoning')
203201

204-
if not isinstance(score, (int, float)) or score < 0 or score > 1:
205-
log.warn(
206-
f'Invalid score evaluated for {metric_key}: {score}. '
207-
'Score must be a number between 0 and 1 inclusive'
208-
)
209-
continue
202+
if not isinstance(score, (int, float)) or score < 0 or score > 1:
203+
log.warn(
204+
f'Invalid score evaluated for {metric_key}: {score}. '
205+
'Score must be a number between 0 and 1 inclusive'
206+
)
207+
return results
210208

211-
if not isinstance(reasoning, str):
212-
log.warn(
213-
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
214-
'Reasoning must be a string'
215-
)
216-
continue
209+
if not isinstance(reasoning, str):
210+
log.warn(
211+
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
212+
'Reasoning must be a string'
213+
)
214+
return results
217215

218-
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
216+
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
219217

220218
return results

packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Internal class for building dynamic evaluation response schemas."""
22

3-
from typing import Any, Dict
3+
from typing import Any, Dict, Optional
44

55

66
class EvaluationSchemaBuilder:
@@ -10,26 +10,29 @@ class EvaluationSchemaBuilder:
1010
"""
1111

1212
@staticmethod
13-
def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
13+
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
1414
"""
15-
Build an evaluation response schema from evaluation metric keys.
15+
Build an evaluation response schema from evaluation metric key.
1616
17-
:param evaluation_metric_keys: List of evaluation metric keys
18-
:return: Schema dictionary for structured output
17+
:param evaluation_metric_key: Evaluation metric key, or None if not available
18+
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
1919
"""
20+
if evaluation_metric_key is None:
21+
return None
22+
2023
return {
2124
'title': 'EvaluationResponse',
22-
'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics",
25+
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
2326
'type': 'object',
2427
'properties': {
2528
'evaluations': {
2629
'type': 'object',
2730
'description': (
2831
f"Object containing evaluation results for "
29-
f"{', '.join(evaluation_metric_keys)} metrics"
32+
f"{evaluation_metric_key} metric"
3033
),
31-
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys),
32-
'required': evaluation_metric_keys,
34+
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
35+
'required': [evaluation_metric_key],
3336
'additionalProperties': False,
3437
},
3538
},
@@ -38,17 +41,16 @@ def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
3841
}
3942

4043
@staticmethod
41-
def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
44+
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
4245
"""
43-
Build properties for each evaluation metric key.
46+
Build properties for a single evaluation metric key.
4447
45-
:param evaluation_metric_keys: List of evaluation metric keys
46-
:return: Dictionary of properties for each key
48+
:param evaluation_metric_key: Evaluation metric key
49+
:return: Dictionary of properties for the key
4750
"""
48-
result: Dict[str, Any] = {}
49-
for key in evaluation_metric_keys:
50-
result[key] = EvaluationSchemaBuilder._build_key_schema(key)
51-
return result
51+
return {
52+
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
53+
}
5254

5355
@staticmethod
5456
def _build_key_schema(key: str) -> Dict[str, Any]:

packages/sdk/server-ai/src/ldai/models.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,16 +285,17 @@ class AIJudgeConfigDefault(AIConfigDefault):
285285
Default Judge-specific AI Config with required evaluation metric key.
286286
"""
287287
messages: Optional[List[LDMessage]] = None
288+
# Deprecated: evaluation_metric_key is used instead
288289
evaluation_metric_keys: Optional[List[str]] = None
290+
evaluation_metric_key: Optional[str] = None
289291

290292
def to_dict(self) -> dict:
291293
"""
292294
Render the given judge config default as a dictionary object.
293295
"""
294296
result = self._base_to_dict()
295297
result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
296-
if self.evaluation_metric_keys is not None:
297-
result['evaluationMetricKeys'] = self.evaluation_metric_keys
298+
result['evaluationMetricKey'] = self.evaluation_metric_key
298299
return result
299300

300301

@@ -303,16 +304,18 @@ class AIJudgeConfig(AIConfig):
303304
"""
304305
Judge-specific AI Config with required evaluation metric key.
305306
"""
307+
# Deprecated: evaluation_metric_key is used instead
306308
evaluation_metric_keys: List[str] = field(default_factory=list)
307309
messages: Optional[List[LDMessage]] = None
310+
evaluation_metric_key: Optional[str] = None
308311

309312
def to_dict(self) -> dict:
310313
"""
311314
Render the given judge config as a dictionary object.
312315
"""
313316
result = self._base_to_dict()
314-
result['evaluationMetricKeys'] = self.evaluation_metric_keys
315317
result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
318+
result['evaluationMetricKey'] = self.evaluation_metric_key
316319
return result
317320

318321

0 commit comments

Comments
 (0)