Skip to content

Commit a55503b

Browse files
authored
feat: add support for custom judges via evaluation metric key (#86)
**Requirements** - [X] I have added test coverage for new or changed functionality - [x] I have followed the repository's [pull request submission guidelines](../blob/main/CONTRIBUTING.md#submitting-pull-requests) - [X] I have validated my changes against all supported platform versions **Related issues** https://launchdarkly.atlassian.net/browse/REL-11511 See tech spec at https://docs.google.com/document/d/1lzYwQqCcTzN_2zkxJZDfJtgUcEJ4jbpx0KSsJ2bRENw/edit?tab=t.0#heading=h.69bdm7karsxh **Describe the solution you've provided** Updating the SDK to check the AI Config's evaluationMetricKey property which now exists. Also added missing tests from previous implementation, and fallback to the original evaluationMetricKeys list. **Describe alternatives you've considered** Provide a clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context about the pull request here. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Implements single-key judge evaluation with backward compatibility and comprehensive tests. > > - Switches judge configs to use `evaluationMetricKey` (deprecated `evaluationMetricKeys`), updating `AIJudgeConfig(Default)` serialization > - `LDAIClient.__evaluate` now returns the raw `variation`; `judge_config` extracts `evaluationMetricKey` with fallback to first in `evaluationMetricKeys` > - `Judge` updated to validate and parse a single metric; `EvaluationSchemaBuilder` builds a single-key structured schema; minor cleanup of unused imports/comments > - Adds extensive unit tests for judge behavior, schema building, and client extraction (including consistency of single variation, sampling, error paths) > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit c6d086a. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
2 parents 09344af + c6d086a commit a55503b

5 files changed

Lines changed: 709 additions & 77 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def completion_config(
4242
"""
4343
self._client.track('$ld:ai:config:function:single', context, key, 1)
4444

45-
model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
45+
model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate(
4646
key, context, default_value.to_dict(), variables
4747
)
4848

@@ -96,18 +96,31 @@ def judge_config(
9696
"""
9797
self._client.track('$ld:ai:judge:function:single', context, key, 1)
9898

99-
model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
99+
model, provider, messages, instructions, tracker, enabled, judge_configuration, variation = self.__evaluate(
100100
key, context, default_value.to_dict(), variables
101101
)
102102

103-
# Extract evaluation_metric_keys from the variation
104-
variation = self._client.variation(key, context, default_value.to_dict())
105-
evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or [])
103+
def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]:
104+
"""
105+
Extract evaluation_metric_key with backward compatibility.
106+
107+
Priority: 1) evaluationMetricKey from variation, 2) first from evaluationMetricKeys in variation
108+
"""
109+
if evaluation_metric_key := variation.get('evaluationMetricKey'):
110+
return evaluation_metric_key
111+
112+
variation_keys = variation.get('evaluationMetricKeys')
113+
if isinstance(variation_keys, list) and variation_keys:
114+
return variation_keys[0]
115+
116+
return None
117+
118+
evaluation_metric_key = _extract_evaluation_metric_key(variation)
106119

107120
config = AIJudgeConfig(
108121
key=key,
109122
enabled=bool(enabled),
110-
evaluation_metric_keys=evaluation_metric_keys,
123+
evaluation_metric_key=evaluation_metric_key,
111124
model=model,
112125
messages=messages,
113126
provider=provider,
@@ -144,7 +157,7 @@ async def create_judge(
144157
enabled=True,
145158
model=ModelConfig("gpt-4"),
146159
provider=ProviderConfig("openai"),
147-
evaluation_metric_keys=['$ld:ai:judge:relevance'],
160+
evaluation_metric_key='$ld:ai:judge:relevance',
148161
messages=[LDMessage(role='system', content='You are a relevance judge.')]
149162
),
150163
variables={'metric': "relevance"}
@@ -160,33 +173,27 @@ async def create_judge(
160173
self._client.track('$ld:ai:judge:function:createJudge', context, key, 1)
161174

162175
try:
163-
# Warn if reserved variables are provided
164176
if variables:
165177
if 'message_history' in variables:
166-
# Note: Python doesn't have a logger on the client, but we could add one
167-
pass # Would log warning if logger available
178+
pass
168179
if 'response_to_evaluate' in variables:
169-
pass # Would log warning if logger available
180+
pass
170181

171-
# Overwrite reserved variables to ensure they remain as placeholders for judge evaluation
172182
extended_variables = dict(variables) if variables else {}
173183
extended_variables['message_history'] = '{{message_history}}'
174184
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
175185

176186
judge_config = self.judge_config(key, context, default_value, extended_variables)
177187

178188
if not judge_config.enabled or not judge_config.tracker:
179-
# Would log info if logger available
180189
return None
181190

182-
# Create AI provider for the judge
183191
provider = await AIProviderFactory.create(judge_config, default_ai_provider)
184192
if not provider:
185193
return None
186194

187195
return Judge(judge_config, judge_config.tracker, provider)
188196
except Exception as error:
189-
# Would log error if logger available
190197
return None
191198

192199
async def _initialize_judges(
@@ -279,7 +286,6 @@ async def create_chat(
279286
config = self.completion_config(key, context, default_value, variables)
280287

281288
if not config.enabled or not config.tracker:
282-
# Would log info if logger available
283289
return None
284290

285291
provider = await AIProviderFactory.create(config, default_ai_provider)
@@ -333,7 +339,6 @@ def agent_config(
333339
:param variables: Variables for interpolation.
334340
:return: Configured AIAgentConfig instance.
335341
"""
336-
# Track single agent usage
337342
self._client.track(
338343
"$ld:ai:agent:function:single",
339344
context,
@@ -399,7 +404,6 @@ def agent_configs(
399404
:param context: The context to evaluate the agent configurations in.
400405
:return: Dictionary mapping agent keys to their AIAgentConfig configurations.
401406
"""
402-
# Track multiple agents usage
403407
agent_count = len(agent_configs)
404408
self._client.track(
405409
"$ld:ai:agent:function:multiple",
@@ -538,7 +542,7 @@ def __evaluate(
538542
variables: Optional[Dict[str, Any]] = None,
539543
) -> Tuple[
540544
Optional[ModelConfig], Optional[ProviderConfig], Optional[List[LDMessage]],
541-
Optional[str], LDAIConfigTracker, bool, Optional[Any]
545+
Optional[str], LDAIConfigTracker, bool, Optional[Any], Dict[str, Any]
542546
]:
543547
"""
544548
Internal method to evaluate a configuration and extract components.
@@ -547,7 +551,7 @@ def __evaluate(
547551
:param context: The evaluation context.
548552
:param default_dict: Default configuration as dictionary.
549553
:param variables: Variables for interpolation.
550-
:return: Tuple of (model, provider, messages, instructions, tracker, enabled).
554+
:return: Tuple of (model, provider, messages, instructions, tracker, enabled, judge_configuration, variation).
551555
"""
552556
variation = self._client.variation(key, context, default_dict)
553557

@@ -556,7 +560,6 @@ def __evaluate(
556560
all_variables.update(variables)
557561
all_variables['ldctx'] = context.to_dict()
558562

559-
# Extract messages
560563
messages = None
561564
if 'messages' in variation and isinstance(variation['messages'], list) and all(
562565
isinstance(entry, dict) for entry in variation['messages']
@@ -571,18 +574,15 @@ def __evaluate(
571574
for entry in variation['messages']
572575
]
573576

574-
# Extract instructions
575577
instructions = None
576578
if 'instructions' in variation and isinstance(variation['instructions'], str):
577579
instructions = self.__interpolate_template(variation['instructions'], all_variables)
578580

579-
# Extract provider config
580581
provider_config = None
581582
if 'provider' in variation and isinstance(variation['provider'], dict):
582583
provider = variation['provider']
583584
provider_config = ProviderConfig(provider.get('name', ''))
584585

585-
# Extract model config
586586
model = None
587587
if 'model' in variation and isinstance(variation['model'], dict):
588588
parameters = variation['model'].get('parameters', None)
@@ -593,7 +593,6 @@ def __evaluate(
593593
custom=custom
594594
)
595595

596-
# Create tracker
597596
tracker = LDAIConfigTracker(
598597
self._client,
599598
variation.get('_ldMeta', {}).get('variationKey', ''),
@@ -606,7 +605,6 @@ def __evaluate(
606605

607606
enabled = variation.get('_ldMeta', {}).get('enabled', False)
608607

609-
# Extract judge configuration
610608
judge_configuration = None
611609
if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict):
612610
judge_config = variation['judgeConfiguration']
@@ -622,7 +620,7 @@ def __evaluate(
622620
if judges:
623621
judge_configuration = JudgeConfiguration(judges=judges)
624622

625-
return model, provider_config, messages, instructions, tracker, enabled, judge_configuration
623+
return model, provider_config, messages, instructions, tracker, enabled, judge_configuration, variation
626624

627625
def __evaluate_agent(
628626
self,
@@ -640,7 +638,7 @@ def __evaluate_agent(
640638
:param variables: Variables for interpolation.
641639
:return: Configured AIAgentConfig instance.
642640
"""
643-
model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
641+
model, provider, messages, instructions, tracker, enabled, judge_configuration, _ = self.__evaluate(
644642
key, context, default_value.to_dict(), variables
645643
)
646644

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1010
from ldai.models import AIJudgeConfig, LDMessage
1111
from ldai.providers.ai_provider import AIProvider
12-
from ldai.providers.types import (ChatResponse, EvalScore, JudgeResponse,
13-
StructuredResponse)
12+
from ldai.providers.types import ChatResponse, EvalScore, JudgeResponse
1413
from ldai.tracker import LDAIConfigTracker
1514

1615

@@ -38,9 +37,7 @@ def __init__(
3837
self._ai_config = ai_config
3938
self._ai_config_tracker = ai_config_tracker
4039
self._ai_provider = ai_provider
41-
self._evaluation_response_structure = EvaluationSchemaBuilder.build(
42-
ai_config.evaluation_metric_keys
43-
)
40+
self._evaluation_response_structure = EvaluationSchemaBuilder.build(ai_config.evaluation_metric_key)
4441

4542
async def evaluate(
4643
self,
@@ -57,9 +54,9 @@ async def evaluate(
5754
:return: Evaluation results or None if not sampled
5855
"""
5956
try:
60-
if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0:
57+
if not self._ai_config.evaluation_metric_key:
6158
log.warn(
62-
'Judge configuration is missing required evaluationMetricKeys'
59+
'Judge configuration is missing required evaluationMetricKey'
6360
)
6461
return None
6562

@@ -72,8 +69,8 @@ async def evaluate(
7269
return None
7370

7471
messages = self._construct_evaluation_messages(input_text, output_text)
72+
assert self._evaluation_response_structure is not None
7573

76-
# Track metrics of the structured model invocation
7774
response = await self._ai_config_tracker.track_metrics_of(
7875
lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure),
7976
lambda result: result.metrics,
@@ -83,8 +80,8 @@ async def evaluate(
8380

8481
evals = self._parse_evaluation_response(response.data)
8582

86-
if len(evals) != len(self._ai_config.evaluation_metric_keys):
87-
log.warn('Judge evaluation did not return all evaluations')
83+
if self._ai_config.evaluation_metric_key not in evals:
84+
log.warn('Judge evaluation did not return the expected evaluation')
8885
success = False
8986

9087
return JudgeResponse(
@@ -191,30 +188,34 @@ def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScor
191188

192189
evaluations = data['evaluations']
193190

194-
for metric_key in self._ai_config.evaluation_metric_keys:
195-
evaluation = evaluations.get(metric_key)
191+
metric_key = self._ai_config.evaluation_metric_key
192+
if not metric_key:
193+
log.warn('Evaluation metric key is missing')
194+
return results
196195

197-
if not evaluation or not isinstance(evaluation, dict):
198-
log.warn(f'Missing evaluation for metric key: {metric_key}')
199-
continue
196+
evaluation = evaluations.get(metric_key)
200197

201-
score = evaluation.get('score')
202-
reasoning = evaluation.get('reasoning')
198+
if not evaluation or not isinstance(evaluation, dict):
199+
log.warn(f'Missing evaluation for metric key: {metric_key}')
200+
return results
203201

204-
if not isinstance(score, (int, float)) or score < 0 or score > 1:
205-
log.warn(
206-
f'Invalid score evaluated for {metric_key}: {score}. '
207-
'Score must be a number between 0 and 1 inclusive'
208-
)
209-
continue
202+
score = evaluation.get('score')
203+
reasoning = evaluation.get('reasoning')
210204

211-
if not isinstance(reasoning, str):
212-
log.warn(
213-
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
214-
'Reasoning must be a string'
215-
)
216-
continue
205+
if not isinstance(score, (int, float)) or score < 0 or score > 1:
206+
log.warn(
207+
f'Invalid score evaluated for {metric_key}: {score}. '
208+
'Score must be a number between 0 and 1 inclusive'
209+
)
210+
return results
211+
212+
if not isinstance(reasoning, str):
213+
log.warn(
214+
f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
215+
'Reasoning must be a string'
216+
)
217+
return results
217218

218-
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
219+
results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
219220

220221
return results

packages/sdk/server-ai/src/ldai/judge/evaluation_schema_builder.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Internal class for building dynamic evaluation response schemas."""
22

3-
from typing import Any, Dict
3+
from typing import Any, Dict, Optional
44

55

66
class EvaluationSchemaBuilder:
@@ -10,26 +10,29 @@ class EvaluationSchemaBuilder:
1010
"""
1111

1212
@staticmethod
13-
def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
13+
def build(evaluation_metric_key: Optional[str]) -> Optional[Dict[str, Any]]:
1414
"""
15-
Build an evaluation response schema from evaluation metric keys.
15+
Build an evaluation response schema from evaluation metric key.
1616
17-
:param evaluation_metric_keys: List of evaluation metric keys
18-
:return: Schema dictionary for structured output
17+
:param evaluation_metric_key: Evaluation metric key, or None if not available
18+
:return: Schema dictionary for structured output, or None if evaluation_metric_key is None
1919
"""
20+
if not evaluation_metric_key:
21+
return None
22+
2023
return {
2124
'title': 'EvaluationResponse',
22-
'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics",
25+
'description': f"Response containing evaluation results for {evaluation_metric_key} metric",
2326
'type': 'object',
2427
'properties': {
2528
'evaluations': {
2629
'type': 'object',
2730
'description': (
2831
f"Object containing evaluation results for "
29-
f"{', '.join(evaluation_metric_keys)} metrics"
32+
f"{evaluation_metric_key} metric"
3033
),
31-
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys),
32-
'required': evaluation_metric_keys,
34+
'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_key),
35+
'required': [evaluation_metric_key],
3336
'additionalProperties': False,
3437
},
3538
},
@@ -38,17 +41,16 @@ def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
3841
}
3942

4043
@staticmethod
41-
def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
44+
def _build_key_properties(evaluation_metric_key: str) -> Dict[str, Any]:
4245
"""
43-
Build properties for each evaluation metric key.
46+
Build properties for a single evaluation metric key.
4447
45-
:param evaluation_metric_keys: List of evaluation metric keys
46-
:return: Dictionary of properties for each key
48+
:param evaluation_metric_key: Evaluation metric key
49+
:return: Dictionary of properties for the key
4750
"""
48-
result: Dict[str, Any] = {}
49-
for key in evaluation_metric_keys:
50-
result[key] = EvaluationSchemaBuilder._build_key_schema(key)
51-
return result
51+
return {
52+
evaluation_metric_key: EvaluationSchemaBuilder._build_key_schema(evaluation_metric_key)
53+
}
5254

5355
@staticmethod
5456
def _build_key_schema(key: str) -> Dict[str, Any]:

0 commit comments

Comments
 (0)