Skip to content

Commit 7365be2

Browse files
committed
feat: bake sampling_rate into Judge at construction; simplify Evaluator to List[Judge]
1 parent 1e1f36b commit 7365be2

4 files changed

Lines changed: 99 additions & 70 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 35 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -306,14 +306,24 @@ def create_judge(
306306
print('Relevance score:', relevance_eval.score)
307307
"""
308308
self._client.track(_TRACK_USAGE_CREATE_JUDGE, context, key, 1)
309+
return self._create_judge_instance(key, context, default, variables, default_ai_provider)
309310

310-
try:
311-
if variables:
312-
if 'message_history' in variables:
313-
pass
314-
if 'response_to_evaluate' in variables:
315-
pass
311+
def _create_judge_instance(
312+
self,
313+
key: str,
314+
context: Context,
315+
default: Optional[AIJudgeConfigDefault] = None,
316+
variables: Optional[Dict[str, Any]] = None,
317+
default_ai_provider: Optional[str] = None,
318+
sample_rate: float = 1.0,
319+
) -> Optional[Judge]:
320+
"""
321+
Construct a Judge for ``key`` without emitting the public create-judge usage event.
316322
323+
Used both by the public :meth:`create_judge` and by :meth:`_build_evaluator`
324+
when materializing judges referenced by an AI config's judge configuration.
325+
"""
326+
try:
317327
extended_variables = dict(variables) if variables else {}
318328
extended_variables['message_history'] = '{{message_history}}'
319329
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
@@ -329,45 +339,10 @@ def create_judge(
329339
if not provider:
330340
return None
331341

332-
return Judge(judge_config, provider)
333-
except Exception as error:
342+
return Judge(judge_config, provider, sample_rate=sample_rate)
343+
except Exception:
334344
return None
335345

336-
def _initialize_judges(
337-
self,
338-
judge_configs: List[JudgeConfiguration.Judge],
339-
context: Context,
340-
variables: Optional[Dict[str, Any]] = None,
341-
default_ai_provider: Optional[str] = None,
342-
) -> Dict[str, Judge]:
343-
"""
344-
Initialize judges from judge configurations.
345-
346-
:param judge_configs: List of judge configurations
347-
:param context: Standard Context used when evaluating flags
348-
:param variables: Dictionary of values for instruction interpolation
349-
:param default_ai_provider: Optional default AI provider to use
350-
:return: Dictionary of judge instances keyed by their configuration keys
351-
"""
352-
judges: Dict[str, Judge] = {}
353-
354-
for judge_config in judge_configs:
355-
try:
356-
judge = self.create_judge(
357-
judge_config.key,
358-
context,
359-
AIJudgeConfigDefault.disabled(),
360-
variables,
361-
default_ai_provider,
362-
)
363-
if judge:
364-
judges[judge_config.key] = judge
365-
except Exception as e:
366-
log.warning(f'Failed to initialize judge {judge_config.key!r}: {e}')
367-
continue
368-
369-
return judges
370-
371346
def _build_evaluator(
372347
self,
373348
judge_configuration: Optional[JudgeConfiguration],
@@ -387,11 +362,23 @@ def _build_evaluator(
387362
"""
388363
if not judge_configuration or not judge_configuration.judges:
389364
return Evaluator.noop()
390-
judges = self._initialize_judges(
391-
judge_configuration.judges, context, default_ai_provider=default_ai_provider,
392-
variables=variables,
393-
)
394-
return Evaluator(judges, judge_configuration)
365+
judge_instances: List[Judge] = []
366+
for jc in judge_configuration.judges:
367+
try:
368+
judge = self._create_judge_instance(
369+
jc.key,
370+
context,
371+
AIJudgeConfigDefault.disabled(),
372+
variables,
373+
default_ai_provider,
374+
sample_rate=jc.sampling_rate,
375+
)
376+
if judge is not None:
377+
judge_instances.append(judge)
378+
except Exception as e:
379+
log.warning(f'Failed to initialize judge {jc.key!r}: {e}')
380+
continue
381+
return Evaluator(judge_instances)
395382

396383
async def create_model(
397384
self,

packages/sdk/server-ai/src/ldai/evaluator.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
from __future__ import annotations
44

55
import asyncio
6-
from typing import Dict, List
6+
from typing import List
77

88
from ldai import log
99
from ldai.judge import Judge
10-
from ldai.models import JudgeConfiguration
1110
from ldai.providers.types import JudgeResult
1211

1312

@@ -20,19 +19,18 @@ class Evaluator:
2019
not need to construct this directly.
2120
"""
2221

23-
def __init__(self, judges: Dict[str, Judge], judge_configuration: JudgeConfiguration):
22+
def __init__(self, judges: List[Judge]):
2423
"""
2524
Initialize the Evaluator.
2625
27-
:param judges: Mapping of judge config key to initialized Judge instances
28-
:param judge_configuration: The judge configuration specifying which judges to run
26+
:param judges: List of initialized Judge instances. Each Judge already
27+
carries its own ``sample_rate`` set at construction time.
2928
"""
3029
self._judges = judges
31-
self._judge_configuration = judge_configuration
3230

3331
@classmethod
3432
def noop(cls) -> Evaluator:
35-
return cls({}, JudgeConfiguration(judges=[]))
33+
return cls([])
3634

3735
def evaluate(
3836
self,
@@ -62,16 +60,12 @@ async def _run_judges(
6260
6361
:param input_text: The input that was provided to the AI model
6462
:param output_text: The AI-generated output to evaluate
65-
:return: List of JudgeResult instances (one per configured judge that was found)
63+
:return: List of JudgeResult instances (one per configured judge)
6664
"""
67-
if not self._judge_configuration.judges:
65+
if not self._judges:
6866
log.debug('No judges configured, no evaluations to run')
6967
return []
7068
results: List[JudgeResult] = []
71-
for jc in self._judge_configuration.judges:
72-
judge = self._judges.get(jc.key)
73-
if not judge:
74-
log.warning(f'Judge not enabled: {jc.key}')
75-
continue
76-
results.append(await judge.evaluate(input_text, output_text, jc.sampling_rate))
69+
for judge in self._judges:
70+
results.append(await judge.evaluate(input_text, output_text))
7771
return results

packages/sdk/server-ai/src/ldai/judge/__init__.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,31 +24,37 @@ def __init__(
2424
self,
2525
ai_config: AIJudgeConfig,
2626
model_runner: ModelRunner,
27+
sample_rate: float = 1.0,
2728
):
2829
"""
2930
Initialize the Judge.
3031
3132
:param ai_config: The judge AI configuration
3233
:param model_runner: The model runner to use for evaluation
34+
:param sample_rate: Default sampling rate (0-1) used when ``evaluate``
35+
is called without an explicit ``sampling_rate`` (defaults to 1).
3336
"""
3437
self._ai_config = ai_config
3538
self._model_runner = model_runner
39+
self.sample_rate = sample_rate
3640
self._evaluation_response_structure = EvaluationSchemaBuilder.build()
3741

3842
async def evaluate(
3943
self,
4044
input_text: str,
4145
output_text: str,
42-
sampling_rate: float = 1.0,
46+
sampling_rate: Optional[float] = None,
4347
) -> JudgeResult:
4448
"""
4549
Evaluates an AI response using the judge's configuration.
4650
4751
:param input_text: The input prompt or question that was provided to the AI
4852
:param output_text: The AI-generated response to be evaluated
49-
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
53+
:param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed.
54+
When ``None`` (the default), falls back to ``self.sample_rate``.
5055
:return: The result of the judge evaluation.
5156
"""
57+
effective_rate = sampling_rate if sampling_rate is not None else self.sample_rate
5258
judge_result = JudgeResult(judge_config_key=self._ai_config.key)
5359

5460
try:
@@ -64,8 +70,8 @@ async def evaluate(
6470
judge_result.error_message = 'Judge configuration must include messages'
6571
return judge_result
6672

67-
if random.random() > sampling_rate:
68-
log.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
73+
if random.random() > effective_rate:
74+
log.debug(f'Judge evaluation skipped due to sampling rate: {effective_rate}')
6975
return judge_result
7076

7177
judge_result.sampled = True
@@ -100,20 +106,22 @@ async def evaluate_messages(
100106
self,
101107
messages: list[LDMessage],
102108
response: ModelResponse,
103-
sampling_ratio: float = 1.0,
109+
sampling_ratio: Optional[float] = None,
104110
) -> JudgeResult:
105111
"""
106112
Evaluates an AI response from chat messages and response.
107113
108114
:param messages: Array of messages representing the conversation history
109115
:param response: The AI response to be evaluated
110-
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
116+
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed.
117+
When ``None`` (the default), falls back to ``self.sample_rate``.
111118
:return: The result of the judge evaluation.
112119
"""
120+
effective_rate = sampling_ratio if sampling_ratio is not None else self.sample_rate
113121
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
114122
output_text = response.message.content
115123

116-
return await self.evaluate(input_text, output_text, sampling_ratio)
124+
return await self.evaluate(input_text, output_text, effective_rate)
117125

118126
def get_ai_config(self) -> AIJudgeConfig:
119127
"""

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,20 @@ def test_judge_initializes_with_evaluation_metric_key(
121121
assert 'score' in judge._evaluation_response_structure['properties']
122122
assert 'reasoning' in judge._evaluation_response_structure['properties']
123123

124+
def test_judge_sample_rate_defaults_to_one(
125+
self, judge_config_with_key: AIJudgeConfig, mock_runner
126+
):
127+
"""sample_rate should default to 1.0 when not provided."""
128+
judge = Judge(judge_config_with_key, mock_runner)
129+
assert judge.sample_rate == 1.0
130+
131+
def test_judge_sample_rate_can_be_set(
132+
self, judge_config_with_key: AIJudgeConfig, mock_runner
133+
):
134+
"""sample_rate should be settable via the constructor."""
135+
judge = Judge(judge_config_with_key, mock_runner, sample_rate=0.25)
136+
assert judge.sample_rate == 0.25
137+
124138

125139
class TestJudgeEvaluate:
126140
"""Tests for Judge.evaluate() method."""
@@ -308,6 +322,32 @@ async def test_evaluate_respects_sampling_rate(
308322
assert result.success is False
309323
mock_runner.invoke_structured_model.assert_not_called()
310324

325+
@pytest.mark.asyncio
326+
async def test_evaluate_uses_instance_sample_rate_when_arg_omitted(
327+
self, judge_config_with_key: AIJudgeConfig, mock_runner
328+
):
329+
"""When sampling_rate arg is omitted, the instance's sample_rate is used."""
330+
judge = Judge(judge_config_with_key, mock_runner, sample_rate=0.0)
331+
332+
result = await judge.evaluate("input", "output")
333+
334+
assert isinstance(result, JudgeResult)
335+
assert result.sampled is False
336+
mock_runner.invoke_structured_model.assert_not_called()
337+
338+
@pytest.mark.asyncio
339+
async def test_evaluate_arg_overrides_instance_sample_rate(
340+
self, judge_config_with_key: AIJudgeConfig, mock_runner
341+
):
342+
"""An explicit sampling_rate=0.0 must override an instance sample_rate of 1.0."""
343+
judge = Judge(judge_config_with_key, mock_runner, sample_rate=1.0)
344+
345+
result = await judge.evaluate("input", "output", sampling_rate=0.0)
346+
347+
assert isinstance(result, JudgeResult)
348+
assert result.sampled is False
349+
mock_runner.invoke_structured_model.assert_not_called()
350+
311351

312352
class TestJudgeEvaluateMessages:
313353
"""Tests for Judge.evaluate_messages() method."""

0 commit comments

Comments
 (0)