Skip to content

Commit 23baeb4

Browse files
feat: adds ability to use inverted judges (#168)
**Requirements** - [x] I have added test coverage for new or changed functionality - [x] I have followed the repository's [pull request submission guidelines](../blob/main/CONTRIBUTING.md#submitting-pull-requests) - [x] I have validated my changes against all supported platform versions **Describe the solution you've provided** Implements handling for "inverted" judges. **Describe alternatives you've considered** This gets feature parity with our online evals functionality; no alternatives considered. **Additional context** When a metric has `is_inverted` set, it's intended that the evaluation of the score flips from `>=` to` <=`. This adds a util `_judge_passed` to handle that logic and implements it throughout. We don't surface the inverted property in the SDK, so we fetch the judge directly to get this information. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Changes core judge pass/fail semantics and adds per-judge REST calls (`get_ai_config`) during config-driven runs, which could affect optimization outcomes and introduce new failure/performance modes if the API is unavailable or slow. > > **Overview** > Adds first-class support for **inverted judges** (where *lower* scores are better) by introducing a shared `judge_passed` helper and using it for pass/fail decisions in `OptimizationClient` and in prompt feedback generation. > > Extends `OptimizationJudge` with an `is_inverted` flag and, for `optimize_from_config`, fetches each judge’s `isInverted` value via `api_client.get_ai_config` when building options. Updates logging to include the inverted status, and adds targeted tests covering the helper, mixed inverted/standard evaluation, config building behavior, and `variation_prompt_feedback` output. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit a8f14de. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY -->
2 parents e8c6692 + a8f14de commit 23baeb4

5 files changed

Lines changed: 277 additions & 6 deletions

File tree

packages/optimization/src/ldai_optimizer/client.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
extract_json_from_response,
6161
generate_slug,
6262
interpolate_variables,
63+
judge_passed,
6364
restore_variable_placeholders,
6465
validate_variation_response,
6566
)
@@ -142,6 +143,7 @@ def _compute_validation_count(pool_size: int) -> int:
142143
}
143144

144145

146+
145147
class OptimizationClient:
146148
_options: OptimizationOptions
147149
_ldClient: LDAIClient
@@ -470,13 +472,14 @@ async def _call_judges(
470472
if optimization_judge.threshold is not None
471473
else 1.0
472474
)
473-
passed = result.score >= threshold
475+
passed = judge_passed(result.score, threshold, optimization_judge.is_inverted)
474476
logger.debug(
475-
"[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s",
477+
"[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f, inverted=%s) -> %s%s",
476478
iteration,
477479
judge_key,
478480
result.score,
479481
threshold,
482+
optimization_judge.is_inverted,
480483
"PASSED" if passed else "FAILED",
481484
f" | {result.rationale}" if result.rationale else "",
482485
)
@@ -1492,9 +1495,13 @@ def _build_options_from_config(
14921495
)
14931496

14941497
for judge in config["judges"]:
1495-
judges[judge["key"]] = OptimizationJudge(
1498+
judge_key = judge["key"]
1499+
ai_config = api_client.get_ai_config(options.project_key, judge_key)
1500+
is_inverted = bool(ai_config.get("isInverted", False)) if ai_config else False
1501+
judges[judge_key] = OptimizationJudge(
14961502
threshold=float(judge.get("threshold", 0.95)),
1497-
judge_key=judge["key"],
1503+
judge_key=judge_key,
1504+
is_inverted=is_inverted,
14981505
)
14991506

15001507
raw_ground_truth: List[str] = config.get("groundTruthResponses") or []
@@ -1852,7 +1859,7 @@ def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:
18521859
if optimization_judge.threshold is not None
18531860
else 1.0
18541861
)
1855-
if result.score < threshold:
1862+
if not judge_passed(result.score, threshold, optimization_judge.is_inverted):
18561863
return False
18571864

18581865
return True

packages/optimization/src/ldai_optimizer/dataclasses.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ class OptimizationJudge:
196196
threshold: float
197197
judge_key: Optional[str] = None
198198
acceptance_statement: Optional[str] = None
199+
is_inverted: bool = False
199200

200201

201202
@dataclass

packages/optimization/src/ldai_optimizer/prompts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
OptimizationContext,
88
OptimizationJudge,
99
)
10+
from ldai_optimizer.util import judge_passed
1011

1112
_DURATION_KEYWORDS = re.compile(
1213
r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|"
@@ -285,7 +286,7 @@ def variation_prompt_feedback(
285286
if optimization_judge:
286287
score = result.score
287288
if optimization_judge.threshold is not None:
288-
passed = score >= optimization_judge.threshold
289+
passed = judge_passed(score, optimization_judge.threshold, optimization_judge.is_inverted)
289290
status = "PASSED" if passed else "FAILED"
290291
feedback_line = (
291292
f"- {judge_key}: Score {score:.3f}"

packages/optimization/src/ldai_optimizer/util.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,3 +303,13 @@ def extract_json_from_response(response_str: str) -> Dict[str, Any]:
303303
)
304304

305305
return response_data
306+
307+
308+
def judge_passed(score: float, threshold: float, is_inverted: bool) -> bool:
309+
"""Return True when a judge score meets its threshold.
310+
311+
For standard judges (higher is better) the score must reach the threshold:
312+
``score >= threshold``. For inverted judges (lower is better, e.g. toxicity)
313+
the score must stay at or below the threshold: ``score <= threshold``.
314+
"""
315+
return score <= threshold if is_inverted else score >= threshold

packages/optimization/tests/test_client.py

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ldclient import Context
1212

1313
from ldai_optimizer.client import OptimizationClient, _compute_validation_count, _find_model_config
14+
from ldai_optimizer.util import judge_passed
1415
from ldai_optimizer.dataclasses import (
1516
AIJudgeCallConfig,
1617
GroundTruthOptimizationOptions,
@@ -28,6 +29,7 @@
2829
_acceptance_criteria_implies_duration_optimization,
2930
build_new_variation_prompt,
3031
variation_prompt_acceptance_criteria,
32+
variation_prompt_feedback,
3133
variation_prompt_improvement_instructions,
3234
variation_prompt_overfit_warning,
3335
variation_prompt_preamble,
@@ -1847,6 +1849,8 @@ def _make_mock_api_client() -> MagicMock:
18471849
mock.post_agent_optimization_result = MagicMock(return_value="result-uuid-789")
18481850
mock.patch_agent_optimization_result = MagicMock()
18491851
mock.get_model_configs = MagicMock(return_value=[])
1852+
# Default: AI Configs do not have isInverted set
1853+
mock.get_ai_config = MagicMock(return_value={})
18501854
return mock
18511855

18521856

@@ -4404,3 +4408,251 @@ async def test_optimization_key_in_post_url_uses_string_key_not_uuid(self):
44044408
assert opt_key_arg == "my-optimization", (
44054409
f"Expected string key 'my-optimization', got '{opt_key_arg}'"
44064410
)
4411+
4412+
4413+
# ---------------------------------------------------------------------------
4414+
# judge_passed helper
4415+
# ---------------------------------------------------------------------------
4416+
4417+
4418+
class TestJudgePassed:
4419+
def test_standard_judge_passes_at_or_above_threshold(self):
4420+
assert judge_passed(0.8, 0.8, is_inverted=False) is True
4421+
assert judge_passed(1.0, 0.8, is_inverted=False) is True
4422+
4423+
def test_standard_judge_fails_below_threshold(self):
4424+
assert judge_passed(0.5, 0.8, is_inverted=False) is False
4425+
4426+
def test_inverted_judge_passes_at_or_below_threshold(self):
4427+
assert judge_passed(0.1, 0.3, is_inverted=True) is True
4428+
assert judge_passed(0.3, 0.3, is_inverted=True) is True
4429+
4430+
def test_inverted_judge_fails_above_threshold(self):
4431+
assert judge_passed(0.8, 0.3, is_inverted=True) is False
4432+
4433+
4434+
# ---------------------------------------------------------------------------
4435+
# _evaluate_response with inverted judges
4436+
# ---------------------------------------------------------------------------
4437+
4438+
4439+
class TestEvaluateResponseInvertedJudges:
4440+
def setup_method(self):
4441+
self.client = _make_client()
4442+
4443+
def _ctx_with_scores(self, scores: Dict[str, JudgeResult]) -> OptimizationContext:
4444+
return OptimizationContext(
4445+
scores=scores,
4446+
completion_response="Some response.",
4447+
current_instructions="Do X.",
4448+
current_parameters={},
4449+
current_variables={},
4450+
iteration=1,
4451+
)
4452+
4453+
def test_inverted_judge_passes_when_score_below_threshold(self):
4454+
self.client._options = _make_options(
4455+
judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)}
4456+
)
4457+
ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.1)})
4458+
assert self.client._evaluate_response(ctx) is True
4459+
4460+
def test_inverted_judge_passes_at_exact_threshold(self):
4461+
self.client._options = _make_options(
4462+
judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)}
4463+
)
4464+
ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.3)})
4465+
assert self.client._evaluate_response(ctx) is True
4466+
4467+
def test_inverted_judge_fails_when_score_above_threshold(self):
4468+
self.client._options = _make_options(
4469+
judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)}
4470+
)
4471+
ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.8)})
4472+
assert self.client._evaluate_response(ctx) is False
4473+
4474+
def test_mixed_judges_all_must_pass(self):
4475+
"""A standard judge and an inverted judge must both pass for overall pass."""
4476+
self.client._options = _make_options(
4477+
judges={
4478+
"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False),
4479+
"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True),
4480+
}
4481+
)
4482+
# Both pass: relevance high, toxicity low
4483+
ctx = self._ctx_with_scores({
4484+
"relevance": JudgeResult(score=0.9),
4485+
"toxicity": JudgeResult(score=0.1),
4486+
})
4487+
assert self.client._evaluate_response(ctx) is True
4488+
4489+
def test_mixed_judges_fails_when_inverted_judge_too_high(self):
4490+
self.client._options = _make_options(
4491+
judges={
4492+
"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False),
4493+
"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True),
4494+
}
4495+
)
4496+
# Relevance passes but toxicity fails (score too high)
4497+
ctx = self._ctx_with_scores({
4498+
"relevance": JudgeResult(score=0.9),
4499+
"toxicity": JudgeResult(score=0.8),
4500+
})
4501+
assert self.client._evaluate_response(ctx) is False
4502+
4503+
def test_mixed_judges_fails_when_standard_judge_too_low(self):
4504+
self.client._options = _make_options(
4505+
judges={
4506+
"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False),
4507+
"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True),
4508+
}
4509+
)
4510+
# Toxicity passes but relevance fails (score too low)
4511+
ctx = self._ctx_with_scores({
4512+
"relevance": JudgeResult(score=0.5),
4513+
"toxicity": JudgeResult(score=0.1),
4514+
})
4515+
assert self.client._evaluate_response(ctx) is False
4516+
4517+
4518+
# ---------------------------------------------------------------------------
4519+
# _build_options_from_config reads isInverted via get_ai_config REST call
4520+
# ---------------------------------------------------------------------------
4521+
4522+
4523+
class TestBuildOptionsFromConfigIsInverted:
4524+
def setup_method(self):
4525+
self.client = _make_client()
4526+
self.client._agent_key = "my-agent"
4527+
self.client._initialize_class_members_from_config(_make_agent_config())
4528+
self.client._options = _make_options()
4529+
self.api_client = _make_mock_api_client()
4530+
4531+
def _build(self, config=None, options=None) -> OptimizationOptions:
4532+
return self.client._build_options_from_config(
4533+
config or dict(_API_CONFIG),
4534+
options or _make_from_config_options(),
4535+
self.api_client,
4536+
optimization_key="opt-key-123",
4537+
run_id="run-uuid-456",
4538+
model_configs=[],
4539+
)
4540+
4541+
def test_is_inverted_true_when_ai_config_returns_isInverted(self):
4542+
"""is_inverted is set from the AI Config REST API response for each judge."""
4543+
self.api_client.get_ai_config.return_value = {"isInverted": True}
4544+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4545+
{"key": "toxicity", "threshold": 0.3},
4546+
])
4547+
result = self._build(config=config)
4548+
assert result.judges["toxicity"].is_inverted is True
4549+
4550+
def test_is_inverted_false_when_ai_config_has_no_isInverted(self):
4551+
self.api_client.get_ai_config.return_value = {}
4552+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4553+
{"key": "relevance", "threshold": 0.8},
4554+
])
4555+
result = self._build(config=config)
4556+
assert result.judges["relevance"].is_inverted is False
4557+
4558+
def test_is_inverted_false_when_ai_config_has_isInverted_false(self):
4559+
self.api_client.get_ai_config.return_value = {"isInverted": False}
4560+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4561+
{"key": "relevance", "threshold": 0.8},
4562+
])
4563+
result = self._build(config=config)
4564+
assert result.judges["relevance"].is_inverted is False
4565+
4566+
def test_get_ai_config_called_once_per_judge(self):
4567+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4568+
{"key": "toxicity", "threshold": 0.3},
4569+
{"key": "relevance", "threshold": 0.8},
4570+
])
4571+
self._build(config=config)
4572+
assert self.api_client.get_ai_config.call_count == 2
4573+
4574+
def test_acceptance_statements_skip_get_ai_config(self):
4575+
"""Acceptance statement judges are not backed by AI Configs."""
4576+
config = dict(_API_CONFIG, judges=[], acceptanceStatements=[
4577+
{"statement": "Be accurate.", "threshold": 0.9},
4578+
])
4579+
self._build(config=config)
4580+
self.api_client.get_ai_config.assert_not_called()
4581+
4582+
def test_raises_when_get_ai_config_fails(self):
4583+
"""A failing get_ai_config call propagates — the build should not silently ignore it."""
4584+
self.api_client.get_ai_config.side_effect = Exception("API error")
4585+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4586+
{"key": "toxicity", "threshold": 0.3},
4587+
])
4588+
with pytest.raises(Exception, match="API error"):
4589+
self._build(config=config)
4590+
4591+
def test_per_judge_isInverted_mixed(self):
4592+
"""Different judges can have different isInverted values."""
4593+
def _get_ai_config_side_effect(project_key, config_key):
4594+
return {"isInverted": True} if config_key == "toxicity" else {"isInverted": False}
4595+
4596+
self.api_client.get_ai_config.side_effect = _get_ai_config_side_effect
4597+
config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
4598+
{"key": "toxicity", "threshold": 0.3},
4599+
{"key": "relevance", "threshold": 0.8},
4600+
])
4601+
result = self._build(config=config)
4602+
assert result.judges["toxicity"].is_inverted is True
4603+
assert result.judges["relevance"].is_inverted is False
4604+
4605+
4606+
# ---------------------------------------------------------------------------
4607+
# variation_prompt_feedback with inverted judges
4608+
# ---------------------------------------------------------------------------
4609+
4610+
4611+
class TestVariationPromptFeedbackInvertedJudges:
4612+
def _make_ctx(self, scores: Dict[str, JudgeResult], iteration: int = 1) -> OptimizationContext:
4613+
return OptimizationContext(
4614+
scores=scores,
4615+
completion_response="Some response.",
4616+
current_instructions="Do X.",
4617+
current_parameters={},
4618+
current_variables={},
4619+
iteration=iteration,
4620+
)
4621+
4622+
def test_inverted_judge_shows_passed_when_score_below_threshold(self):
4623+
ctx = self._make_ctx({"toxicity": JudgeResult(score=0.1, rationale="Very clean.")})
4624+
judges = {"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)}
4625+
result = variation_prompt_feedback([ctx], judges)
4626+
assert "PASSED" in result
4627+
4628+
def test_inverted_judge_shows_failed_when_score_above_threshold(self):
4629+
ctx = self._make_ctx({"toxicity": JudgeResult(score=0.8, rationale="Very toxic.")})
4630+
judges = {"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)}
4631+
result = variation_prompt_feedback([ctx], judges)
4632+
assert "FAILED" in result
4633+
4634+
def test_standard_judge_shows_passed_when_score_above_threshold(self):
4635+
ctx = self._make_ctx({"relevance": JudgeResult(score=0.9)})
4636+
judges = {"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False)}
4637+
result = variation_prompt_feedback([ctx], judges)
4638+
assert "PASSED" in result
4639+
4640+
def test_standard_judge_shows_failed_when_score_below_threshold(self):
4641+
ctx = self._make_ctx({"relevance": JudgeResult(score=0.5)})
4642+
judges = {"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False)}
4643+
result = variation_prompt_feedback([ctx], judges)
4644+
assert "FAILED" in result
4645+
4646+
def test_mixed_judges_feedback_reflects_correct_pass_fail(self):
4647+
ctx = self._make_ctx({
4648+
"relevance": JudgeResult(score=0.9),
4649+
"toxicity": JudgeResult(score=0.05),
4650+
})
4651+
judges = {
4652+
"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False),
4653+
"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True),
4654+
}
4655+
result = variation_prompt_feedback([ctx], judges)
4656+
# Both should be PASSED — relevance high enough, toxicity low enough
4657+
assert result.count("PASSED") == 2
4658+
assert "FAILED" not in result

0 commit comments

Comments
 (0)