Skip to content

Commit 806b564

Browse files
committed
circumvents judge_config calls to make judge evaluations in optimization
1 parent b506fef commit 806b564

2 files changed

Lines changed: 132 additions & 48 deletions

File tree

packages/optimization/src/ldai_optimizer/client.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import logging
2121
import os
2222
import random
23+
import re
2324
import time
2425
import uuid
2526
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
@@ -69,6 +70,15 @@
6970
logger.addFilter(RedactionFilter())
7071

7172

73+
def _interpolate(template: str, variables: Dict[str, Any]) -> str:
74+
"""Replace {{key}} tokens with values from variables; unresolved tokens become empty string."""
75+
return re.sub(
76+
r"\{\{(\w+)\}\}",
77+
lambda m: str(variables.get(m.group(1), "")),
78+
template,
79+
)
80+
81+
7282
def _find_model_config(
7383
model_name: str, configs: List[Dict[str, Any]]
7484
) -> Optional[Dict[str, Any]]:
@@ -402,18 +412,65 @@ def _judge_config(
402412
variables: Dict[str, Any],
403413
) -> AIJudgeConfig:
404414
"""
405-
Fetch a judge configuration from the LaunchDarkly client.
415+
Fetch a judge configuration by evaluating the flag variation directly.
406416
407-
Thin wrapper around LDAIClient.judge_config so callers do not need a
408-
direct reference to the client.
417+
Bypasses LDAIClient.judge_config to avoid the reserved-variable warnings
418+
for 'message_history' and 'response_to_evaluate'. Those variables are
419+
interpolated here with their actual values instead of being neutralised
420+
by the SDK. If the template contains only a system message, a user turn
421+
is synthesised from the provided message_history and response_to_evaluate
422+
so that _evaluate_config_judge always receives a complete conversation.
409423
410424
:param judge_key: The key for the judge configuration in LaunchDarkly
411425
:param context: The evaluation context
412-
:param default: Fallback config when the flag is disabled or unreachable
413-
:param variables: Template variables for instruction interpolation
426+
:param default: Unused; kept for signature compatibility
427+
:param variables: Template variables including message_history and response_to_evaluate
414428
:return: The resolved AIJudgeConfig
415429
"""
416-
return self._ldClient.judge_config(judge_key, context, default, variables)
430+
variation: Dict[str, Any] = self._ldClient._client.variation(judge_key, context, {})
431+
enabled: bool = bool(variation.get("_ldMeta", {}).get("enabled", False))
432+
433+
all_variables: Dict[str, Any] = {"ldctx": context.to_dict(), **variables}
434+
435+
messages: List[LDMessage] = []
436+
raw_messages = variation.get("messages")
437+
if isinstance(raw_messages, list) and all(isinstance(m, dict) for m in raw_messages):
438+
messages = [
439+
LDMessage(
440+
role=m["role"],
441+
content=_interpolate(m.get("content", ""), all_variables),
442+
)
443+
for m in raw_messages
444+
]
445+
446+
# New-style templates only have a system message. Auto-generate a user
447+
# turn so _evaluate_config_judge always has a complete conversation to split.
448+
if not any(m.role == "user" for m in messages):
449+
message_history = variables.get("message_history", "")
450+
response_to_evaluate = variables.get("response_to_evaluate", "")
451+
parts: List[str] = []
452+
if message_history:
453+
parts.append(str(message_history))
454+
parts.append(f"Here is the response to evaluate: {response_to_evaluate}")
455+
messages.append(LDMessage(role="user", content="\n\n".join(parts)))
456+
457+
model: Optional[ModelConfig] = None
458+
raw_model = variation.get("model")
459+
if isinstance(raw_model, dict):
460+
model = ModelConfig(
461+
name=raw_model.get("name", ""),
462+
parameters=raw_model.get("parameters"),
463+
custom=raw_model.get("custom"),
464+
)
465+
466+
return AIJudgeConfig(
467+
key=judge_key,
468+
enabled=enabled,
469+
create_tracker=lambda: None,
470+
model=model,
471+
messages=messages,
472+
evaluation_metric_key=variation.get("evaluationMetricKey"),
473+
)
417474

418475
def _serialize_scores(
419476
self, judge_results: Dict[str, JudgeResult]

packages/optimization/tests/test_client.py

Lines changed: 69 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from unittest.mock import AsyncMock, MagicMock, patch
77

88
import pytest
9-
from ldai import AIAgentConfig, AIJudgeConfig, LDAIClient
9+
from ldai import AIAgentConfig, LDAIClient
1010
from ldai.client import Evaluator
1111
from ldai.models import LDMessage, ModelConfig
1212
from ldai.tracker import TokenUsage
@@ -717,20 +717,19 @@ def setup_method(self):
717717
self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
718718
self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
719719

720-
def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig:
721-
return AIJudgeConfig(
722-
key="ld-judge-key",
723-
enabled=enabled,
724-
create_tracker=MagicMock,
725-
model=ModelConfig(name="gpt-4o", parameters={}),
726-
messages=[
727-
LDMessage(role="system", content="You are an evaluator."),
728-
LDMessage(role="user", content="Evaluate this response."),
720+
def _make_raw_variation(self, enabled: bool = True) -> Dict[str, Any]:
721+
"""Raw variation dict as returned by _client.variation for a judge flag."""
722+
return {
723+
"_ldMeta": {"enabled": enabled},
724+
"messages": [
725+
{"role": "system", "content": "You are an evaluator."},
726+
{"role": "user", "content": "Evaluate this response."},
729727
],
730-
)
728+
"model": {"name": "gpt-4o", "parameters": {}},
729+
}
731730

732731
async def test_calls_handle_judge_call_with_correct_config_type(self):
733-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
732+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
734733
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
735734
await self.client._evaluate_config_judge(
736735
judge_key="quality",
@@ -748,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self):
748747
assert isinstance(ctx, OptimizationJudgeContext)
749748

750749
async def test_messages_has_system_and_user_turns(self):
751-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
750+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
752751
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
753752
await self.client._evaluate_config_judge(
754753
judge_key="quality",
@@ -763,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self):
763762
assert roles == ["system", "user"]
764763

765764
async def test_messages_system_content_matches_instructions(self):
766-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
765+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
767766
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
768767
await self.client._evaluate_config_judge(
769768
judge_key="quality",
@@ -778,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self):
778777
assert system_msg.content == config.instructions
779778

780779
async def test_messages_user_content_matches_context_user_input(self):
781-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
780+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
782781
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
783782
await self.client._evaluate_config_judge(
784783
judge_key="quality",
@@ -793,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self):
793792
assert user_msg.content == ctx.user_input
794793

795794
async def test_messages_user_content_contains_ld_user_message(self):
796-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
795+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
797796
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
798797
await self.client._evaluate_config_judge(
799798
judge_key="quality",
@@ -808,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self):
808807
assert "Evaluate this response." in user_msg.content
809808

810809
async def test_returns_zero_score_when_judge_disabled(self):
811-
self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False)
810+
self.mock_ldai._client.variation.return_value = self._make_raw_variation(enabled=False)
812811
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
813812
result = await self.client._evaluate_config_judge(
814813
judge_key="quality",
@@ -821,48 +820,76 @@ async def test_returns_zero_score_when_judge_disabled(self):
821820
assert result.score == 0.0
822821
self.handle_judge_call.assert_not_called()
823822

824-
async def test_returns_zero_score_when_judge_has_no_messages(self):
825-
judge_config = AIJudgeConfig(
826-
key="ld-judge-key",
827-
enabled=True,
828-
create_tracker=MagicMock,
829-
model=ModelConfig(name="gpt-4o", parameters={}),
830-
messages=None,
831-
)
832-
self.mock_ldai.judge_config.return_value = judge_config
823+
async def test_system_only_template_auto_generates_user_message(self):
824+
"""When the flag template has only a system message, a user turn is synthesised."""
825+
self.mock_ldai._client.variation.return_value = {
826+
"_ldMeta": {"enabled": True},
827+
"messages": [{"role": "system", "content": "You are an evaluator."}],
828+
"model": {"name": "gpt-4o", "parameters": {}},
829+
}
833830
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
834-
result = await self.client._evaluate_config_judge(
831+
await self.client._evaluate_config_judge(
835832
judge_key="quality",
836833
optimization_judge=judge,
837-
completion_response="Any.",
834+
completion_response="The answer is 42.",
838835
iteration=1,
839836
reasoning_history="",
840-
user_input="Anything?",
837+
user_input="What is the answer?",
841838
)
842-
assert result.score == 0.0
843-
self.handle_judge_call.assert_not_called()
844-
845-
async def test_template_variables_merged_into_judge_config_call(self):
846-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
839+
_, config, _, _ = self.handle_judge_call.call_args.args
840+
user_msg = next(m for m in config.messages if m.role == "user")
841+
assert "The answer is 42." in user_msg.content
842+
843+
async def test_template_variables_interpolated_into_messages(self):
844+
"""Custom agent variables are interpolated into judge template messages."""
845+
self.mock_ldai._client.variation.return_value = {
846+
"_ldMeta": {"enabled": True},
847+
"messages": [
848+
{"role": "system", "content": "Evaluate in {{language}}."},
849+
{"role": "user", "content": "Evaluate this response."},
850+
],
851+
"model": {"name": "gpt-4o", "parameters": {}},
852+
}
847853
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
848-
variables = {"language": "Spanish"}
849854
await self.client._evaluate_config_judge(
850855
judge_key="quality",
851856
optimization_judge=judge,
852857
completion_response="Answer.",
853858
iteration=1,
854859
reasoning_history="",
855860
user_input="Q?",
856-
variables=variables,
861+
variables={"language": "Spanish"},
857862
)
858-
call_kwargs = self.mock_ldai.judge_config.call_args
859-
passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {})
860-
assert passed_vars.get("language") == "Spanish"
861-
assert "message_history" in passed_vars
862-
assert "response_to_evaluate" in passed_vars
863+
_, config, _, _ = self.handle_judge_call.call_args.args
864+
assert "Spanish" in config.instructions
865+
866+
async def test_reserved_variables_interpolated_into_template_messages(self):
867+
"""message_history and response_to_evaluate are interpolated when present in the template."""
868+
self.mock_ldai._client.variation.return_value = {
869+
"_ldMeta": {"enabled": True},
870+
"messages": [
871+
{"role": "system", "content": "History: {{message_history}}"},
872+
{"role": "user", "content": "Response: {{response_to_evaluate}}"},
873+
],
874+
"model": {"name": "gpt-4o", "parameters": {}},
875+
}
876+
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
877+
await self.client._evaluate_config_judge(
878+
judge_key="quality",
879+
optimization_judge=judge,
880+
completion_response="My answer.",
881+
iteration=1,
882+
reasoning_history="",
883+
user_input="Q?",
884+
)
885+
_, config, _, _ = self.handle_judge_call.call_args.args
886+
system_msg = next(m for m in config.messages if m.role == "system")
887+
assert "History:" in system_msg.content
888+
user_msg = next(m for m in config.messages if m.role == "user")
889+
assert "My answer." in user_msg.content
863890

864891
async def test_agent_tools_included_without_evaluation_tool(self):
865-
self.mock_ldai.judge_config.return_value = self._make_judge_config()
892+
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
866893
agent_tool = ToolDefinition(name="search", description="Search", input_schema={})
867894
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
868895
await self.client._evaluate_config_judge(

0 commit comments

Comments
 (0)