Skip to content

Commit f62ae83

Browse files
jsonbaileyclaude
andcommitted
fix: strip legacy judge messages on direct judge_config() path
Move reserved-variable warnings and ``message_history``/ ``response_to_evaluate`` marker injection from ``_create_judge_instance`` into ``_judge_config`` so both the direct ``judge_config()`` API and the ``create_judge()`` wrapper get correct legacy-message stripping. Previously the markers were only injected by ``_create_judge_instance``, so callers using ``judge_config()`` directly saw legacy ``{{message_history}}``/``{{response_to_evaluate}}`` placeholders rendered to empty strings by Mustache before ``_strip_legacy_judge_messages`` ran, leaving the legacy messages intact in the returned config. Mirrors the JS sibling fix in launchdarkly/js-core#1364. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 583939d commit f62ae83

2 files changed

Lines changed: 153 additions & 7 deletions

File tree

packages/sdk/server-ai/src/ldai/client.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,27 @@ def _judge_config(
215215
default: AIJudgeConfigDefault,
216216
variables: Optional[Dict[str, Any]] = None,
217217
) -> AIJudgeConfig:
218+
if variables is not None:
219+
if variables.get('message_history') is not None:
220+
log.warning(
221+
"The variable 'message_history' is reserved by the judge and will be ignored."
222+
)
223+
if variables.get('response_to_evaluate') is not None:
224+
log.warning(
225+
"The variable 'response_to_evaluate' is reserved by the judge and will be ignored."
226+
)
227+
228+
# Re-inject the reserved variables as their literal placeholders so they
229+
# survive Mustache interpolation in ``__evaluate``. Without this, legacy
230+
# templates like ``{{message_history}}`` get rendered to empty strings and
231+
# ``_strip_legacy_judge_messages`` below cannot detect them.
232+
extended_variables = dict(variables) if variables else {}
233+
extended_variables['message_history'] = '{{message_history}}'
234+
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
235+
218236
(model, provider, messages, instructions,
219237
tracker_factory, enabled, judge_configuration, variation) = self.__evaluate(
220-
key, context, default.to_dict(), variables
238+
key, context, default.to_dict(), extended_variables
221239
)
222240

223241
def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]:
@@ -336,12 +354,8 @@ def _create_judge_instance(
336354
when materializing judges referenced by an AI config's judge configuration.
337355
"""
338356
try:
339-
extended_variables = dict(variables) if variables else {}
340-
extended_variables['message_history'] = '{{message_history}}'
341-
extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
342-
343357
judge_config = self._judge_config(
344-
key, context, default or _DISABLED_JUDGE_DEFAULT, extended_variables
358+
key, context, default or _DISABLED_JUDGE_DEFAULT, variables
345359
)
346360

347361
if not judge_config.enabled:

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 133 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
"""Tests for Judge functionality."""
22

3-
from unittest.mock import AsyncMock, MagicMock, call
3+
from unittest.mock import AsyncMock, MagicMock, call, patch
44

55
import pytest
66
from ldclient import Config, Context, LDClient
77
from ldclient.integrations.test_data import TestData
88

9+
from ldai import LDAIClient
910
from ldai.judge import Judge, _strip_legacy_judge_messages
1011
from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
1112
from ldai.models import (
@@ -543,6 +544,137 @@ async def test_evaluate_messages_calls_evaluate(
543544
assert tracker.track_metrics_of_async.called
544545

545546

547+
class TestJudgeConfigStripsLegacyMessages:
548+
"""Tests for ``LDAIClient.judge_config()`` legacy-message stripping.
549+
550+
Both ``LDAIClient.judge_config()`` (the direct public API) and
551+
``LDAIClient.create_judge()`` (the wrapper API) reach the same internal
552+
``_judge_config()`` method. ``_judge_config()`` is responsible for
553+
injecting ``message_history``/``response_to_evaluate`` markers so they
554+
survive Mustache rendering, then stripping any legacy template messages
555+
before returning the config.
556+
"""
557+
558+
@pytest.fixture
559+
def context(self) -> Context:
560+
return Context.create('user-key')
561+
562+
def _make_client(self, td: TestData) -> LDAIClient:
563+
config = Config('sdk-key', update_processor_class=td, send_events=False)
564+
return LDAIClient(LDClient(config=config))
565+
566+
def test_judge_config_strips_legacy_messages_from_returned_config(self, context):
567+
"""Calling ``judge_config()`` directly (no variables) still strips legacy messages.
568+
569+
This is the regression for the bug where legacy messages leaked through
570+
the public ``judge_config()`` entry point because reserved-variable
571+
markers were only injected in ``_create_judge_instance``.
572+
"""
573+
td = TestData.data_source()
574+
td.update(
575+
td.flag('legacy-judge')
576+
.variations({
577+
'model': {'name': 'gpt-4'},
578+
'provider': {'name': 'openai'},
579+
'messages': [
580+
{'role': 'system', 'content': 'You are a judge.'},
581+
{'role': 'assistant', 'content': '{{message_history}}'},
582+
{'role': 'user', 'content': 'Evaluate: {{response_to_evaluate}}'},
583+
],
584+
'evaluationMetricKey': '$ld:ai:judge:relevance',
585+
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
586+
})
587+
.variation_for_all(0)
588+
)
589+
client = self._make_client(td)
590+
591+
result = client.judge_config('legacy-judge', context)
592+
593+
assert result.enabled is True
594+
assert result.messages is not None
595+
assert len(result.messages) == 1
596+
assert result.messages[0].role == 'system'
597+
assert result.messages[0].content == 'You are a judge.'
598+
599+
def test_judge_config_passes_user_variables_to_template(self, context):
600+
"""User variables are still interpolated into the system message."""
601+
td = TestData.data_source()
602+
td.update(
603+
td.flag('parametric-judge')
604+
.variations({
605+
'model': {'name': 'gpt-4'},
606+
'provider': {'name': 'openai'},
607+
'messages': [
608+
{'role': 'system', 'content': 'You are a {{tone}} judge.'},
609+
],
610+
'evaluationMetricKey': '$ld:ai:judge:relevance',
611+
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
612+
})
613+
.variation_for_all(0)
614+
)
615+
client = self._make_client(td)
616+
617+
result = client.judge_config(
618+
'parametric-judge', context, variables={'tone': 'strict'}
619+
)
620+
621+
assert result.messages is not None
622+
assert result.messages[0].content == 'You are a strict judge.'
623+
624+
def test_judge_config_warns_on_reserved_variables(self, context):
625+
"""``_judge_config`` warns when callers pass reserved variable names."""
626+
td = TestData.data_source()
627+
td.update(
628+
td.flag('judge-config')
629+
.variations({
630+
'model': {'name': 'gpt-4'},
631+
'provider': {'name': 'openai'},
632+
'messages': [{'role': 'system', 'content': 'You are a judge.'}],
633+
'evaluationMetricKey': '$ld:ai:judge:relevance',
634+
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
635+
})
636+
.variation_for_all(0)
637+
)
638+
client = self._make_client(td)
639+
640+
with patch('ldai.client.log') as mock_log:
641+
client.judge_config(
642+
'judge-config',
643+
context,
644+
variables={
645+
'message_history': 'should be ignored',
646+
'response_to_evaluate': 'should be ignored',
647+
},
648+
)
649+
650+
warning_messages = [c.args[0] for c in mock_log.warning.call_args_list]
651+
assert any("'message_history' is reserved" in m for m in warning_messages)
652+
assert any("'response_to_evaluate' is reserved" in m for m in warning_messages)
653+
654+
def test_judge_config_does_not_warn_without_reserved_variables(self, context):
655+
"""No warnings should be emitted when callers pass non-reserved variables."""
656+
td = TestData.data_source()
657+
td.update(
658+
td.flag('judge-config')
659+
.variations({
660+
'model': {'name': 'gpt-4'},
661+
'provider': {'name': 'openai'},
662+
'messages': [{'role': 'system', 'content': 'You are a judge.'}],
663+
'evaluationMetricKey': '$ld:ai:judge:relevance',
664+
'_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1},
665+
})
666+
.variation_for_all(0)
667+
)
668+
client = self._make_client(td)
669+
670+
with patch('ldai.client.log') as mock_log:
671+
client.judge_config('judge-config', context, variables={'tone': 'strict'})
672+
673+
warning_messages = [c.args[0] for c in mock_log.warning.call_args_list]
674+
assert not any("'message_history' is reserved" in m for m in warning_messages)
675+
assert not any("'response_to_evaluate' is reserved" in m for m in warning_messages)
676+
677+
546678
class TestEvaluationSchemaBuilder:
547679
"""Tests for EvaluationSchemaBuilder."""
548680

0 commit comments

Comments
 (0)