66from unittest .mock import AsyncMock , MagicMock , patch
77
88import pytest
9- from ldai import AIAgentConfig , AIJudgeConfig , LDAIClient
9+ from ldai import AIAgentConfig , LDAIClient
1010from ldai .client import Evaluator
1111from ldai .models import LDMessage , ModelConfig
1212from ldai .tracker import TokenUsage
@@ -717,20 +717,19 @@ def setup_method(self):
717717 self .handle_judge_call = AsyncMock (return_value = OptimizationResponse (output = JUDGE_PASS_RESPONSE ))
718718 self .client ._options = _make_options (handle_judge_call = self .handle_judge_call )
719719
720- def _make_judge_config (self , enabled : bool = True ) -> AIJudgeConfig :
721- return AIJudgeConfig (
722- key = "ld-judge-key" ,
723- enabled = enabled ,
724- create_tracker = MagicMock ,
725- model = ModelConfig (name = "gpt-4o" , parameters = {}),
726- messages = [
727- LDMessage (role = "system" , content = "You are an evaluator." ),
728- LDMessage (role = "user" , content = "Evaluate this response." ),
720+ def _make_raw_variation (self , enabled : bool = True ) -> Dict [str , Any ]:
721+ """Raw variation dict as returned by _client.variation for a judge flag."""
722+ return {
723+ "_ldMeta" : {"enabled" : enabled },
724+ "messages" : [
725+ {"role" : "system" , "content" : "You are an evaluator." },
726+ {"role" : "user" , "content" : "Evaluate this response." },
729727 ],
730- )
728+ "model" : {"name" : "gpt-4o" , "parameters" : {}},
729+ }
731730
732731 async def test_calls_handle_judge_call_with_correct_config_type (self ):
733- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
732+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
734733 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
735734 await self .client ._evaluate_config_judge (
736735 judge_key = "quality" ,
@@ -748,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self):
748747 assert isinstance (ctx , OptimizationJudgeContext )
749748
750749 async def test_messages_has_system_and_user_turns (self ):
751- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
750+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
752751 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
753752 await self .client ._evaluate_config_judge (
754753 judge_key = "quality" ,
@@ -763,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self):
763762 assert roles == ["system" , "user" ]
764763
765764 async def test_messages_system_content_matches_instructions (self ):
766- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
765+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
767766 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
768767 await self .client ._evaluate_config_judge (
769768 judge_key = "quality" ,
@@ -778,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self):
778777 assert system_msg .content == config .instructions
779778
780779 async def test_messages_user_content_matches_context_user_input (self ):
781- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
780+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
782781 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
783782 await self .client ._evaluate_config_judge (
784783 judge_key = "quality" ,
@@ -793,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self):
793792 assert user_msg .content == ctx .user_input
794793
795794 async def test_messages_user_content_contains_ld_user_message (self ):
796- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
795+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
797796 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
798797 await self .client ._evaluate_config_judge (
799798 judge_key = "quality" ,
@@ -808,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self):
808807 assert "Evaluate this response." in user_msg .content
809808
810809 async def test_returns_zero_score_when_judge_disabled (self ):
811- self .mock_ldai .judge_config . return_value = self ._make_judge_config (enabled = False )
810+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation (enabled = False )
812811 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
813812 result = await self .client ._evaluate_config_judge (
814813 judge_key = "quality" ,
@@ -821,48 +820,76 @@ async def test_returns_zero_score_when_judge_disabled(self):
821820 assert result .score == 0.0
822821 self .handle_judge_call .assert_not_called ()
823822
824- async def test_returns_zero_score_when_judge_has_no_messages (self ):
825- judge_config = AIJudgeConfig (
826- key = "ld-judge-key" ,
827- enabled = True ,
828- create_tracker = MagicMock ,
829- model = ModelConfig (name = "gpt-4o" , parameters = {}),
830- messages = None ,
831- )
832- self .mock_ldai .judge_config .return_value = judge_config
823+ async def test_system_only_template_auto_generates_user_message (self ):
824+ """When the flag template has only a system message, a user turn is synthesised."""
825+ self .mock_ldai ._client .variation .return_value = {
826+ "_ldMeta" : {"enabled" : True },
827+ "messages" : [{"role" : "system" , "content" : "You are an evaluator." }],
828+ "model" : {"name" : "gpt-4o" , "parameters" : {}},
829+ }
833830 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
834- result = await self .client ._evaluate_config_judge (
831+ await self .client ._evaluate_config_judge (
835832 judge_key = "quality" ,
836833 optimization_judge = judge ,
837- completion_response = "Any ." ,
834+ completion_response = "The answer is 42 ." ,
838835 iteration = 1 ,
839836 reasoning_history = "" ,
840- user_input = "Anything ?" ,
837+ user_input = "What is the answer ?" ,
841838 )
842- assert result .score == 0.0
843- self .handle_judge_call .assert_not_called ()
844-
845- async def test_template_variables_merged_into_judge_config_call (self ):
846- self .mock_ldai .judge_config .return_value = self ._make_judge_config ()
839+ _ , config , _ , _ = self .handle_judge_call .call_args .args
840+ user_msg = next (m for m in config .messages if m .role == "user" )
841+ assert "The answer is 42." in user_msg .content
842+
843+ async def test_template_variables_interpolated_into_messages (self ):
844+ """Custom agent variables are interpolated into judge template messages."""
845+ self .mock_ldai ._client .variation .return_value = {
846+ "_ldMeta" : {"enabled" : True },
847+ "messages" : [
848+ {"role" : "system" , "content" : "Evaluate in {{language}}." },
849+ {"role" : "user" , "content" : "Evaluate this response." },
850+ ],
851+ "model" : {"name" : "gpt-4o" , "parameters" : {}},
852+ }
847853 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
848- variables = {"language" : "Spanish" }
849854 await self .client ._evaluate_config_judge (
850855 judge_key = "quality" ,
851856 optimization_judge = judge ,
852857 completion_response = "Answer." ,
853858 iteration = 1 ,
854859 reasoning_history = "" ,
855860 user_input = "Q?" ,
856- variables = variables ,
861+ variables = { "language" : "Spanish" } ,
857862 )
858- call_kwargs = self .mock_ldai .judge_config .call_args
859- passed_vars = call_kwargs .args [3 ] if call_kwargs .args else call_kwargs .kwargs .get ("variables" , {})
860- assert passed_vars .get ("language" ) == "Spanish"
861- assert "message_history" in passed_vars
862- assert "response_to_evaluate" in passed_vars
863+ _ , config , _ , _ = self .handle_judge_call .call_args .args
864+ assert "Spanish" in config .instructions
865+
866+ async def test_reserved_variables_interpolated_into_template_messages (self ):
867+ """message_history and response_to_evaluate are interpolated when present in the template."""
868+ self .mock_ldai ._client .variation .return_value = {
869+ "_ldMeta" : {"enabled" : True },
870+ "messages" : [
871+ {"role" : "system" , "content" : "History: {{message_history}}" },
872+ {"role" : "user" , "content" : "Response: {{response_to_evaluate}}" },
873+ ],
874+ "model" : {"name" : "gpt-4o" , "parameters" : {}},
875+ }
876+ judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
877+ await self .client ._evaluate_config_judge (
878+ judge_key = "quality" ,
879+ optimization_judge = judge ,
880+ completion_response = "My answer." ,
881+ iteration = 1 ,
882+ reasoning_history = "" ,
883+ user_input = "Q?" ,
884+ )
885+ _ , config , _ , _ = self .handle_judge_call .call_args .args
886+ system_msg = next (m for m in config .messages if m .role == "system" )
887+ assert "History:" in system_msg .content
888+ user_msg = next (m for m in config .messages if m .role == "user" )
889+ assert "My answer." in user_msg .content
863890
864891 async def test_agent_tools_included_without_evaluation_tool (self ):
865- self .mock_ldai .judge_config . return_value = self ._make_judge_config ()
892+ self .mock_ldai ._client . variation . return_value = self ._make_raw_variation ()
866893 agent_tool = ToolDefinition (name = "search" , description = "Search" , input_schema = {})
867894 judge = OptimizationJudge (threshold = 0.8 , judge_key = "ld-judge-key" )
868895 await self .client ._evaluate_config_judge (
0 commit comments