11"""Judge implementation for AI evaluation."""
22
33import random
4- from typing import Any , Dict , Optional
4+ from typing import Any , Dict , Optional , Tuple
55
66import chevron
77
88from ldai import log
99from ldai .judge .evaluation_schema_builder import EvaluationSchemaBuilder
1010from ldai .models import AIJudgeConfig , LDMessage
1111from ldai .providers .model_runner import ModelRunner
12- from ldai .providers .types import EvalScore , JudgeResponse , ModelResponse
12+ from ldai .providers .types import JudgeResult , ModelResponse
1313from ldai .tracker import LDAIConfigTracker
1414
1515
@@ -44,29 +44,34 @@ async def evaluate(
4444 input_text : str ,
4545 output_text : str ,
4646 sampling_rate : float = 1.0 ,
47- ) -> Optional [ JudgeResponse ] :
47+ ) -> JudgeResult :
4848 """
4949 Evaluates an AI response using the judge's configuration.
5050
5151 :param input_text: The input prompt or question that was provided to the AI
5252 :param output_text: The AI-generated response to be evaluated
5353 :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
54- :return: Evaluation results or None if not sampled
54+ :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
5555 """
56+ judge_result = JudgeResult (judge_config_key = self ._ai_config .key )
57+
5658 try :
5759 if not self ._ai_config .evaluation_metric_key :
5860 log .warning (
5961 'Judge configuration is missing required evaluationMetricKey'
6062 )
61- return None
63+ judge_result .error_message = 'Judge configuration is missing required evaluationMetricKey'
64+ return judge_result
6265
6366 if not self ._ai_config .messages :
6467 log .warning ('Judge configuration must include messages' )
65- return None
68+ judge_result .error_message = 'Judge configuration must include messages'
69+ return judge_result
6670
6771 if random .random () > sampling_rate :
6872 log .debug (f'Judge evaluation skipped due to sampling rate: { sampling_rate } ' )
69- return None
73+ judge_result .sampled = True
74+ return judge_result
7075
7176 messages = self ._construct_evaluation_messages (input_text , output_text )
7277 assert self ._evaluation_response_structure is not None
@@ -76,39 +81,36 @@ async def evaluate(
7681 lambda result : result .metrics ,
7782 )
7883
79- success = response .metrics .success
80- evals = self ._parse_evaluation_response (response .data )
84+ parsed = self ._parse_evaluation_response (response .data )
8185
82- if not evals :
86+ if parsed is None :
8387 log .warning ('Judge evaluation did not return the expected evaluation' )
84- success = False
85-
86- return JudgeResponse (
87- judge_config_key = self ._ai_config .key ,
88- evals = evals ,
89- success = success ,
90- )
88+ return judge_result
89+
90+ score , reasoning = parsed
91+ judge_result .metric_key = self ._ai_config .evaluation_metric_key
92+ judge_result .score = score
93+ judge_result .reasoning = reasoning
94+ judge_result .success = response .metrics .success
95+ return judge_result
9196 except Exception as error :
9297 log .error (f'Judge evaluation failed: { error } ' )
93- return JudgeResponse (
94- evals = {},
95- success = False ,
96- error = str (error ) if isinstance (error , Exception ) else 'Unknown error' ,
97- )
98+ judge_result .error_message = str (error ) if isinstance (error , Exception ) else 'Unknown error'
99+ return judge_result
98100
99101 async def evaluate_messages (
100102 self ,
101103 messages : list [LDMessage ],
102104 response : ModelResponse ,
103105 sampling_ratio : float = 1.0 ,
104- ) -> Optional [ JudgeResponse ] :
106+ ) -> JudgeResult :
105107 """
106108 Evaluates an AI response from chat messages and response.
107109
108110 :param messages: Array of messages representing the conversation history
109111 :param response: The AI response to be evaluated
110112 :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
111- :return: Evaluation results or None if not sampled
113+ :return: Evaluation result; ``sampled=True`` when skipped due to sampling rate
112114 """
113115 input_text = '\r \n ' .join ([msg .content for msg in messages ]) if messages else ''
114116 output_text = response .message .content
@@ -172,28 +174,23 @@ def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
172174 # Use chevron (Mustache) for templating, with no escaping
173175 return chevron .render (content , variables )
174176
175- def _parse_evaluation_response (self , data : Dict [str , Any ]) -> Dict [ str , EvalScore ]:
177+ def _parse_evaluation_response (self , data : Dict [str , Any ]) -> Optional [ Tuple [ float , str ] ]:
176178 """
177179 Parses the structured evaluation response. Expects {"score": n, "reasoning": "..."}.
178- """
179- results : Dict [str , EvalScore ] = {}
180- metric_key = self ._ai_config .evaluation_metric_key
181- if not metric_key :
182- log .warning ('Evaluation metric key is missing' )
183- return results
184180
181+ :return: ``(score, reasoning)`` on success, or ``None`` if the response is invalid.
182+ """
185183 if not isinstance (data , dict ):
186184 log .warning ('Invalid response: missing or invalid evaluation' )
187- return results
185+ return None
188186
189187 score = data .get ('score' )
190188 reasoning = data .get ('reasoning' )
191189 if not isinstance (score , (int , float )) or score < 0 or score > 1 :
192190 log .warning (f'Invalid score: { score } . Score must be a number between 0 and 1 inclusive' )
193- return results
191+ return None
194192 if not isinstance (reasoning , str ):
195193 log .warning ('Invalid reasoning: must be a string' )
196- return results
194+ return None
197195
198- results [metric_key ] = EvalScore (score = float (score ), reasoning = reasoning )
199- return results
196+ return (float (score ), reasoning )
0 commit comments