From eca4f903aa556640f55f2aa6006d4a7f48e3dff2 Mon Sep 17 00:00:00 2001 From: Nagendra Posani Date: Fri, 29 May 2026 21:06:33 -0700 Subject: [PATCH] Fixed remaining evaluators --- .../evaluator/_customer_satisfaction.py | 5 ++++- .../builtin/fluency/evaluator/_fluency.py | 14 +++++++++--- .../groundedness/evaluator/_groundedness.py | 19 ++++++++++++---- .../evaluator/_intent_resolution.py | 11 ++++++++-- .../builtin/relevance/evaluator/_relevance.py | 14 +++++++++--- .../evaluator/_response_completeness.py | 14 +++++++++--- .../builtin/retrieval/evaluator/_retrieval.py | 14 +++++++++--- .../similarity/evaluator/_similarity.py | 14 +++++++++--- .../evaluator/_task_adherence.py | 22 ++++++++++++++----- .../evaluator/_task_completion.py | 19 ++++++++++++---- .../evaluator/_tool_call_success.py | 14 +++++++++--- .../evaluator/_tool_input_accuracy.py | 11 ++++++++-- .../evaluator/_tool_output_utilization.py | 14 +++++++++--- .../evaluator/_tool_selection.py | 11 ++++++++-- 14 files changed, 155 insertions(+), 41 deletions(-) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py index 538c666538..577016aa68 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py @@ -1270,7 +1270,7 @@ def _build_result( "sample_input": p.get("sample_input", ""), "sample_output": p.get("sample_output", ""), } - return { + result_payload = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_result": result, @@ -1280,6 +1280,9 @@ def _build_result( f"{self._result_key}_status": status, f"{self._result_key}_properties": {**properties, **metadata}, } + # Add top-level token metadata fields for backward compatibility. + result_payload.update({f"{self._result_key}_{key}": value for key, value in metadata.items()}) + return result_payload def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]: """Parse the prompty output into a standardized result dictionary. diff --git a/assets/evaluators/builtin/fluency/evaluator/_fluency.py b/assets/evaluators/builtin/fluency/evaluator/_fluency.py index f2a1175166..286c0c1e06 100644 --- a/assets/evaluators/builtin/fluency/evaluator/_fluency.py +++ b/assets/evaluators/builtin/fluency/evaluator/_fluency.py @@ -712,7 +712,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -722,6 +723,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: """Do a relevance evaluation. @@ -782,8 +786,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s score = float(match.group()) score = float(score) if score is not None else math.nan score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -793,6 +798,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py index 9f13c5eb13..24ef995c77 100644 --- a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py +++ b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py @@ -1253,7 +1253,8 @@ def _build_result( """Build a standardized groundedness result dictionary.""" p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} properties = dict(properties) if isinstance(properties, dict) else {} - properties.update(self._get_token_metadata(p)) + token_metadata = self._get_token_metadata(p) + properties.update(token_metadata) parsed_result: Dict[str, Union[str, int, float, Dict, None]] = { self._result_key: score, f"{self._result_key}_score": score, @@ -1264,6 +1265,8 @@ def _build_result( } if status is not None: parsed_result[f"{self._result_key}_status"] = status + # Add top-level token metadata fields for backward compatibility. + parsed_result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) return parsed_result def _return_not_applicable_result( @@ -1278,7 +1281,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1288,6 +1292,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: """Do a relevance evaluation. @@ -1348,8 +1355,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s score = float(match.group()) score = float(score) if score is not None else math.nan score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -1359,6 +1367,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/intent_resolution/evaluator/_intent_resolution.py b/assets/evaluators/builtin/intent_resolution/evaluator/_intent_resolution.py index 285552ad34..073c9b9a5e 100644 --- a/assets/evaluators/builtin/intent_resolution/evaluator/_intent_resolution.py +++ b/assets/evaluators/builtin/intent_resolution/evaluator/_intent_resolution.py @@ -827,7 +827,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -837,6 +838,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -995,7 +999,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t score = float(score) score_result = "pass" if score >= self._threshold else "fail" llm_properties = llm_output.get("properties", {}) or {} - llm_properties.update(self._get_token_metadata(prompty_output_dict)) + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) response_dict = { self._result_key: score, @@ -1007,6 +1012,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) return response_dict raise EvaluationException( message="Evaluator returned invalid output.", diff --git a/assets/evaluators/builtin/relevance/evaluator/_relevance.py b/assets/evaluators/builtin/relevance/evaluator/_relevance.py index 891f3975d6..e14a288501 100644 --- a/assets/evaluators/builtin/relevance/evaluator/_relevance.py +++ b/assets/evaluators/builtin/relevance/evaluator/_relevance.py @@ -720,7 +720,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -730,6 +731,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -872,8 +876,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t reason = llm_output.get("reason", "") llm_properties = llm_output.get("properties", {}) or {} score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(result)) - return { + token_metadata = self._get_token_metadata(result) + llm_properties.update(token_metadata) + response_dict = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -883,6 +888,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return response_dict raise EvaluationException( message="Evaluator returned invalid output.", diff --git a/assets/evaluators/builtin/response_completeness/evaluator/_response_completeness.py b/assets/evaluators/builtin/response_completeness/evaluator/_response_completeness.py index 5a1aba1495..b5cb29a4be 100644 --- a/assets/evaluators/builtin/response_completeness/evaluator/_response_completeness.py +++ b/assets/evaluators/builtin/response_completeness/evaluator/_response_completeness.py @@ -238,7 +238,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -248,6 +249,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -310,9 +314,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t llm_properties = llm_output.get("properties", {}) or {} score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(result if isinstance(result, dict) else {})) + token_metadata = self._get_token_metadata(result if isinstance(result, dict) else {}) + llm_properties.update(token_metadata) - return { + response_dict = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -322,6 +327,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return response_dict raise EvaluationException( message="Evaluator returned invalid output.", diff --git a/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py b/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py index a2b37b5e5a..43b9b84089 100644 --- a/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py +++ b/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py @@ -245,7 +245,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -255,6 +256,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: """Do a relevance evaluation. @@ -315,8 +319,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s score = float(match.group()) score = float(score) if score is not None else math.nan score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -326,6 +331,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/similarity/evaluator/_similarity.py b/assets/evaluators/builtin/similarity/evaluator/_similarity.py index b591a78a4f..a1b7b8f69e 100644 --- a/assets/evaluators/builtin/similarity/evaluator/_similarity.py +++ b/assets/evaluators/builtin/similarity/evaluator/_similarity.py @@ -270,7 +270,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -280,6 +281,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: """Do a relevance evaluation. @@ -340,8 +344,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s score = float(match.group()) score = float(score) if score is not None else math.nan score_result = self._get_binary_result(score) - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -351,6 +356,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py b/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py index abf35e7d95..e63496cc9c 100644 --- a/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py +++ b/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py @@ -1099,8 +1099,9 @@ def _build_result( p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} resolved_threshold = threshold if threshold is not None else self._threshold properties = dict(properties) if isinstance(properties, dict) else {} - properties.update(self._get_token_metadata(p)) - return { + token_metadata = self._get_token_metadata(p) + properties.update(token_metadata) + result_payload = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_result": result, @@ -1108,6 +1109,9 @@ def _build_result( f"{self._result_key}_reason": reason, f"{self._result_key}_properties": properties, } + # Add top-level token metadata fields for backward compatibility. + result_payload.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result_payload def _return_not_applicable_result( self, error_message: str, threshold: Union[int, float] @@ -1121,7 +1125,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1131,6 +1136,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -1371,9 +1379,10 @@ def _parse_prompty_output(self, prompty_output_dict: Dict[str, Any]) -> Dict[str score = float(llm_output.get("score", 0.0)) score_result = "pass" if score >= 1.0 else "fail" llm_properties = llm_output.get("properties", {}) or {} - llm_properties.update(self._get_token_metadata(prompty_output_dict)) + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) - return { + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -1383,3 +1392,6 @@ def _parse_prompty_output(self, prompty_output_dict: Dict[str, Any]) -> Dict[str f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index 35f89dd24f..f946611adf 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1230,7 +1230,7 @@ def _build_result( "sample_input": p.get("sample_input", ""), "sample_output": p.get("sample_output", ""), } - return { + result_payload = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_result": result, @@ -1240,6 +1240,9 @@ def _build_result( f"{self._result_key}_status": status, f"{self._result_key}_properties": {**properties, **metadata} } + # Add top-level token metadata fields for backward compatibility. + result_payload.update({f"{self._result_key}_{key}": value for key, value in metadata.items()}) + return result_payload def _return_not_applicable_result( self, error_message: str, threshold: Union[int, float] @@ -1253,7 +1256,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1263,6 +1267,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -1470,8 +1477,9 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in success_result = "pass" if score >= 1.0 else "fail" reason = llm_output.get("reason", "") llm_properties = llm_output.get("properties", {}) or {} - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": success_result == "pass", @@ -1481,3 +1489,6 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result diff --git a/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py b/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py index 32d5b79344..b10e2885fe 100644 --- a/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py +++ b/assets/evaluators/builtin/tool_call_success/evaluator/_tool_call_success.py @@ -840,7 +840,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -850,6 +851,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -940,8 +944,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t score = float(llm_output.get("score", 0)) success_result = "pass" if score >= 1.0 else "fail" reason = llm_output.get("reason", "") - llm_properties.update(self._get_token_metadata(prompty_output_dict)) - return { + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": success_result == "pass", @@ -951,6 +956,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py index 2b9e41d224..62123d1ba4 100644 --- a/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py +++ b/assets/evaluators/builtin/tool_input_accuracy/evaluator/_tool_input_accuracy.py @@ -1196,7 +1196,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: reason = llm_output.get("reason", "") score = float(score) score_result = "pass" if score == 1 else "fail" - llm_properties.update(self._get_token_metadata(prompty_output_dict)) + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) response_dict = { self._result_key: score, f"{self._result_key}_score": score, @@ -1207,6 +1208,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) return response_dict else: @@ -1278,7 +1281,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1288,6 +1292,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: diff --git a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py index ca7836eb19..5452cfb5d1 100644 --- a/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py +++ b/assets/evaluators/builtin/tool_output_utilization/evaluator/_tool_output_utilization.py @@ -1188,7 +1188,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1198,6 +1199,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: @@ -1288,9 +1292,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t score_result = "pass" if score >= 1.0 else "fail" reason = llm_output.get("reason", "") llm_properties = llm_output.get("properties", {}) or {} - llm_properties.update(self._get_token_metadata(prompty_output_dict)) + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) - return { + result = { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_passed": score_result == "pass", @@ -1300,6 +1305,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result raise EvaluationException( message="Evaluator returned invalid output.", blame=ErrorBlame.SYSTEM_ERROR, diff --git a/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py b/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py index a9fe20b982..fdbd51f224 100644 --- a/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py +++ b/assets/evaluators/builtin/tool_selection/evaluator/_tool_selection.py @@ -1235,7 +1235,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: tool_selection_accuracy = self._calculate_tool_selection_accuracy(llm_properties) llm_properties["tool_selection_accuracy"] = tool_selection_accuracy - llm_properties.update(self._get_token_metadata(prompty_output_dict)) + token_metadata = self._get_token_metadata(prompty_output_dict) + llm_properties.update(token_metadata) response_dict = { self._result_key: score, @@ -1247,6 +1248,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_properties": llm_properties, } + # Add top-level token metadata fields for backward compatibility. + response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) return response_dict else: @@ -1299,7 +1302,8 @@ def _return_not_applicable_result( :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float, None]] """ - return { + token_metadata = self._get_token_metadata({}) + result = { f"{self._result_key}": None, f"{self._result_key}_score": None, f"{self._result_key}_passed": None, @@ -1309,6 +1313,9 @@ def _return_not_applicable_result( f"{self._result_key}_threshold": threshold, f"{self._result_key}_properties": None, } + # Add top-level token metadata fields for backward compatibility. + result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()}) + return result @staticmethod def _get_token_metadata(prompty_output: Dict) -> Dict: