Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1270,7 +1270,7 @@ def _build_result(
"sample_input": p.get("sample_input", ""),
"sample_output": p.get("sample_output", ""),
}
return {
result_payload = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": result,
Expand All @@ -1280,6 +1280,9 @@ def _build_result(
f"{self._result_key}_status": status,
f"{self._result_key}_properties": {**properties, **metadata},
}
# Add top-level token metadata fields for backward compatibility.
result_payload.update({f"{self._result_key}_{key}": value for key, value in metadata.items()})
return result_payload

def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
"""Parse the prompty output into a standardized result dictionary.
Expand Down
14 changes: 11 additions & 3 deletions assets/evaluators/builtin/fluency/evaluator/_fluency.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -722,6 +723,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
"""Do a relevance evaluation.
Expand Down Expand Up @@ -782,8 +786,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
score = float(match.group())
score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(prompty_output_dict))
return {
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)
result = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -793,6 +798,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1253,7 +1253,8 @@ def _build_result(
"""Build a standardized groundedness result dictionary."""
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
properties = dict(properties) if isinstance(properties, dict) else {}
properties.update(self._get_token_metadata(p))
token_metadata = self._get_token_metadata(p)
properties.update(token_metadata)
parsed_result: Dict[str, Union[str, int, float, Dict, None]] = {
self._result_key: score,
f"{self._result_key}_score": score,
Expand All @@ -1264,6 +1265,8 @@ def _build_result(
}
if status is not None:
parsed_result[f"{self._result_key}_status"] = status
# Add top-level token metadata fields for backward compatibility.
parsed_result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return parsed_result

def _return_not_applicable_result(
Expand All @@ -1278,7 +1281,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -1288,6 +1292,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
"""Do a relevance evaluation.
Expand Down Expand Up @@ -1348,8 +1355,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
score = float(match.group())
score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(prompty_output_dict))
return {
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)
result = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -1359,6 +1367,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -837,6 +838,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
Expand Down Expand Up @@ -995,7 +999,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
score = float(score)
score_result = "pass" if score >= self._threshold else "fail"
llm_properties = llm_output.get("properties", {}) or {}
llm_properties.update(self._get_token_metadata(prompty_output_dict))
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)

response_dict = {
self._result_key: score,
Expand All @@ -1007,6 +1012,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return response_dict
raise EvaluationException(
message="Evaluator returned invalid output.",
Expand Down
14 changes: 11 additions & 3 deletions assets/evaluators/builtin/relevance/evaluator/_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -730,6 +731,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
Expand Down Expand Up @@ -872,8 +876,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
reason = llm_output.get("reason", "")
llm_properties = llm_output.get("properties", {}) or {}
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(result))
return {
token_metadata = self._get_token_metadata(result)
llm_properties.update(token_metadata)
response_dict = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -883,6 +888,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return response_dict

raise EvaluationException(
message="Evaluator returned invalid output.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -248,6 +249,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
Expand Down Expand Up @@ -310,9 +314,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
llm_properties = llm_output.get("properties", {}) or {}
score_result = self._get_binary_result(score)

llm_properties.update(self._get_token_metadata(result if isinstance(result, dict) else {}))
token_metadata = self._get_token_metadata(result if isinstance(result, dict) else {})
llm_properties.update(token_metadata)

return {
response_dict = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -322,6 +327,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
response_dict.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return response_dict

raise EvaluationException(
message="Evaluator returned invalid output.",
Expand Down
14 changes: 11 additions & 3 deletions assets/evaluators/builtin/retrieval/evaluator/_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -255,6 +256,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
"""Do a relevance evaluation.
Expand Down Expand Up @@ -315,8 +319,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
score = float(match.group())
score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(prompty_output_dict))
return {
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)
result = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -326,6 +331,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
Expand Down
14 changes: 11 additions & 3 deletions assets/evaluators/builtin/similarity/evaluator/_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -280,6 +281,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
"""Do a relevance evaluation.
Expand Down Expand Up @@ -340,8 +344,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
score = float(match.group())
score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(prompty_output_dict))
return {
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)
result = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -351,6 +356,9 @@ async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, s
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1099,15 +1099,19 @@ def _build_result(
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
resolved_threshold = threshold if threshold is not None else self._threshold
properties = dict(properties) if isinstance(properties, dict) else {}
properties.update(self._get_token_metadata(p))
return {
token_metadata = self._get_token_metadata(p)
properties.update(token_metadata)
result_payload = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": result,
f"{self._result_key}_threshold": resolved_threshold,
f"{self._result_key}_reason": reason,
f"{self._result_key}_properties": properties,
}
# Add top-level token metadata fields for backward compatibility.
result_payload.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result_payload

def _return_not_applicable_result(
self, error_message: str, threshold: Union[int, float]
Expand All @@ -1121,7 +1125,8 @@ def _return_not_applicable_result(
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
token_metadata = self._get_token_metadata({})
result = {
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
Expand All @@ -1131,6 +1136,9 @@ def _return_not_applicable_result(
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
Expand Down Expand Up @@ -1371,9 +1379,10 @@ def _parse_prompty_output(self, prompty_output_dict: Dict[str, Any]) -> Dict[str
score = float(llm_output.get("score", 0.0))
score_result = "pass" if score >= 1.0 else "fail"
llm_properties = llm_output.get("properties", {}) or {}
llm_properties.update(self._get_token_metadata(prompty_output_dict))
token_metadata = self._get_token_metadata(prompty_output_dict)
llm_properties.update(token_metadata)

return {
result = {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
Expand All @@ -1383,3 +1392,6 @@ def _parse_prompty_output(self, prompty_output_dict: Dict[str, Any]) -> Dict[str
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
# Add top-level token metadata fields for backward compatibility.
result.update({f"{self._result_key}_{key}": value for key, value in token_metadata.items()})
return result
Loading
Loading