Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,4 @@ mlruns/

# ignore config files
config.json

78 changes: 78 additions & 0 deletions assets/evaluators/builtin/bleu_score/evaluator/_bleu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import logging
from typing import Dict
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
from typing_extensions import overload, override
Expand All @@ -9,6 +10,9 @@

from azure.ai.evaluation._evaluators._common import EvaluatorBase
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
from azure.ai.evaluation._exceptions import EvaluationException, ErrorCategory, ErrorTarget

logger = logging.getLogger(__name__)


class BleuScoreEvaluator(EvaluatorBase):
Expand Down Expand Up @@ -91,11 +95,85 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
binary_result = score <= self._threshold

return {
"bleu": score,
"bleu_score": score,
"bleu_passed": binary_result,
"bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
"bleu_reason": None,
"bleu_status": "completed",
"bleu_threshold": self._threshold,
"bleu_properties": None,
}

@override
async def _real_call(self, **kwargs):
"""Perform the asynchronous call where real end-to-end evaluation logic runs.

:keyword kwargs: The inputs to evaluate.
:type kwargs: Dict
:return: The evaluation result.
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
"""
# Convert inputs into list of evaluable inputs.
try:
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
except Exception as e:
logger.error(f"Error converting kwargs to eval_input_list: {e}")
raise e
per_turn_results = []
# Evaluate all inputs.
for eval_input in eval_input_list:
result = await self._do_eval(eval_input)
# logic to determine threshold pass/fail
# if it wasn't computed in _do_eval
try:
keys = list(result.keys())
contains_result_key = any(key.endswith("_result") for key in keys)
contains_threshold_key = any(key.endswith("_threshold") for key in keys)
if not contains_result_key or not contains_threshold_key:
for key in keys:
if key.endswith("_score"):
score_value = result[key]
base_key = key[:-6] # Remove "_score" suffix
result_key = f"{base_key}_result"
threshold_key = f"{base_key}_threshold"
threshold_value = (
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
)
if not isinstance(threshold_value, (int, float)):
raise EvaluationException(
"Threshold value must be a number.",
internal_message=str(threshold_value),
target=ErrorTarget.EVALUATE,
category=ErrorCategory.INVALID_VALUE,
)

if not contains_threshold_key:
result[threshold_key] = threshold_value

if not contains_result_key:
if self._higher_is_better:
if float(score_value) >= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
else:
if float(score_value) <= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
except Exception as e:
logger.warning(f"Error calculating binary result: {e}")
per_turn_results.append(result)
# Return results as-is if only one result was produced.

if len(per_turn_results) == 1:
return per_turn_results[0]
if len(per_turn_results) == 0:
return {} # TODO raise something?
# Otherwise, aggregate results.
return self._aggregate_results(per_turn_results=per_turn_results)

@overload # type: ignore
def __call__(self, *, response: str, ground_truth: str):
"""
Expand Down
2 changes: 1 addition & 1 deletion assets/evaluators/builtin/bleu_score/spec.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
type: "evaluator"
name: "builtin.bleu_score"
version: 1
version: 2
displayName: "Bleu-Score-Evaluator"
description: "Measures how similar the model’s output is to a reference text. Useful for assessing alignment between generated and expected responses. It’s best used for natural language processing (NLP) tasks, including text summarization and text generation use cases."
evaluatorType: "builtin"
Expand Down
207 changes: 183 additions & 24 deletions assets/evaluators/builtin/coherence/evaluator/_coherence.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import json
import math
import os
import logging
import re
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Dict, Optional, Union, List, Tuple
Expand All @@ -18,9 +20,12 @@
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from azure.ai.evaluation._model_configurations import Conversation
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
from azure.ai.evaluation._common.utils import (
construct_prompty_model_config,
validate_model_config,
parse_quality_evaluator_reason_score,
_extract_text_from_content,
_get_agent_response,
_pretty_format_conversation_history,
Expand Down Expand Up @@ -1006,23 +1011,117 @@ def __call__( # pylint: disable=docstring-missing-param
"""
return super().__call__(*args, **kwargs)

def _not_applicable_result(
def _return_not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict]]:
"""Return a result indicating that the evaluation is not applicable."""
) -> Dict[str, Union[str, float, Dict, None]]:
"""Return a result indicating that the tool call is not applicable for evaluation.

:param error_message: The error message indicating why the evaluation is not applicable.
:type error_message: str
:param threshold: The threshold value for the evaluation.
:type threshold: Union[int, float]
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float, None]]
"""
return {
self._result_key: threshold,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}": None,
f"{self._result_key}_score": None,
f"{self._result_key}_passed": None,
f"{self._result_key}_result": "not_applicable",
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_properties": {},
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
f"{self._result_key}_status": "skipped",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_properties": None,
}

async def _the_super_do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
"""Do a relevance evaluation.

:param eval_input: The input to the evaluator.
:type eval_input: Dict
:return: The evaluation result.
:rtype: Dict
"""
if "query" not in eval_input and "response" not in eval_input:
raise EvaluationException(
message="Only text conversation inputs are supported.",
internal_message="Only text conversation inputs are supported.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.CONVERSATION,
)
# Check for intermediate response
if _is_intermediate_response(eval_input.get("response")):
return self._return_not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)
# Preprocess messages if they are lists
if isinstance(eval_input.get("response"), list):
eval_input["response"] = _preprocess_messages(eval_input["response"])
if isinstance(eval_input.get("query"), list):
eval_input["query"] = _preprocess_messages(eval_input["query"])
# Call the prompty flow to get the evaluation result.
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
score = math.nan
reason = ""
llm_properties = {}
if prompty_output_dict:
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
parsed_output = None
if isinstance(llm_output, dict):
parsed_output = llm_output
elif isinstance(llm_output, str):
try:
parsed_output = json.loads(llm_output)
except (json.JSONDecodeError, TypeError):
parsed_output = None
if parsed_output and isinstance(parsed_output, dict):
llm_status = parsed_output.get("status", "completed")
if llm_status == "skipped":
skip_reason = parsed_output.get("reason", "")
return self._return_not_applicable_result(skip_reason, self._threshold)
score = parsed_output.get("score", math.nan)
reason = parsed_output.get("reason", "")
llm_properties = parsed_output.get("properties", {}) or {}
else:
if isinstance(llm_output, str) and self._result_key in PROMPT_BASED_REASON_EVALUATORS:
score, reason = parse_quality_evaluator_reason_score(llm_output)
elif isinstance(llm_output, str):
match = re.search(r"\d", llm_output)
if match:
score = float(match.group())
score = float(score) if score is not None else math.nan
score_result = self._get_binary_result(score)
llm_properties.update(self._get_token_metadata(prompty_output_dict))
return {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_passed": score_result == "pass",
f"{self._result_key}_result": score_result,
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": "completed",
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_properties": llm_properties,
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.EVALUATE,
)

@staticmethod
def _get_token_metadata(prompty_output: Dict) -> Dict:
"""Extract token usage and model metadata from the prompty output dict."""
return {
"prompt_tokens": prompty_output.get("input_token_count", 0),
"completion_tokens": prompty_output.get("output_token_count", 0),
"total_tokens": prompty_output.get("total_token_count", 0),
"finish_reason": prompty_output.get("finish_reason", ""),
"model": prompty_output.get("model_id", ""),
"sample_input": prompty_output.get("sample_input", ""),
"sample_output": prompty_output.get("sample_output", ""),
}

def _should_use_conversation_level(self, eval_input: Dict) -> bool:
Expand All @@ -1044,6 +1143,8 @@ def _build_result(
) -> Dict[str, Union[str, int, float, Dict, None]]:
"""Build a standardized result dictionary for multi-turn coherence outputs."""
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
properties = dict(properties) if isinstance(properties, dict) else {}
properties.update(self._get_token_metadata(p))
return {
self._result_key: score,
f"{self._result_key}_score": score,
Expand All @@ -1052,13 +1153,6 @@ def _build_result(
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": status,
f"{self._result_key}_properties": properties,
f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": p.get("output_token_count", 0),
f"{self._result_key}_total_tokens": p.get("total_token_count", 0),
f"{self._result_key}_finish_reason": p.get("finish_reason", ""),
f"{self._result_key}_model": p.get("model_id", ""),
f"{self._result_key}_sample_input": p.get("sample_input", ""),
f"{self._result_key}_sample_output": p.get("sample_output", ""),
}

@override
Expand Down Expand Up @@ -1087,7 +1181,72 @@ async def _real_call(self, **kwargs):
# Validate input before processing
self._validator.validate_eval_input(kwargs)

return await super()._real_call(**kwargs)
return await self._the_super_real_call(**kwargs)

async def _the_super_real_call(self, **kwargs):
"""Perform the asynchronous call where real end-to-end evaluation logic runs.

:keyword kwargs: The inputs to evaluate.
:type kwargs: Dict
:return: The evaluation result.
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
"""
# Convert inputs into list of evaluable inputs.
try:
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
except Exception as e:
logger.error(f"Error converting kwargs to eval_input_list: {e}")
raise e
per_turn_results = []
# Evaluate all inputs.
for eval_input in eval_input_list:
result = await self._do_eval(eval_input)
# logic to determine threshold pass/fail
# if it wasn't computed in _do_eval
try:
keys = list(result.keys())
contains_result_key = any(key.endswith("_result") for key in keys)
contains_threshold_key = any(key.endswith("_threshold") for key in keys)
if not contains_result_key or not contains_threshold_key:
for key in keys:
if key.endswith("_score"):
score_value = result[key]
base_key = key[:-6] # Remove "_score" suffix
result_key = f"{base_key}_result"
threshold_key = f"{base_key}_threshold"
threshold_value = (
self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
)
if not isinstance(threshold_value, (int, float)):
raise EvaluationException(
"Threshold value must be a number.",
internal_message=str(threshold_value),
target=ErrorTarget.EVALUATE,
category=ErrorCategory.INVALID_VALUE,
)
if not contains_threshold_key:
result[threshold_key] = threshold_value
if not contains_result_key:
if self._higher_is_better:
if float(score_value) >= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
else:
if float(score_value) <= threshold_value:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
else:
result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
except Exception as e:
logger.warning(f"Error calculating binary result: {e}")
per_turn_results.append(result)
# Return results as-is if only one result was produced.
if len(per_turn_results) == 1:
return per_turn_results[0]
if len(per_turn_results) == 0:
return {} # TODO raise something?
# Otherwise, aggregate results.
return self._aggregate_results(per_turn_results=per_turn_results)

@override
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
Expand All @@ -1102,7 +1261,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
return await self._do_eval_conversation_level(eval_input)

if _is_intermediate_response(eval_input.get("response")):
return self._not_applicable_result(
return self._return_not_applicable_result(
"Intermediate response. Please provide the agent's final response for evaluation.",
self._threshold,
)
Expand All @@ -1112,7 +1271,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
eval_input["query"] = _preprocess_messages(eval_input["query"])
eval_input.pop("messages", None)

result = await super()._do_eval(eval_input)
result = await self._the_super_do_eval(eval_input)

# Check if base returned nan (invalid output case)
if math.isnan(result.get(self._result_key, 0)):
Expand Down
Loading
Loading