From 6c81f6ab4fa366acf78520e872ca3583a1ab1404 Mon Sep 17 00:00:00 2001 From: mohessie Date: Tue, 16 Jun 2026 19:21:25 +0300 Subject: [PATCH 1/4] feat(evaluation): unify validators with azureml-assets - add DEVELOPER role, EvaluationLevel, MessagesOrQueryResponseInputValidator + level utils - support actions/expected_actions aliases in TaskNavigationEfficiencyValidator - align check_for_unsupported_tools flags in tool_call/input/output evaluators --- .../_common/_validators/__init__.py | 16 ++ .../_validators/_conversation_validator.py | 57 +++---- .../_validators/_evaluation_level_utils.py | 65 +++++++ .../_messages_or_query_response_validator.py | 158 ++++++++++++++++++ .../_task_navigation_efficiency_validator.py | 34 +++- .../_validators/_validation_constants.py | 12 ++ .../_tool_call_accuracy.py | 2 +- .../_tool_input_accuracy.py | 2 +- .../_tool_output_utilization.py | 4 +- 9 files changed, 316 insertions(+), 34 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py index 92be4feac022..d7aefa8ccbd4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py @@ -3,16 +3,32 @@ """Validators package init.""" +from ._validation_constants import MessageRole, ContentType, EvaluationLevel from ._validator_interface import ValidatorInterface from ._conversation_validator import ConversationValidator from ._tool_definitions_validator import ToolDefinitionsValidator from ._tool_calls_validator import ToolCallsValidator from ._task_navigation_efficiency_validator import TaskNavigationEfficiencyValidator +from ._messages_or_query_response_validator import MessagesOrQueryResponseInputValidator +from ._evaluation_level_utils import ( + _resolve_evaluation_level, + _merge_query_response_messages, + _split_messages_at_latest_user, + _wrap_string_messages, +) __all__ = [ + "MessageRole", + "ContentType", + "EvaluationLevel", "ValidatorInterface", "ConversationValidator", "ToolDefinitionsValidator", "ToolCallsValidator", "TaskNavigationEfficiencyValidator", + "MessagesOrQueryResponseInputValidator", + "_resolve_evaluation_level", + "_merge_query_response_messages", + "_split_messages_at_latest_user", + "_wrap_string_messages", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py index 5e43f0265b0f..a1c375340bfc 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py @@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) - if not isinstance(content_item["text"], str): return EvaluationException( - message=f"The 'text' field must be a string in content items.", + message="The 'text' field must be a string in content items.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu """Validate assistant message content.""" content = message["content"] - valid_assistant_content_types = [ - ContentType.TEXT, - ContentType.OUTPUT_TEXT, - ContentType.TOOL_CALL, - ContentType.FUNCTION_CALL, - ContentType.MCP_APPROVAL_REQUEST, - ContentType.OPENAPI_CALL, - ] - valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] if isinstance(content, list): + valid_assistant_content_types = [ + ContentType.TEXT, + ContentType.OUTPUT_TEXT, + ContentType.TOOL_CALL, + ContentType.FUNCTION_CALL, + ContentType.MCP_APPROVAL_REQUEST, + ContentType.OPENAPI_CALL, + ] + valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types] for content_item in content: content_type = content_item["type"] if content_type not in valid_assistant_content_types: @@ -225,19 +225,21 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu if error: return error - # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools - if self.check_for_unsupported_tools: - if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: - name = ( - "openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower() - ) - if name in self.UNSUPPORTED_TOOLS: - return EvaluationException( - message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", - blame=ErrorBlame.USER_ERROR, - category=ErrorCategory.NOT_APPLICABLE, - target=self.error_target, + # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools + if self.check_for_unsupported_tools: + if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL: + name = ( + "openapi_call" + if content_type == ContentType.OPENAPI_CALL + else content_item["name"].lower() ) + if name in self.UNSUPPORTED_TOOLS: + return EvaluationException( + message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.NOT_APPLICABLE, + target=self.error_target, + ) return None def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]: @@ -314,7 +316,7 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation ) if not content_is_string_or_list_of_dicts: return EvaluationException( - message=f"The 'content' field must be a string or a list of dictionaries messages.", + message="The 'content' field must be a string or a list of dictionaries messages.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, @@ -322,23 +324,22 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation if len(content) == 0: return EvaluationException( - message=f"The 'content' field can't be empty.", + message="The 'content' field can't be empty.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) if isinstance(content, list): - all_messages_have_type_field = all("type" in item for item in content) - if not all_messages_have_type_field: + if not all("type" in item for item in content): return EvaluationException( - message=f"Each content item in the 'content' list must contain a 'type' field.", + message="Each content item in the 'content' list must contain a 'type' field.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=self.error_target, ) - if role in [MessageRole.USER, MessageRole.SYSTEM]: + if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]: error = self._validate_user_or_system_message(message, role) if error: return error diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py new file mode 100644 index 000000000000..7dcfefed147d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py @@ -0,0 +1,65 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Utilities for resolving evaluation levels and reshaping query/response/messages inputs. +""" + +from typing import List, Optional, Tuple, Union +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from ._validation_constants import MessageRole, EvaluationLevel + + +def _resolve_evaluation_level( + evaluation_level: Optional[Union[EvaluationLevel, str]], + error_target: ErrorTarget, +) -> Optional[EvaluationLevel]: + """Validate and normalize the evaluation_level parameter. + + :param evaluation_level: The evaluation level to resolve. + :type evaluation_level: Optional[Union[EvaluationLevel, str]] + :param error_target: The error target for exceptions. + :type error_target: ErrorTarget + :return: The resolved EvaluationLevel or None for auto-detect. + :rtype: Optional[EvaluationLevel] + """ + valid = [level.value for level in EvaluationLevel] + if evaluation_level is None or evaluation_level == "": + return None + if isinstance(evaluation_level, EvaluationLevel): + return evaluation_level + if isinstance(evaluation_level, str): + try: + return EvaluationLevel(evaluation_level) + except ValueError as exc: + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) from exc + raise EvaluationException( + message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) + + +def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: + """Merge query and response message lists into a single conversation.""" + return [*query, *response] + + +def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: + """Split messages into query/response slices at the latest user turn.""" + latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER) + return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] + + +def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: + """Wrap string query/response into separate message lists.""" + return ( + [{"role": "user", "content": [{"type": "text", "text": query}]}], + [{"role": "assistant", "content": [{"type": "text", "text": response}]}], + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py new file mode 100644 index 000000000000..370d3d3edd1d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py @@ -0,0 +1,158 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +""" +Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. +""" + +from typing import Any, Dict +from typing_extensions import override +from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget +from ._validation_constants import MessageRole, ContentType +from ._conversation_validator import ConversationValidator +from ._tool_definitions_validator import ToolDefinitionsValidator + + +class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): + """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. + + A single implementation serves all evaluators via two behavior flags: + - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the + messages path and the query/response path. Set False for evaluators that do not accept + tool definitions (parity with a plain ``ConversationValidator``). + - ``deep_validate_messages`` (default False): additionally run full per-message + ``_validate_message_dict`` checks in the messages path. + """ + + enforce_tool_definitions: bool = True + deep_validate_messages: bool = False + + def __init__( + self, + error_target: ErrorTarget, + requires_query: bool = True, + optional_tool_definitions: bool = True, + check_for_unsupported_tools: bool = False, + *, + enforce_tool_definitions: bool = True, + deep_validate_messages: bool = False, + ): + """Initialize MessagesOrQueryResponseInputValidator.""" + super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) + self.enforce_tool_definitions = enforce_tool_definitions + self.deep_validate_messages = deep_validate_messages + + @override + def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: + """Validate evaluation input, supporting messages as an alternative to query/response.""" + # Multi-turn path (messages list) + messages = eval_input.get("messages") + if messages is not None: + if not isinstance(messages, list): + raise EvaluationException( + message="messages must be provided as a list of message dictionaries.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if len(messages) == 0: + raise EvaluationException( + message="messages list must not be empty.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + + # Per-message structural checks + valid_roles = {role.value for role in MessageRole} + roles_present = set() + for index, message in enumerate(messages): + if not isinstance(message, dict): + raise EvaluationException( + message=( + f"Each item in 'messages' must be a dictionary, " + f"but item at index {index} is {type(message).__name__}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + role = message.get("role") + if role is None: + raise EvaluationException( + message=f"Each message must contain a 'role' key, but message at index {index} is missing it.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if role not in valid_roles: + raise EvaluationException( + message=( + f"Invalid role '{role}' at message index {index}. " + f"Must be one of: {sorted(valid_roles)}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + roles_present.add(role) + + # Conversation-level checks + if MessageRole.USER.value not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'user'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if MessageRole.ASSISTANT.value not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'assistant'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + # The final assistant message must contain text + last_content = messages[-1].get("content", "") + if isinstance(last_content, list): + has_text = any( + ( + isinstance(content_item, dict) + and content_item.get("type") + in ( + ContentType.TEXT, + ContentType.INPUT_TEXT, + ContentType.OUTPUT_TEXT, + ) + ) + or isinstance(content_item, str) + for content_item in last_content + ) + if not has_text: + raise EvaluationException( + message=( + "The last message must contain text content, " + "not only tool calls. The conversation appears to be " + "mid-execution \u2014 provide the agent's final text response." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + + if self.deep_validate_messages: + for message in messages: + error = self._validate_message_dict(message) + if error: + raise error + + if self.enforce_tool_definitions: + tool_definitions = eval_input.get("tool_definitions") + tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) + if tool_definitions_validation_exception: + raise tool_definitions_validation_exception + return True + + if self.enforce_tool_definitions: + return super().validate_eval_input(eval_input) + return ConversationValidator.validate_eval_input(self, eval_input) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py index 132303129546..3c0d6018b2eb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py @@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface): """ Validate task navigation efficiency inputs (response and ground_truth). + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Validates: - - response: List of assistant messages containing tool calls - - ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict) + - response (alias ``actions``): List of assistant messages containing tool calls + - ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a + tuple of (tool names, parameters dict) """ error_target: ErrorTarget + # Canonical input key -> accepted alternate (azureml-assets) key name. + _INPUT_ALIASES: Dict[str, str] = { + "response": "actions", + "ground_truth": "expected_actions", + } + def __init__(self, error_target: ErrorTarget): """Initialize with error target.""" self.error_target = error_target + def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None: + """Map azureml-assets-style input keys onto the canonical keys in place. + + If a canonical key (``response``/``ground_truth``) is absent but its alias + (``actions``/``expected_actions``) is provided, copy the alias value to the canonical + key so the rest of the pipeline can rely on a single set of names. + """ + for canonical, alias in self._INPUT_ALIASES.items(): + if eval_input.get(canonical) is None and eval_input.get(alias) is not None: + eval_input[canonical] = eval_input[alias] + def _validate_response(self, response: Any) -> Optional[EvaluationException]: """Validate the response parameter.""" if response is None: @@ -221,8 +242,12 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: """ Validate task navigation evaluation input. + Accepts either the SDK input names (``response``/``ground_truth``) or the + azureml-assets names (``actions``/``expected_actions``). + Args: - eval_input: Dictionary containing 'response' and 'ground_truth'. + eval_input: Dictionary containing 'response'/'ground_truth' (or their + 'actions'/'expected_actions' aliases). Returns: True if validation passes. @@ -230,6 +255,9 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: Raises: EvaluationException: If validation fails. """ + # Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys. + self._normalize_input_aliases(eval_input) + # If response or ground_truth is a string, try to parse it as JSON for key in ("response", "ground_truth"): value = eval_input.get(key) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py index f4c242a9f02b..3c6795309672 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py @@ -15,6 +15,7 @@ class MessageRole(str, Enum): ASSISTANT = "assistant" SYSTEM = "system" TOOL = "tool" + DEVELOPER = "developer" class ContentType(str, Enum): @@ -31,3 +32,14 @@ class ContentType(str, Enum): MCP_APPROVAL_RESPONSE = "mcp_approval_response" OPENAPI_CALL = "openapi_call" OPENAPI_CALL_OUTPUT = "openapi_call_output" + + +class EvaluationLevel(str, Enum): + """Supported evaluation levels for the evaluator. + + - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. + - ``TURN``: Force turn-level evaluation using the single-turn query/response path. + """ + + CONVERSATION = "conversation" + TURN = "turn" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py index 6339fdab2bb6..6f8605c5a071 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py @@ -86,7 +86,9 @@ def __init__( # Initialize input validator self._validator = ToolDefinitionsValidator( - error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False + error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, + optional_tool_definitions=False, + check_for_unsupported_tools=True, ) super().__init__( From 2808e5af81159bee3d897b0179f266db5fa33458 Mon Sep 17 00:00:00 2001 From: Mohamed Hessien Date: Tue, 16 Jun 2026 20:23:28 +0300 Subject: [PATCH 2/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../_common/_validators/_evaluation_level_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py index 7dcfefed147d..379e3e065902 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py @@ -53,7 +53,12 @@ def _merge_query_response_messages(query: List[dict], response: List[dict]) -> L def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: """Split messages into query/response slices at the latest user turn.""" - latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER) + latest_user_index = max( + (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), + default=-1, + ) + if latest_user_index == -1: + raise ValueError("messages must contain at least one message with role 'user'.") return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] From 106ac42ef1f22b784f8dcd942b8ec2eaa284df08 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:28:15 +0000 Subject: [PATCH 3/4] Add unit tests for actions/expected_actions alias input normalization Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- ...t_task_navigation_efficiency_evaluators.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py index 2c1a5dfba237..8a1f3b512b9b 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -404,3 +404,95 @@ def test_matching_mode_validation(self): # Test invalid type for mode with pytest.raises(Exception): # EvaluationException _TaskNavigationEfficiencyEvaluator(matching_mode=123) # type: ignore + + # ==================== ALIAS INPUT NORMALIZATION TESTS ==================== + + def test_alias_actions_normalized_as_response(self): + """Test that 'actions' alias is accepted and normalized to 'response'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + ground_truth = ["search", "analyze"] + + result = evaluator(actions=actions, ground_truth=ground_truth) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_alias_expected_actions_normalized_as_ground_truth(self): + """Test that 'expected_actions' alias is accepted and normalized to 'ground_truth'.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + response = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(response=response, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + + def test_both_aliases_normalized_and_evaluated(self): + """Test that both 'actions' and 'expected_actions' aliases together produce the correct result.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is True + assert result["task_navigation_efficiency_result"] == "pass" + assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 + assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0 + + def test_alias_inputs_mismatch(self): + """Test that alias inputs produce a failing result when actions do not match expected_actions.""" + evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH) + + # Agent performs 'search' and 'extra_step', but expected is 'search' and 'analyze' + actions = [ + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}], + }, + { + "role": "assistant", + "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}], + }, + ] + expected_actions = ["search", "analyze"] + + result = evaluator(actions=actions, expected_actions=expected_actions) + assert result["task_navigation_efficiency_passed"] is False + assert result["task_navigation_efficiency_result"] == "fail" + # precision: 1 match out of 2 agent steps = 0.5 + assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.5 + # recall: 1 match out of 2 expected steps = 0.5 + assert result["task_navigation_efficiency_properties"]["recall_score"] == 0.5 From aadd11ca772af3e370a1ed910f934b5426d912de Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Jun 2026 17:29:13 +0000 Subject: [PATCH 4/4] Remove redundant assertions from test_both_aliases_normalized_and_evaluated Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com> --- .../unittests/test_task_navigation_efficiency_evaluators.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py index 8a1f3b512b9b..7b3ab816ad22 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py @@ -467,10 +467,6 @@ def test_both_aliases_normalized_and_evaluated(self): result = evaluator(actions=actions, expected_actions=expected_actions) assert result["task_navigation_efficiency_passed"] is True - assert result["task_navigation_efficiency_result"] == "pass" - assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0 - assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0 - assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0 def test_alias_inputs_mismatch(self): """Test that alias inputs produce a failing result when actions do not match expected_actions."""