From 6c81f6ab4fa366acf78520e872ca3583a1ab1404 Mon Sep 17 00:00:00 2001
From: mohessie <mohessie@microsoft.com>
Date: Tue, 16 Jun 2026 19:21:25 +0300
Subject: [PATCH 1/4] feat(evaluation): unify validators with azureml-assets

- add DEVELOPER role, EvaluationLevel, MessagesOrQueryResponseInputValidator + level utils
- support actions/expected_actions aliases in TaskNavigationEfficiencyValidator
- align check_for_unsupported_tools flags in tool_call/input/output evaluators
---
 .../_common/_validators/__init__.py           |  16 ++
 .../_validators/_conversation_validator.py    |  57 +++----
 .../_validators/_evaluation_level_utils.py    |  65 +++++++
 .../_messages_or_query_response_validator.py  | 158 ++++++++++++++++++
 .../_task_navigation_efficiency_validator.py  |  34 +++-
 .../_validators/_validation_constants.py      |  12 ++
 .../_tool_call_accuracy.py                    |   2 +-
 .../_tool_input_accuracy.py                   |   2 +-
 .../_tool_output_utilization.py               |   4 +-
 9 files changed, 316 insertions(+), 34 deletions(-)
 create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
 create mode 100644 sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py
index 92be4feac022..d7aefa8ccbd4 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py
@@ -3,16 +3,32 @@
 
 """Validators package init."""
 
+from ._validation_constants import MessageRole, ContentType, EvaluationLevel
 from ._validator_interface import ValidatorInterface
 from ._conversation_validator import ConversationValidator
 from ._tool_definitions_validator import ToolDefinitionsValidator
 from ._tool_calls_validator import ToolCallsValidator
 from ._task_navigation_efficiency_validator import TaskNavigationEfficiencyValidator
+from ._messages_or_query_response_validator import MessagesOrQueryResponseInputValidator
+from ._evaluation_level_utils import (
+    _resolve_evaluation_level,
+    _merge_query_response_messages,
+    _split_messages_at_latest_user,
+    _wrap_string_messages,
+)
 
 __all__ = [
+    "MessageRole",
+    "ContentType",
+    "EvaluationLevel",
     "ValidatorInterface",
     "ConversationValidator",
     "ToolDefinitionsValidator",
     "ToolCallsValidator",
     "TaskNavigationEfficiencyValidator",
+    "MessagesOrQueryResponseInputValidator",
+    "_resolve_evaluation_level",
+    "_merge_query_response_messages",
+    "_split_messages_at_latest_user",
+    "_wrap_string_messages",
 ]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py
index 5e43f0265b0f..a1c375340bfc 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py
@@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) -
 
         if not isinstance(content_item["text"], str):
             return EvaluationException(
-                message=f"The 'text' field must be a string in content items.",
+                message="The 'text' field must be a string in content items.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
@@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
         """Validate assistant message content."""
         content = message["content"]
 
-        valid_assistant_content_types = [
-            ContentType.TEXT,
-            ContentType.OUTPUT_TEXT,
-            ContentType.TOOL_CALL,
-            ContentType.FUNCTION_CALL,
-            ContentType.MCP_APPROVAL_REQUEST,
-            ContentType.OPENAPI_CALL,
-        ]
-        valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
         if isinstance(content, list):
+            valid_assistant_content_types = [
+                ContentType.TEXT,
+                ContentType.OUTPUT_TEXT,
+                ContentType.TOOL_CALL,
+                ContentType.FUNCTION_CALL,
+                ContentType.MCP_APPROVAL_REQUEST,
+                ContentType.OPENAPI_CALL,
+            ]
+            valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
             for content_item in content:
                 content_type = content_item["type"]
                 if content_type not in valid_assistant_content_types:
@@ -225,19 +225,21 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
                     if error:
                         return error
 
-                # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools
-                if self.check_for_unsupported_tools:
-                    if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL:
-                        name = (
-                            "openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower()
-                        )
-                        if name in self.UNSUPPORTED_TOOLS:
-                            return EvaluationException(
-                                message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.",
-                                blame=ErrorBlame.USER_ERROR,
-                                category=ErrorCategory.NOT_APPLICABLE,
-                                target=self.error_target,
+                    # Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools
+                    if self.check_for_unsupported_tools:
+                        if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL:
+                            name = (
+                                "openapi_call"
+                                if content_type == ContentType.OPENAPI_CALL
+                                else content_item["name"].lower()
                             )
+                            if name in self.UNSUPPORTED_TOOLS:
+                                return EvaluationException(
+                                    message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.",
+                                    blame=ErrorBlame.USER_ERROR,
+                                    category=ErrorCategory.NOT_APPLICABLE,
+                                    target=self.error_target,
+                                )
         return None
 
     def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]:
@@ -314,7 +316,7 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation
         )
         if not content_is_string_or_list_of_dicts:
             return EvaluationException(
-                message=f"The 'content' field must be a string or a list of dictionaries messages.",
+                message="The 'content' field must be a string or a list of dictionaries messages.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
@@ -322,23 +324,22 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation
 
         if len(content) == 0:
             return EvaluationException(
-                message=f"The 'content' field can't be empty.",
+                message="The 'content' field can't be empty.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
             )
 
         if isinstance(content, list):
-            all_messages_have_type_field = all("type" in item for item in content)
-            if not all_messages_have_type_field:
+            if not all("type" in item for item in content):
                 return EvaluationException(
-                    message=f"Each content item in the 'content' list must contain a 'type' field.",
+                    message="Each content item in the 'content' list must contain a 'type' field.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-        if role in [MessageRole.USER, MessageRole.SYSTEM]:
+        if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]:
             error = self._validate_user_or_system_message(message, role)
             if error:
                 return error
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
new file mode 100644
index 000000000000..7dcfefed147d
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Utilities for resolving evaluation levels and reshaping query/response/messages inputs.
+"""
+
+from typing import List, Optional, Tuple, Union
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ._validation_constants import MessageRole, EvaluationLevel
+
+
+def _resolve_evaluation_level(
+    evaluation_level: Optional[Union[EvaluationLevel, str]],
+    error_target: ErrorTarget,
+) -> Optional[EvaluationLevel]:
+    """Validate and normalize the evaluation_level parameter.
+
+    :param evaluation_level: The evaluation level to resolve.
+    :type evaluation_level: Optional[Union[EvaluationLevel, str]]
+    :param error_target: The error target for exceptions.
+    :type error_target: ErrorTarget
+    :return: The resolved EvaluationLevel or None for auto-detect.
+    :rtype: Optional[EvaluationLevel]
+    """
+    valid = [level.value for level in EvaluationLevel]
+    if evaluation_level is None or evaluation_level == "":
+        return None
+    if isinstance(evaluation_level, EvaluationLevel):
+        return evaluation_level
+    if isinstance(evaluation_level, str):
+        try:
+            return EvaluationLevel(evaluation_level)
+        except ValueError as exc:
+            raise EvaluationException(
+                message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=error_target,
+            ) from exc
+    raise EvaluationException(
+        message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
+        blame=ErrorBlame.USER_ERROR,
+        category=ErrorCategory.INVALID_VALUE,
+        target=error_target,
+    )
+
+
+def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]:
+    """Merge query and response message lists into a single conversation."""
+    return [*query, *response]
+
+
+def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]:
+    """Split messages into query/response slices at the latest user turn."""
+    latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER)
+    return messages[: latest_user_index + 1], messages[latest_user_index + 1 :]
+
+
+def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]:
+    """Wrap string query/response into separate message lists."""
+    return (
+        [{"role": "user", "content": [{"type": "text", "text": query}]}],
+        [{"role": "assistant", "content": [{"type": "text", "text": response}]}],
+    )
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py
new file mode 100644
index 000000000000..370d3d3edd1d
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_messages_or_query_response_validator.py
@@ -0,0 +1,158 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.
+"""
+
+from typing import Any, Dict
+from typing_extensions import override
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from ._validation_constants import MessageRole, ContentType
+from ._conversation_validator import ConversationValidator
+from ._tool_definitions_validator import ToolDefinitionsValidator
+
+
+class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator):
+    """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.
+
+    A single implementation serves all evaluators via two behavior flags:
+      - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the
+        messages path and the query/response path. Set False for evaluators that do not accept
+        tool definitions (parity with a plain ``ConversationValidator``).
+      - ``deep_validate_messages`` (default False): additionally run full per-message
+        ``_validate_message_dict`` checks in the messages path.
+    """
+
+    enforce_tool_definitions: bool = True
+    deep_validate_messages: bool = False
+
+    def __init__(
+        self,
+        error_target: ErrorTarget,
+        requires_query: bool = True,
+        optional_tool_definitions: bool = True,
+        check_for_unsupported_tools: bool = False,
+        *,
+        enforce_tool_definitions: bool = True,
+        deep_validate_messages: bool = False,
+    ):
+        """Initialize MessagesOrQueryResponseInputValidator."""
+        super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools)
+        self.enforce_tool_definitions = enforce_tool_definitions
+        self.deep_validate_messages = deep_validate_messages
+
+    @override
+    def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
+        """Validate evaluation input, supporting messages as an alternative to query/response."""
+        # Multi-turn path (messages list)
+        messages = eval_input.get("messages")
+        if messages is not None:
+            if not isinstance(messages, list):
+                raise EvaluationException(
+                    message="messages must be provided as a list of message dictionaries.",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=self.error_target,
+                )
+            if len(messages) == 0:
+                raise EvaluationException(
+                    message="messages list must not be empty.",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=self.error_target,
+                )
+
+            # Per-message structural checks
+            valid_roles = {role.value for role in MessageRole}
+            roles_present = set()
+            for index, message in enumerate(messages):
+                if not isinstance(message, dict):
+                    raise EvaluationException(
+                        message=(
+                            f"Each item in 'messages' must be a dictionary, "
+                            f"but item at index {index} is {type(message).__name__}."
+                        ),
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=self.error_target,
+                    )
+                role = message.get("role")
+                if role is None:
+                    raise EvaluationException(
+                        message=f"Each message must contain a 'role' key, but message at index {index} is missing it.",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=self.error_target,
+                    )
+                if role not in valid_roles:
+                    raise EvaluationException(
+                        message=(
+                            f"Invalid role '{role}' at message index {index}. "
+                            f"Must be one of: {sorted(valid_roles)}."
+                        ),
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=self.error_target,
+                    )
+                roles_present.add(role)
+
+            # Conversation-level checks
+            if MessageRole.USER.value not in roles_present:
+                raise EvaluationException(
+                    message="messages must contain at least one message with role 'user'.",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=self.error_target,
+                )
+            if MessageRole.ASSISTANT.value not in roles_present:
+                raise EvaluationException(
+                    message="messages must contain at least one message with role 'assistant'.",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=self.error_target,
+                )
+            # The final assistant message must contain text
+            last_content = messages[-1].get("content", "")
+            if isinstance(last_content, list):
+                has_text = any(
+                    (
+                        isinstance(content_item, dict)
+                        and content_item.get("type")
+                        in (
+                            ContentType.TEXT,
+                            ContentType.INPUT_TEXT,
+                            ContentType.OUTPUT_TEXT,
+                        )
+                    )
+                    or isinstance(content_item, str)
+                    for content_item in last_content
+                )
+                if not has_text:
+                    raise EvaluationException(
+                        message=(
+                            "The last message must contain text content, "
+                            "not only tool calls. The conversation appears to be "
+                            "mid-execution \u2014 provide the agent's final text response."
+                        ),
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=self.error_target,
+                    )
+
+            if self.deep_validate_messages:
+                for message in messages:
+                    error = self._validate_message_dict(message)
+                    if error:
+                        raise error
+
+            if self.enforce_tool_definitions:
+                tool_definitions = eval_input.get("tool_definitions")
+                tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions)
+                if tool_definitions_validation_exception:
+                    raise tool_definitions_validation_exception
+            return True
+
+        if self.enforce_tool_definitions:
+            return super().validate_eval_input(eval_input)
+        return ConversationValidator.validate_eval_input(self, eval_input)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
index 132303129546..3c0d6018b2eb 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
@@ -17,17 +17,38 @@ class TaskNavigationEfficiencyValidator(ValidatorInterface):
     """
     Validate task navigation efficiency inputs (response and ground_truth).
 
+    Accepts either the SDK input names (``response``/``ground_truth``) or the
+    azureml-assets names (``actions``/``expected_actions``).
+
     Validates:
-    - response: List of assistant messages containing tool calls
-    - ground_truth: Either a list of expected tool names, or a tuple of (tool names, parameters dict)
+    - response (alias ``actions``): List of assistant messages containing tool calls
+    - ground_truth (alias ``expected_actions``): Either a list of expected tool names, or a
+      tuple of (tool names, parameters dict)
     """
 
     error_target: ErrorTarget
 
+    # Canonical input key -> accepted alternate (azureml-assets) key name.
+    _INPUT_ALIASES: Dict[str, str] = {
+        "response": "actions",
+        "ground_truth": "expected_actions",
+    }
+
     def __init__(self, error_target: ErrorTarget):
         """Initialize with error target."""
         self.error_target = error_target
 
+    def _normalize_input_aliases(self, eval_input: Dict[str, Any]) -> None:
+        """Map azureml-assets-style input keys onto the canonical keys in place.
+
+        If a canonical key (``response``/``ground_truth``) is absent but its alias
+        (``actions``/``expected_actions``) is provided, copy the alias value to the canonical
+        key so the rest of the pipeline can rely on a single set of names.
+        """
+        for canonical, alias in self._INPUT_ALIASES.items():
+            if eval_input.get(canonical) is None and eval_input.get(alias) is not None:
+                eval_input[canonical] = eval_input[alias]
+
     def _validate_response(self, response: Any) -> Optional[EvaluationException]:
         """Validate the response parameter."""
         if response is None:
@@ -221,8 +242,12 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
         """
         Validate task navigation evaluation input.
 
+        Accepts either the SDK input names (``response``/``ground_truth``) or the
+        azureml-assets names (``actions``/``expected_actions``).
+
         Args:
-            eval_input: Dictionary containing 'response' and 'ground_truth'.
+            eval_input: Dictionary containing 'response'/'ground_truth' (or their
+                'actions'/'expected_actions' aliases).
 
         Returns:
             True if validation passes.
@@ -230,6 +255,9 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
         Raises:
             EvaluationException: If validation fails.
         """
+        # Normalize azureml-assets-style aliases ('actions'/'expected_actions') onto canonical keys.
+        self._normalize_input_aliases(eval_input)
+
         # If response or ground_truth is a string, try to parse it as JSON
         for key in ("response", "ground_truth"):
             value = eval_input.get(key)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py
index f4c242a9f02b..3c6795309672 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_validation_constants.py
@@ -15,6 +15,7 @@ class MessageRole(str, Enum):
     ASSISTANT = "assistant"
     SYSTEM = "system"
     TOOL = "tool"
+    DEVELOPER = "developer"
 
 
 class ContentType(str, Enum):
@@ -31,3 +32,14 @@ class ContentType(str, Enum):
     MCP_APPROVAL_RESPONSE = "mcp_approval_response"
     OPENAPI_CALL = "openapi_call"
     OPENAPI_CALL_OUTPUT = "openapi_call_output"
+
+
+class EvaluationLevel(str, Enum):
+    """Supported evaluation levels for the evaluator.
+
+    - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path.
+    - ``TURN``: Force turn-level evaluation using the single-turn query/response path.
+    """
+
+    CONVERSATION = "conversation"
+    TURN = "turn"
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 3a2ccb1ace85..f5057f09e947 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index 7ebc20c7e130..198fefde02d1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
index 6339fdab2bb6..6f8605c5a071 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py
@@ -86,7 +86,9 @@ def __init__(
 
         # Initialize input validator
         self._validator = ToolDefinitionsValidator(
-            error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR, optional_tool_definitions=False
+            error_target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
+            optional_tool_definitions=False,
+            check_for_unsupported_tools=True,
         )
 
         super().__init__(

From 2808e5af81159bee3d897b0179f266db5fa33458 Mon Sep 17 00:00:00 2001
From: Mohamed Hessien <mohessie@microsoft.com>
Date: Tue, 16 Jun 2026 20:23:28 +0300
Subject: [PATCH 2/4] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 .../_common/_validators/_evaluation_level_utils.py         | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
index 7dcfefed147d..379e3e065902 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_evaluation_level_utils.py
@@ -53,7 +53,12 @@ def _merge_query_response_messages(query: List[dict], response: List[dict]) -> L
 
 def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]:
     """Split messages into query/response slices at the latest user turn."""
-    latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER)
+    latest_user_index = max(
+        (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value),
+        default=-1,
+    )
+    if latest_user_index == -1:
+        raise ValueError("messages must contain at least one message with role 'user'.")
     return messages[: latest_user_index + 1], messages[latest_user_index + 1 :]
 
 

From 106ac42ef1f22b784f8dcd942b8ec2eaa284df08 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:28:15 +0000
Subject: [PATCH 3/4] Add unit tests for actions/expected_actions alias input
 normalization

Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
---
 ...t_task_navigation_efficiency_evaluators.py | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
index 2c1a5dfba237..8a1f3b512b9b 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -404,3 +404,95 @@ def test_matching_mode_validation(self):
         # Test invalid type for mode
         with pytest.raises(Exception):  # EvaluationException
             _TaskNavigationEfficiencyEvaluator(matching_mode=123)  # type: ignore
+
+    # ==================== ALIAS INPUT NORMALIZATION TESTS ====================
+
+    def test_alias_actions_normalized_as_response(self):
+        """Test that 'actions' alias is accepted and normalized to 'response'."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+
+        actions = [
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}],
+            },
+        ]
+        ground_truth = ["search", "analyze"]
+
+        result = evaluator(actions=actions, ground_truth=ground_truth)
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+
+    def test_alias_expected_actions_normalized_as_ground_truth(self):
+        """Test that 'expected_actions' alias is accepted and normalized to 'ground_truth'."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+
+        response = [
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}],
+            },
+        ]
+        expected_actions = ["search", "analyze"]
+
+        result = evaluator(response=response, expected_actions=expected_actions)
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+
+    def test_both_aliases_normalized_and_evaluated(self):
+        """Test that both 'actions' and 'expected_actions' aliases together produce the correct result."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+
+        actions = [
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "analyze", "arguments": {}}],
+            },
+        ]
+        expected_actions = ["search", "analyze"]
+
+        result = evaluator(actions=actions, expected_actions=expected_actions)
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_result"] == "pass"
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
+        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
+
+    def test_alias_inputs_mismatch(self):
+        """Test that alias inputs produce a failing result when actions do not match expected_actions."""
+        evaluator = _TaskNavigationEfficiencyEvaluator(matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH)
+
+        # Agent performs 'search' and 'extra_step', but expected is 'search' and 'analyze'
+        actions = [
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {}}],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "extra_step", "arguments": {}}],
+            },
+        ]
+        expected_actions = ["search", "analyze"]
+
+        result = evaluator(actions=actions, expected_actions=expected_actions)
+        assert result["task_navigation_efficiency_passed"] is False
+        assert result["task_navigation_efficiency_result"] == "fail"
+        # precision: 1 match out of 2 agent steps = 0.5
+        assert result["task_navigation_efficiency_properties"]["precision_score"] == 0.5
+        # recall: 1 match out of 2 expected steps = 0.5
+        assert result["task_navigation_efficiency_properties"]["recall_score"] == 0.5

From aadd11ca772af3e370a1ed910f934b5426d912de Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 16 Jun 2026 17:29:13 +0000
Subject: [PATCH 4/4] Remove redundant assertions from
 test_both_aliases_normalized_and_evaluated

Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
---
 .../unittests/test_task_navigation_efficiency_evaluators.py   | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
index 8a1f3b512b9b..7b3ab816ad22 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_evaluators.py
@@ -467,10 +467,6 @@ def test_both_aliases_normalized_and_evaluated(self):
 
         result = evaluator(actions=actions, expected_actions=expected_actions)
         assert result["task_navigation_efficiency_passed"] is True
-        assert result["task_navigation_efficiency_result"] == "pass"
-        assert result["task_navigation_efficiency_properties"]["precision_score"] == 1.0
-        assert result["task_navigation_efficiency_properties"]["recall_score"] == 1.0
-        assert result["task_navigation_efficiency_properties"]["f1_score"] == 1.0
 
     def test_alias_inputs_mismatch(self):
         """Test that alias inputs produce a failing result when actions do not match expected_actions."""