-
Notifications
You must be signed in to change notification settings - Fork 3.3k
feat(evaluation): unify validators with azureml-assets #47526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6c81f6a
2808e5a
106ac42
aadd11c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT License. | ||
|
|
||
| """ | ||
| Utilities for resolving evaluation levels and reshaping query/response/messages inputs. | ||
| """ | ||
|
|
||
| from typing import List, Optional, Tuple, Union | ||
| from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget | ||
| from ._validation_constants import MessageRole, EvaluationLevel | ||
|
|
||
|
|
||
| def _resolve_evaluation_level( | ||
| evaluation_level: Optional[Union[EvaluationLevel, str]], | ||
| error_target: ErrorTarget, | ||
| ) -> Optional[EvaluationLevel]: | ||
| """Validate and normalize the evaluation_level parameter. | ||
|
|
||
| :param evaluation_level: The evaluation level to resolve. | ||
| :type evaluation_level: Optional[Union[EvaluationLevel, str]] | ||
| :param error_target: The error target for exceptions. | ||
| :type error_target: ErrorTarget | ||
| :return: The resolved EvaluationLevel or None for auto-detect. | ||
| :rtype: Optional[EvaluationLevel] | ||
| """ | ||
| valid = [level.value for level in EvaluationLevel] | ||
| if evaluation_level is None or evaluation_level == "": | ||
| return None | ||
| if isinstance(evaluation_level, EvaluationLevel): | ||
| return evaluation_level | ||
| if isinstance(evaluation_level, str): | ||
| try: | ||
| return EvaluationLevel(evaluation_level) | ||
| except ValueError as exc: | ||
| raise EvaluationException( | ||
| message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=error_target, | ||
| ) from exc | ||
| raise EvaluationException( | ||
| message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=error_target, | ||
| ) | ||
|
|
||
|
|
||
| def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: | ||
| """Merge query and response message lists into a single conversation.""" | ||
| return [*query, *response] | ||
|
|
||
|
|
||
| def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: | ||
| """Split messages into query/response slices at the latest user turn.""" | ||
| latest_user_index = max( | ||
| (i for i, message in enumerate(messages) if message.get("role") == MessageRole.USER.value), | ||
| default=-1, | ||
| ) | ||
| if latest_user_index == -1: | ||
| raise ValueError("messages must contain at least one message with role 'user'.") | ||
| return messages[: latest_user_index + 1], messages[latest_user_index + 1 :] | ||
|
|
||
|
|
||
| def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: | ||
| """Wrap string query/response into separate message lists.""" | ||
| return ( | ||
| [{"role": "user", "content": [{"type": "text", "text": query}]}], | ||
| [{"role": "assistant", "content": [{"type": "text", "text": response}]}], | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT License. | ||
|
|
||
| """ | ||
| Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. | ||
| """ | ||
|
|
||
| from typing import Any, Dict | ||
| from typing_extensions import override | ||
| from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget | ||
| from ._validation_constants import MessageRole, ContentType | ||
| from ._conversation_validator import ConversationValidator | ||
| from ._tool_definitions_validator import ToolDefinitionsValidator | ||
|
|
||
|
|
||
| class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add unit tests for these new validators. |
||
| """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. | ||
|
|
||
| A single implementation serves all evaluators via two behavior flags: | ||
| - ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the | ||
| messages path and the query/response path. Set False for evaluators that do not accept | ||
| tool definitions (parity with a plain ``ConversationValidator``). | ||
| - ``deep_validate_messages`` (default False): additionally run full per-message | ||
| ``_validate_message_dict`` checks in the messages path. | ||
| """ | ||
|
|
||
| enforce_tool_definitions: bool = True | ||
| deep_validate_messages: bool = False | ||
|
|
||
| def __init__( | ||
| self, | ||
| error_target: ErrorTarget, | ||
| requires_query: bool = True, | ||
| optional_tool_definitions: bool = True, | ||
| check_for_unsupported_tools: bool = False, | ||
| *, | ||
| enforce_tool_definitions: bool = True, | ||
| deep_validate_messages: bool = False, | ||
| ): | ||
| """Initialize MessagesOrQueryResponseInputValidator.""" | ||
| super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools) | ||
| self.enforce_tool_definitions = enforce_tool_definitions | ||
| self.deep_validate_messages = deep_validate_messages | ||
|
|
||
| @override | ||
| def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: | ||
| """Validate evaluation input, supporting messages as an alternative to query/response.""" | ||
| # Multi-turn path (messages list) | ||
| messages = eval_input.get("messages") | ||
| if messages is not None: | ||
| if not isinstance(messages, list): | ||
| raise EvaluationException( | ||
| message="messages must be provided as a list of message dictionaries.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if len(messages) == 0: | ||
| raise EvaluationException( | ||
| message="messages list must not be empty.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
|
|
||
| # Per-message structural checks | ||
| valid_roles = {role.value for role in MessageRole} | ||
| roles_present = set() | ||
| for index, message in enumerate(messages): | ||
| if not isinstance(message, dict): | ||
| raise EvaluationException( | ||
| message=( | ||
| f"Each item in 'messages' must be a dictionary, " | ||
| f"but item at index {index} is {type(message).__name__}." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| role = message.get("role") | ||
| if role is None: | ||
| raise EvaluationException( | ||
| message=f"Each message must contain a 'role' key, but message at index {index} is missing it.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if role not in valid_roles: | ||
| raise EvaluationException( | ||
| message=( | ||
| f"Invalid role '{role}' at message index {index}. " | ||
| f"Must be one of: {sorted(valid_roles)}." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| roles_present.add(role) | ||
|
m7md7sien marked this conversation as resolved.
|
||
|
|
||
| # Conversation-level checks | ||
| if MessageRole.USER.value not in roles_present: | ||
| raise EvaluationException( | ||
| message="messages must contain at least one message with role 'user'.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| if MessageRole.ASSISTANT.value not in roles_present: | ||
| raise EvaluationException( | ||
| message="messages must contain at least one message with role 'assistant'.", | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
| # The final assistant message must contain text | ||
| last_content = messages[-1].get("content", "") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we assume that the last message will have a role as assistant, but that may not be the case. Can we explicitly check that the last message's role is assistant before moving on to content check? |
||
| if isinstance(last_content, list): | ||
| has_text = any( | ||
| ( | ||
| isinstance(content_item, dict) | ||
| and content_item.get("type") | ||
| in ( | ||
| ContentType.TEXT, | ||
| ContentType.INPUT_TEXT, | ||
| ContentType.OUTPUT_TEXT, | ||
| ) | ||
| ) | ||
| or isinstance(content_item, str) | ||
| for content_item in last_content | ||
| ) | ||
| if not has_text: | ||
| raise EvaluationException( | ||
| message=( | ||
| "The last message must contain text content, " | ||
| "not only tool calls. The conversation appears to be " | ||
| "mid-execution \u2014 provide the agent's final text response." | ||
| ), | ||
| blame=ErrorBlame.USER_ERROR, | ||
| category=ErrorCategory.INVALID_VALUE, | ||
| target=self.error_target, | ||
| ) | ||
|
|
||
| if self.deep_validate_messages: | ||
| for message in messages: | ||
| error = self._validate_message_dict(message) | ||
| if error: | ||
| raise error | ||
|
|
||
| if self.enforce_tool_definitions: | ||
| tool_definitions = eval_input.get("tool_definitions") | ||
| tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions) | ||
| if tool_definitions_validation_exception: | ||
| raise tool_definitions_validation_exception | ||
| return True | ||
|
|
||
| if self.enforce_tool_definitions: | ||
| return super().validate_eval_input(eval_input) | ||
| return ConversationValidator.validate_eval_input(self, eval_input) | ||
Uh oh!
There was an error while loading. Please reload this page.