Skip to content

Commit 6c81f6a

Browse files
committed
feat(evaluation): unify validators with azureml-assets
- add DEVELOPER role, EvaluationLevel, MessagesOrQueryResponseInputValidator + level utils - support actions/expected_actions aliases in TaskNavigationEfficiencyValidator - align check_for_unsupported_tools flags in tool_call/input/output evaluators
1 parent 89c1029 commit 6c81f6a

9 files changed

Lines changed: 316 additions & 34 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,32 @@
33

44
"""Validators package init."""
55

6+
from ._validation_constants import MessageRole, ContentType, EvaluationLevel
67
from ._validator_interface import ValidatorInterface
78
from ._conversation_validator import ConversationValidator
89
from ._tool_definitions_validator import ToolDefinitionsValidator
910
from ._tool_calls_validator import ToolCallsValidator
1011
from ._task_navigation_efficiency_validator import TaskNavigationEfficiencyValidator
12+
from ._messages_or_query_response_validator import MessagesOrQueryResponseInputValidator
13+
from ._evaluation_level_utils import (
14+
_resolve_evaluation_level,
15+
_merge_query_response_messages,
16+
_split_messages_at_latest_user,
17+
_wrap_string_messages,
18+
)
1119

1220
__all__ = [
21+
"MessageRole",
22+
"ContentType",
23+
"EvaluationLevel",
1324
"ValidatorInterface",
1425
"ConversationValidator",
1526
"ToolDefinitionsValidator",
1627
"ToolCallsValidator",
1728
"TaskNavigationEfficiencyValidator",
29+
"MessagesOrQueryResponseInputValidator",
30+
"_resolve_evaluation_level",
31+
"_merge_query_response_messages",
32+
"_split_messages_at_latest_user",
33+
"_wrap_string_messages",
1834
]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_conversation_validator.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def _validate_text_content_item(self, content_item: Dict[str, Any], role: str) -
130130

131131
if not isinstance(content_item["text"], str):
132132
return EvaluationException(
133-
message=f"The 'text' field must be a string in content items.",
133+
message="The 'text' field must be a string in content items.",
134134
blame=ErrorBlame.USER_ERROR,
135135
category=ErrorCategory.INVALID_VALUE,
136136
target=self.error_target,
@@ -196,16 +196,16 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
196196
"""Validate assistant message content."""
197197
content = message["content"]
198198

199-
valid_assistant_content_types = [
200-
ContentType.TEXT,
201-
ContentType.OUTPUT_TEXT,
202-
ContentType.TOOL_CALL,
203-
ContentType.FUNCTION_CALL,
204-
ContentType.MCP_APPROVAL_REQUEST,
205-
ContentType.OPENAPI_CALL,
206-
]
207-
valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
208199
if isinstance(content, list):
200+
valid_assistant_content_types = [
201+
ContentType.TEXT,
202+
ContentType.OUTPUT_TEXT,
203+
ContentType.TOOL_CALL,
204+
ContentType.FUNCTION_CALL,
205+
ContentType.MCP_APPROVAL_REQUEST,
206+
ContentType.OPENAPI_CALL,
207+
]
208+
valid_assistant_content_types_as_strings = [t.value for t in valid_assistant_content_types]
209209
for content_item in content:
210210
content_type = content_item["type"]
211211
if content_type not in valid_assistant_content_types:
@@ -225,19 +225,21 @@ def _validate_assistant_message(self, message: Dict[str, Any]) -> Optional[Evalu
225225
if error:
226226
return error
227227

228-
# Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools
229-
if self.check_for_unsupported_tools:
230-
if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL:
231-
name = (
232-
"openapi_call" if content_type == ContentType.OPENAPI_CALL else content_item["name"].lower()
233-
)
234-
if name in self.UNSUPPORTED_TOOLS:
235-
return EvaluationException(
236-
message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.",
237-
blame=ErrorBlame.USER_ERROR,
238-
category=ErrorCategory.NOT_APPLICABLE,
239-
target=self.error_target,
228+
# Raise error in case of unsupported tools for evaluators that enabled check_for_unsupported_tools
229+
if self.check_for_unsupported_tools:
230+
if content_type == ContentType.TOOL_CALL or content_type == ContentType.OPENAPI_CALL:
231+
name = (
232+
"openapi_call"
233+
if content_type == ContentType.OPENAPI_CALL
234+
else content_item["name"].lower()
240235
)
236+
if name in self.UNSUPPORTED_TOOLS:
237+
return EvaluationException(
238+
message=f"{name} tool call is currently not supported for {self.error_target.value} evaluator.",
239+
blame=ErrorBlame.USER_ERROR,
240+
category=ErrorCategory.NOT_APPLICABLE,
241+
target=self.error_target,
242+
)
241243
return None
242244

243245
def _validate_tool_message(self, message: Dict[str, Any]) -> Optional[EvaluationException]:
@@ -314,31 +316,30 @@ def _validate_message_dict(self, message: Dict[str, Any]) -> Optional[Evaluation
314316
)
315317
if not content_is_string_or_list_of_dicts:
316318
return EvaluationException(
317-
message=f"The 'content' field must be a string or a list of dictionaries messages.",
319+
message="The 'content' field must be a string or a list of dictionaries messages.",
318320
blame=ErrorBlame.USER_ERROR,
319321
category=ErrorCategory.INVALID_VALUE,
320322
target=self.error_target,
321323
)
322324

323325
if len(content) == 0:
324326
return EvaluationException(
325-
message=f"The 'content' field can't be empty.",
327+
message="The 'content' field can't be empty.",
326328
blame=ErrorBlame.USER_ERROR,
327329
category=ErrorCategory.INVALID_VALUE,
328330
target=self.error_target,
329331
)
330332

331333
if isinstance(content, list):
332-
all_messages_have_type_field = all("type" in item for item in content)
333-
if not all_messages_have_type_field:
334+
if not all("type" in item for item in content):
334335
return EvaluationException(
335-
message=f"Each content item in the 'content' list must contain a 'type' field.",
336+
message="Each content item in the 'content' list must contain a 'type' field.",
336337
blame=ErrorBlame.USER_ERROR,
337338
category=ErrorCategory.INVALID_VALUE,
338339
target=self.error_target,
339340
)
340341

341-
if role in [MessageRole.USER, MessageRole.SYSTEM]:
342+
if role in [MessageRole.USER, MessageRole.SYSTEM, MessageRole.DEVELOPER]:
342343
error = self._validate_user_or_system_message(message, role)
343344
if error:
344345
return error
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""
5+
Utilities for resolving evaluation levels and reshaping query/response/messages inputs.
6+
"""
7+
8+
from typing import List, Optional, Tuple, Union
9+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
10+
from ._validation_constants import MessageRole, EvaluationLevel
11+
12+
13+
def _resolve_evaluation_level(
14+
evaluation_level: Optional[Union[EvaluationLevel, str]],
15+
error_target: ErrorTarget,
16+
) -> Optional[EvaluationLevel]:
17+
"""Validate and normalize the evaluation_level parameter.
18+
19+
:param evaluation_level: The evaluation level to resolve.
20+
:type evaluation_level: Optional[Union[EvaluationLevel, str]]
21+
:param error_target: The error target for exceptions.
22+
:type error_target: ErrorTarget
23+
:return: The resolved EvaluationLevel or None for auto-detect.
24+
:rtype: Optional[EvaluationLevel]
25+
"""
26+
valid = [level.value for level in EvaluationLevel]
27+
if evaluation_level is None or evaluation_level == "":
28+
return None
29+
if isinstance(evaluation_level, EvaluationLevel):
30+
return evaluation_level
31+
if isinstance(evaluation_level, str):
32+
try:
33+
return EvaluationLevel(evaluation_level)
34+
except ValueError as exc:
35+
raise EvaluationException(
36+
message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
37+
blame=ErrorBlame.USER_ERROR,
38+
category=ErrorCategory.INVALID_VALUE,
39+
target=error_target,
40+
) from exc
41+
raise EvaluationException(
42+
message=(f"Invalid evaluation_level '{evaluation_level}'. " f"Must be one of: {valid}."),
43+
blame=ErrorBlame.USER_ERROR,
44+
category=ErrorCategory.INVALID_VALUE,
45+
target=error_target,
46+
)
47+
48+
49+
def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]:
50+
"""Merge query and response message lists into a single conversation."""
51+
return [*query, *response]
52+
53+
54+
def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]:
55+
"""Split messages into query/response slices at the latest user turn."""
56+
latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER)
57+
return messages[: latest_user_index + 1], messages[latest_user_index + 1 :]
58+
59+
60+
def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]:
61+
"""Wrap string query/response into separate message lists."""
62+
return (
63+
[{"role": "user", "content": [{"type": "text", "text": query}]}],
64+
[{"role": "assistant", "content": [{"type": "text", "text": response}]}],
65+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""
5+
Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.
6+
"""
7+
8+
from typing import Any, Dict
9+
from typing_extensions import override
10+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
11+
from ._validation_constants import MessageRole, ContentType
12+
from ._conversation_validator import ConversationValidator
13+
from ._tool_definitions_validator import ToolDefinitionsValidator
14+
15+
16+
class MessagesOrQueryResponseInputValidator(ToolDefinitionsValidator):
17+
"""Validator that supports both single-turn (query/response) and multi-turn (messages) inputs.
18+
19+
A single implementation serves all evaluators via two behavior flags:
20+
- ``enforce_tool_definitions`` (default True): validate ``tool_definitions`` in both the
21+
messages path and the query/response path. Set False for evaluators that do not accept
22+
tool definitions (parity with a plain ``ConversationValidator``).
23+
- ``deep_validate_messages`` (default False): additionally run full per-message
24+
``_validate_message_dict`` checks in the messages path.
25+
"""
26+
27+
enforce_tool_definitions: bool = True
28+
deep_validate_messages: bool = False
29+
30+
def __init__(
31+
self,
32+
error_target: ErrorTarget,
33+
requires_query: bool = True,
34+
optional_tool_definitions: bool = True,
35+
check_for_unsupported_tools: bool = False,
36+
*,
37+
enforce_tool_definitions: bool = True,
38+
deep_validate_messages: bool = False,
39+
):
40+
"""Initialize MessagesOrQueryResponseInputValidator."""
41+
super().__init__(error_target, requires_query, optional_tool_definitions, check_for_unsupported_tools)
42+
self.enforce_tool_definitions = enforce_tool_definitions
43+
self.deep_validate_messages = deep_validate_messages
44+
45+
@override
46+
def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
47+
"""Validate evaluation input, supporting messages as an alternative to query/response."""
48+
# Multi-turn path (messages list)
49+
messages = eval_input.get("messages")
50+
if messages is not None:
51+
if not isinstance(messages, list):
52+
raise EvaluationException(
53+
message="messages must be provided as a list of message dictionaries.",
54+
blame=ErrorBlame.USER_ERROR,
55+
category=ErrorCategory.INVALID_VALUE,
56+
target=self.error_target,
57+
)
58+
if len(messages) == 0:
59+
raise EvaluationException(
60+
message="messages list must not be empty.",
61+
blame=ErrorBlame.USER_ERROR,
62+
category=ErrorCategory.INVALID_VALUE,
63+
target=self.error_target,
64+
)
65+
66+
# Per-message structural checks
67+
valid_roles = {role.value for role in MessageRole}
68+
roles_present = set()
69+
for index, message in enumerate(messages):
70+
if not isinstance(message, dict):
71+
raise EvaluationException(
72+
message=(
73+
f"Each item in 'messages' must be a dictionary, "
74+
f"but item at index {index} is {type(message).__name__}."
75+
),
76+
blame=ErrorBlame.USER_ERROR,
77+
category=ErrorCategory.INVALID_VALUE,
78+
target=self.error_target,
79+
)
80+
role = message.get("role")
81+
if role is None:
82+
raise EvaluationException(
83+
message=f"Each message must contain a 'role' key, but message at index {index} is missing it.",
84+
blame=ErrorBlame.USER_ERROR,
85+
category=ErrorCategory.INVALID_VALUE,
86+
target=self.error_target,
87+
)
88+
if role not in valid_roles:
89+
raise EvaluationException(
90+
message=(
91+
f"Invalid role '{role}' at message index {index}. "
92+
f"Must be one of: {sorted(valid_roles)}."
93+
),
94+
blame=ErrorBlame.USER_ERROR,
95+
category=ErrorCategory.INVALID_VALUE,
96+
target=self.error_target,
97+
)
98+
roles_present.add(role)
99+
100+
# Conversation-level checks
101+
if MessageRole.USER.value not in roles_present:
102+
raise EvaluationException(
103+
message="messages must contain at least one message with role 'user'.",
104+
blame=ErrorBlame.USER_ERROR,
105+
category=ErrorCategory.INVALID_VALUE,
106+
target=self.error_target,
107+
)
108+
if MessageRole.ASSISTANT.value not in roles_present:
109+
raise EvaluationException(
110+
message="messages must contain at least one message with role 'assistant'.",
111+
blame=ErrorBlame.USER_ERROR,
112+
category=ErrorCategory.INVALID_VALUE,
113+
target=self.error_target,
114+
)
115+
# The final assistant message must contain text
116+
last_content = messages[-1].get("content", "")
117+
if isinstance(last_content, list):
118+
has_text = any(
119+
(
120+
isinstance(content_item, dict)
121+
and content_item.get("type")
122+
in (
123+
ContentType.TEXT,
124+
ContentType.INPUT_TEXT,
125+
ContentType.OUTPUT_TEXT,
126+
)
127+
)
128+
or isinstance(content_item, str)
129+
for content_item in last_content
130+
)
131+
if not has_text:
132+
raise EvaluationException(
133+
message=(
134+
"The last message must contain text content, "
135+
"not only tool calls. The conversation appears to be "
136+
"mid-execution \u2014 provide the agent's final text response."
137+
),
138+
blame=ErrorBlame.USER_ERROR,
139+
category=ErrorCategory.INVALID_VALUE,
140+
target=self.error_target,
141+
)
142+
143+
if self.deep_validate_messages:
144+
for message in messages:
145+
error = self._validate_message_dict(message)
146+
if error:
147+
raise error
148+
149+
if self.enforce_tool_definitions:
150+
tool_definitions = eval_input.get("tool_definitions")
151+
tool_definitions_validation_exception = self._validate_tool_definitions(tool_definitions)
152+
if tool_definitions_validation_exception:
153+
raise tool_definitions_validation_exception
154+
return True
155+
156+
if self.enforce_tool_definitions:
157+
return super().validate_eval_input(eval_input)
158+
return ConversationValidator.validate_eval_input(self, eval_input)

0 commit comments

Comments
 (0)