diff --git a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py index 99fdeb0fe7..5acc69e152 100644 --- a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py +++ b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py @@ -4,7 +4,7 @@ import os import logging import math -from typing import Dict, List, Optional, Union, Any +from typing import Dict, List, Optional, Union, Any, Tuple from typing_extensions import overload, override @@ -20,10 +20,14 @@ ErrorTarget, EvaluationException, ErrorCategory, + check_score_is_valid, construct_prompty_model_config, validate_model_config, _extract_text_from_content, + _get_agent_response, + _pretty_format_conversation_history, ) +from azure.ai.evaluation._common.utils import reformat_tool_definitions from abc import ABC, abstractmethod from enum import Enum @@ -66,6 +70,78 @@ class ContentType(str, Enum): OPENAPI_CALL_OUTPUT = "openapi_call_output" +class EvaluationLevel(str, Enum): + """Supported evaluation levels for GroundednessEvaluator. + + - ``CONVERSATION``: Force conversation-level evaluation using the multi-turn path. + - ``TURN``: Force turn-level evaluation using the single-turn query/response path. + """ + + CONVERSATION = "conversation" + TURN = "turn" + + +def _resolve_evaluation_level( + evaluation_level: Optional[Union[EvaluationLevel, str]], + error_target: ErrorTarget, +) -> Optional[EvaluationLevel]: + """Validate and normalize the evaluation_level parameter. + + :param evaluation_level: The evaluation level to resolve. + :type evaluation_level: Optional[Union[EvaluationLevel, str]] + :param error_target: The error target for exceptions. + :type error_target: ErrorTarget + :return: The resolved EvaluationLevel or None for auto-detect. + :rtype: Optional[EvaluationLevel] + """ + valid = [level.value for level in EvaluationLevel] + if evaluation_level is None: + return None + if isinstance(evaluation_level, EvaluationLevel): + return evaluation_level + if isinstance(evaluation_level, str): + try: + return EvaluationLevel(evaluation_level) + except ValueError: + raise EvaluationException( + message=( + f"Invalid evaluation_level '{evaluation_level}'. " + f"Must be one of: {valid}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) + raise EvaluationException( + message=( + f"Invalid evaluation_level '{evaluation_level}'. " + f"Must be one of: {valid}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=error_target, + ) + + +def _merge_query_response_messages(query: List[dict], response: List[dict]) -> List[dict]: + """Merge query and response message lists into a single conversation.""" + return [*query, *response] + + +def _split_messages_at_latest_user(messages: List[dict]) -> Tuple[List[dict], List[dict]]: + """Split messages into query/response slices at the latest user turn.""" + latest_user_index = max(i for i, message in enumerate(messages) if message["role"] == MessageRole.USER) + return messages[: latest_user_index + 1], messages[latest_user_index + 1:] + + +def _wrap_string_messages(query: str, response: str) -> Tuple[List[dict], List[dict]]: + """Wrap string query/response into separate message lists.""" + return ( + [{"role": "user", "content": [{"type": "text", "text": query}]}], + [{"role": "assistant", "content": [{"type": "text", "text": response}]}], + ) + + class ConversationValidator(ValidatorInterface): """Validate conversation inputs (queries and responses) comprised of message lists.""" @@ -492,6 +568,115 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: return True +class MessagesOrQueryResponseInputValidator(ConversationValidator): + """Validator that supports both single-turn (query/response) and multi-turn (messages) inputs. + + When ``messages`` is provided, it validates the messages list. + Otherwise, it delegates to the parent ``ConversationValidator`` for the query/response path. + """ + + @override + def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool: + """Validate evaluation input, supporting messages as an alternative to query/response.""" + messages = eval_input.get("messages") + if messages is not None: + if not isinstance(messages, list): + raise EvaluationException( + message="messages must be provided as a list of message dictionaries.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if len(messages) == 0: + raise EvaluationException( + message="messages list must not be empty.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + + # Per-message structural checks + valid_roles = {r.value for r in MessageRole} + roles_present: set = set() + for i, msg in enumerate(messages): + if not isinstance(msg, dict): + raise EvaluationException( + message=( + f"Each item in 'messages' must be a dictionary, " + f"but item at index {i} is {type(msg).__name__}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + role = msg.get("role") + if role is None: + raise EvaluationException( + message=f"Each message must contain a 'role' key, but message at index {i} is missing it.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if role not in valid_roles: + raise EvaluationException( + message=( + f"Invalid role '{role}' at message index {i}. " + f"Must be one of: {sorted(valid_roles)}." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + roles_present.add(role) + + # Conversation-level checks + if MessageRole.USER not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'user'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if MessageRole.ASSISTANT not in roles_present: + raise EvaluationException( + message="messages must contain at least one message with role 'assistant'.", + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + if messages[-1]["role"] != MessageRole.ASSISTANT: + raise EvaluationException( + message=( + f"The last message must have role 'assistant', " + f"but found role '{messages[-1]['role']}'." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + # The final assistant message must contain text + last_content = messages[-1].get("content", "") + if isinstance(last_content, list): + has_text = any( + isinstance(c, dict) and c.get("type") in ("text",) + or isinstance(c, str) + for c in last_content + ) + if not has_text: + raise EvaluationException( + message=( + "The last assistant message must contain text content, " + "not only tool calls. The conversation appears to be " + "mid-execution — provide the agent's final text response." + ), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=self.error_target, + ) + return True + return super().validate_eval_input(eval_input) + + # endregion Validators try: @@ -649,6 +834,110 @@ def _preprocess_messages(messages): return messages +def serialize_messages(messages: List[dict]) -> str: + """Serialize a list of chat messages into a labeled text transcript for the multi-turn prompty. + + **Input format:** List of message dicts, each with ``"role"`` (``user``, ``assistant``, ``tool``, + ``system``) and ``"content"`` (string or list of content-block dicts like + ``{"type": "text", "text": "..."}``). Tool messages may include ``tool_call_id`` and content + blocks of type ``tool_result``/``tool_call``. + + **Output format:** Plain-text transcript with labeled turns:: + + User turn 1: + + + Agent turn 1: + + [TOOL_CALL] func_name({"arg": "val"}) + [TOOL_RESULT] + + User turn 2: + + ... + + System messages are included as a system preamble. Consecutive messages of the same + role are grouped into a single turn. Assistant string content is auto-normalized to content-block + format for consistent formatting. + + :param messages: Chat messages with role and content. + :type messages: List[dict] + :return: Formatted text transcript. + :rtype: str + """ + if not messages: + return "" + + all_user_queries: List = [] + all_agent_responses: List = [] + cur_user_query: List = [] + cur_agent_response: List = [] + system_message = None + + for msg in messages: + if not isinstance(msg, dict): + continue + role = msg.get("role") + if not role: + continue + + normalized = msg + if role == MessageRole.ASSISTANT and isinstance(msg.get("content"), str): + normalized = {**msg, "content": [{"type": "text", "text": msg["content"]}]} + + if role == MessageRole.SYSTEM: + system_message = msg.get("content", "") + + elif role == MessageRole.USER and "content" in msg: + if cur_agent_response: + formatted = _get_agent_response(cur_agent_response, include_tool_messages=True) + all_agent_responses.append([formatted]) + cur_agent_response = [] + content = msg["content"] + if isinstance(content, str): + text_in_msg = [content] + else: + text_in_msg = _extract_text_from_content(content) + if text_in_msg: + cur_user_query.append(text_in_msg) + + elif role in (MessageRole.ASSISTANT, MessageRole.TOOL): + if cur_user_query: + all_user_queries.append(cur_user_query) + cur_user_query = [] + cur_agent_response.append(normalized) + + if cur_user_query: + all_user_queries.append(cur_user_query) + if cur_agent_response: + formatted = _get_agent_response(cur_agent_response, include_tool_messages=True) + all_agent_responses.append([formatted]) + + conversation_history: Dict = { + "user_queries": all_user_queries, + "agent_responses": all_agent_responses[:len(all_user_queries) - 1] + if len(all_user_queries) > 0 + else [], + } + if system_message: + conversation_history["system_message"] = system_message + + result = _pretty_format_conversation_history(conversation_history) + + start = max(len(all_user_queries) - 1, 0) + for i, agent_response in enumerate(all_agent_responses[start:], start=start): + result += f"Agent turn {i + 1}:\n" + for msg_text in agent_response: + if isinstance(msg_text, list): + for submsg in msg_text: + result += " " + "\n ".join(submsg.split("\n")) + "\n" + else: + result += " " + "\n ".join(msg_text.split("\n")) + "\n" + result += "\n" + + return result.rstrip("\n") + + class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Evaluates groundedness score. @@ -710,25 +999,47 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]): _PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty" _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty" + _MULTI_TURN_PROMPTY_FILE = "groundedness_multi_turn.prompty" _RESULT_KEY = "groundedness" - _OPTIONAL_PARAMS = ["query"] + _MIN_GROUNDEDNESS_SCORE = 1 + _MAX_GROUNDEDNESS_SCORE = 5 + _OPTIONAL_PARAMS = ["query", "messages", "tool_definitions"] _SUPPORTED_TOOLS = ["file_search"] _validator: ValidatorInterface _validator_with_query: ValidatorInterface + _validator_messages: ValidatorInterface id = "azureai://built-in/evaluators/groundedness" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override - def __init__(self, model_config, *, threshold=3, credential=None, **kwargs): - """Initialize a GroundednessEvaluator instance.""" + def __init__(self, model_config, *, threshold=3, credential=None, evaluation_level=None, **kwargs): + """Initialize a GroundednessEvaluator instance. + + :param model_config: Configuration for the Azure OpenAI model. + :type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration] + :keyword threshold: The threshold for the groundedness evaluator. Default is 3. + :type threshold: int + :keyword credential: Credential for authentication. + :type credential: Optional[TokenCredential] + :keyword evaluation_level: Force a specific evaluation level for this invocation. When ``None`` + (default), the level is auto-detected from input shape (``messages`` -> conversation, + ``query``/``response`` -> turn). Set to ``EvaluationLevel.CONVERSATION`` or + ``EvaluationLevel.TURN`` to override auto-detection. + :type evaluation_level: Optional[Union[EvaluationLevel, str]] + """ current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query self._higher_is_better = True - # Initialize input validator + # Validate and store evaluation level + self._evaluation_level = _resolve_evaluation_level( + evaluation_level, ErrorTarget.GROUNDEDNESS_EVALUATOR + ) + + # Initialize input validators self._validator = ConversationValidator( error_target=ErrorTarget.GROUNDEDNESS_EVALUATOR, requires_query=False, @@ -740,6 +1051,12 @@ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs): check_for_unsupported_tools=True ) + self._validator_messages = MessagesOrQueryResponseInputValidator( + error_target=ErrorTarget.GROUNDEDNESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=False + ) + super().__init__( model_config=model_config, prompty_file=prompty_path, @@ -752,7 +1069,20 @@ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs): self._model_config = model_config self.threshold = threshold self._credential = credential - # Needs to be set because it's used in call method to re-validate prompt if `query` is provided + + # Load the multi-turn prompty flow for conversation-level evaluation + multi_turn_prompty_path = os.path.join(current_dir, self._MULTI_TURN_PROMPTY_FILE) + prompty_model_config = construct_prompty_model_config( + validate_model_config(model_config), + self._DEFAULT_OPEN_API_VERSION, + UserAgentSingleton().value, + ) + self._multi_turn_flow = AsyncPrompty.load( + source=multi_turn_prompty_path, + model=prompty_model_config, + is_reasoning_model=self._is_reasoning_model, + token_credential=credential, + ) @overload def __call__( @@ -811,6 +1141,26 @@ def __call__( :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ + @overload + def __call__( + self, + *, + messages: List[dict], + tool_definitions: Optional[Union[dict, List[dict]]] = None, + ) -> Dict[str, Union[str, float]]: + """Evaluate groundedness for a full multi-turn conversation. + + Evaluates whether the agent's responses remain grounded in the provided context, + tool results, and user-provided information throughout all turns. + + :keyword messages: The full multi-turn conversation as a list of message dicts. + :paramtype messages: List[dict] + :keyword tool_definitions: An optional list of tool definitions the agent is aware of. + :paramtype tool_definitions: Optional[Union[dict, List[dict]]] + :return: The groundedness score. + :rtype: Dict[str, Union[str, float]] + """ + @override def __call__( # pylint: disable=docstring-missing-param self, @@ -820,23 +1170,23 @@ def __call__( # pylint: disable=docstring-missing-param """Evaluate groundedness. Accepts either a query, response, and context for a single evaluation, - or a conversation for a multi-turn evaluation. + a conversation for a multi-turn per-turn evaluation, or messages for + conversation-level evaluation. - If the conversation has more than one turn, the evaluator will aggregate the results of each turn. - - :keyword query: The query to be evaluated. Mutually exclusive with `conversation`. Optional parameter for use - with the `response` and `context` parameters. If provided, a different prompt template will be used for - evaluation. + :keyword query: The query to be evaluated. Optional parameter for use + with the `response` and `context` parameters. :paramtype query: Optional[str] - :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter. + :keyword response: The response to be evaluated. :paramtype response: Optional[str] - :keyword context: The context to be evaluated. Mutually exclusive with the `conversation` parameter. + :keyword context: The context to be evaluated. :paramtype context: Optional[str] - :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the - key "messages", and potentially a global context under the key "context". Conversation turns are expected - to be dictionaries with keys "content", "role", and possibly "context". + :keyword conversation: The conversation to evaluate per-turn. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] - :return: The relevance score. + :keyword messages: The full multi-turn conversation for conversation-level evaluation. + :paramtype messages: Optional[List[dict]] + :keyword tool_definitions: Optional tool definitions for conversation-level evaluation. + :paramtype tool_definitions: Optional[Union[dict, List[dict]]] + :return: The groundedness score. :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]] """ if kwargs.get("query", None): @@ -891,26 +1241,72 @@ def _validate_context(self, context) -> bool: return bool(context.strip()) return True + def _build_result( + self, + score: Optional[Union[int, float]], + result: str, + reason: str, + properties: Dict, + prompty_output_dict: Optional[Dict] = None, + status: Optional[str] = None, + ) -> Dict[str, Union[str, int, float, Dict, None]]: + """Build a standardized groundedness result dictionary.""" + p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} + parsed_result: Dict[str, Union[str, int, float, Dict, None]] = { + self._result_key: score, + f"{self._result_key}_score": score, + f"{self._result_key}_result": result, + f"{self._result_key}_threshold": self.threshold, + f"{self._result_key}_reason": reason, + f"{self._result_key}_details": properties, + f"{self._result_key}_properties": properties, + f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), + f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), + f"{self._result_key}_total_tokens": p.get("total_token_count", 0), + f"{self._result_key}_finish_reason": p.get("finish_reason", ""), + f"{self._result_key}_model": p.get("model_id", ""), + f"{self._result_key}_sample_input": p.get("sample_input", ""), + f"{self._result_key}_sample_output": p.get("sample_output", ""), + } + if status is not None: + parsed_result[f"{self._result_key}_status"] = status + return parsed_result + def _not_applicable_result( self, error_message: str, threshold: Union[int, float] ) -> Dict[str, Union[str, float, Dict]]: """Return a result indicating that the evaluation is not applicable.""" - return { - self._result_key: threshold, - f"{self._result_key}_result": "pass", - f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", - } + return self._build_result( + score=threshold, + result="not_applicable", + reason=f"Not applicable: {error_message}", + properties={}, + ) + + def _should_use_conversation_level(self, eval_input: Dict) -> bool: + """Determine whether to use conversation-level evaluation. + + When ``_evaluation_level`` is set, it takes precedence. Otherwise, auto-detect + based on whether ``messages`` is present in the input. + + :param eval_input: The evaluation input. + :type eval_input: Dict + :return: True if conversation-level evaluation should be used. + :rtype: bool + """ + if self._evaluation_level == EvaluationLevel.CONVERSATION: + return True + if self._evaluation_level == EvaluationLevel.TURN: + return False + # Auto-detect (_evaluation_level is None) + return eval_input.get("messages") is not None @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: + # Route to conversation-level evaluation if appropriate + if self._should_use_conversation_level(eval_input): + return await self._do_eval_conversation_level(eval_input) + if _is_intermediate_response(eval_input.get("response")): return self._not_applicable_result( "Intermediate response. Please provide the agent's final response for evaluation.", @@ -956,6 +1352,87 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: ) return result + async def _do_eval_conversation_level(self, eval_input: Dict) -> Dict[str, Union[float, str]]: + """Evaluate groundedness for a full conversation-level evaluation. + + :param eval_input: The input containing ``messages`` and optionally ``tool_definitions``. + :type eval_input: Dict + :return: The evaluation result. + :rtype: Dict + """ + messages = eval_input["messages"] + + messages = _preprocess_messages(messages) + conversation_text = serialize_messages(messages) + + prompty_kwargs: Dict[str, Any] = {"messages": conversation_text} + tool_definitions = eval_input.get("tool_definitions") + if tool_definitions: + prompty_kwargs["tool_definitions"] = reformat_tool_definitions(tool_definitions, logger) + + prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs) + return self._parse_prompty_output(prompty_output_dict) + + def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]: + """Parse the prompty output into a standardized result dictionary. + + :param prompty_output_dict: Raw output from the prompty flow. + :type prompty_output_dict: Dict + :return: The parsed evaluation result. + :rtype: Dict + """ + llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) + + score: Optional[Union[int, float]] = None + score_result = "error" + reason = "Evaluator returned invalid output." + status = "error" + properties: Dict[str, Any] = {} + + if isinstance(llm_output, dict): + status = str(llm_output.get("status", "completed")).strip().lower() + reason = llm_output.get("reason", llm_output.get("explanation", "")) + properties = llm_output.get("properties", llm_output.get("properties", {})) or {} + if not isinstance(properties, dict): + properties = {} + + if status in ["skipped", "error"]: + score = None + score_result = "not_applicable" + else: + score_value = llm_output.get("score", self.threshold) + if isinstance(score_value, str): + normalized_score = score_value.strip() + score = float(normalized_score) if normalized_score.replace(".", "", 1).isdigit() else None + elif isinstance(score_value, (int, float)): + score = float(score_value) + else: + score = None + + if score is None or not check_score_is_valid( + score, + GroundednessEvaluator._MIN_GROUNDEDNESS_SCORE, + GroundednessEvaluator._MAX_GROUNDEDNESS_SCORE, + ): + score_result = "error" + reason = reason or ( + f"Invalid score value: {score}. Expected a number in range " + f"[{GroundednessEvaluator._MIN_GROUNDEDNESS_SCORE}, " + f"{GroundednessEvaluator._MAX_GROUNDEDNESS_SCORE}]." + ) + status = "error" + else: + score_result = "pass" if score >= self.threshold else "fail" + + return self._build_result( + score=score, + result=score_result, + reason=reason, + properties=properties, + status=status, + prompty_output_dict=prompty_output_dict, + ) + async def _real_call(self, **kwargs): """Asynchronous call where real end-to-end evaluation logic is performed. @@ -964,8 +1441,24 @@ async def _real_call(self, **kwargs): :return: The evaluation result. :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]] """ + # Reshape inputs based on evaluation level before validation + if self._evaluation_level == EvaluationLevel.CONVERSATION and not kwargs.get("messages"): + query = kwargs.get("query") + response = kwargs.get("response") + if isinstance(query, str) and isinstance(response, str) and query and response: + query, response = _wrap_string_messages(query, response) + if isinstance(query, list) and isinstance(response, list): + kwargs["messages"] = _merge_query_response_messages(query, response) + elif self._evaluation_level == EvaluationLevel.TURN and kwargs.get("messages"): + if any(m.get("role") == MessageRole.USER for m in kwargs["messages"]): + query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"]) + kwargs["query"] = query_messages + kwargs["response"] = response_messages + # Validate input before processing - if kwargs.get("query"): + if kwargs.get("messages"): + self._validator_messages.validate_eval_input(kwargs) + elif kwargs.get("query"): self._validator_with_query.validate_eval_input(kwargs) else: self._validator.validate_eval_input(kwargs) @@ -975,20 +1468,12 @@ async def _real_call(self, **kwargs): return await super()._real_call(**kwargs) except EvaluationException as ex: if ex.category == ErrorCategory.NOT_APPLICABLE: - return { - self._result_key: self.threshold, - f"{self._result_key}_result": "pass", - f"{self._result_key}_threshold": self.threshold, - f"{self._result_key}_reason": f"Not applicable: {ex.message}", - f"{self._result_key}_details": {}, - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", - } + return self._build_result( + score=self.threshold, + result="pass", + reason=f"Not applicable: {ex.message}", + properties={}, + ) else: raise ex @@ -1001,6 +1486,9 @@ def _is_single_entry(self, value): return False def _convert_kwargs_to_eval_input(self, **kwargs): + if kwargs.get("messages") is not None: + return super()._convert_kwargs_to_eval_input(**kwargs) + if kwargs.get("context") or kwargs.get("conversation"): return super()._convert_kwargs_to_eval_input(**kwargs) query = kwargs.get("query") diff --git a/assets/evaluators/builtin/groundedness/evaluator/groundedness_multi_turn.prompty b/assets/evaluators/builtin/groundedness/evaluator/groundedness_multi_turn.prompty new file mode 100644 index 0000000000..a126467ecf --- /dev/null +++ b/assets/evaluators/builtin/groundedness/evaluator/groundedness_multi_turn.prompty @@ -0,0 +1,189 @@ +--- +name: Groundedness Multi-Turn +description: Evaluates groundedness across a full multi-turn conversation +model: + api: chat + parameters: + temperature: 0.0 + max_tokens: 3000 + top_p: 1.0 + presence_penalty: 0 + frequency_penalty: 0 + response_format: + type: json_object +inputs: + messages: + type: string + tool_definitions: + type: string + optional: true + default: "" +--- +system: +# Instruction +## Goal +### You are an expert in evaluating the quality of an AI agent's responses across a full multi-turn conversation based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a multi-turn CONVERSATION and optionally TOOL_DEFINITIONS. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways. + +user: +# Definition +**Groundedness** refers to how well an AI agent's responses remain anchored in the provided context, tool results, and user-provided information throughout a multi-turn conversation. It assesses whether the agent's claims are supported by available grounding sources, whether the agent correctly retains and references information from earlier turns, and whether the agent avoids contradicting, distorting, or fabricating information. + +> Grounding sources include: information explicitly stated by the user, tool results returned during the conversation, context provided by the system. These are the ONLY sources against which the agent's claims should be verified. +> Tool calls and tool results are part of the agent's working process and serve as grounding sources. +> If the agent contradicts or forgets information that was explicitly provided by the user or returned by a tool in an earlier turn, and that information is directly relevant to the current response, this counts as a grounding failure. + +## Important Evaluation Checks +Before scoring, follow this procedure: +1. **Locate the agent's natural-language response:** In multi-turn conversations, agent turns may contain [TOOL_CALL] and [TOOL_RESULT] blocks followed by natural-language text. The text that appears AFTER the tool results IS the agent's response and MUST be evaluated. Do not skip responses that follow tool call blocks or treat the entire turn as "just a tool call." +2. **Extract factual propositions:** From the agent's natural-language response, identify every factual proposition about the world, entities, events, or data. Strip away conversational wrappers first — greetings, self-referential framing ("I know...", "I'm not sure but...", "I can't confirm..."), expressions of uncertainty, polite filler, and question framing are NOT factual propositions and should be set aside before evaluation. +3. **Check each factual proposition against grounding sources:** For each proposition, determine whether its meaning is supported by the grounding sources (user statements, tool results, system context). Focus on whether the core meaning is preserved, not exact wording. Groundedness evaluates whether facts are supported by grounding sources, NOT whether the agent's response is topically relevant to the user's question. An off-topic but factually supported response is still grounded. +4. **Classify unsupported content:** + - **Substantive hallucinations** (MUST lower score): Fabricated facts about the topic not in any grounding source, personal opinions or stated preferences about the topic ("my favorite", "I love it"), made-up anecdotes or experiences about the topic ("I had heard of...", "I read that..."), specific factual claims that contradict or materially distort information from the grounding sources. + - **NOT hallucinations** (do NOT penalize these — they are normal conversational behavior): + - Self-referential framing: "I'm a bot", "I can't eat", "I know a bit about it", "I can't confirm that", "I don't have examples right now", "I don't know that detail", "Swimming is not possible for a bot like me". These are meta-statements about the agent, not factual claims about the world. Always ignore them. + - Expressions of uncertainty: "I'm not sure", "I don't really know, but...", "Not really", "No, I don't know" + - Polite filler and engagement: "That's interesting!", "It is great!", "Wow!", "I'm happy to help", "Cool!" + - Follow-up questions and topic-steering: "Have you been there?", "Did you know...?", "What about you?", "Have you heard of [entity]?", "Do you like X?", "Have you ever tried...?" + - Pronoun references ("it", "they", "them") where the referent is identifiable from the conversation context + - Minor paraphrasing that preserves the core meaning: synonym substitutions ("last" → "final"), word form changes ("skateboarders" → "skateboards"), softening/strengthening hedges ("may be the most familiar" → "the most familiar"), restating facts as questions, summarizing with slightly different wording. If the original meaning is preserved, it is grounded. + - Minor typos or grammar errors that don't change the meaning +5. **Apply this decision tree to score:** + - **No factual propositions to evaluate** (response consists entirely of greetings, questions, or acknowledgments with zero factual assertions) → Score 3. Do NOT use score 3 if the response contains at least one factual proposition — instead, evaluate whether each proposition is supported. + - **Any substantive hallucination identified** (even one) → Score 2 (or 1 if pervasive fabrication) + - **All factual propositions grounded but response is incomplete** → Score 4 + - **All factual propositions grounded and response is complete** → Score 5 + +> CRITICAL: If you identify a substantive hallucination (fabricated facts, unsupported opinions about the topic, or claims that contradict or materially distort grounding sources), the score MUST be 2 or lower. Scores 4-5 require zero substantive hallucinations. Self-referential framing, expressions of uncertainty, engagement phrases, follow-up questions, polite filler, pronoun references, and meaning-preserving paraphrasing are NOT hallucinations and must NOT lower the score. + +# Ratings +## [Groundedness: 1] (Not Grounded) +**Definition:** Agent responses are largely disconnected from the available grounding sources. The agent fabricates information, ignores context and tool results, or provides responses unrelated to the conversation content. + +**Examples:** + **CONVERSATION:** + User turn 1: What medications am I currently prescribed? + [TOOL_CALL] get_patient_medications(patient_id="12345") + [TOOL_RESULT] Current medications: Lisinopril 10mg daily, Metformin 500mg twice daily + Agent turn 1: You are currently taking Aspirin 81mg and Atorvastatin 20mg daily. Make sure to take them with food for best absorption. + User turn 2: Any side effects I should watch for? + Agent turn 2: Common side effects include dizziness and muscle pain. You should also avoid grapefruit juice while on these medications. + +## [Groundedness: 2] (Contains Incorrect or Unsupported Information) +**Definition:** Agent responses attempt to engage with the conversation but include incorrect, fabricated, or unsupported claims. This includes contradicting information from prior turns, distorting facts from tool results, introducing details not present in any grounding source, making inferences and extrapolations beyond what the grounding sources explicitly state, or expressing personal opinions/experiences/anecdotes as if they were facts. Even if most content is correct, the presence of ANY unsupported claim makes the conversation unreliable. + +**Examples:** + **CONVERSATION:** + User turn 1: What's the return policy for electronics? + [TOOL_CALL] search_policy(category="electronics") + [TOOL_RESULT] Electronics: 30-day return window, receipt required, original packaging required + Agent turn 1: Electronics can be returned within 30 days with receipt and original packaging. + User turn 2: What if I lost the receipt? + Agent turn 2: No problem! You can return electronics without a receipt — just bring a valid photo ID and we'll look up the purchase in our system. + + **CONVERSATION:** + User turn 1: What services does this gardening company offer? + [TOOL_CALL] get_business_profile(name="Fifth Season Gardening") + [TOOL_RESULT] Fifth Season Gardening has a 4.2 rating from 62 reviews. Services include lawn mowing, hedge trimming, edging, and weed control. + Agent turn 1: They offer lawn mowing, hedge trimming, edging, and weed control. + User turn 2: How can I prepare my garden for the winter season? + Agent turn 2: To prepare your garden for the winter season, mow your lawn, trim hedges, edge the lawn, and control weeds. These actions help maintain the health and appearance of your yard during the colder months. + + **CONVERSATION:** + User turn 1: Tell me about Dylan's Candy Bar. + [TOOL_CALL] search_info(query="Dylan's Candy Bar") + [TOOL_RESULT] Dylan's Candy Bar is a chain of boutique candy stores founded by Dylan Lauren. Flagship store on Third Avenue in New York City, opened in 2001. + Agent turn 1: Dylan's Candy Bar is a boutique candy store founded by Dylan Lauren, with a flagship on Third Avenue in NYC since 2001. It's my favorite candy store — I love it! + *Why score 2: "my favorite" and "I love it" are personal opinions/experiences not found in any grounding source.* + +## [Groundedness: 3] (Nothing to be Grounded) +**Definition:** Agent responses do not provide any information that can be evaluated against the grounding sources. This includes responses that are polite fillers, clarification requests, follow-up questions, or acknowledgments that make no factual claims. + +**Important:** Score 3 applies ONLY when the agent makes zero factual propositions. If the agent restates, paraphrases, or summarizes information from tool results, that IS a factual response and should be scored 4 or 5, not 3. + +**Examples:** + **CONVERSATION:** + User turn 1: Hi, can you help me with something? + Agent turn 1: Of course! What would you like help with? + User turn 2: I need to update my shipping address. + Agent turn 2: Sure, I can help with that. Could you provide your new address? + +## [Groundedness: 4] (Fully Grounded but Incomplete) +**Definition:** Agent responses provide correct information from the grounding sources but are incomplete or lack specific details that were available. The agent captures some of the necessary information but omits key elements needed for a full understanding. All stated claims are supported by the grounding sources with zero unsupported additions — no fabricated details, personal opinions, or unverifiable claims are present. + +**Examples:** + **CONVERSATION:** + User turn 1: What benefits does the premium plan include? + [TOOL_CALL] get_plan_details(plan="premium") + [TOOL_RESULT] Premium plan: unlimited storage, priority support (24/7), custom domains, advanced analytics, team collaboration (up to 50 members) + Agent turn 1: The premium plan includes unlimited storage and priority support. + User turn 2: Does it support team collaboration? + Agent turn 2: Yes, the premium plan supports team collaboration features. + + **CONVERSATION:** + User turn 1: How do I set up parental controls on Netflix? + [TOOL_CALL] search_help(query="Netflix parental controls") + [TOOL_RESULT] Disney+: To set up parental controls, go to Profile > Settings > Content Rating. + Agent turn 1: The available information does not cover Netflix parental controls. However, for Disney+ you can set up parental controls by going to Profile > Settings > Content Rating. + + **CONVERSATION:** + User turn 1: I see, what else do you know about cheerleading? + Agent turn 1: + [TOOL_CALL] search_knowledge(query="what else do you know about cheerleading") + [TOOL_RESULT] It can be performed to motivate sports teams, entertain the audience, or for competition. + It is performed to motivate sports teams, entertain the audience, or for competition. + *Why score 4: The agent's response after the tool result paraphrases the tool information. This is grounded (the meaning is preserved) but incomplete (only covers one aspect of cheerleading).* + +## [Groundedness: 5] (Fully Grounded & Complete) +**Definition:** All agent responses across all turns are thoroughly accurate, directly supported by grounding sources, and correctly retain information from prior turns. No fabrications, contradictions, or unsupported claims. Every factual claim is supported by the grounding sources. + +**Examples:** + **CONVERSATION:** + User turn 1: What are the office hours for the downtown branch? + [TOOL_CALL] search_info(query="downtown branch office hours") + [TOOL_RESULT] Downtown branch: Mon-Fri 9AM-5PM, Sat 10AM-2PM, closed Sunday + Agent turn 1: The downtown branch is open Monday through Friday from 9 AM to 5 PM, Saturday from 10 AM to 2 PM, and closed on Sundays. + User turn 2: Can I visit on Saturday afternoon at 1 PM? + Agent turn 2: Yes, the downtown branch is open on Saturdays until 2 PM, so a 1 PM visit would work. + +# Data +CONVERSATION: {{messages}} + +{% if tool_definitions %} +TOOL_DEFINITIONS: {{tool_definitions}} +{% endif %} + +# Tasks +## Please provide your assessment Score for the previous CONVERSATION based on the Definitions above. Evaluate groundedness across ALL turns, not just the last response. Your output should be a JSON object with the following information: +- **score**: An integer score (1-5) based on the levels of the definitions. +- **reason**: 15-60 words explaining the groundedness assessment. +- **status**: "completed" if the evaluation was performed successfully. "skipped" if the input is not applicable for groundedness evaluation (e.g., no agent responses to evaluate). +- **properties**: An object containing: + - **grounding_sources**: A list of all grounding sources identified in the conversation. + - **grounded_claims**: A summary of claims that are well-supported by grounding sources. + - **ungrounded_claims**: For each unsupported/incorrect claim, quote the agent's response and explain why it is ungrounded. "None" if all claims are grounded. + +## OUTPUT FORMAT: +{ + "properties": { + "grounding_sources": ["", ""], + "grounded_claims": "", + "ungrounded_claims": "'None' if all claims are grounded." + }, + "reason": "<15-60 words explaining the groundedness assessment>", + "score": 5, + "status": "completed" + +} + +## OUTPUT FORMAT FOR SKIPPED EVALUATION: +{ + "properties": null, + "reason": "No agent responses to evaluate for groundedness.", + "score": null, + "status": "skipped" +} + +# Output diff --git a/assets/evaluators/builtin/groundedness/spec.yaml b/assets/evaluators/builtin/groundedness/spec.yaml index d3d7e4f394..a6a37340fa 100644 --- a/assets/evaluators/builtin/groundedness/spec.yaml +++ b/assets/evaluators/builtin/groundedness/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.groundedness" -version: 8 +version: 9 displayName: "Groundedness-Evaluator" description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context." evaluatorType: "builtin" @@ -9,6 +9,7 @@ categories: ["quality", "agents"] tags: provider: "Microsoft" is_continuous_scenario: "true" +supportedEvaluationLevels: ["conversation", "turn"] initParameterSchema: type: "object" properties: @@ -19,6 +20,8 @@ initParameterSchema: minimum: 1 maximum: 5 multipleOf: 1 + evaluation_level: + type: "string" required: ["deployment_name"] dataMappingSchema: type: "object" @@ -37,6 +40,10 @@ dataMappingSchema: - type: "array" items: type: "object" + messages: + type: "array" + items: + type: "object" tool_definitions: anyOf: - type: "string" @@ -47,6 +54,7 @@ dataMappingSchema: anyOf: - required: ["response", "context"] - required: ["query", "response"] + - required: ["messages"] outputSchema: groundedness: type: "ordinal" diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py index 7d712759a2..b64d1c631f 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py @@ -3,13 +3,23 @@ """Behavioral tests for Groundedness Evaluator.""" +import os import pytest +from typing import Any, Dict, List +from unittest.mock import MagicMock + +from azure.ai.evaluation import AzureOpenAIModelConfiguration +from azure.ai.evaluation._exceptions import EvaluationException + from .base_evaluator_behavior_test import BaseEvaluatorBehaviorTest from .base_tool_evaluation_test import BaseToolEvaluationTest from . import common_tool_test_data as data from ...builtin.groundedness.evaluator._groundedness import ( GroundednessEvaluator, + EvaluationLevel, + serialize_messages, ) +from ..common.evaluator_mock_config import get_flow_side_effect_for_evaluator @pytest.mark.unittest @@ -63,3 +73,505 @@ class TestGroundednessEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvalu check_for_unsupported_tools = True MINIMAL_RESPONSE = BaseEvaluatorBehaviorTest.weather_tool_result_and_assistant_response + + +# region Conversation-level (messages) behavioral tests + + +def _create_multi_turn_mock_side_effect(): + """Create a mock side effect that returns dict output for multi-turn groundedness.""" + + async def flow_side_effect(timeout, **kwargs): + return { + "llm_output": { + "score": 5, + "reason": "All responses are grounded in the provided conversation evidence.", + "status": "completed", + "properties": { + "grounding_sources": ["Tool result"], + "grounded_claims": "All claims supported.", + "ungrounded_claims": "None", + }, + } + } + + return flow_side_effect + + +def _create_mocked_groundedness_evaluator(): + """Create a GroundednessEvaluator with both _flow and _multi_turn_flow mocked.""" + model_config = AzureOpenAIModelConfiguration( + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://Sanitized.api.cognitive.microsoft.com"), + azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT", "aoai-deployment"), + ) + evaluator = GroundednessEvaluator(model_config=model_config) + mock_side_effect = get_flow_side_effect_for_evaluator("groundedness") + evaluator._flow = MagicMock(side_effect=mock_side_effect) + evaluator._multi_turn_flow = MagicMock(side_effect=_create_multi_turn_mock_side_effect()) + return evaluator + + +VALID_GROUNDEDNESS_MESSAGES: List[Dict[str, Any]] = [ + { + "role": "user", + "content": [{"type": "text", "text": "What are the office hours for the downtown branch?"}], + }, + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "search_info", + "arguments": {"query": "downtown branch office hours"}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_1", + "content": [ + { + "type": "tool_result", + "tool_result": "Downtown branch: Mon-Fri 9AM-5PM, Sat 10AM-2PM, closed Sunday", + } + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The downtown branch is open Monday through Friday from 9 AM to 5 PM, " + "Saturday from 10 AM to 2 PM, and closed on Sundays.", + } + ], + }, + { + "role": "user", + "content": [{"type": "text", "text": "Can I visit on Saturday afternoon at 1 PM?"}], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Yes, the downtown branch is open on Saturdays until 2 PM, so a 1 PM visit would work.", + } + ], + }, +] + + +@pytest.mark.unittest +class TestGroundednessMultiturnBehavior: + """Behavioral tests for the multi-turn (messages) path of GroundednessEvaluator.""" + + def test_messages_valid_input(self): + """Valid messages list produces expected output fields.""" + evaluator = _create_mocked_groundedness_evaluator() + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + assert "groundedness" in result + assert "groundedness_result" in result + assert "groundedness_reason" in result + assert "groundedness_properties" in result + assert "groundedness_status" in result + assert "groundedness_threshold" in result + assert 1 <= result["groundedness"] <= 5 + + def test_messages_with_tool_definitions(self): + """Messages plus tool_definitions works correctly.""" + evaluator = _create_mocked_groundedness_evaluator() + tool_defs = [ + { + "name": "search_info", + "description": "Search for information.", + "parameters": {"type": "object", "properties": {"query": {"type": "string"}}}, + } + ] + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES, tool_definitions=tool_defs) + + assert "groundedness" in result + assert 1 <= result["groundedness"] <= 5 + + def test_messages_parses_canonical_schema_skipped_output(self): + """Canonical skipped output is returned as not_applicable with no score.""" + evaluator = _create_mocked_groundedness_evaluator() + + async def canonical_skipped_output(timeout, **kwargs): + return { + "llm_output": { + "score": None, + "reason": "No agent responses to evaluate for groundedness.", + "status": "skipped", + "properties": None, + } + } + + evaluator._multi_turn_flow = MagicMock(side_effect=canonical_skipped_output) + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + assert result["groundedness"] is None + assert result["groundedness_result"] == "not_applicable" + assert result["groundedness_reason"] == "No agent responses to evaluate for groundedness." + assert result["groundedness_status"] == "skipped" + assert result["groundedness_properties"] == {} + + def test_messages_invalid_output_returns_error_result(self): + """Invalid non-dict output returns structured error result instead of raising.""" + evaluator = _create_mocked_groundedness_evaluator() + + async def invalid_output(timeout, **kwargs): + return {"llm_output": "invalid"} + + evaluator._multi_turn_flow = MagicMock(side_effect=invalid_output) + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + assert result["groundedness"] is None + assert result["groundedness_result"] == "error" + assert result["groundedness_reason"] == "Evaluator returned invalid output." + assert result["groundedness_status"] == "error" + assert result["groundedness_properties"] == {} + + def test_messages_empty_list_raises_error(self): + """Empty messages list raises validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + with pytest.raises(EvaluationException): + evaluator(messages=[]) + + def test_messages_invalid_type_raises_error(self): + """Non-list messages raises validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + with pytest.raises(EvaluationException): + evaluator(messages="not a list") + + def test_messages_with_system_message(self): + """Messages with a system message are handled correctly.""" + evaluator = _create_mocked_groundedness_evaluator() + messages_with_system = [ + {"role": "system", "content": "You are a helpful assistant."}, + ] + VALID_GROUNDEDNESS_MESSAGES + result = evaluator(messages=messages_with_system) + + assert "groundedness" in result + assert 1 <= result["groundedness"] <= 5 + + def test_messages_string_content(self): + """Messages with string content (not list) are handled.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": "What color is the sky?"}, + {"role": "assistant", "content": "The sky is blue."}, + ] + result = evaluator(messages=messages) + + assert "groundedness" in result + assert 1 <= result["groundedness"] <= 5 + + def test_messages_uses_multi_turn_flow(self): + """Verify that the multi-turn conversation path calls _multi_turn_flow, not _flow.""" + evaluator = _create_mocked_groundedness_evaluator() + evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + evaluator._multi_turn_flow.assert_called_once() + evaluator._flow.assert_not_called() + + def test_query_response_uses_single_turn_flow(self): + """Verify that the query/response/context path still calls _flow, not _multi_turn_flow.""" + evaluator = _create_mocked_groundedness_evaluator() + evaluator(response="The sky is blue.", context="The sky appears blue due to Rayleigh scattering.") + + evaluator._flow.assert_called_once() + evaluator._multi_turn_flow.assert_not_called() + + def test_messages_with_mcp_approval(self): + """MCP approval messages are dropped during preprocessing.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Do something"}]}, + { + "role": "assistant", + "content": [{"type": "mcp_approval_request", "id": "req_1"}], + }, + { + "role": "tool", + "tool_call_id": "req_1", + "content": [{"type": "mcp_approval_response", "id": "req_1", "approved": True}], + }, + {"role": "assistant", "content": [{"type": "text", "text": "Done!"}]}, + ] + result = evaluator(messages=messages) + + assert "groundedness" in result + assert 1 <= result["groundedness"] <= 5 + + def test_messages_without_tool_definitions(self): + """Messages without tool_definitions still works correctly.""" + evaluator = _create_mocked_groundedness_evaluator() + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + assert "groundedness" in result + # Verify tool_definitions was NOT passed to the prompty + call_kwargs = evaluator._multi_turn_flow.call_args + assert "tool_definitions" not in call_kwargs.kwargs + + def test_messages_with_non_dict_items_raises_error(self): + """Messages list containing non-dict items raises validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + "not a dict", + {"role": "assistant", "content": [{"type": "text", "text": "Hi!"}]}, + ] + with pytest.raises(EvaluationException): + evaluator(messages=messages) + + def test_messages_rejects_invalid_role(self): + """Messages with an invalid role raise validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + {"role": "narrator", "content": [{"type": "text", "text": "The agent responded."}]}, + {"role": "assistant", "content": [{"type": "text", "text": "Hi!"}]}, + ] + with pytest.raises(EvaluationException, match="Invalid role"): + evaluator(messages=messages) + + def test_messages_rejects_no_user_message(self): + """Messages without any 'user' role raise validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "assistant", "content": [{"type": "text", "text": "Hello!"}]}, + ] + with pytest.raises(EvaluationException, match="user"): + evaluator(messages=messages) + + def test_messages_rejects_no_assistant_message(self): + """Messages without any 'assistant' role raise validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + {"role": "user", "content": [{"type": "text", "text": "Anyone there?"}]}, + ] + with pytest.raises(EvaluationException, match="assistant"): + evaluator(messages=messages) + + def test_messages_rejects_conversation_ending_with_user(self): + """Messages ending with a user message raise validation error.""" + evaluator = _create_mocked_groundedness_evaluator() + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "Hi!"}]}, + {"role": "user", "content": [{"type": "text", "text": "Thanks, bye"}]}, + ] + with pytest.raises(EvaluationException, match="last message must have role 'assistant'"): + evaluator(messages=messages) + + def test_messages_intermediate_response(self): + """Messages ending with only tool calls (no text) are rejected.""" + evaluator = _create_mocked_groundedness_evaluator() + intermediate_messages = [ + {"role": "user", "content": [{"type": "text", "text": "Search for info."}]}, + { + "role": "assistant", + "content": [ + { + "type": "function_call", + "name": "search_info", + "tool_call_id": "call_1", + "arguments": {"query": "info"}, + } + ], + }, + ] + with pytest.raises(EvaluationException, match="must contain text content"): + evaluator(messages=intermediate_messages) + + def test_messages_pass_fail_threshold(self): + """Score result respects threshold for pass/fail.""" + evaluator = _create_mocked_groundedness_evaluator() + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + + # Default threshold is 3; mock returns score 5 + assert result["groundedness"] == 5 + assert result["groundedness_result"] == "pass" + assert result["groundedness_threshold"] == 3 + + +# endregion + + +# region evaluation_level tests + +def _create_mocked_groundedness_evaluator_with_level(evaluation_level=None): + """Create a GroundednessEvaluator with evaluation_level and mocked flows.""" + model_config = AzureOpenAIModelConfiguration( + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://Sanitized.api.cognitive.microsoft.com"), + azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT", "aoai-deployment"), + ) + evaluator = GroundednessEvaluator( + model_config=model_config, + evaluation_level=evaluation_level, + ) + mock_side_effect = get_flow_side_effect_for_evaluator("groundedness") + evaluator._flow = MagicMock(side_effect=mock_side_effect) + evaluator._multi_turn_flow = MagicMock(side_effect=_create_multi_turn_mock_side_effect()) + return evaluator + + +@pytest.mark.unittest +class TestGroundednessEvaluationLevel: + """Tests for the evaluation_level parameter.""" + + def test_auto_detect_uses_multi_turn_for_messages(self): + """Default (None) mode auto-detects multi-turn when messages provided.""" + evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level=None) + evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + evaluator._multi_turn_flow.assert_called_once() + evaluator._flow.assert_not_called() + + def test_auto_detect_uses_single_turn_for_response_context(self): + """Default (None) mode auto-detects single-turn when response/context provided.""" + evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level=None) + evaluator(response="The sky is blue.", context="The sky is blue due to Rayleigh scattering.") + evaluator._flow.assert_called_once() + evaluator._multi_turn_flow.assert_not_called() + + def test_forced_conversation_with_messages(self): + """Forced conversation level works with messages.""" + evaluator = _create_mocked_groundedness_evaluator_with_level( + evaluation_level=EvaluationLevel.CONVERSATION + ) + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + evaluator._multi_turn_flow.assert_called_once() + evaluator._flow.assert_not_called() + assert "groundedness" in result + + def test_forced_conversation_with_string_query_response_wraps_to_messages(self): + """Forced conversation level wraps string query/response into messages and uses multi-turn.""" + evaluator = _create_mocked_groundedness_evaluator_with_level( + evaluation_level=EvaluationLevel.CONVERSATION + ) + result = evaluator( + query="What color is the sky?", + response="The sky is blue.", + context="The sky is blue." + ) + # Note: _flow may be reassigned by _ensure_query_prompty_loaded when query is present, + # so we only assert that multi_turn_flow was used for conversation-level evaluation. + evaluator._multi_turn_flow.assert_called_once() + call_kwargs = evaluator._multi_turn_flow.call_args + conversation_text = call_kwargs.kwargs.get("messages", "") + assert "What color is the sky?" in conversation_text + assert "The sky is blue." in conversation_text + assert "groundedness" in result + + def test_string_level_conversation(self): + """String 'conversation' is accepted as evaluation_level.""" + evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="conversation") + result = evaluator(messages=VALID_GROUNDEDNESS_MESSAGES) + evaluator._multi_turn_flow.assert_called_once() + assert "groundedness" in result + + def test_string_level_turn(self): + """String 'turn' is accepted as evaluation_level.""" + evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="turn") + evaluator(response="The sky is blue.", context="The sky is blue due to scattering.") + evaluator._flow.assert_called_once() + evaluator._multi_turn_flow.assert_not_called() + + def test_invalid_string_level_raises(self): + """Invalid string evaluation_level raises at init time.""" + with pytest.raises(EvaluationException, match="Invalid evaluation_level"): + _create_mocked_groundedness_evaluator_with_level(evaluation_level="batch") + + def test_invalid_type_level_raises(self): + """Non-string/non-enum evaluation_level raises at init time.""" + with pytest.raises(EvaluationException, match="Invalid evaluation_level"): + _create_mocked_groundedness_evaluator_with_level(evaluation_level=42) + + +# endregion + + +# region serialize_messages tests + + +@pytest.mark.unittest +class TestGroundednessSerializeMessages: + """Unit tests for the serialize_messages helper used by groundedness.""" + + def test_simple_conversation(self): + """Simple user/assistant exchange serializes correctly.""" + messages = [ + {"role": "user", "content": "What color is the sky?"}, + {"role": "assistant", "content": "The sky is blue."}, + ] + result = serialize_messages(messages) + assert "User turn 1:" in result + assert "What color is the sky?" in result + assert "Agent turn 1:" in result + assert "The sky is blue." in result + + def test_multi_turn_conversation(self): + """Multi-turn conversation serializes with numbered turns.""" + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "How are you?"}, + {"role": "assistant", "content": "I'm doing well, thanks!"}, + ] + result = serialize_messages(messages) + assert "User turn 1:" in result + assert "Agent turn 1:" in result + assert "User turn 2:" in result + assert "Agent turn 2:" in result + + def test_with_tool_calls(self): + """Tool calls and results are included in serialization.""" + messages = [ + {"role": "user", "content": [{"type": "text", "text": "What's the weather?"}]}, + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_1", + "name": "get_weather", + "arguments": {"city": "Seattle"}, + } + ], + }, + { + "role": "tool", + "tool_call_id": "call_1", + "content": [{"type": "tool_result", "tool_result": {"temp": "14C"}}], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "The weather is 14C."}], + }, + ] + result = serialize_messages(messages) + assert "User turn 1:" in result + assert "What's the weather?" in result + assert "Agent turn 1:" in result + + def test_empty_messages(self): + """Empty messages list returns empty string.""" + assert serialize_messages([]) == "" + + def test_system_message_included(self): + """System message is included in serialization.""" + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi!"}, + ] + result = serialize_messages(messages) + assert "You are a helpful assistant." in result + + +# endregion