Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/assets.json‎
Lines changed: 2 additions & 2 deletions b/‎sdk/evaluation/azure-ai-evaluation/assets.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 2 additions & 12 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 6 additions & 1 deletion b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_bleu/_bleu.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty‎
Lines changed: 8 additions & 7 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 35 additions & 27 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py‎
Lines changed: 35 additions & 27 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 67 additions & 83 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py‎
Lines changed: 67 additions & 83 deletions
@@ -12,6 +12,8 @@
 
 ### Breaking Changes
 
+- Updated `EVALUATOR_NAME_METRICS_MAPPINGS` so `document_retrieval` and `rouge_score` report single primary metrics (`document_retrieval`, `rouge`), with previous sub-metrics now represented in each evaluator's `*_properties` payload.
+
 ### Bugs Fixed
 
 - `_TaskNavigationEfficiencyEvaluator` now accepts JSON-stringified `response` and `ground_truth` inputs (e.g., from data pipelines that serialize list/tuple inputs to strings). String inputs are parsed as JSON; on parse failure the original value is preserved so downstream validation surfaces the error as before.
 
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_0748353c8d"
-}
+  "Tag": "python/evaluation/azure-ai-evaluation_f30e4bdde3"
+}
@@ -103,24 +103,14 @@ class _EvaluatorMetricMapping:
     EVALUATOR_NAME_METRICS_MAPPINGS = {
         "bleu_score": ["bleu"],
         "coherence": ["coherence"],
-        "document_retrieval": [
-            "xdcg@3",
-            "ndcg@3",
-            "fidelity",
-            "top1_relevance",
-            "top3_max_relevance",
-            "holes",
-            "holes_ratio",
-            "total_retrieved_documents",
-            "total_ground_truth_documents",
-        ],
+        "document_retrieval": ["document_retrieval"],
         "f1_score": ["f1_score"],
         "fluency": ["fluency"],
         "gleu_score": ["gleu"],
         "meteor_score": ["meteor"],
         "relevance": ["relevance"],
         "response_completeness": ["response_completeness"],
-        "rouge_score": ["rouge_f1_score", "rouge_precision", "rouge_recall"],
+        "rouge_score": ["rouge"],
         "groundedness_pro": ["groundedness_pro"],
         "similarity": ["similarity"],
         "intent_resolution": ["intent_resolution"],
 
@@ -6,9 +6,9 @@
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._common.utils import nltk_tokenize
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
-from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 
 
 class BleuScoreEvaluator(EvaluatorBase):
@@ -87,9 +87,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
             binary_result = score <= self._threshold
 
         return {
+            "bleu": score,
             "bleu_score": score,
+            "bleu_passed": binary_result,
             "bleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "bleu_reason": None,
+            "bleu_status": "completed",
             "bleu_threshold": self._threshold,
+            "bleu_properties": None,
         }
 
     @overload  # type: ignore
 
@@ -10,7 +10,7 @@ model:
     presence_penalty: 0
     frequency_penalty: 0
     response_format:
-      type: text
+      type: json_object
 
 inputs:
   query:
@@ -89,11 +89,12 @@ RESPONSE: {{response}}
 
 
 # Tasks
-## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
-- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
-- **Explanation**: a very short explanation of why you think the input Data should get that Score.
-- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
+## Please provide your assessment for the previous RESPONSE in relation to the QUERY based on the Definitions above.
+Your output must be a valid JSON object with exactly these keys:
+  - reason: a string explaining your thought process and assessment. Start with "Let's think step by step:". When status is "skipped", explain why evaluation was skipped.
+  - score: an integer value between 1 and 5 based on the level definitions above. The score you give MUST be an integer score (i.e., 1, 2...) based on the levels of the definitions. Set to null when status is "skipped".
+  - status: a string indicating the evaluation status. Must be one of:
+      - "completed": evaluation was performed normally.
+      - "skipped": evaluation was not performed because the QUERY or RESPONSE is empty or not provided. When skipped, set score to null.
 
-
-## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
 # Output
@@ -619,35 +619,43 @@ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], Aggrega
         for eval_input in eval_input_list:
             result = await self._do_eval(eval_input)
             # logic to determine threshold pass/fail
+            # if it wasn't computed in _do_eval
             try:
-                for key in list(result.keys()):
-                    if key.endswith("_score") and "rouge" not in key:
-                        score_value = result[key]
-                        base_key = key[:-6]  # Remove "_score" suffix
-                        result_key = f"{base_key}_result"
-                        threshold_key = f"{base_key}_threshold"
-                        threshold_value = (
-                            self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
-                        )
-                        if not isinstance(threshold_value, (int, float)):
-                            raise EvaluationException(
-                                "Threshold value must be a number.",
-                                internal_message=str(threshold_value),
-                                target=ErrorTarget.EVALUATE,
-                                category=ErrorCategory.INVALID_VALUE,
+                keys = list(result.keys())
+                contains_result_key = any(key.endswith("_result") for key in keys)
+                contains_threshold_key = any(key.endswith("_threshold") for key in keys)
+                if not contains_result_key or not contains_threshold_key:
+                    for key in keys:
+                        if key.endswith("_score"):
+                            score_value = result[key]
+                            base_key = key[:-6]  # Remove "_score" suffix
+                            result_key = f"{base_key}_result"
+                            threshold_key = f"{base_key}_threshold"
+                            threshold_value = (
+                                self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
                             )
-
-                        result[threshold_key] = threshold_value
-                        if self._higher_is_better:
-                            if float(score_value) >= threshold_value:
-                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
-                            else:
-                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
-                        else:
-                            if float(score_value) <= threshold_value:
-                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
-                            else:
-                                result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
+                            if not isinstance(threshold_value, (int, float)):
+                                raise EvaluationException(
+                                    "Threshold value must be a number.",
+                                    internal_message=str(threshold_value),
+                                    target=ErrorTarget.EVALUATE,
+                                    category=ErrorCategory.INVALID_VALUE,
+                                )
+
+                            if not contains_threshold_key:
+                                result[threshold_key] = threshold_value
+
+                            if not contains_result_key:
+                                if self._higher_is_better:
+                                    if float(score_value) >= threshold_value:
+                                        result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
+                                    else:
+                                        result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
+                                else:
+                                    if float(score_value) <= threshold_value:
+                                        result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
+                                    else:
+                                        result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
             except Exception as e:
                 logger.warning(f"Error calculating binary result: {e}")
             per_turn_results.append(result)
 
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
+import json
 import math
 import re
 import os
@@ -201,7 +202,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         # Check for intermediate response
         if _is_intermediate_response(eval_input.get("response")):
-            return self._not_applicable_result(
+            return self._return_not_applicable_result(
                 "Intermediate response. Please provide the agent's final response for evaluation.",
                 self._threshold,
             )
@@ -216,59 +217,83 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
 
         score = math.nan
+        reason = ""
+        llm_properties = {}
+
         if prompty_output_dict:
             llm_output = prompty_output_dict.get("llm_output", "")
-            input_token_count = prompty_output_dict.get("input_token_count", 0)
-            output_token_count = prompty_output_dict.get("output_token_count", 0)
-            total_token_count = prompty_output_dict.get("total_token_count", 0)
-            finish_reason = prompty_output_dict.get("finish_reason", "")
-            model_id = prompty_output_dict.get("model_id", "")
-            sample_input = prompty_output_dict.get("sample_input", "")
-            sample_output = prompty_output_dict.get("sample_output", "")
-            # Parse out score and reason from evaluators known to possess them.
-            if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
-                score, reason = parse_quality_evaluator_reason_score(llm_output)
-                binary_result = self._get_binary_result(score)
-                return {
-                    self._result_key: float(score),
-                    f"gpt_{self._result_key}": float(score),
-                    f"{self._result_key}_reason": reason,
-                    f"{self._result_key}_result": binary_result,
-                    f"{self._result_key}_threshold": self._threshold,
-                    f"{self._result_key}_prompt_tokens": input_token_count,
-                    f"{self._result_key}_completion_tokens": output_token_count,
-                    f"{self._result_key}_total_tokens": total_token_count,
-                    f"{self._result_key}_finish_reason": finish_reason,
-                    f"{self._result_key}_model": model_id,
-                    f"{self._result_key}_sample_input": sample_input,
-                    f"{self._result_key}_sample_output": sample_output,
-                }
-            match = re.search(r"\d", llm_output)
-            if match:
-                score = float(match.group())
-                binary_result = self._get_binary_result(score)
+
+            # Parse JSON output from LLM
+            parsed_output = None
+            if isinstance(llm_output, dict):
+                parsed_output = llm_output
+            elif isinstance(llm_output, str):
+                try:
+                    parsed_output = json.loads(llm_output)
+                except (json.JSONDecodeError, TypeError):
+                    parsed_output = None
+
+            if parsed_output and isinstance(parsed_output, dict):
+                # Handle skipped status from LLM
+                llm_status = parsed_output.get("status", "completed")
+                if llm_status == "skipped":
+                    skip_reason = parsed_output.get("reason", "")
+                    return self._return_not_applicable_result(skip_reason, self._threshold)
+
+                score = parsed_output.get("score", math.nan)
+                reason = parsed_output.get("reason", "")
+                llm_properties = parsed_output.get("properties", {}) or {}
+            else:
+                # Fallback: try to parse legacy XML format or extract digit
+                if isinstance(llm_output, str) and self._result_key in PROMPT_BASED_REASON_EVALUATORS:
+                    score, reason = parse_quality_evaluator_reason_score(llm_output)
+                elif isinstance(llm_output, str):
+                    match = re.search(r"\d", llm_output)
+                    if match:
+                        score = float(match.group())
+
+            score = float(score) if score is not None else math.nan
+            score_result = self._get_binary_result(score)
+
+            llm_properties.update(self._get_token_metadata(prompty_output_dict))
+
             return {
-                self._result_key: float(score),
-                f"gpt_{self._result_key}": float(score),
-                f"{self._result_key}_result": binary_result,
+                self._result_key: score,
+                f"{self._result_key}_score": score,
+                f"{self._result_key}_passed": score_result == "pass",
+                f"{self._result_key}_result": score_result,
+                f"{self._result_key}_reason": reason,
+                f"{self._result_key}_status": "completed",
                 f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_prompt_tokens": input_token_count,
-                f"{self._result_key}_completion_tokens": output_token_count,
-                f"{self._result_key}_total_tokens": total_token_count,
-                f"{self._result_key}_finish_reason": finish_reason,
-                f"{self._result_key}_model": model_id,
-                f"{self._result_key}_sample_input": sample_input,
-                f"{self._result_key}_sample_output": sample_output,
+                f"{self._result_key}_properties": llm_properties,
             }
 
-        binary_result = self._get_binary_result(score)
         raise EvaluationException(
             message="Evaluator returned invalid output.",
             blame=ErrorBlame.SYSTEM_ERROR,
             category=ErrorCategory.FAILED_EXECUTION,
             target=ErrorTarget.EVALUATE,
         )
 
+    @staticmethod
+    def _get_token_metadata(prompty_output: Dict) -> Dict:
+        """Extract token usage and model metadata from the prompty output dict.
+
+        :param prompty_output: The raw output dictionary from the prompty flow.
+        :type prompty_output: Dict
+        :return: A dictionary with token counts, finish reason, model, and sample I/O.
+        :rtype: Dict
+        """
+        return {
+            "prompt_tokens": prompty_output.get("input_token_count", 0),
+            "completion_tokens": prompty_output.get("output_token_count", 0),
+            "total_tokens": prompty_output.get("total_token_count", 0),
+            "finish_reason": prompty_output.get("finish_reason", ""),
+            "model": prompty_output.get("model_id", ""),
+            "sample_input": prompty_output.get("sample_input", ""),
+            "sample_output": prompty_output.get("sample_output", ""),
+        }
+
     @staticmethod
     def _get_built_in_tool_definition(tool_name: str):
         """Get the definition for the built-in tool."""
@@ -401,45 +426,6 @@ def _extract_needed_tool_definitions(
 
         return needed_tool_definitions
 
-    def _not_applicable_result(
-        self, error_message: str, threshold: Union[int, float], has_details: bool = False
-    ) -> Dict[str, Union[str, int, float, Dict]]:
-        """Return a result indicating that the evaluation is not applicable.
-
-        When evaluation cannot be performed (e.g., no tool calls, missing definitions),
-        this returns the threshold value as the score with a "pass" result.
-
-        :param error_message: The error message explaining why evaluation is not applicable.
-        :type error_message: str
-        :param threshold: The threshold value for the evaluator, used as the score.
-        :type threshold: Union[int, float]
-        :param has_details: Whether to include an empty details field in the result.
-        :type has_details: bool
-        :return: A dictionary containing the result of the evaluation.
-        :rtype: Dict[str, Union[str, float, Dict]]
-        """
-        # If no tool calls were made or tool call type is not supported, return threshold as score with pass result
-        result = {
-            self._result_key: threshold,
-            f"{self._result_key}_result": "pass",
-            f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_prompt_tokens": 0,
-            f"{self._result_key}_completion_tokens": 0,
-            f"{self._result_key}_total_tokens": 0,
-            f"{self._result_key}_finish_reason": "",
-            f"{self._result_key}_model": "",
-            f"{self._result_key}_sample_input": "",
-            f"{self._result_key}_sample_output": "",
-        }
-
-        # Add empty details field if requested
-        if has_details:
-            result[f"{self._result_key}_details"] = {}
-
-        return result
-
-    # TODO: After all evaluators output are updated, we can remove the _not_applicable_result method and replace calls to it with _return_not_applicable_result, which returns a "skipped" status instead of "pass" to avoid confusion.
     def _return_not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
     ) -> Dict[str, Union[str, float, Dict, None]]:
@@ -455,10 +441,8 @@ def _return_not_applicable_result(
         return {
             f"{self._result_key}": None,
             f"{self._result_key}_score": None,
-            # TODO: Return "not_applicable" instead of "pass" once the
-            # evaluation service accepts it as a valid result value.
-            f"{self._result_key}_result": "pass",
             f"{self._result_key}_passed": None,
+            f"{self._result_key}_result": "not_applicable",
             f"{self._result_key}_reason": f"Not applicable: {error_message}",
             f"{self._result_key}_status": "skipped",
             f"{self._result_key}_threshold": threshold,