Update CSAT evaluator output schema (#4945)

salma-elshafey · Copilot · web-flow · commit cf221e5d03d6 · 2026-05-12T00:53:31.000+03:00
* Update CSAT evaluator output schema

* run docstyle

* update not applicable

* Fix not-applicable CSAT result label

Set not-applicable output result to not_applicable while keeping status as skipped, and update behavior tests to match.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* update csat output schema

---------

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import math
 import os
 import logging
 from typing import Dict, Union, List, Optional, Tuple
@@ -1057,21 +1056,18 @@ def __call__(  # pylint: disable=docstring-missing-param
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
     ) -> Dict[str, Union[str, float, Dict]]:
-        """Return a result indicating that the evaluation is not applicable."""
-        return {
-            self._result_key: threshold,
-            f"{self._result_key}_result": "pass",
-            f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_dimensions": {},
-            f"{self._result_key}_prompt_tokens": 0,
-            f"{self._result_key}_completion_tokens": 0,
-            f"{self._result_key}_total_tokens": 0,
-            f"{self._result_key}_finish_reason": "",
-            f"{self._result_key}_model": "",
-            f"{self._result_key}_sample_input": "",
-            f"{self._result_key}_sample_output": "",
-        }
+        """Return a result indicating that the evaluation is not applicable (skipped).
+
+        Not-applicable results have no score since the evaluator cannot make a judgment
+        (e.g., intermediate responses that are not final agent responses).
+        """
+        return self._build_result(
+            score=None,
+            result="not_applicable",
+            reason=f"Not applicable: {error_message}",
+            status="skipped",
+            properties={},
+        )
 
     def _should_use_conversation_level(self, eval_input: Dict) -> bool:
         """Determine whether to use conversation-level evaluation.
@@ -1187,7 +1183,47 @@ async def _do_eval_multi_turn(self, eval_input: Dict) -> Dict[str, Union[float,
         prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs)
         return self._parse_prompty_output(prompty_output_dict)
 
-    def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[float, str]]:
+    def _build_result(
+        self,
+        score: Optional[int],
+        result: str,
+        reason: str,
+        status: str,
+        properties: Dict,
+        prompty_output_dict: Optional[Dict] = None,
+    ) -> Dict[str, Union[str, int, float, Dict, None]]:
+        """Build a standardized result dictionary.
+
+        :param score: The evaluation score (1, 0, or None).
+        :param result: The result label ("pass", "fail", "not_applicable", or "error").
+        :param reason: The reasoning or explanation string.
+        :param status: The evaluation status ("completed", "skipped", or "error").
+        :param properties: The properties dictionary.
+        :param prompty_output_dict: Optional raw prompty output for extracting token metadata.
+        :return: The standardized result dictionary.
+        """
+        p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
+        metadata = {
+            "prompt_tokens": p.get("input_token_count", 0),
+            "completion_tokens": p.get("output_token_count", 0),
+            "total_tokens": p.get("total_token_count", 0),
+            "finish_reason": p.get("finish_reason", ""),
+            "model": p.get("model_id", ""),
+            "sample_input": p.get("sample_input", ""),
+            "sample_output": p.get("sample_output", ""),
+        }
+        return {
+            self._result_key: score,
+            f"{self._result_key}_score": score,
+            f"{self._result_key}_result": result,
+            f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_reason": reason,
+            f"{self._result_key}_status": status,
+            f"{self._result_key}_properties": {**properties, **metadata},
+        }
+
+    def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
         """Parse the prompty output into a standardized result dictionary.
 
         Shared between single-turn and multi-turn evaluation paths.
@@ -1199,47 +1235,29 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[fl
         """
         llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
-        if isinstance(llm_output, dict):
-            score_value = llm_output.get("score", 3)
-            if isinstance(score_value, str):
-                score = float(score_value) if score_value.replace(".", "").isdigit() else 3.0
+        if not isinstance(llm_output, dict):
+            score = None
+            result = "error"
+            reason = "Evaluator returned invalid output."
+            status = "error"
+            properties = {}
+        else:
+            status = llm_output.get("status", "completed")
+            reason = llm_output.get("reason", "")
+            properties = llm_output.get("properties") or {}
+
+            if status == "skipped":
+                score = None
+                result = "skipped"
             else:
-                score = float(score_value) if score_value else 3.0
-
-            # Clamp score to 1-5 range
-            score = max(1.0, min(5.0, score))
-
-            success_result = "pass" if score >= self._threshold else "fail"
-            reason = llm_output.get("explanation", "")
-            dimensions = llm_output.get("dimensions", {})
-
-            return {
-                self._result_key: score,
-                f"{self._result_key}_result": success_result,
-                f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_reason": reason,
-                f"{self._result_key}_dimensions": dimensions,
-                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
-                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
-                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
-                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
-            }
-
-        # Check if base returned nan (invalid output case)
-        if isinstance(llm_output, float) and math.isnan(llm_output):
-            raise EvaluationException(
-                message="Evaluator returned invalid output.",
-                blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.FAILED_EXECUTION,
-                target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
-            )
-
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
+                score = llm_output.get("score", self._threshold)
+                result = "pass" if score >= self._threshold else "fail"
+
+        return self._build_result(
+            score=score,
+            result=result,
+            reason=reason,
+            status=status,
+            properties=properties,
+            prompty_output_dict=prompty_output_dict,
         )
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty
@@ -89,15 +89,22 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "score": <1, 2, 3, 4, or 5>,
-  "explanation": "<30-60 words explaining the predicted satisfaction level>",
-  "dimensions": {
+  "reason": "<30-60 words explaining the predicted satisfaction level>",
+  "properties": {
     "helpfulness": "<1-2 sentences assessing helpfulness>",
     "completeness": "<1-2 sentences assessing completeness>",
     "tone": "<1-2 sentences assessing tone>"
-  }
+  },
+  "score": <1, 2, 3, 4, or 5, or null when skipped>,
+  "status": "completed",
 }
 
+**Status: Skipped**
+If the USER QUERY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 EXAMPLES
 ========
 
@@ -109,13 +116,14 @@ AGENT RESPONSE: "I've successfully cancelled your order #12345. Your payment of
 
 OUTPUT:
 {
-  "score": 5,
-  "explanation": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
-  "dimensions": {
+  "reason": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
+  "properties": {
     "helpfulness": "Directly addressed the cancellation request and completed it immediately.",
     "completeness": "Provided all relevant details: confirmation, refund amount, timeline, and email notification.",
     "tone": "Professional and helpful, ended with an offer for further assistance."
-  }
+  },
+  "score": 5,
+  "status": "completed",
 }
 
 ### Score 3 - Neutral
@@ -126,13 +134,14 @@ AGENT RESPONSE: "Our return policy allows returns within 30 days."
 
 OUTPUT:
 {
-  "score": 3,
-  "explanation": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
-  "dimensions": {
+  "reason": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
+  "properties": {
     "helpfulness": "Answered the basic question but minimal detail provided.",
     "completeness": "Missing key information about conditions, exceptions, and return process.",
     "tone": "Neutral tone, neither particularly warm nor cold."
-  }
+  },
+  "score": 3,
+  "status": "completed"
 }
 
 ### Score 1 - Very Dissatisfied
@@ -143,13 +152,14 @@ AGENT RESPONSE: "According to our records, the package was delivered. Have you c
 
 OUTPUT:
 {
-  "score": 1,
-  "explanation": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
-  "dimensions": {
+  "reason": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
+  "properties": {
     "helpfulness": "Failed to offer any meaningful assistance or resolution options.",
     "completeness": "Did not offer to investigate, file a claim, or provide alternatives.",
     "tone": "Dismissive tone that implies the customer is wrong or didn't look properly."
-  }
+  },
+  "score": 1,
+  "status": "completed"
 }
 
 # Output
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty
@@ -103,15 +103,23 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "score": <1, 2, 3, 4, or 5>,
-  "explanation": "<30-60 words explaining the predicted satisfaction level for the full session>",
-  "dimensions": {
+  
+  "reason": "<30-60 words explaining the predicted satisfaction level for the full session>",
+  "properties": {
     "helpfulness": "<1-2 sentences assessing helpfulness across all turns>",
     "completeness": "<1-2 sentences assessing completeness of all requests>",
     "tone": "<1-2 sentences assessing tone throughout the session>"
-  }
+  },
+  "score": <1, 2, 3, 4, or 5, or null when skipped>,
+  "status": "completed"
 }
 
+**Status: Skipped**
+If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 SCORING EXAMPLES
 ================
 
@@ -125,13 +133,14 @@ Agent turn 2: Order #12346 shipped yesterday via FedEx. Tracking number: FX12345
 
 EXPECTED OUTPUT:
 {
-  "score": 5,
-  "explanation": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
-  "dimensions": {
+  "reason": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
+  "properties": {
     "helpfulness": "Both the cancellation and shipping inquiry were addressed immediately and completely.",
     "completeness": "All details provided: refund timeline, confirmation email, tracking number, and delivery estimate.",
     "tone": "Professional and proactive throughout, offering further assistance after the first request."
-  }
+  },
+  "score": 5,
+  "status": "completed"
 }
 
 ### Score 3 - Neutral (Partial resolution across turns)
@@ -144,13 +153,14 @@ Agent turn 2: I see. Account locks usually expire after 30 minutes. Please try a
 
 EXPECTED OUTPUT:
 {
-  "score": 3,
-  "explanation": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
-  "dimensions": {
+  "reason": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
+  "properties": {
     "helpfulness": "Initial suggestion was generic and unhelpful. Second response addressed the specific error but offered only passive waiting.",
     "completeness": "Missing proactive options like unlocking the account, password reset, or escalation to support.",
     "tone": "Polite but somewhat dismissive of the customer's frustration with a 'try again later' response."
-  }
+  },
+  "score": 3,
+  "status": "completed"
 }
 
 ### Score 1 - Very Dissatisfied (Failed session)
@@ -163,13 +173,14 @@ Agent turn 2: Unfortunately, since the package shows as delivered, we cannot pro
 
 EXPECTED OUTPUT:
 {
-  "score": 1,
-  "explanation": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
-  "dimensions": {
+  "reason": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
+  "properties": {
     "helpfulness": "Failed to offer any meaningful assistance. Deflected responsibility to the customer.",
     "completeness": "Did not offer investigation, replacement, refund, or escalation options.",
     "tone": "Dismissive in both turns, implying the customer is wrong and offering no empathy for the situation."
-  }
+  },
+  "score": 1,
+  "status": "completed"
 }
 
 KEY PRINCIPLES
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py