From f706a3b2921b387627d096c75c2d1de935b7724c Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 21 Apr 2026 13:12:08 +0200
Subject: [PATCH 1/5] Update CSAT evaluator output schema

---
 .../evaluator/_customer_satisfaction.py       | 116 ++++++++++--------
 .../evaluator/customer_satisfaction.prompty   |  42 ++++---
 .../customer_satisfaction_multi_turn.prompty  |  43 ++++---
 ...ustomer_satisfaction_evaluator_behavior.py |  46 ++++++-
 4 files changed, 164 insertions(+), 83 deletions(-)

diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
index f9e920715c..60d48e9c59 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -1,6 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import math
 import os
 import logging
 from typing import Dict, Union, List, Optional, Tuple
@@ -1052,14 +1051,16 @@ def __call__(  # pylint: disable=docstring-missing-param
 
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
-    ) -> Dict[str, Union[str, float, Dict]]:
+    ) -> Dict[str, Any]:
         """Return a result indicating that the evaluation is not applicable."""
         return {
-            self._result_key: threshold,
-            f"{self._result_key}_result": "pass",
+            self._result_key: None,
+            f"{self._result_key}_score": None,
+            f"{self._result_key}_result": "skipped",
             f"{self._result_key}_threshold": threshold,
             f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_dimensions": {},
+            f"{self._result_key}_status": "skipped",
+            f"{self._result_key}_properties": {},
             f"{self._result_key}_prompt_tokens": 0,
             f"{self._result_key}_completion_tokens": 0,
             f"{self._result_key}_total_tokens": 0,
@@ -1183,7 +1184,44 @@ async def _do_eval_multi_turn(self, eval_input: Dict) -> Dict[str, Union[float,
         prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs)
         return self._parse_prompty_output(prompty_output_dict)
 
-    def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[float, str]]:
+    def _build_result(
+        self,
+        score: Optional[int],
+        result: str,
+        reason: str,
+        status: str,
+        properties: Dict,
+        prompty_output_dict: Optional[Dict] = None,
+    ) -> Dict[str, Union[str, int, float, Dict, None]]:
+        """Build a standardized result dictionary.
+
+        :param score: The evaluation score (1, 0, or None).
+        :param result: The result label ("pass", "fail", "skipped", or "error").
+        :param reason: The reasoning or explanation string.
+        :param status: The evaluation status ("completed", "skipped", or "error").
+        :param properties: The properties dictionary.
+        :param prompty_output_dict: Optional raw prompty output for extracting token metadata.
+        :return: The standardized result dictionary.
+        """
+        p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
+        return {
+            self._result_key: score,
+            f"{self._result_key}_score": score,
+            f"{self._result_key}_result": result,
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_reason": reason,
+            f"{self._result_key}_status": status,
+            f"{self._result_key}_properties": properties,
+            f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0),
+            f"{self._result_key}_completion_tokens": p.get("output_token_count", 0),
+            f"{self._result_key}_total_tokens": p.get("total_token_count", 0),
+            f"{self._result_key}_finish_reason": p.get("finish_reason", ""),
+            f"{self._result_key}_model": p.get("model_id", ""),
+            f"{self._result_key}_sample_input": p.get("sample_input", ""),
+            f"{self._result_key}_sample_output": p.get("sample_output", ""),
+        }
+
+    def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
         """Parse the prompty output into a standardized result dictionary.
 
         Shared between single-turn and multi-turn evaluation paths.
@@ -1195,47 +1233,29 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[fl
         """
         llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
-        if isinstance(llm_output, dict):
-            score_value = llm_output.get("score", 3)
-            if isinstance(score_value, str):
-                score = float(score_value) if score_value.replace(".", "").isdigit() else 3.0
+        if not isinstance(llm_output, dict):
+            score = None
+            result = "error"
+            reason = "Evaluator returned invalid output."
+            status = "error"
+            properties = {}
+        else:
+            status = llm_output.get("status", "completed")
+            reason = llm_output.get("reason", "")
+            properties = llm_output.get("properties") or {}
+
+            if status == "skipped":
+                score = None
+                result = "skipped"
             else:
-                score = float(score_value) if score_value else 3.0
-
-            # Clamp score to 1-5 range
-            score = max(1.0, min(5.0, score))
-
-            success_result = "pass" if score >= self._threshold else "fail"
-            reason = llm_output.get("explanation", "")
-            dimensions = llm_output.get("dimensions", {})
-
-            return {
-                self._result_key: score,
-                f"{self._result_key}_result": success_result,
-                f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_reason": reason,
-                f"{self._result_key}_dimensions": dimensions,
-                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
-                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
-                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
-                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
-            }
-
-        # Check if base returned nan (invalid output case)
-        if isinstance(llm_output, float) and math.isnan(llm_output):
-            raise EvaluationException(
-                message="Evaluator returned invalid output.",
-                blame=ErrorBlame.SYSTEM_ERROR,
-                category=ErrorCategory.FAILED_EXECUTION,
-                target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
-            )
-
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
+                score = llm_output.get("score", self._threshold)
+                result = "pass" if score >= self._threshold else "fail"
+
+        return self._build_result(
+            score=score,
+            result=result,
+            reason=reason,
+            status=status,
+            properties=properties,
+            prompty_output_dict=prompty_output_dict,
         )
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty
index 749686c756..6efef89e08 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty
@@ -88,15 +88,22 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "score": <1, 2, 3, 4, or 5>,
-  "explanation": "<30-60 words explaining the predicted satisfaction level>",
-  "dimensions": {
+  "reason": "<30-60 words explaining the predicted satisfaction level>",
+  "properties": {
     "helpfulness": "<1-2 sentences assessing helpfulness>",
     "completeness": "<1-2 sentences assessing completeness>",
     "tone": "<1-2 sentences assessing tone>"
-  }
+  },
+  "score": <1, 2, 3, 4, or 5, or null when skipped>,
+  "status": "completed",
 }
 
+**Status: Skipped**
+If the USER QUERY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 EXAMPLES
 ========
 
@@ -108,13 +115,14 @@ AGENT RESPONSE: "I've successfully cancelled your order #12345. Your payment of
 
 OUTPUT:
 {
-  "score": 5,
-  "explanation": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
-  "dimensions": {
+  "reason": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
+  "properties": {
     "helpfulness": "Directly addressed the cancellation request and completed it immediately.",
     "completeness": "Provided all relevant details: confirmation, refund amount, timeline, and email notification.",
     "tone": "Professional and helpful, ended with an offer for further assistance."
-  }
+  },
+  "score": 5,
+  "status": "completed",
 }
 
 ### Score 3 - Neutral
@@ -125,13 +133,14 @@ AGENT RESPONSE: "Our return policy allows returns within 30 days."
 
 OUTPUT:
 {
-  "score": 3,
-  "explanation": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
-  "dimensions": {
+  "reason": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
+  "properties": {
     "helpfulness": "Answered the basic question but minimal detail provided.",
     "completeness": "Missing key information about conditions, exceptions, and return process.",
     "tone": "Neutral tone, neither particularly warm nor cold."
-  }
+  },
+  "score": 3,
+  "status": "completed"
 }
 
 ### Score 1 - Very Dissatisfied
@@ -142,13 +151,14 @@ AGENT RESPONSE: "According to our records, the package was delivered. Have you c
 
 OUTPUT:
 {
-  "score": 1,
-  "explanation": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
-  "dimensions": {
+  "reason": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
+  "properties": {
     "helpfulness": "Failed to offer any meaningful assistance or resolution options.",
     "completeness": "Did not offer to investigate, file a claim, or provide alternatives.",
     "tone": "Dismissive tone that implies the customer is wrong or didn't look properly."
-  }
+  },
+  "score": 1,
+  "status": "completed"
 }
 
 # Output
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty
index caefb753d3..b0ae426916 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty
@@ -101,15 +101,23 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "score": <1, 2, 3, 4, or 5>,
-  "explanation": "<30-60 words explaining the predicted satisfaction level for the full session>",
-  "dimensions": {
+  
+  "reason": "<30-60 words explaining the predicted satisfaction level for the full session>",
+  "properties": {
     "helpfulness": "<1-2 sentences assessing helpfulness across all turns>",
     "completeness": "<1-2 sentences assessing completeness of all requests>",
     "tone": "<1-2 sentences assessing tone throughout the session>"
-  }
+  },
+  "score": <1, 2, 3, 4, or 5, or null when skipped>,
+  "status": "completed"
 }
 
+**Status: Skipped**
+If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 SCORING EXAMPLES
 ================
 
@@ -123,13 +131,14 @@ Agent turn 2: Order #12346 shipped yesterday via FedEx. Tracking number: FX12345
 
 EXPECTED OUTPUT:
 {
-  "score": 5,
-  "explanation": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
-  "dimensions": {
+  "reason": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
+  "properties": {
     "helpfulness": "Both the cancellation and shipping inquiry were addressed immediately and completely.",
     "completeness": "All details provided: refund timeline, confirmation email, tracking number, and delivery estimate.",
     "tone": "Professional and proactive throughout, offering further assistance after the first request."
-  }
+  },
+  "score": 5,
+  "status": "completed"
 }
 
 ### Score 3 - Neutral (Partial resolution across turns)
@@ -142,13 +151,14 @@ Agent turn 2: I see. Account locks usually expire after 30 minutes. Please try a
 
 EXPECTED OUTPUT:
 {
-  "score": 3,
-  "explanation": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
-  "dimensions": {
+  "reason": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
+  "properties": {
     "helpfulness": "Initial suggestion was generic and unhelpful. Second response addressed the specific error but offered only passive waiting.",
     "completeness": "Missing proactive options like unlocking the account, password reset, or escalation to support.",
     "tone": "Polite but somewhat dismissive of the customer's frustration with a 'try again later' response."
-  }
+  },
+  "score": 3,
+  "status": "completed"
 }
 
 ### Score 1 - Very Dissatisfied (Failed session)
@@ -161,13 +171,14 @@ Agent turn 2: Unfortunately, since the package shows as delivered, we cannot pro
 
 EXPECTED OUTPUT:
 {
-  "score": 1,
-  "explanation": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
-  "dimensions": {
+  "reason": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
+  "properties": {
     "helpfulness": "Failed to offer any meaningful assistance. Deflected responsibility to the customer.",
     "completeness": "Did not offer investigation, replacement, refund, or escalation options.",
     "tone": "Dismissive in both turns, implying the customer is wrong and offering no empathy for the situation."
-  }
+  },
+  "score": 1,
+  "status": "completed"
 }
 
 KEY PRINCIPLES
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
index 16c2e7beb8..674322e655 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
@@ -30,17 +30,19 @@ class TestCustomerSatisfactionEvaluatorBehavior(BaseEvaluatorBehaviorTest):
 
     MINIMAL_RESPONSE = BaseEvaluatorBehaviorTest.MINIMAL_RESPONSE
 
-    _additional_expected_field_suffixes = ["dimensions"]
+    _additional_expected_field_suffixes = ["status", "properties"]
 
     @property
     def expected_result_fields(self):
         """Get the expected result fields for customer satisfaction evaluator."""
         return [
             f"{self._result_prefix}",
+            f"{self._result_prefix}_score",
             f"{self._result_prefix}_reason",
             f"{self._result_prefix}_threshold",
             f"{self._result_prefix}_result",
-            f"{self._result_prefix}_dimensions",
+            f"{self._result_prefix}_status",
+            f"{self._result_prefix}_properties",
             f"{self._result_prefix}_prompt_tokens",
             f"{self._result_prefix}_completion_tokens",
             f"{self._result_prefix}_total_tokens",
@@ -50,6 +52,12 @@ def expected_result_fields(self):
             f"{self._result_prefix}_sample_output",
         ]
 
+    def assert_not_applicable(self, result_data: Dict[str, Any]):
+        """Customer satisfaction treats intermediate responses as skipped results."""
+        assert result_data["score"] is None
+        assert result_data["label"] == "skipped"
+        assert "Not applicable" in result_data.get("reason", "")
+
 
 def _create_mocked_evaluator():
     """Create a CustomerSatisfactionEvaluator with both _flow and _multi_turn_flow mocked."""
@@ -98,11 +106,16 @@ def test_messages_valid_input(self):
         result = evaluator(messages=VALID_MESSAGES)
 
         assert "customer_satisfaction" in result
+        assert "customer_satisfaction_score" in result
         assert "customer_satisfaction_result" in result
         assert "customer_satisfaction_reason" in result
-        assert "customer_satisfaction_dimensions" in result
+        assert "customer_satisfaction_status" in result
+        assert "customer_satisfaction_properties" in result
         assert "customer_satisfaction_threshold" in result
         assert 1.0 <= result["customer_satisfaction"] <= 5.0
+        assert result["customer_satisfaction_score"] == result["customer_satisfaction"]
+        assert result["customer_satisfaction_status"] == "completed"
+        assert isinstance(result["customer_satisfaction_properties"], dict)
 
     def test_messages_empty_list_raises_error(self):
         """Empty messages list raises validation error."""
@@ -194,6 +207,33 @@ def test_messages_string_content(self):
         conversation_text = call_kwargs.kwargs.get("conversation", "")
         assert "I need help with my order." in conversation_text
 
+    def test_query_response_intermediate_returns_skipped_schema(self):
+        """Intermediate single-turn responses return the standardized skipped schema."""
+        evaluator = _create_mocked_evaluator()
+        result = evaluator(
+            query="Cancel my order.",
+            response=[
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "function_call",
+                            "name": "cancel_order",
+                            "tool_call_id": "call_1",
+                            "arguments": {"order_id": "12345"},
+                        }
+                    ],
+                }
+            ],
+        )
+
+        assert result["customer_satisfaction"] is None
+        assert result["customer_satisfaction_score"] is None
+        assert result["customer_satisfaction_result"] == "skipped"
+        assert result["customer_satisfaction_status"] == "skipped"
+        assert result["customer_satisfaction_reason"].startswith("Not applicable:")
+        assert result["customer_satisfaction_properties"] == {}
+
     def test_messages_uses_multi_turn_flow(self):
         """Verify that the session path calls _multi_turn_flow, not _flow."""
         evaluator = _create_mocked_evaluator()

From f656ac965d0954048acbdacd82cf1b798bf25e10 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 21 Apr 2026 13:21:18 +0200
Subject: [PATCH 2/5] run docstyle

---
 .../test_customer_satisfaction_evaluator_behavior.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
index 674322e655..0c4ad09e0c 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
@@ -53,7 +53,7 @@ def expected_result_fields(self):
         ]
 
     def assert_not_applicable(self, result_data: Dict[str, Any]):
-        """Customer satisfaction treats intermediate responses as skipped results."""
+        """Assert that the result is not applicable."""
         assert result_data["score"] is None
         assert result_data["label"] == "skipped"
         assert "Not applicable" in result_data.get("reason", "")

From 0b2d48713b656238b8a559c4a093aec34ee01afa Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Tue, 21 Apr 2026 13:23:25 +0200
Subject: [PATCH 3/5] update not applicable

---
 .../customer_satisfaction/evaluator/_customer_satisfaction.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
index 60d48e9c59..8b8a5b5b87 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -1051,7 +1051,7 @@ def __call__(  # pylint: disable=docstring-missing-param
 
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
-    ) -> Dict[str, Any]:
+    ) -> Dict[str, Union[str, float, Dict]]:
         """Return a result indicating that the evaluation is not applicable."""
         return {
             self._result_key: None,

From b31333a8dd21b36506de9f155a00ad06b117a142 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Wed, 22 Apr 2026 12:59:00 +0200
Subject: [PATCH 4/5] Fix not-applicable CSAT result label

Set not-applicable output result to not_applicable while keeping status as skipped, and update behavior tests to match.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../evaluator/_customer_satisfaction.py                   | 4 ++--
 .../test_customer_satisfaction_evaluator_behavior.py      | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
index 8b8a5b5b87..1ac351b934 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -1056,7 +1056,7 @@ def _not_applicable_result(
         return {
             self._result_key: None,
             f"{self._result_key}_score": None,
-            f"{self._result_key}_result": "skipped",
+            f"{self._result_key}_result": "not_applicable",
             f"{self._result_key}_threshold": threshold,
             f"{self._result_key}_reason": f"Not applicable: {error_message}",
             f"{self._result_key}_status": "skipped",
@@ -1196,7 +1196,7 @@ def _build_result(
         """Build a standardized result dictionary.
 
         :param score: The evaluation score (1, 0, or None).
-        :param result: The result label ("pass", "fail", "skipped", or "error").
+        :param result: The result label ("pass", "fail", "not_applicable", or "error").
         :param reason: The reasoning or explanation string.
         :param status: The evaluation status ("completed", "skipped", or "error").
         :param properties: The properties dictionary.
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
index 0c4ad09e0c..297aa18c62 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
@@ -55,7 +55,7 @@ def expected_result_fields(self):
     def assert_not_applicable(self, result_data: Dict[str, Any]):
         """Assert that the result is not applicable."""
         assert result_data["score"] is None
-        assert result_data["label"] == "skipped"
+        assert result_data["label"] == "not_applicable"
         assert "Not applicable" in result_data.get("reason", "")
 
 
@@ -207,8 +207,8 @@ def test_messages_string_content(self):
         conversation_text = call_kwargs.kwargs.get("conversation", "")
         assert "I need help with my order." in conversation_text
 
-    def test_query_response_intermediate_returns_skipped_schema(self):
-        """Intermediate single-turn responses return the standardized skipped schema."""
+    def test_query_response_intermediate_returns_not_applicable_schema(self):
+        """Intermediate single-turn responses return the standardized not-applicable schema."""
         evaluator = _create_mocked_evaluator()
         result = evaluator(
             query="Cancel my order.",
@@ -229,7 +229,7 @@ def test_query_response_intermediate_returns_skipped_schema(self):
 
         assert result["customer_satisfaction"] is None
         assert result["customer_satisfaction_score"] is None
-        assert result["customer_satisfaction_result"] == "skipped"
+        assert result["customer_satisfaction_result"] == "not_applicable"
         assert result["customer_satisfaction_status"] == "skipped"
         assert result["customer_satisfaction_reason"].startswith("Not applicable:")
         assert result["customer_satisfaction_properties"] == {}

From 095628c642867a286873cda127421919c1bfbed9 Mon Sep 17 00:00:00 2001
From: salma-elshafey <selshafey@microsoft.com>
Date: Thu, 30 Apr 2026 13:45:52 +0300
Subject: [PATCH 5/5] update csat output schema

---
 .../evaluator/_customer_satisfaction.py       | 48 +++++++++----------
 ...ustomer_satisfaction_evaluator_behavior.py | 17 ++++---
 2 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
index 8d668c6585..447932e4fd 100644
--- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
+++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -1052,23 +1052,18 @@ def __call__(  # pylint: disable=docstring-missing-param
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
     ) -> Dict[str, Union[str, float, Dict]]:
-        """Return a result indicating that the evaluation is not applicable."""
-        return {
-            self._result_key: None,
-            f"{self._result_key}_score": None,
-            f"{self._result_key}_result": "not_applicable",
-            f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_status": "skipped",
-            f"{self._result_key}_properties": {},
-            f"{self._result_key}_prompt_tokens": 0,
-            f"{self._result_key}_completion_tokens": 0,
-            f"{self._result_key}_total_tokens": 0,
-            f"{self._result_key}_finish_reason": "",
-            f"{self._result_key}_model": "",
-            f"{self._result_key}_sample_input": "",
-            f"{self._result_key}_sample_output": "",
-        }
+        """Return a result indicating that the evaluation is not applicable (skipped).
+
+        Not-applicable results have no score since the evaluator cannot make a judgment
+        (e.g., intermediate responses that are not final agent responses).
+        """
+        return self._build_result(
+            score=None,
+            result="not_applicable",
+            reason=f"Not applicable: {error_message}",
+            status="skipped",
+            properties={},
+        )
 
     def _should_use_conversation_level(self, eval_input: Dict) -> bool:
         """Determine whether to use conversation-level evaluation.
@@ -1204,21 +1199,24 @@ def _build_result(
         :return: The standardized result dictionary.
         """
         p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
+        metadata = {
+            "prompt_tokens": p.get("input_token_count", 0),
+            "completion_tokens": p.get("output_token_count", 0),
+            "total_tokens": p.get("total_token_count", 0),
+            "finish_reason": p.get("finish_reason", ""),
+            "model": p.get("model_id", ""),
+            "sample_input": p.get("sample_input", ""),
+            "sample_output": p.get("sample_output", ""),
+        }
         return {
             self._result_key: score,
             f"{self._result_key}_score": score,
             f"{self._result_key}_result": result,
+            f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
             f"{self._result_key}_threshold": self._threshold,
             f"{self._result_key}_reason": reason,
             f"{self._result_key}_status": status,
-            f"{self._result_key}_properties": properties,
-            f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0),
-            f"{self._result_key}_completion_tokens": p.get("output_token_count", 0),
-            f"{self._result_key}_total_tokens": p.get("total_token_count", 0),
-            f"{self._result_key}_finish_reason": p.get("finish_reason", ""),
-            f"{self._result_key}_model": p.get("model_id", ""),
-            f"{self._result_key}_sample_input": p.get("sample_input", ""),
-            f"{self._result_key}_sample_output": p.get("sample_output", ""),
+            f"{self._result_key}_properties": {**properties, **metadata},
         }
 
     def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
index 297aa18c62..79f923b7bd 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
@@ -34,22 +34,16 @@ class TestCustomerSatisfactionEvaluatorBehavior(BaseEvaluatorBehaviorTest):
 
     @property
     def expected_result_fields(self):
-        """Get the expected result fields for customer satisfaction evaluator."""
+        """Get expected result fields — metadata now lives inside properties, not as top-level keys."""
         return [
             f"{self._result_prefix}",
             f"{self._result_prefix}_score",
             f"{self._result_prefix}_reason",
             f"{self._result_prefix}_threshold",
             f"{self._result_prefix}_result",
+            f"{self._result_prefix}_passed",
             f"{self._result_prefix}_status",
             f"{self._result_prefix}_properties",
-            f"{self._result_prefix}_prompt_tokens",
-            f"{self._result_prefix}_completion_tokens",
-            f"{self._result_prefix}_total_tokens",
-            f"{self._result_prefix}_finish_reason",
-            f"{self._result_prefix}_model",
-            f"{self._result_prefix}_sample_input",
-            f"{self._result_prefix}_sample_output",
         ]
 
     def assert_not_applicable(self, result_data: Dict[str, Any]):
@@ -232,7 +226,12 @@ def test_query_response_intermediate_returns_not_applicable_schema(self):
         assert result["customer_satisfaction_result"] == "not_applicable"
         assert result["customer_satisfaction_status"] == "skipped"
         assert result["customer_satisfaction_reason"].startswith("Not applicable:")
-        assert result["customer_satisfaction_properties"] == {}
+        # properties contains default metadata (all zeros/empty) when no prompty output
+        props = result["customer_satisfaction_properties"]
+        assert isinstance(props, dict)
+        assert props["prompt_tokens"] == 0
+        assert props["completion_tokens"] == 0
+        assert props["total_tokens"] == 0
 
     def test_messages_uses_multi_turn_flow(self):
         """Verify that the session path calls _multi_turn_flow, not _flow."""