From f706a3b2921b387627d096c75c2d1de935b7724c Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 21 Apr 2026 13:12:08 +0200 Subject: [PATCH 1/5] Update CSAT evaluator output schema --- .../evaluator/_customer_satisfaction.py | 116 ++++++++++-------- .../evaluator/customer_satisfaction.prompty | 42 ++++--- .../customer_satisfaction_multi_turn.prompty | 43 ++++--- ...ustomer_satisfaction_evaluator_behavior.py | 46 ++++++- 4 files changed, 164 insertions(+), 83 deletions(-) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py index f9e920715c..60d48e9c59 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import math import os import logging from typing import Dict, Union, List, Optional, Tuple @@ -1052,14 +1051,16 @@ def __call__( # pylint: disable=docstring-missing-param def _not_applicable_result( self, error_message: str, threshold: Union[int, float] - ) -> Dict[str, Union[str, float, Dict]]: + ) -> Dict[str, Any]: """Return a result indicating that the evaluation is not applicable.""" return { - self._result_key: threshold, - f"{self._result_key}_result": "pass", + self._result_key: None, + f"{self._result_key}_score": None, + f"{self._result_key}_result": "skipped", f"{self._result_key}_threshold": threshold, f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_dimensions": {}, + f"{self._result_key}_status": "skipped", + f"{self._result_key}_properties": {}, f"{self._result_key}_prompt_tokens": 0, f"{self._result_key}_completion_tokens": 0, f"{self._result_key}_total_tokens": 0, @@ -1183,7 +1184,44 @@ async def _do_eval_multi_turn(self, eval_input: Dict) -> Dict[str, Union[float, prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs) return self._parse_prompty_output(prompty_output_dict) - def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[float, str]]: + def _build_result( + self, + score: Optional[int], + result: str, + reason: str, + status: str, + properties: Dict, + prompty_output_dict: Optional[Dict] = None, + ) -> Dict[str, Union[str, int, float, Dict, None]]: + """Build a standardized result dictionary. + + :param score: The evaluation score (1, 0, or None). + :param result: The result label ("pass", "fail", "skipped", or "error"). + :param reason: The reasoning or explanation string. + :param status: The evaluation status ("completed", "skipped", or "error"). + :param properties: The properties dictionary. + :param prompty_output_dict: Optional raw prompty output for extracting token metadata. + :return: The standardized result dictionary. + """ + p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} + return { + self._result_key: score, + f"{self._result_key}_score": score, + f"{self._result_key}_result": result, + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": status, + f"{self._result_key}_properties": properties, + f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), + f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), + f"{self._result_key}_total_tokens": p.get("total_token_count", 0), + f"{self._result_key}_finish_reason": p.get("finish_reason", ""), + f"{self._result_key}_model": p.get("model_id", ""), + f"{self._result_key}_sample_input": p.get("sample_input", ""), + f"{self._result_key}_sample_output": p.get("sample_output", ""), + } + + def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]: """Parse the prompty output into a standardized result dictionary. Shared between single-turn and multi-turn evaluation paths. @@ -1195,47 +1233,29 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[fl """ llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) - if isinstance(llm_output, dict): - score_value = llm_output.get("score", 3) - if isinstance(score_value, str): - score = float(score_value) if score_value.replace(".", "").isdigit() else 3.0 + if not isinstance(llm_output, dict): + score = None + result = "error" + reason = "Evaluator returned invalid output." + status = "error" + properties = {} + else: + status = llm_output.get("status", "completed") + reason = llm_output.get("reason", "") + properties = llm_output.get("properties") or {} + + if status == "skipped": + score = None + result = "skipped" else: - score = float(score_value) if score_value else 3.0 - - # Clamp score to 1-5 range - score = max(1.0, min(5.0, score)) - - success_result = "pass" if score >= self._threshold else "fail" - reason = llm_output.get("explanation", "") - dimensions = llm_output.get("dimensions", {}) - - return { - self._result_key: score, - f"{self._result_key}_result": success_result, - f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_reason": reason, - f"{self._result_key}_dimensions": dimensions, - f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), - f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), - f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""), - f"{self._result_key}_model": prompty_output_dict.get("model_id", ""), - f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), - f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), - } - - # Check if base returned nan (invalid output case) - if isinstance(llm_output, float) and math.isnan(llm_output): - raise EvaluationException( - message="Evaluator returned invalid output.", - blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, - target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR, - ) - - raise EvaluationException( - message="Evaluator returned invalid output.", - blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, - target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR, + score = llm_output.get("score", self._threshold) + result = "pass" if score >= self._threshold else "fail" + + return self._build_result( + score=score, + result=result, + reason=reason, + status=status, + properties=properties, + prompty_output_dict=prompty_output_dict, ) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty index 749686c756..6efef89e08 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty @@ -88,15 +88,22 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "score": <1, 2, 3, 4, or 5>, - "explanation": "<30-60 words explaining the predicted satisfaction level>", - "dimensions": { + "reason": "<30-60 words explaining the predicted satisfaction level>", + "properties": { "helpfulness": "<1-2 sentences assessing helpfulness>", "completeness": "<1-2 sentences assessing completeness>", "tone": "<1-2 sentences assessing tone>" - } + }, + "score": <1, 2, 3, 4, or 5, or null when skipped>, + "status": "completed", } +**Status: Skipped** +If the USER QUERY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` + EXAMPLES ======== @@ -108,13 +115,14 @@ AGENT RESPONSE: "I've successfully cancelled your order #12345. Your payment of OUTPUT: { - "score": 5, - "explanation": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.", - "dimensions": { + "reason": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.", + "properties": { "helpfulness": "Directly addressed the cancellation request and completed it immediately.", "completeness": "Provided all relevant details: confirmation, refund amount, timeline, and email notification.", "tone": "Professional and helpful, ended with an offer for further assistance." - } + }, + "score": 5, + "status": "completed", } ### Score 3 - Neutral @@ -125,13 +133,14 @@ AGENT RESPONSE: "Our return policy allows returns within 30 days." OUTPUT: { - "score": 3, - "explanation": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.", - "dimensions": { + "reason": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.", + "properties": { "helpfulness": "Answered the basic question but minimal detail provided.", "completeness": "Missing key information about conditions, exceptions, and return process.", "tone": "Neutral tone, neither particularly warm nor cold." - } + }, + "score": 3, + "status": "completed" } ### Score 1 - Very Dissatisfied @@ -142,13 +151,14 @@ AGENT RESPONSE: "According to our records, the package was delivered. Have you c OUTPUT: { - "score": 1, - "explanation": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.", - "dimensions": { + "reason": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.", + "properties": { "helpfulness": "Failed to offer any meaningful assistance or resolution options.", "completeness": "Did not offer to investigate, file a claim, or provide alternatives.", "tone": "Dismissive tone that implies the customer is wrong or didn't look properly." - } + }, + "score": 1, + "status": "completed" } # Output diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty index caefb753d3..b0ae426916 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty @@ -101,15 +101,23 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "score": <1, 2, 3, 4, or 5>, - "explanation": "<30-60 words explaining the predicted satisfaction level for the full session>", - "dimensions": { + + "reason": "<30-60 words explaining the predicted satisfaction level for the full session>", + "properties": { "helpfulness": "<1-2 sentences assessing helpfulness across all turns>", "completeness": "<1-2 sentences assessing completeness of all requests>", "tone": "<1-2 sentences assessing tone throughout the session>" - } + }, + "score": <1, 2, 3, 4, or 5, or null when skipped>, + "status": "completed" } +**Status: Skipped** +If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` + SCORING EXAMPLES ================ @@ -123,13 +131,14 @@ Agent turn 2: Order #12346 shipped yesterday via FedEx. Tracking number: FX12345 EXPECTED OUTPUT: { - "score": 5, - "explanation": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.", - "dimensions": { + "reason": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.", + "properties": { "helpfulness": "Both the cancellation and shipping inquiry were addressed immediately and completely.", "completeness": "All details provided: refund timeline, confirmation email, tracking number, and delivery estimate.", "tone": "Professional and proactive throughout, offering further assistance after the first request." - } + }, + "score": 5, + "status": "completed" } ### Score 3 - Neutral (Partial resolution across turns) @@ -142,13 +151,14 @@ Agent turn 2: I see. Account locks usually expire after 30 minutes. Please try a EXPECTED OUTPUT: { - "score": 3, - "explanation": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.", - "dimensions": { + "reason": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.", + "properties": { "helpfulness": "Initial suggestion was generic and unhelpful. Second response addressed the specific error but offered only passive waiting.", "completeness": "Missing proactive options like unlocking the account, password reset, or escalation to support.", "tone": "Polite but somewhat dismissive of the customer's frustration with a 'try again later' response." - } + }, + "score": 3, + "status": "completed" } ### Score 1 - Very Dissatisfied (Failed session) @@ -161,13 +171,14 @@ Agent turn 2: Unfortunately, since the package shows as delivered, we cannot pro EXPECTED OUTPUT: { - "score": 1, - "explanation": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.", - "dimensions": { + "reason": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.", + "properties": { "helpfulness": "Failed to offer any meaningful assistance. Deflected responsibility to the customer.", "completeness": "Did not offer investigation, replacement, refund, or escalation options.", "tone": "Dismissive in both turns, implying the customer is wrong and offering no empathy for the situation." - } + }, + "score": 1, + "status": "completed" } KEY PRINCIPLES diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py index 16c2e7beb8..674322e655 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py @@ -30,17 +30,19 @@ class TestCustomerSatisfactionEvaluatorBehavior(BaseEvaluatorBehaviorTest): MINIMAL_RESPONSE = BaseEvaluatorBehaviorTest.MINIMAL_RESPONSE - _additional_expected_field_suffixes = ["dimensions"] + _additional_expected_field_suffixes = ["status", "properties"] @property def expected_result_fields(self): """Get the expected result fields for customer satisfaction evaluator.""" return [ f"{self._result_prefix}", + f"{self._result_prefix}_score", f"{self._result_prefix}_reason", f"{self._result_prefix}_threshold", f"{self._result_prefix}_result", - f"{self._result_prefix}_dimensions", + f"{self._result_prefix}_status", + f"{self._result_prefix}_properties", f"{self._result_prefix}_prompt_tokens", f"{self._result_prefix}_completion_tokens", f"{self._result_prefix}_total_tokens", @@ -50,6 +52,12 @@ def expected_result_fields(self): f"{self._result_prefix}_sample_output", ] + def assert_not_applicable(self, result_data: Dict[str, Any]): + """Customer satisfaction treats intermediate responses as skipped results.""" + assert result_data["score"] is None + assert result_data["label"] == "skipped" + assert "Not applicable" in result_data.get("reason", "") + def _create_mocked_evaluator(): """Create a CustomerSatisfactionEvaluator with both _flow and _multi_turn_flow mocked.""" @@ -98,11 +106,16 @@ def test_messages_valid_input(self): result = evaluator(messages=VALID_MESSAGES) assert "customer_satisfaction" in result + assert "customer_satisfaction_score" in result assert "customer_satisfaction_result" in result assert "customer_satisfaction_reason" in result - assert "customer_satisfaction_dimensions" in result + assert "customer_satisfaction_status" in result + assert "customer_satisfaction_properties" in result assert "customer_satisfaction_threshold" in result assert 1.0 <= result["customer_satisfaction"] <= 5.0 + assert result["customer_satisfaction_score"] == result["customer_satisfaction"] + assert result["customer_satisfaction_status"] == "completed" + assert isinstance(result["customer_satisfaction_properties"], dict) def test_messages_empty_list_raises_error(self): """Empty messages list raises validation error.""" @@ -194,6 +207,33 @@ def test_messages_string_content(self): conversation_text = call_kwargs.kwargs.get("conversation", "") assert "I need help with my order." in conversation_text + def test_query_response_intermediate_returns_skipped_schema(self): + """Intermediate single-turn responses return the standardized skipped schema.""" + evaluator = _create_mocked_evaluator() + result = evaluator( + query="Cancel my order.", + response=[ + { + "role": "assistant", + "content": [ + { + "type": "function_call", + "name": "cancel_order", + "tool_call_id": "call_1", + "arguments": {"order_id": "12345"}, + } + ], + } + ], + ) + + assert result["customer_satisfaction"] is None + assert result["customer_satisfaction_score"] is None + assert result["customer_satisfaction_result"] == "skipped" + assert result["customer_satisfaction_status"] == "skipped" + assert result["customer_satisfaction_reason"].startswith("Not applicable:") + assert result["customer_satisfaction_properties"] == {} + def test_messages_uses_multi_turn_flow(self): """Verify that the session path calls _multi_turn_flow, not _flow.""" evaluator = _create_mocked_evaluator() From f656ac965d0954048acbdacd82cf1b798bf25e10 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 21 Apr 2026 13:21:18 +0200 Subject: [PATCH 2/5] run docstyle --- .../test_customer_satisfaction_evaluator_behavior.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py index 674322e655..0c4ad09e0c 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py @@ -53,7 +53,7 @@ def expected_result_fields(self): ] def assert_not_applicable(self, result_data: Dict[str, Any]): - """Customer satisfaction treats intermediate responses as skipped results.""" + """Assert that the result is not applicable.""" assert result_data["score"] is None assert result_data["label"] == "skipped" assert "Not applicable" in result_data.get("reason", "") From 0b2d48713b656238b8a559c4a093aec34ee01afa Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 21 Apr 2026 13:23:25 +0200 Subject: [PATCH 3/5] update not applicable --- .../customer_satisfaction/evaluator/_customer_satisfaction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py index 60d48e9c59..8b8a5b5b87 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py @@ -1051,7 +1051,7 @@ def __call__( # pylint: disable=docstring-missing-param def _not_applicable_result( self, error_message: str, threshold: Union[int, float] - ) -> Dict[str, Any]: + ) -> Dict[str, Union[str, float, Dict]]: """Return a result indicating that the evaluation is not applicable.""" return { self._result_key: None, From b31333a8dd21b36506de9f155a00ad06b117a142 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Wed, 22 Apr 2026 12:59:00 +0200 Subject: [PATCH 4/5] Fix not-applicable CSAT result label Set not-applicable output result to not_applicable while keeping status as skipped, and update behavior tests to match. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../evaluator/_customer_satisfaction.py | 4 ++-- .../test_customer_satisfaction_evaluator_behavior.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py index 8b8a5b5b87..1ac351b934 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py @@ -1056,7 +1056,7 @@ def _not_applicable_result( return { self._result_key: None, f"{self._result_key}_score": None, - f"{self._result_key}_result": "skipped", + f"{self._result_key}_result": "not_applicable", f"{self._result_key}_threshold": threshold, f"{self._result_key}_reason": f"Not applicable: {error_message}", f"{self._result_key}_status": "skipped", @@ -1196,7 +1196,7 @@ def _build_result( """Build a standardized result dictionary. :param score: The evaluation score (1, 0, or None). - :param result: The result label ("pass", "fail", "skipped", or "error"). + :param result: The result label ("pass", "fail", "not_applicable", or "error"). :param reason: The reasoning or explanation string. :param status: The evaluation status ("completed", "skipped", or "error"). :param properties: The properties dictionary. diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py index 0c4ad09e0c..297aa18c62 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py @@ -55,7 +55,7 @@ def expected_result_fields(self): def assert_not_applicable(self, result_data: Dict[str, Any]): """Assert that the result is not applicable.""" assert result_data["score"] is None - assert result_data["label"] == "skipped" + assert result_data["label"] == "not_applicable" assert "Not applicable" in result_data.get("reason", "") @@ -207,8 +207,8 @@ def test_messages_string_content(self): conversation_text = call_kwargs.kwargs.get("conversation", "") assert "I need help with my order." in conversation_text - def test_query_response_intermediate_returns_skipped_schema(self): - """Intermediate single-turn responses return the standardized skipped schema.""" + def test_query_response_intermediate_returns_not_applicable_schema(self): + """Intermediate single-turn responses return the standardized not-applicable schema.""" evaluator = _create_mocked_evaluator() result = evaluator( query="Cancel my order.", @@ -229,7 +229,7 @@ def test_query_response_intermediate_returns_skipped_schema(self): assert result["customer_satisfaction"] is None assert result["customer_satisfaction_score"] is None - assert result["customer_satisfaction_result"] == "skipped" + assert result["customer_satisfaction_result"] == "not_applicable" assert result["customer_satisfaction_status"] == "skipped" assert result["customer_satisfaction_reason"].startswith("Not applicable:") assert result["customer_satisfaction_properties"] == {} From 095628c642867a286873cda127421919c1bfbed9 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 30 Apr 2026 13:45:52 +0300 Subject: [PATCH 5/5] update csat output schema --- .../evaluator/_customer_satisfaction.py | 48 +++++++++---------- ...ustomer_satisfaction_evaluator_behavior.py | 17 ++++--- 2 files changed, 31 insertions(+), 34 deletions(-) diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py index 8d668c6585..447932e4fd 100644 --- a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py +++ b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py @@ -1052,23 +1052,18 @@ def __call__( # pylint: disable=docstring-missing-param def _not_applicable_result( self, error_message: str, threshold: Union[int, float] ) -> Dict[str, Union[str, float, Dict]]: - """Return a result indicating that the evaluation is not applicable.""" - return { - self._result_key: None, - f"{self._result_key}_score": None, - f"{self._result_key}_result": "not_applicable", - f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_status": "skipped", - f"{self._result_key}_properties": {}, - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", - } + """Return a result indicating that the evaluation is not applicable (skipped). + + Not-applicable results have no score since the evaluator cannot make a judgment + (e.g., intermediate responses that are not final agent responses). + """ + return self._build_result( + score=None, + result="not_applicable", + reason=f"Not applicable: {error_message}", + status="skipped", + properties={}, + ) def _should_use_conversation_level(self, eval_input: Dict) -> bool: """Determine whether to use conversation-level evaluation. @@ -1204,21 +1199,24 @@ def _build_result( :return: The standardized result dictionary. """ p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} + metadata = { + "prompt_tokens": p.get("input_token_count", 0), + "completion_tokens": p.get("output_token_count", 0), + "total_tokens": p.get("total_token_count", 0), + "finish_reason": p.get("finish_reason", ""), + "model": p.get("model_id", ""), + "sample_input": p.get("sample_input", ""), + "sample_output": p.get("sample_output", ""), + } return { self._result_key: score, f"{self._result_key}_score": score, f"{self._result_key}_result": result, + f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None, f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_reason": reason, f"{self._result_key}_status": status, - f"{self._result_key}_properties": properties, - f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), - f"{self._result_key}_total_tokens": p.get("total_token_count", 0), - f"{self._result_key}_finish_reason": p.get("finish_reason", ""), - f"{self._result_key}_model": p.get("model_id", ""), - f"{self._result_key}_sample_input": p.get("sample_input", ""), - f"{self._result_key}_sample_output": p.get("sample_output", ""), + f"{self._result_key}_properties": {**properties, **metadata}, } def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]: diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py index 297aa18c62..79f923b7bd 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py @@ -34,22 +34,16 @@ class TestCustomerSatisfactionEvaluatorBehavior(BaseEvaluatorBehaviorTest): @property def expected_result_fields(self): - """Get the expected result fields for customer satisfaction evaluator.""" + """Get expected result fields — metadata now lives inside properties, not as top-level keys.""" return [ f"{self._result_prefix}", f"{self._result_prefix}_score", f"{self._result_prefix}_reason", f"{self._result_prefix}_threshold", f"{self._result_prefix}_result", + f"{self._result_prefix}_passed", f"{self._result_prefix}_status", f"{self._result_prefix}_properties", - f"{self._result_prefix}_prompt_tokens", - f"{self._result_prefix}_completion_tokens", - f"{self._result_prefix}_total_tokens", - f"{self._result_prefix}_finish_reason", - f"{self._result_prefix}_model", - f"{self._result_prefix}_sample_input", - f"{self._result_prefix}_sample_output", ] def assert_not_applicable(self, result_data: Dict[str, Any]): @@ -232,7 +226,12 @@ def test_query_response_intermediate_returns_not_applicable_schema(self): assert result["customer_satisfaction_result"] == "not_applicable" assert result["customer_satisfaction_status"] == "skipped" assert result["customer_satisfaction_reason"].startswith("Not applicable:") - assert result["customer_satisfaction_properties"] == {} + # properties contains default metadata (all zeros/empty) when no prompty output + props = result["customer_satisfaction_properties"] + assert isinstance(props, dict) + assert props["prompt_tokens"] == 0 + assert props["completion_tokens"] == 0 + assert props["total_tokens"] == 0 def test_messages_uses_multi_turn_flow(self): """Verify that the session path calls _multi_turn_flow, not _flow."""