Skip to content

Commit cf221e5

Browse files
Update CSAT evaluator output schema (#4945)
* Update CSAT evaluator output schema * run docstyle * update not applicable * Fix not-applicable CSAT result label Set not-applicable output result to not_applicable while keeping status as skipped, and update behavior tests to match. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * update csat output schema --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 64a60ba commit cf221e5

4 files changed

Lines changed: 180 additions & 102 deletions

File tree

assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py

Lines changed: 77 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
3-
import math
43
import os
54
import logging
65
from typing import Dict, Union, List, Optional, Tuple
@@ -1057,21 +1056,18 @@ def __call__( # pylint: disable=docstring-missing-param
10571056
def _not_applicable_result(
10581057
self, error_message: str, threshold: Union[int, float]
10591058
) -> Dict[str, Union[str, float, Dict]]:
1060-
"""Return a result indicating that the evaluation is not applicable."""
1061-
return {
1062-
self._result_key: threshold,
1063-
f"{self._result_key}_result": "pass",
1064-
f"{self._result_key}_threshold": threshold,
1065-
f"{self._result_key}_reason": f"Not applicable: {error_message}",
1066-
f"{self._result_key}_dimensions": {},
1067-
f"{self._result_key}_prompt_tokens": 0,
1068-
f"{self._result_key}_completion_tokens": 0,
1069-
f"{self._result_key}_total_tokens": 0,
1070-
f"{self._result_key}_finish_reason": "",
1071-
f"{self._result_key}_model": "",
1072-
f"{self._result_key}_sample_input": "",
1073-
f"{self._result_key}_sample_output": "",
1074-
}
1059+
"""Return a result indicating that the evaluation is not applicable (skipped).
1060+
1061+
Not-applicable results have no score since the evaluator cannot make a judgment
1062+
(e.g., intermediate responses that are not final agent responses).
1063+
"""
1064+
return self._build_result(
1065+
score=None,
1066+
result="not_applicable",
1067+
reason=f"Not applicable: {error_message}",
1068+
status="skipped",
1069+
properties={},
1070+
)
10751071

10761072
def _should_use_conversation_level(self, eval_input: Dict) -> bool:
10771073
"""Determine whether to use conversation-level evaluation.
@@ -1187,7 +1183,47 @@ async def _do_eval_multi_turn(self, eval_input: Dict) -> Dict[str, Union[float,
11871183
prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs)
11881184
return self._parse_prompty_output(prompty_output_dict)
11891185

1190-
def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[float, str]]:
1186+
def _build_result(
1187+
self,
1188+
score: Optional[int],
1189+
result: str,
1190+
reason: str,
1191+
status: str,
1192+
properties: Dict,
1193+
prompty_output_dict: Optional[Dict] = None,
1194+
) -> Dict[str, Union[str, int, float, Dict, None]]:
1195+
"""Build a standardized result dictionary.
1196+
1197+
:param score: The evaluation score (1, 0, or None).
1198+
:param result: The result label ("pass", "fail", "not_applicable", or "error").
1199+
:param reason: The reasoning or explanation string.
1200+
:param status: The evaluation status ("completed", "skipped", or "error").
1201+
:param properties: The properties dictionary.
1202+
:param prompty_output_dict: Optional raw prompty output for extracting token metadata.
1203+
:return: The standardized result dictionary.
1204+
"""
1205+
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
1206+
metadata = {
1207+
"prompt_tokens": p.get("input_token_count", 0),
1208+
"completion_tokens": p.get("output_token_count", 0),
1209+
"total_tokens": p.get("total_token_count", 0),
1210+
"finish_reason": p.get("finish_reason", ""),
1211+
"model": p.get("model_id", ""),
1212+
"sample_input": p.get("sample_input", ""),
1213+
"sample_output": p.get("sample_output", ""),
1214+
}
1215+
return {
1216+
self._result_key: score,
1217+
f"{self._result_key}_score": score,
1218+
f"{self._result_key}_result": result,
1219+
f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
1220+
f"{self._result_key}_threshold": self._threshold,
1221+
f"{self._result_key}_reason": reason,
1222+
f"{self._result_key}_status": status,
1223+
f"{self._result_key}_properties": {**properties, **metadata},
1224+
}
1225+
1226+
def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
11911227
"""Parse the prompty output into a standardized result dictionary.
11921228
11931229
Shared between single-turn and multi-turn evaluation paths.
@@ -1199,47 +1235,29 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[fl
11991235
"""
12001236
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
12011237

1202-
if isinstance(llm_output, dict):
1203-
score_value = llm_output.get("score", 3)
1204-
if isinstance(score_value, str):
1205-
score = float(score_value) if score_value.replace(".", "").isdigit() else 3.0
1238+
if not isinstance(llm_output, dict):
1239+
score = None
1240+
result = "error"
1241+
reason = "Evaluator returned invalid output."
1242+
status = "error"
1243+
properties = {}
1244+
else:
1245+
status = llm_output.get("status", "completed")
1246+
reason = llm_output.get("reason", "")
1247+
properties = llm_output.get("properties") or {}
1248+
1249+
if status == "skipped":
1250+
score = None
1251+
result = "skipped"
12061252
else:
1207-
score = float(score_value) if score_value else 3.0
1208-
1209-
# Clamp score to 1-5 range
1210-
score = max(1.0, min(5.0, score))
1211-
1212-
success_result = "pass" if score >= self._threshold else "fail"
1213-
reason = llm_output.get("explanation", "")
1214-
dimensions = llm_output.get("dimensions", {})
1215-
1216-
return {
1217-
self._result_key: score,
1218-
f"{self._result_key}_result": success_result,
1219-
f"{self._result_key}_threshold": self._threshold,
1220-
f"{self._result_key}_reason": reason,
1221-
f"{self._result_key}_dimensions": dimensions,
1222-
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
1223-
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
1224-
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
1225-
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
1226-
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
1227-
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
1228-
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
1229-
}
1230-
1231-
# Check if base returned nan (invalid output case)
1232-
if isinstance(llm_output, float) and math.isnan(llm_output):
1233-
raise EvaluationException(
1234-
message="Evaluator returned invalid output.",
1235-
blame=ErrorBlame.SYSTEM_ERROR,
1236-
category=ErrorCategory.FAILED_EXECUTION,
1237-
target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
1238-
)
1239-
1240-
raise EvaluationException(
1241-
message="Evaluator returned invalid output.",
1242-
blame=ErrorBlame.SYSTEM_ERROR,
1243-
category=ErrorCategory.FAILED_EXECUTION,
1244-
target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
1253+
score = llm_output.get("score", self._threshold)
1254+
result = "pass" if score >= self._threshold else "fail"
1255+
1256+
return self._build_result(
1257+
score=score,
1258+
result=result,
1259+
reason=reason,
1260+
status=status,
1261+
properties=properties,
1262+
prompty_output_dict=prompty_output_dict,
12451263
)

assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction.prompty

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,22 @@ OUTPUT FORMAT
8989
=============
9090
Output a JSON object with these keys:
9191
{
92-
"score": <1, 2, 3, 4, or 5>,
93-
"explanation": "<30-60 words explaining the predicted satisfaction level>",
94-
"dimensions": {
92+
"reason": "<30-60 words explaining the predicted satisfaction level>",
93+
"properties": {
9594
"helpfulness": "<1-2 sentences assessing helpfulness>",
9695
"completeness": "<1-2 sentences assessing completeness>",
9796
"tone": "<1-2 sentences assessing tone>"
98-
}
97+
},
98+
"score": <1, 2, 3, 4, or 5, or null when skipped>,
99+
"status": "completed",
99100
}
100101

102+
**Status: Skipped**
103+
If the USER QUERY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
104+
```json
105+
{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
106+
```
107+
101108
EXAMPLES
102109
========
103110

@@ -109,13 +116,14 @@ AGENT RESPONSE: "I've successfully cancelled your order #12345. Your payment of
109116

110117
OUTPUT:
111118
{
112-
"score": 5,
113-
"explanation": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
114-
"dimensions": {
119+
"reason": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
120+
"properties": {
115121
"helpfulness": "Directly addressed the cancellation request and completed it immediately.",
116122
"completeness": "Provided all relevant details: confirmation, refund amount, timeline, and email notification.",
117123
"tone": "Professional and helpful, ended with an offer for further assistance."
118-
}
124+
},
125+
"score": 5,
126+
"status": "completed",
119127
}
120128

121129
### Score 3 - Neutral
@@ -126,13 +134,14 @@ AGENT RESPONSE: "Our return policy allows returns within 30 days."
126134

127135
OUTPUT:
128136
{
129-
"score": 3,
130-
"explanation": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
131-
"dimensions": {
137+
"reason": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
138+
"properties": {
132139
"helpfulness": "Answered the basic question but minimal detail provided.",
133140
"completeness": "Missing key information about conditions, exceptions, and return process.",
134141
"tone": "Neutral tone, neither particularly warm nor cold."
135-
}
142+
},
143+
"score": 3,
144+
"status": "completed"
136145
}
137146

138147
### Score 1 - Very Dissatisfied
@@ -143,13 +152,14 @@ AGENT RESPONSE: "According to our records, the package was delivered. Have you c
143152

144153
OUTPUT:
145154
{
146-
"score": 1,
147-
"explanation": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
148-
"dimensions": {
155+
"reason": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
156+
"properties": {
149157
"helpfulness": "Failed to offer any meaningful assistance or resolution options.",
150158
"completeness": "Did not offer to investigate, file a claim, or provide alternatives.",
151159
"tone": "Dismissive tone that implies the customer is wrong or didn't look properly."
152-
}
160+
},
161+
"score": 1,
162+
"status": "completed"
153163
}
154164

155165
# Output

assets/evaluators/builtin/customer_satisfaction/evaluator/customer_satisfaction_multi_turn.prompty

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,23 @@ OUTPUT FORMAT
103103
=============
104104
Output a JSON object with these keys:
105105
{
106-
"score": <1, 2, 3, 4, or 5>,
107-
"explanation": "<30-60 words explaining the predicted satisfaction level for the full session>",
108-
"dimensions": {
106+
107+
"reason": "<30-60 words explaining the predicted satisfaction level for the full session>",
108+
"properties": {
109109
"helpfulness": "<1-2 sentences assessing helpfulness across all turns>",
110110
"completeness": "<1-2 sentences assessing completeness of all requests>",
111111
"tone": "<1-2 sentences assessing tone throughout the session>"
112-
}
112+
},
113+
"score": <1, 2, 3, 4, or 5, or null when skipped>,
114+
"status": "completed"
113115
}
114116

117+
**Status: Skipped**
118+
If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring:
119+
```json
120+
{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
121+
```
122+
115123
SCORING EXAMPLES
116124
================
117125

@@ -125,13 +133,14 @@ Agent turn 2: Order #12346 shipped yesterday via FedEx. Tracking number: FX12345
125133

126134
EXPECTED OUTPUT:
127135
{
128-
"score": 5,
129-
"explanation": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
130-
"dimensions": {
136+
"reason": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
137+
"properties": {
131138
"helpfulness": "Both the cancellation and shipping inquiry were addressed immediately and completely.",
132139
"completeness": "All details provided: refund timeline, confirmation email, tracking number, and delivery estimate.",
133140
"tone": "Professional and proactive throughout, offering further assistance after the first request."
134-
}
141+
},
142+
"score": 5,
143+
"status": "completed"
135144
}
136145

137146
### Score 3 - Neutral (Partial resolution across turns)
@@ -144,13 +153,14 @@ Agent turn 2: I see. Account locks usually expire after 30 minutes. Please try a
144153

145154
EXPECTED OUTPUT:
146155
{
147-
"score": 3,
148-
"explanation": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
149-
"dimensions": {
156+
"reason": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
157+
"properties": {
150158
"helpfulness": "Initial suggestion was generic and unhelpful. Second response addressed the specific error but offered only passive waiting.",
151159
"completeness": "Missing proactive options like unlocking the account, password reset, or escalation to support.",
152160
"tone": "Polite but somewhat dismissive of the customer's frustration with a 'try again later' response."
153-
}
161+
},
162+
"score": 3,
163+
"status": "completed"
154164
}
155165

156166
### Score 1 - Very Dissatisfied (Failed session)
@@ -163,13 +173,14 @@ Agent turn 2: Unfortunately, since the package shows as delivered, we cannot pro
163173

164174
EXPECTED OUTPUT:
165175
{
166-
"score": 1,
167-
"explanation": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
168-
"dimensions": {
176+
"reason": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
177+
"properties": {
169178
"helpfulness": "Failed to offer any meaningful assistance. Deflected responsibility to the customer.",
170179
"completeness": "Did not offer investigation, replacement, refund, or escalation options.",
171180
"tone": "Dismissive in both turns, implying the customer is wrong and offering no empathy for the situation."
172-
}
181+
},
182+
"score": 1,
183+
"status": "completed"
173184
}
174185

175186
KEY PRINCIPLES

0 commit comments

Comments
 (0)