Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import math
import os
import logging
from typing import Dict, Union, List, Optional, Tuple
Expand Down Expand Up @@ -1057,21 +1056,18 @@ def __call__( # pylint: disable=docstring-missing-param
def _not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict]]:
"""Return a result indicating that the evaluation is not applicable."""
return {
self._result_key: threshold,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_dimensions": {},
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
}
"""Return a result indicating that the evaluation is not applicable (skipped).

Not-applicable results have no score since the evaluator cannot make a judgment
(e.g., intermediate responses that are not final agent responses).
"""
return self._build_result(
score=None,
result="not_applicable",
reason=f"Not applicable: {error_message}",
status="skipped",
properties={},
)

def _should_use_conversation_level(self, eval_input: Dict) -> bool:
"""Determine whether to use conversation-level evaluation.
Expand Down Expand Up @@ -1187,7 +1183,47 @@ async def _do_eval_multi_turn(self, eval_input: Dict) -> Dict[str, Union[float,
prompty_output_dict = await self._multi_turn_flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_kwargs)
return self._parse_prompty_output(prompty_output_dict)

def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[float, str]]:
def _build_result(
self,
score: Optional[int],
result: str,
reason: str,
status: str,
properties: Dict,
prompty_output_dict: Optional[Dict] = None,
) -> Dict[str, Union[str, int, float, Dict, None]]:
"""Build a standardized result dictionary.

:param score: The evaluation score (1, 0, or None).
:param result: The result label ("pass", "fail", "not_applicable", or "error").
:param reason: The reasoning or explanation string.
:param status: The evaluation status ("completed", "skipped", or "error").
:param properties: The properties dictionary.
:param prompty_output_dict: Optional raw prompty output for extracting token metadata.
:return: The standardized result dictionary.
"""
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
metadata = {
"prompt_tokens": p.get("input_token_count", 0),
"completion_tokens": p.get("output_token_count", 0),
"total_tokens": p.get("total_token_count", 0),
"finish_reason": p.get("finish_reason", ""),
"model": p.get("model_id", ""),
"sample_input": p.get("sample_input", ""),
"sample_output": p.get("sample_output", ""),
}
return {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": result,
Comment thread
salma-elshafey marked this conversation as resolved.
f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": status,
f"{self._result_key}_properties": {**properties, **metadata},
}

def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Any]:
"""Parse the prompty output into a standardized result dictionary.

Shared between single-turn and multi-turn evaluation paths.
Expand All @@ -1199,47 +1235,29 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[fl
"""
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
score_value = llm_output.get("score", 3)
if isinstance(score_value, str):
score = float(score_value) if score_value.replace(".", "").isdigit() else 3.0
if not isinstance(llm_output, dict):
score = None
result = "error"
reason = "Evaluator returned invalid output."
status = "error"
properties = {}
else:
status = llm_output.get("status", "completed")
reason = llm_output.get("reason", "")
properties = llm_output.get("properties") or {}

if status == "skipped":
score = None
result = "skipped"
else:
score = float(score_value) if score_value else 3.0

# Clamp score to 1-5 range
score = max(1.0, min(5.0, score))

success_result = "pass" if score >= self._threshold else "fail"
reason = llm_output.get("explanation", "")
dimensions = llm_output.get("dimensions", {})

return {
self._result_key: score,
f"{self._result_key}_result": success_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_reason": reason,
f"{self._result_key}_dimensions": dimensions,
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}

# Check if base returned nan (invalid output case)
if isinstance(llm_output, float) and math.isnan(llm_output):
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
)

raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
score = llm_output.get("score", self._threshold)
result = "pass" if score >= self._threshold else "fail"

return self._build_result(
score=score,
result=result,
reason=reason,
status=status,
properties=properties,
prompty_output_dict=prompty_output_dict,
)
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,22 @@ OUTPUT FORMAT
=============
Output a JSON object with these keys:
{
"score": <1, 2, 3, 4, or 5>,
"explanation": "<30-60 words explaining the predicted satisfaction level>",
"dimensions": {
"reason": "<30-60 words explaining the predicted satisfaction level>",
"properties": {
"helpfulness": "<1-2 sentences assessing helpfulness>",
"completeness": "<1-2 sentences assessing completeness>",
"tone": "<1-2 sentences assessing tone>"
}
},
"score": <1, 2, 3, 4, or 5, or null when skipped>,
"status": "completed",
}

**Status: Skipped**
If the USER QUERY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
```json
{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
```

EXAMPLES
========

Expand All @@ -109,13 +116,14 @@ AGENT RESPONSE: "I've successfully cancelled your order #12345. Your payment of

OUTPUT:
{
"score": 5,
"explanation": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
"dimensions": {
"reason": "The agent immediately resolved the cancellation request, provided clear refund timeline, and confirmed next steps. Customer would be very satisfied with the efficient and complete resolution.",
"properties": {
"helpfulness": "Directly addressed the cancellation request and completed it immediately.",
"completeness": "Provided all relevant details: confirmation, refund amount, timeline, and email notification.",
"tone": "Professional and helpful, ended with an offer for further assistance."
}
},
"score": 5,
"status": "completed",
}

### Score 3 - Neutral
Expand All @@ -126,13 +134,14 @@ AGENT RESPONSE: "Our return policy allows returns within 30 days."

OUTPUT:
{
"score": 3,
"explanation": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
"dimensions": {
"reason": "The agent provided basic information about the return window but lacked important details like conditions, process, or refund method. Customer got a partial answer but might need to ask follow-up questions.",
"properties": {
"helpfulness": "Answered the basic question but minimal detail provided.",
"completeness": "Missing key information about conditions, exceptions, and return process.",
"tone": "Neutral tone, neither particularly warm nor cold."
}
},
"score": 3,
"status": "completed"
}

### Score 1 - Very Dissatisfied
Expand All @@ -143,13 +152,14 @@ AGENT RESPONSE: "According to our records, the package was delivered. Have you c

OUTPUT:
{
"score": 1,
"explanation": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
"dimensions": {
"reason": "The agent dismissed the customer's concern and offered no real help beyond a generic suggestion. Customer has a real problem that wasn't addressed, leaving them frustrated with no resolution path.",
"properties": {
"helpfulness": "Failed to offer any meaningful assistance or resolution options.",
"completeness": "Did not offer to investigate, file a claim, or provide alternatives.",
"tone": "Dismissive tone that implies the customer is wrong or didn't look properly."
}
},
"score": 1,
"status": "completed"
}

# Output
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,23 @@ OUTPUT FORMAT
=============
Output a JSON object with these keys:
{
"score": <1, 2, 3, 4, or 5>,
"explanation": "<30-60 words explaining the predicted satisfaction level for the full session>",
"dimensions": {

"reason": "<30-60 words explaining the predicted satisfaction level for the full session>",
"properties": {
"helpfulness": "<1-2 sentences assessing helpfulness across all turns>",
"completeness": "<1-2 sentences assessing completeness of all requests>",
"tone": "<1-2 sentences assessing tone throughout the session>"
}
},
"score": <1, 2, 3, 4, or 5, or null when skipped>,
"status": "completed"
}

**Status: Skipped**
If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring:
```json
{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
```

SCORING EXAMPLES
================

Expand All @@ -125,13 +133,14 @@ Agent turn 2: Order #12346 shipped yesterday via FedEx. Tracking number: FX12345

EXPECTED OUTPUT:
{
"score": 5,
"explanation": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
"dimensions": {
"reason": "Both requests handled efficiently across two turns. Cancellation processed immediately with refund details, and shipping status provided with tracking info. Customer would be very satisfied with the responsive, complete service.",
"properties": {
"helpfulness": "Both the cancellation and shipping inquiry were addressed immediately and completely.",
"completeness": "All details provided: refund timeline, confirmation email, tracking number, and delivery estimate.",
"tone": "Professional and proactive throughout, offering further assistance after the first request."
}
},
"score": 5,
"status": "completed"
}

### Score 3 - Neutral (Partial resolution across turns)
Expand All @@ -144,13 +153,14 @@ Agent turn 2: I see. Account locks usually expire after 30 minutes. Please try a

EXPECTED OUTPUT:
{
"score": 3,
"explanation": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
"dimensions": {
"reason": "The agent provided basic troubleshooting but didn't proactively unlock the account or offer alternative solutions. The customer got a partial answer but would need to wait and try again without certainty of resolution.",
"properties": {
"helpfulness": "Initial suggestion was generic and unhelpful. Second response addressed the specific error but offered only passive waiting.",
"completeness": "Missing proactive options like unlocking the account, password reset, or escalation to support.",
"tone": "Polite but somewhat dismissive of the customer's frustration with a 'try again later' response."
}
},
"score": 3,
"status": "completed"
}

### Score 1 - Very Dissatisfied (Failed session)
Expand All @@ -163,13 +173,14 @@ Agent turn 2: Unfortunately, since the package shows as delivered, we cannot pro

EXPECTED OUTPUT:
{
"score": 1,
"explanation": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
"dimensions": {
"reason": "The agent dismissed the customer's concern across both turns, offered no real resolution, and suggested filing a police report instead of helping. The customer's problem was completely unresolved.",
"properties": {
"helpfulness": "Failed to offer any meaningful assistance. Deflected responsibility to the customer.",
"completeness": "Did not offer investigation, replacement, refund, or escalation options.",
"tone": "Dismissive in both turns, implying the customer is wrong and offering no empathy for the situation."
}
},
"score": 1,
"status": "completed"
}

KEY PRINCIPLES
Expand Down
Loading
Loading