Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -1210,24 +1210,61 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool:
# Auto-detect (_evaluation_level is None)
return eval_input.get("messages") is not None

def _build_result(
self,
score: Optional[int],
result: str,
reason: str,
status: str,
properties: Dict,
prompty_output_dict: Optional[Dict] = None,
) -> Dict[str, Union[str, int, float, Dict, None]]:
"""Build a standardized result dictionary.

:param score: The evaluation score (1, 0, or None).
:param result: The result label ("pass", "fail", "skipped", or "error").
:param reason: The reasoning or explanation string.
:param status: The evaluation status ("completed", "skipped", or "error").
:param properties: The properties dictionary.
:param prompty_output_dict: Optional raw prompty output for extracting token metadata.
:return: The standardized result dictionary.
"""
p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
metadata = {
"prompt_tokens": p.get("input_token_count", 0),
"completion_tokens": p.get("output_token_count", 0),
"total_tokens": p.get("total_token_count", 0),
"finish_reason": p.get("finish_reason", ""),
"model": p.get("model_id", ""),
"sample_input": p.get("sample_input", ""),
"sample_output": p.get("sample_output", ""),
}
return {
self._result_key: score,
f"{self._result_key}_score": score,
f"{self._result_key}_result": result,
Comment thread
salma-elshafey marked this conversation as resolved.
f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_reason": reason,
f"{self._result_key}_status": status,
f"{self._result_key}_properties": {**properties, **metadata}
}

def _not_applicable_result(
self, error_message: str, threshold: Union[int, float]
) -> Dict[str, Union[str, float, Dict]]:
"""Return a result indicating that the evaluation is not applicable."""
return {
self._result_key: threshold,
f"{self._result_key}_result": "pass",
f"{self._result_key}_threshold": threshold,
f"{self._result_key}_reason": f"Not applicable: {error_message}",
f"{self._result_key}_details": {},
f"{self._result_key}_prompt_tokens": 0,
f"{self._result_key}_completion_tokens": 0,
f"{self._result_key}_total_tokens": 0,
f"{self._result_key}_finish_reason": "",
f"{self._result_key}_model": "",
f"{self._result_key}_sample_input": "",
f"{self._result_key}_sample_output": "",
}
"""Return a result indicating that the evaluation is not applicable (skipped).

Not-applicable results have no score since the evaluator cannot make a judgment
(e.g., intermediate responses that are not final agent responses).
"""
return self._build_result(
score=None,
result="not_applicable",
Comment thread
m7md7sien marked this conversation as resolved.
reason=f"Not applicable: {error_message}",
status="skipped",
properties={},
)

@override
async def _real_call(self, **kwargs):
Expand Down Expand Up @@ -1326,6 +1363,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
"""Parse the prompty output into a standardized result dictionary.

Shared between single-turn and multi-turn evaluation paths.
Expects the canonical schema: score (int), reason (str), status (str), properties (dict|null).

:param prompty_output_dict: Raw output from the prompty flow.
:type prompty_output_dict: Dict
Expand All @@ -1334,31 +1372,35 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
"""
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
success_value = llm_output.get("success", False)
if isinstance(success_value, str):
success = 1 if success_value.lower() == "true" else 0
if not isinstance(llm_output, dict):
score = None
result = "error"
reason = "Evaluator returned invalid output."
status = "error"
properties = {}
else:
status = llm_output.get("status", "completed")
reason = llm_output.get("reason", "")
properties = llm_output.get("properties") or {}

if status == "skipped":
score = None
result = "not_applicable"
else:
success = 1 if success_value else 0
success_result = "pass" if success == 1 else "fail"
reason = llm_output.get("explanation", "")
return {
self._result_key: success,
f"{self._result_key}_result": success_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_reason": reason,
f"{self._result_key}_details": llm_output.get("details", {}),
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
}
raise EvaluationException(
message="Evaluator returned invalid output.",
blame=ErrorBlame.SYSTEM_ERROR,
category=ErrorCategory.FAILED_EXECUTION,
target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR,
score_value = llm_output.get("score", 0)
if isinstance(score_value, str):
score = 1 if score_value.strip() in ("1", "true") else 0
elif isinstance(score_value, (int, float)):
score = 1 if score_value == 1 else 0
else:
score = 1 if score_value else 0
result = "pass" if score == 1 else "fail"

return self._build_result(
score=score,
result=result,
reason=reason,
status=status,
properties=properties,
prompty_output_dict=prompty_output_dict,
)
Original file line number Diff line number Diff line change
Expand Up @@ -66,30 +66,37 @@ C. Assess Task Completion:
- **Incomplete**: No usable deliverable or major requirements unmet

D. Assign a Score:
- **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
- **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
- **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
- **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.

**Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.
**Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.

**Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question (e.g., "What is the capital of France?"), a correct and direct answer fully completes the task. No additional elaboration, context, or "actionable information" beyond the accurate answer is required.

OUTPUT FORMAT
=============
Output a JSON object with these keys:
{
"explanation": "<15-60 words explaining the completion status>",
"details": {
"reason": "<15-60 words explaining the completion status>",
"properties": {
"task_requirements": "<15-60 words on what the user specifically requested>",
"delivered_outcome": "<15-60 words on what the agent actually provided>",
"completion_gaps": "<15-60 words on missing elements if task is incomplete>"
},
"success": <TRUE or FALSE>
"score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>,
"status": "<completed or skipped>"
}

**Status: Skipped**
If the CONVERSATION_HISTORY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
```json
{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
```

SCORING EXAMPLES
================

### SUCCESS: TRUE - Example A
### SCORE: 1 - Example A

CONVERSATION_HISTORY:
User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
Expand Down Expand Up @@ -117,16 +124,17 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc

EXPECTED OUTPUT:
{
"explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
"details": {
"reason": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
"properties": {
"task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
"delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips",
"completion_gaps": "None"
},
"success": TRUE
"score": 1,
"status": "completed"
}

### SUCCESS: TRUE - Example B
### SCORE: 1 - Example B

CONVERSATION_HISTORY:
User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
Expand All @@ -142,16 +150,17 @@ Your internet issues are resolved - update firmware and change DNS as instructed

EXPECTED OUTPUT:
{
"explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
"details": {
"reason": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
"properties": {
"task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
"delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active",
"completion_gaps": "None"
},
"success": TRUE
"score": 1,
"status": "completed"
}

### SUCCESS: TRUE - Example C
### SCORE: 1 - Example C

CONVERSATION_HISTORY:
User: Which is better for a beginner, Python or JavaScript?
Expand All @@ -173,16 +182,17 @@ If you're interested in data or automation, start with Python. If you want to bu

EXPECTED OUTPUT:
{
"explanation": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
"details": {
"reason": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
"properties": {
"task_requirements": "Subjective comparison of Python vs JavaScript for beginners",
"delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation",
"completion_gaps": "None"
},
"success": TRUE
"score": 1,
"status": "completed"
}

### SUCCESS: TRUE - Example D
### SCORE: 1 - Example D

CONVERSATION_HISTORY:
User: Is the boiling point of water 50°C?
Expand All @@ -192,16 +202,17 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5

EXPECTED OUTPUT:
{
"explanation": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
"details": {
"reason": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
"properties": {
"task_requirements": "Verify whether the boiling point of water is 50°C",
"delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C",
"completion_gaps": "None"
},
"success": TRUE
"score": 1,
"status": "completed"
}

### SUCCESS: FALSE - Example A
### SCORE: 0 - Example A

CONVERSATION_HISTORY:
User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
Expand All @@ -211,16 +222,17 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F

EXPECTED OUTPUT:
{
"explanation": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
"details": {
"reason": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
"properties": {
"task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine",
"delivered_outcome": "General recommendations without structured itinerary as asked",
"completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks"
},
"success": FALSE
"score": 0,
"status": "completed"
}

### SUCCESS: FALSE - Example B
### SCORE: 0 - Example B

CONVERSATION_HISTORY:
User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
Expand All @@ -232,16 +244,17 @@ I found that your router firmware needs updating. You can usually do this throug

EXPECTED OUTPUT:
{
"explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
"details": {
"reason": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
"properties": {
"task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
"delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed",
"completion_gaps": "No specific firmware update steps, data plan upgrade not processed"
},
"success": FALSE
"score": 0,
"status": "completed"
}

### SUCCESS: FALSE - Example C
### SCORE: 0 - Example C

CONVERSATION_HISTORY:
User: Analyze our Q3 sales data and generate a summary report.
Expand All @@ -251,13 +264,14 @@ I can help you analyze sales data! Sales analysis is important for business grow

EXPECTED OUTPUT:
{
"explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
"details": {
"reason": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
"properties": {
"task_requirements": "Analyze Q3 sales data and generate summary report",
"delivered_outcome": "General information about sales analysis concepts",
"completion_gaps": "No data analysis performed, no summary report created"
},
"success": FALSE
"score": 0,
"status": "completed"
}

KEY PRINCIPLES
Expand All @@ -273,4 +287,4 @@ KEY PRINCIPLES

Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements.

# Output
# Output
Loading
Loading