Azure · salma-elshafey · May 12, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
@@ -1210,24 +1210,61 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool:
         # Auto-detect (_evaluation_level is None)
         return eval_input.get("messages") is not None
 
+    def _build_result(
+        self,
+        score: Optional[int],
+        result: str,
+        reason: str,
+        status: str,
+        properties: Dict,
+        prompty_output_dict: Optional[Dict] = None,
+    ) -> Dict[str, Union[str, int, float, Dict, None]]:
+        """Build a standardized result dictionary.
+
+        :param score: The evaluation score (1, 0, or None).
+        :param result: The result label ("pass", "fail", "skipped", or "error").
+        :param reason: The reasoning or explanation string.
+        :param status: The evaluation status ("completed", "skipped", or "error").
+        :param properties: The properties dictionary.
+        :param prompty_output_dict: Optional raw prompty output for extracting token metadata.
+        :return: The standardized result dictionary.
+        """
+        p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
+        metadata = {
+            "prompt_tokens": p.get("input_token_count", 0),
+            "completion_tokens": p.get("output_token_count", 0),
+            "total_tokens": p.get("total_token_count", 0),
+            "finish_reason": p.get("finish_reason", ""),
+            "model": p.get("model_id", ""),
+            "sample_input": p.get("sample_input", ""),
+            "sample_output": p.get("sample_output", ""),
+        }
+        return {
+            self._result_key: score,
+            f"{self._result_key}_score": score,
+            f"{self._result_key}_result": result,
+            f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_reason": reason,
+            f"{self._result_key}_status": status,
+            f"{self._result_key}_properties": {**properties, **metadata}
+        }
+
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
     ) -> Dict[str, Union[str, float, Dict]]:
-        """Return a result indicating that the evaluation is not applicable."""
-        return {
-            self._result_key: threshold,
-            f"{self._result_key}_result": "pass",
-            f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_details": {},
-            f"{self._result_key}_prompt_tokens": 0,
-            f"{self._result_key}_completion_tokens": 0,
-            f"{self._result_key}_total_tokens": 0,
-            f"{self._result_key}_finish_reason": "",
-            f"{self._result_key}_model": "",
-            f"{self._result_key}_sample_input": "",
-            f"{self._result_key}_sample_output": "",
-        }
+        """Return a result indicating that the evaluation is not applicable (skipped).
+
+        Not-applicable results have no score since the evaluator cannot make a judgment
+        (e.g., intermediate responses that are not final agent responses).
+        """
+        return self._build_result(
+            score=None,
+            result="not_applicable",
+            reason=f"Not applicable: {error_message}",
+            status="skipped",
+            properties={},
+        )
 
     @override
     async def _real_call(self, **kwargs):
@@ -1326,6 +1363,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
         """Parse the prompty output into a standardized result dictionary.
 
         Shared between single-turn and multi-turn evaluation paths.
+        Expects the canonical schema: score (int), reason (str), status (str), properties (dict|null).
 
         :param prompty_output_dict: Raw output from the prompty flow.
         :type prompty_output_dict: Dict
@@ -1334,31 +1372,35 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
         """
         llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
-        if isinstance(llm_output, dict):
-            success_value = llm_output.get("success", False)
-            if isinstance(success_value, str):
-                success = 1 if success_value.lower() == "true" else 0
+        if not isinstance(llm_output, dict):
+            score = None
+            result = "error"
+            reason = "Evaluator returned invalid output."
+            status = "error"
+            properties = {}
+        else:
+            status = llm_output.get("status", "completed")
+            reason = llm_output.get("reason", "")
+            properties = llm_output.get("properties") or {}
+
+            if status == "skipped":
+                score = None
+                result = "not_applicable"
             else:
-                success = 1 if success_value else 0
-            success_result = "pass" if success == 1 else "fail"
-            reason = llm_output.get("explanation", "")
-            return {
-                self._result_key: success,
-                f"{self._result_key}_result": success_result,
-                f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_reason": reason,
-                f"{self._result_key}_details": llm_output.get("details", {}),
-                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
-                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
-                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
-                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
-            }
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR,
+                score_value = llm_output.get("score", 0)
+                if isinstance(score_value, str):
+                    score = 1 if score_value.strip() in ("1", "true") else 0
+                elif isinstance(score_value, (int, float)):
+                    score = 1 if score_value == 1 else 0
+                else:
+                    score = 1 if score_value else 0
+                result = "pass" if score == 1 else "fail"
+
+        return self._build_result(
+            score=score,
+            result=result,
+            reason=reason,
+            status=status,
+            properties=properties,
+            prompty_output_dict=prompty_output_dict,
         )
@@ -66,30 +66,37 @@ C. Assess Task Completion:
 - **Incomplete**: No usable deliverable or major requirements unmet
 
 D. Assign a Score:
-   - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
-   - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
+   - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
+   - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
 
-   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.
+   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.
 
    **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question (e.g., "What is the capital of France?"), a correct and direct answer fully completes the task. No additional elaboration, context, or "actionable information" beyond the accurate answer is required.
 
 OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "explanation": "<15-60 words explaining the completion status>",
-  "details": {
+  "reason": "<15-60 words explaining the completion status>",
+  "properties": {
     "task_requirements": "<15-60 words on what the user specifically requested>",
     "delivered_outcome": "<15-60 words on what the agent actually provided>",
     "completion_gaps": "<15-60 words on missing elements if task is incomplete>"
   },
-  "success": <TRUE or FALSE>
+  "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>,
+  "status": "<completed or skipped>"
 }
 
+**Status: Skipped**
+If the CONVERSATION_HISTORY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 SCORING EXAMPLES
 ================
 
-### SUCCESS: TRUE - Example A
+### SCORE: 1 - Example A
 
 CONVERSATION_HISTORY:
 User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
@@ -117,16 +124,17 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
-  "details": {
+  "reason": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
+  "properties": {
     "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
     "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example B
+### SCORE: 1 - Example B
 
 CONVERSATION_HISTORY:
 User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
@@ -142,16 +150,17 @@ Your internet issues are resolved - update firmware and change DNS as instructed
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
-  "details": {
+  "reason": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
+  "properties": {
     "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
     "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example C
+### SCORE: 1 - Example C
 
 CONVERSATION_HISTORY:
 User: Which is better for a beginner, Python or JavaScript?
@@ -173,16 +182,17 @@ If you're interested in data or automation, start with Python. If you want to bu
 
 EXPECTED OUTPUT:
 {
-  "explanation": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
-  "details": {
+  "reason": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
+  "properties": {
     "task_requirements": "Subjective comparison of Python vs JavaScript for beginners",
     "delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example D
+### SCORE: 1 - Example D
 
 CONVERSATION_HISTORY:
 User: Is the boiling point of water 50°C?
@@ -192,16 +202,17 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5
 
 EXPECTED OUTPUT:
 {
-  "explanation": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
-  "details": {
+  "reason": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
+  "properties": {
     "task_requirements": "Verify whether the boiling point of water is 50°C",
     "delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example A
+### SCORE: 0 - Example A
 
 CONVERSATION_HISTORY:
 User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
@@ -211,16 +222,17 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
-  "details": {
+  "reason": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
+  "properties": {
     "task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine",
     "delivered_outcome": "General recommendations without structured itinerary as asked",
     "completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example B
+### SCORE: 0 - Example B
 
 CONVERSATION_HISTORY:
 User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
@@ -232,16 +244,17 @@ I found that your router firmware needs updating. You can usually do this throug
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
-  "details": {
+  "reason": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
+  "properties": {
     "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
     "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed",
     "completion_gaps": "No specific firmware update steps, data plan upgrade not processed"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example C
+### SCORE: 0 - Example C
 
 CONVERSATION_HISTORY:
 User: Analyze our Q3 sales data and generate a summary report.
@@ -251,13 +264,14 @@ I can help you analyze sales data! Sales analysis is important for business grow
 
 EXPECTED OUTPUT:
 {
-  "explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
-  "details": {
+  "reason": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
+  "properties": {
     "task_requirements": "Analyze Q3 sales data and generate summary report",
     "delivered_outcome": "General information about sales analysis concepts",
     "completion_gaps": "No data analysis performed, no summary report created"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
 KEY PRINCIPLES
@@ -273,4 +287,4 @@ KEY PRINCIPLES
 
 Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements.
 
-# Output
+# Output