diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index d0599ddea6..5e5a18dd22 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1210,24 +1210,61 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool: # Auto-detect (_evaluation_level is None) return eval_input.get("messages") is not None + def _build_result( + self, + score: Optional[int], + result: str, + reason: str, + status: str, + properties: Dict, + prompty_output_dict: Optional[Dict] = None, + ) -> Dict[str, Union[str, int, float, Dict, None]]: + """Build a standardized result dictionary. + + :param score: The evaluation score (1, 0, or None). + :param result: The result label ("pass", "fail", "skipped", or "error"). + :param reason: The reasoning or explanation string. + :param status: The evaluation status ("completed", "skipped", or "error"). + :param properties: The properties dictionary. + :param prompty_output_dict: Optional raw prompty output for extracting token metadata. + :return: The standardized result dictionary. + """ + p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} + metadata = { + "prompt_tokens": p.get("input_token_count", 0), + "completion_tokens": p.get("output_token_count", 0), + "total_tokens": p.get("total_token_count", 0), + "finish_reason": p.get("finish_reason", ""), + "model": p.get("model_id", ""), + "sample_input": p.get("sample_input", ""), + "sample_output": p.get("sample_output", ""), + } + return { + self._result_key: score, + f"{self._result_key}_score": score, + f"{self._result_key}_result": result, + f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None, + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": status, + f"{self._result_key}_properties": {**properties, **metadata} + } + def _not_applicable_result( self, error_message: str, threshold: Union[int, float] ) -> Dict[str, Union[str, float, Dict]]: - """Return a result indicating that the evaluation is not applicable.""" - return { - self._result_key: threshold, - f"{self._result_key}_result": "pass", - f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_details": {}, - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", - } + """Return a result indicating that the evaluation is not applicable (skipped). + + Not-applicable results have no score since the evaluator cannot make a judgment + (e.g., intermediate responses that are not final agent responses). + """ + return self._build_result( + score=None, + result="not_applicable", + reason=f"Not applicable: {error_message}", + status="skipped", + properties={}, + ) @override async def _real_call(self, **kwargs): @@ -1326,6 +1363,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in """Parse the prompty output into a standardized result dictionary. Shared between single-turn and multi-turn evaluation paths. + Expects the canonical schema: score (int), reason (str), status (str), properties (dict|null). :param prompty_output_dict: Raw output from the prompty flow. :type prompty_output_dict: Dict @@ -1334,31 +1372,35 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in """ llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) - if isinstance(llm_output, dict): - success_value = llm_output.get("success", False) - if isinstance(success_value, str): - success = 1 if success_value.lower() == "true" else 0 + if not isinstance(llm_output, dict): + score = None + result = "error" + reason = "Evaluator returned invalid output." + status = "error" + properties = {} + else: + status = llm_output.get("status", "completed") + reason = llm_output.get("reason", "") + properties = llm_output.get("properties") or {} + + if status == "skipped": + score = None + result = "not_applicable" else: - success = 1 if success_value else 0 - success_result = "pass" if success == 1 else "fail" - reason = llm_output.get("explanation", "") - return { - self._result_key: success, - f"{self._result_key}_result": success_result, - f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_reason": reason, - f"{self._result_key}_details": llm_output.get("details", {}), - f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), - f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), - f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""), - f"{self._result_key}_model": prompty_output_dict.get("model_id", ""), - f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), - f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), - } - raise EvaluationException( - message="Evaluator returned invalid output.", - blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, - target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR, + score_value = llm_output.get("score", 0) + if isinstance(score_value, str): + score = 1 if score_value.strip() in ("1", "true") else 0 + elif isinstance(score_value, (int, float)): + score = 1 if score_value == 1 else 0 + else: + score = 1 if score_value else 0 + result = "pass" if score == 1 else "fail" + + return self._build_result( + score=score, + result=result, + reason=reason, + status=status, + properties=properties, + prompty_output_dict=prompty_output_dict, ) diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty index e53ff1a9c4..050b43804a 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty @@ -66,10 +66,10 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. + - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. - **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation. + **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation. **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question (e.g., "What is the capital of France?"), a correct and direct answer fully completes the task. No additional elaboration, context, or "actionable information" beyond the accurate answer is required. @@ -77,19 +77,26 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "explanation": "<15-60 words explaining the completion status>", - "details": { + "reason": "<15-60 words explaining the completion status>", + "properties": { "task_requirements": "<15-60 words on what the user specifically requested>", "delivered_outcome": "<15-60 words on what the agent actually provided>", "completion_gaps": "<15-60 words on missing elements if task is incomplete>" }, - "success": + "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>, + "status": "" } +**Status: Skipped** +If the CONVERSATION_HISTORY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` + SCORING EXAMPLES ================ -### SUCCESS: TRUE - Example A +### SCORE: 1 - Example A CONVERSATION_HISTORY: User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine. @@ -117,16 +124,17 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc EXPECTED OUTPUT: { - "explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", - "details": { + "reason": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", + "properties": { "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example B +### SCORE: 1 - Example B CONVERSATION_HISTORY: User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited. @@ -142,16 +150,17 @@ Your internet issues are resolved - update firmware and change DNS as instructed EXPECTED OUTPUT: { - "explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", - "details": { + "reason": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", + "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example C +### SCORE: 1 - Example C CONVERSATION_HISTORY: User: Which is better for a beginner, Python or JavaScript? @@ -173,16 +182,17 @@ If you're interested in data or automation, start with Python. If you want to bu EXPECTED OUTPUT: { - "explanation": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", - "details": { + "reason": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", + "properties": { "task_requirements": "Subjective comparison of Python vs JavaScript for beginners", "delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example D +### SCORE: 1 - Example D CONVERSATION_HISTORY: User: Is the boiling point of water 50°C? @@ -192,16 +202,17 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5 EXPECTED OUTPUT: { - "explanation": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", - "details": { + "reason": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", + "properties": { "task_requirements": "Verify whether the boiling point of water is 50°C", "delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: FALSE - Example A +### SCORE: 0 - Example A CONVERSATION_HISTORY: User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine. @@ -211,16 +222,17 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F EXPECTED OUTPUT: { - "explanation": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", - "details": { + "reason": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", + "properties": { "task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "General recommendations without structured itinerary as asked", "completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks" }, - "success": FALSE + "score": 0, + "status": "completed" } -### SUCCESS: FALSE - Example B +### SCORE: 0 - Example B CONVERSATION_HISTORY: User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited. @@ -232,16 +244,17 @@ I found that your router firmware needs updating. You can usually do this throug EXPECTED OUTPUT: { - "explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", - "details": { + "reason": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", + "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed", "completion_gaps": "No specific firmware update steps, data plan upgrade not processed" }, - "success": FALSE + "score": 0, + "status": "completed" } -### SUCCESS: FALSE - Example C +### SCORE: 0 - Example C CONVERSATION_HISTORY: User: Analyze our Q3 sales data and generate a summary report. @@ -251,13 +264,14 @@ I can help you analyze sales data! Sales analysis is important for business grow EXPECTED OUTPUT: { - "explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", - "details": { + "reason": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", + "properties": { "task_requirements": "Analyze Q3 sales data and generate summary report", "delivered_outcome": "General information about sales analysis concepts", "completion_gaps": "No data analysis performed, no summary report created" }, - "success": FALSE + "score": 0, + "status": "completed" } KEY PRINCIPLES @@ -273,4 +287,4 @@ KEY PRINCIPLES Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements. -# Output \ No newline at end of file +# Output diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty index 17cffe161c..559a50bee3 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty @@ -69,10 +69,10 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. + - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. - **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs. + **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs. **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question, a correct and direct answer fully completes the task. No additional elaboration is required. @@ -82,19 +82,26 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "details": { + "reason": "<15-60 words explaining the completion status>", + "properties": { "user_requests": ["", "", ...], "delivered_outcome": "<15-60 words summarizing what the agent accomplished>", "completion_gaps": "" }, - "explanation": "<15-60 words explaining the completion status>", - "success": + "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>, + "status": "" } +**Status: Skipped** +If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` + SCORING EXAMPLES ================ -### SUCCESS: TRUE - Multi-turn task with refinement +### SCORE: 1 - Multi-turn task with refinement CONVERSATION: User turn 1: Book me a flight from NYC to London for next Friday. @@ -107,16 +114,17 @@ I found a Holiday Inn 0.3 miles from ExCeL London at $120/night for 3 nights. I' EXPECTED OUTPUT: { - "details": { + "reason": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", + "properties": { "user_requests": ["Book a flight from NYC to London for next Friday", "Find a hotel near ExCeL London for 3 nights"], "delivered_outcome": "Flight and hotel both booked with confirmations sent, total cost provided", "completion_gaps": "None" }, - "explanation": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: FALSE - Incomplete multi-turn conversation +### SCORE: 0 - Incomplete multi-turn conversation CONVERSATION: User turn 1: I need you to analyze our Q3 sales data and create a report with charts. @@ -129,13 +137,14 @@ Agent turn 2: Revenue by region shows North America at 45%, Europe at 30%, and A EXPECTED OUTPUT: { - "details": { + "reason": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", + "properties": { "user_requests": ["Analyze Q3 sales data", "Create a report with charts", "Revenue by region chart", "Monthly trend line chart"], "delivered_outcome": "Text-based summary of data analysis without actual charts or formatted report", "completion_gaps": "User requested charts but agent only provided text: 'Revenue by region shows North America at 45%...' — no visual charts or formatted report were generated." }, - "explanation": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", - "success": FALSE + "score": 0, + "status": "completed" } KEY PRINCIPLES diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index 3ceb288d32..0d59900909 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -7,6 +7,7 @@ Provides common interfaces and assertion helpers shared by both prompty-based and code-based evaluators. """ +import copy from abc import ABC from typing import Any, Dict, List, Optional, Tuple, Type from unittest.mock import MagicMock @@ -106,6 +107,10 @@ def _run_evaluation_and_return_mocked_flow(self, **kwargs) -> Tuple[Dict[str, An evaluator = self._init_evaluator(**constructor_kwargs) + # Deep-copy call_kwargs to prevent evaluator preprocessing (e.g., + # _normalize_function_call_types) from mutating shared test data. + call_kwargs = copy.deepcopy(call_kwargs) + # Mock the flow only for behavioral tests flow_mock = None if self.use_mocking: diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py index e7723befce..51fa4c4686 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py @@ -19,6 +19,8 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation """ # region Expected flow inputs for each test + # Fluency calls reformat_agent_response() which extracts text-only content + # from assistant messages, so expected inputs are the reformatted strings. test_function_tool_local_calls_expected_flow_inputs = { "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE, } diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py index 5ff80fb47f..059c318a98 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py @@ -126,7 +126,30 @@ class TestTaskCompletionEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseTo MINIMAL_RESPONSE = BaseToolsEvaluatorBehaviorTest.email_tool_call_and_assistant_response - _additional_expected_field_suffixes = ["details"] + _additional_expected_field_suffixes = ["status", "properties", "score", "passed"] + + @property + def expected_result_fields(self) -> List[str]: + """Get expected result fields — metadata now lives inside properties, not as top-level keys.""" + return [ + f"{self._result_prefix}", + f"{self._result_prefix}_reason", + f"{self._result_prefix}_threshold", + f"{self._result_prefix}_result", + ] + [f"{self._result_prefix}_{suffix}" for suffix in self._additional_expected_field_suffixes] + + def assert_not_applicable(self, result_data): + """Assert a not-applicable (not_applicable) result for TaskCompletionEvaluator. + + Task completion returns score=None and label='not_applicable' for intermediate/not-applicable + responses, unlike the base class which expects a passing score. + """ + assert result_data["label"] == "not_applicable", \ + f"Expected 'not_applicable' but got '{result_data['label']}'" + assert result_data["score"] is None, \ + f"Expected score to be None for not-applicable result, got '{result_data['score']}'" + assert "Not applicable" in result_data.get("reason", ""), \ + f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'" def _create_mocked_evaluator(): @@ -192,8 +215,11 @@ def test_messages_valid_input(self): assert "task_completion" in result assert "task_completion_result" in result assert "task_completion_reason" in result - assert "task_completion_details" in result + assert "task_completion_properties" in result + assert "task_completion_status" in result assert "task_completion_threshold" in result + assert "task_completion_score" in result + assert "task_completion_passed" in result assert result["task_completion"] in (0, 1) def test_messages_with_tool_definitions(self):