From 51e0c7b01d112c0d31598060661def5f81dbcb4f Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Sun, 19 Apr 2026 15:59:59 +0200 Subject: [PATCH 1/5] add changes to tc output schema --- .../evaluator/_task_completion.py | 122 ++++++++++++------ .../evaluator/task_completion.prompty | 83 +++++++----- .../task_completion_multi_turn.prompty | 36 ++++-- .../builtin/task_completion/spec.yaml | 2 +- .../tests/common/base_evaluator_runner.py | 5 + .../common_tool_test_data.py | 92 ++++++++++++- .../test_coherence_evaluator_behavior.py | 2 +- .../test_fluency_evaluator_behavior.py | 32 ++--- ...test_task_completion_evaluator_behavior.py | 16 ++- .../test_tool_selection_evaluator_behavior.py | 18 +++ 10 files changed, 295 insertions(+), 113 deletions(-) diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index 27186900ee..f9c930953f 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1205,25 +1205,60 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool: return False # Auto-detect (_evaluation_level is None) return eval_input.get("messages") is not None + + + def _build_result( + self, + score: Optional[int], + result: str, + reason: str, + status: str, + details: Dict, + prompty_output_dict: Optional[Dict] = None, + ) -> Dict[str, Union[str, int, float, Dict, None]]: + """Build a standardized result dictionary. + + :param score: The evaluation score (1, 0, or None). + :param result: The result label ("pass", "fail", "skipped", or "error"). + :param reason: The reasoning or explanation string. + :param status: The evaluation status ("completed", "skipped", or "error"). + :param details: The properties/details dictionary. + :param prompty_output_dict: Optional raw prompty output for extracting token metadata. + :return: The standardized result dictionary. + """ + p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} + return { + self._result_key: score, + f"{self._result_key}_result": result, + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": status, + f"{self._result_key}_details": details, + f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), + f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), + f"{self._result_key}_total_tokens": p.get("total_token_count", 0), + f"{self._result_key}_finish_reason": p.get("finish_reason", ""), + f"{self._result_key}_model": p.get("model_id", ""), + f"{self._result_key}_sample_input": p.get("sample_input", ""), + f"{self._result_key}_sample_output": p.get("sample_output", ""), + } + def _not_applicable_result( self, error_message: str, threshold: Union[int, float] ) -> Dict[str, Union[str, float, Dict]]: - """Return a result indicating that the evaluation is not applicable.""" - return { - self._result_key: threshold, - f"{self._result_key}_result": "pass", - f"{self._result_key}_threshold": threshold, - f"{self._result_key}_reason": f"Not applicable: {error_message}", - f"{self._result_key}_details": {}, - f"{self._result_key}_prompt_tokens": 0, - f"{self._result_key}_completion_tokens": 0, - f"{self._result_key}_total_tokens": 0, - f"{self._result_key}_finish_reason": "", - f"{self._result_key}_model": "", - f"{self._result_key}_sample_input": "", - f"{self._result_key}_sample_output": "", - } + """Return a result indicating that the evaluation is not applicable (skipped). + + Not-applicable results have no score since the evaluator cannot make a judgment + (e.g., intermediate responses that are not final agent responses). + """ + return self._build_result( + score=None, + result="skipped", + reason=f"Not applicable: {error_message}", + status="skipped", + details={}, + ) @override async def _real_call(self, **kwargs): @@ -1322,6 +1357,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in """Parse the prompty output into a standardized result dictionary. Shared between single-turn and multi-turn evaluation paths. + Expects the canonical schema: score (int), reasoning (str), status (str), properties (dict|null). :param prompty_output_dict: Raw output from the prompty flow. :type prompty_output_dict: Dict @@ -1330,31 +1366,35 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in """ llm_output = prompty_output_dict.get("llm_output", prompty_output_dict) - if isinstance(llm_output, dict): - success_value = llm_output.get("success", False) - if isinstance(success_value, str): - success = 1 if success_value.lower() == "true" else 0 + if not isinstance(llm_output, dict): + score = None + result = "error" + reasoning = "Evaluator returned invalid output." + status = "error" + properties = {} + else: + status = llm_output.get("status", "completed") + reasoning = llm_output.get("reasoning", "") + properties = llm_output.get("properties") or {} + + if status == "skipped": + score = None + result = "skipped" else: - success = 1 if success_value else 0 - success_result = "pass" if success == 1 else "fail" - reason = llm_output.get("explanation", "") - return { - self._result_key: success, - f"{self._result_key}_result": success_result, - f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_reason": reason, - f"{self._result_key}_details": llm_output.get("details", {}), - f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), - f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), - f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), - f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""), - f"{self._result_key}_model": prompty_output_dict.get("model_id", ""), - f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""), - f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""), - } - raise EvaluationException( - message="Evaluator returned invalid output.", - blame=ErrorBlame.SYSTEM_ERROR, - category=ErrorCategory.FAILED_EXECUTION, - target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR, + score_value = llm_output.get("score", 0) + if isinstance(score_value, str): + score = 1 if score_value.strip() in ("1", "true") else 0 + elif isinstance(score_value, (int, float)): + score = 1 if score_value == 1 else 0 + else: + score = 1 if score_value else 0 + result = "pass" if score == 1 else "fail" + + return self._build_result( + score=score, + result=result, + reason=reasoning, + status=status, + details=properties, + prompty_output_dict=prompty_output_dict, ) diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty index e53ff1a9c4..d8aceccd04 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty @@ -66,10 +66,10 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. + - **1**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. - **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation. + **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation. **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question (e.g., "What is the capital of France?"), a correct and direct answer fully completes the task. No additional elaboration, context, or "actionable information" beyond the accurate answer is required. @@ -77,19 +77,25 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "explanation": "<15-60 words explaining the completion status>", - "details": { + "reasoning": "<15-60 words explaining the completion status>", + "properties": { "task_requirements": "<15-60 words on what the user specifically requested>", "delivered_outcome": "<15-60 words on what the agent actually provided>", "completion_gaps": "<15-60 words on missing elements if task is incomplete>" }, - "success": + "score": <1 or 0>, + "status": "" } +- **reasoning**: A brief explanation of the completion status. +- **properties**: An object containing evaluation details. null when status is "skipped". +- **score**: 1 if the task was fully completed, 0 otherwise. null when status is "skipped". +- **status**: "completed" when you were able to evaluate the input. "skipped" when the input is not applicable or cannot be evaluated (e.g., no response provided, empty input). + SCORING EXAMPLES ================ -### SUCCESS: TRUE - Example A +### SCORE: 1 - Example A CONVERSATION_HISTORY: User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine. @@ -117,16 +123,17 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc EXPECTED OUTPUT: { - "explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", - "details": { + "reasoning": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", + "properties": { "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example B +### SCORE: 1 - Example B CONVERSATION_HISTORY: User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited. @@ -142,16 +149,17 @@ Your internet issues are resolved - update firmware and change DNS as instructed EXPECTED OUTPUT: { - "explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", - "details": { + "reasoning": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", + "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example C +### SCORE: 1 - Example C CONVERSATION_HISTORY: User: Which is better for a beginner, Python or JavaScript? @@ -173,16 +181,17 @@ If you're interested in data or automation, start with Python. If you want to bu EXPECTED OUTPUT: { - "explanation": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", - "details": { + "reasoning": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", + "properties": { "task_requirements": "Subjective comparison of Python vs JavaScript for beginners", "delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: TRUE - Example D +### SCORE: 1 - Example D CONVERSATION_HISTORY: User: Is the boiling point of water 50°C? @@ -192,16 +201,17 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5 EXPECTED OUTPUT: { - "explanation": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", - "details": { + "reasoning": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", + "properties": { "task_requirements": "Verify whether the boiling point of water is 50°C", "delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C", "completion_gaps": "None" }, - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: FALSE - Example A +### SCORE: 0 - Example A CONVERSATION_HISTORY: User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine. @@ -211,16 +221,17 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F EXPECTED OUTPUT: { - "explanation": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", - "details": { + "reasoning": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", + "properties": { "task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "General recommendations without structured itinerary as asked", "completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks" }, - "success": FALSE + "score": 0, + "status": "completed" } -### SUCCESS: FALSE - Example B +### SCORE: 0 - Example B CONVERSATION_HISTORY: User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited. @@ -232,16 +243,17 @@ I found that your router firmware needs updating. You can usually do this throug EXPECTED OUTPUT: { - "explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", - "details": { + "reasoning": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", + "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed", "completion_gaps": "No specific firmware update steps, data plan upgrade not processed" }, - "success": FALSE + "score": 0, + "status": "completed" } -### SUCCESS: FALSE - Example C +### SCORE: 0 - Example C CONVERSATION_HISTORY: User: Analyze our Q3 sales data and generate a summary report. @@ -251,13 +263,14 @@ I can help you analyze sales data! Sales analysis is important for business grow EXPECTED OUTPUT: { - "explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", - "details": { + "reasoning": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", + "properties": { "task_requirements": "Analyze Q3 sales data and generate summary report", "delivered_outcome": "General information about sales analysis concepts", "completion_gaps": "No data analysis performed, no summary report created" }, - "success": FALSE + "score": 0, + "status": "completed" } KEY PRINCIPLES @@ -273,4 +286,4 @@ KEY PRINCIPLES Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements. -# Output \ No newline at end of file +# Output diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty index 17cffe161c..54f022063c 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty @@ -69,10 +69,10 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. + - **1**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. - **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs. + **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs. **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question, a correct and direct answer fully completes the task. No additional elaboration is required. @@ -82,19 +82,25 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "details": { + "reasoning": "<15-60 words explaining the completion status>", + "properties": { "user_requests": ["", "", ...], "delivered_outcome": "<15-60 words summarizing what the agent accomplished>", "completion_gaps": "" }, - "explanation": "<15-60 words explaining the completion status>", - "success": + "score": <1 or 0>, + "status": "" } +- **reasoning**: A brief explanation of the completion status. +- **properties**: An object containing evaluation details. null when status is "skipped". +- **score**: 1 if the task was fully completed, 0 otherwise. null when status is "skipped". +- **status**: "completed" when you were able to evaluate the input. "skipped" when the input is not applicable or cannot be evaluated (e.g., no response provided, empty conversation). + SCORING EXAMPLES ================ -### SUCCESS: TRUE - Multi-turn task with refinement +### SCORE: 1 - Multi-turn task with refinement CONVERSATION: User turn 1: Book me a flight from NYC to London for next Friday. @@ -107,16 +113,17 @@ I found a Holiday Inn 0.3 miles from ExCeL London at $120/night for 3 nights. I' EXPECTED OUTPUT: { - "details": { + "reasoning": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", + "properties": { "user_requests": ["Book a flight from NYC to London for next Friday", "Find a hotel near ExCeL London for 3 nights"], "delivered_outcome": "Flight and hotel both booked with confirmations sent, total cost provided", "completion_gaps": "None" }, - "explanation": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", - "success": TRUE + "score": 1, + "status": "completed" } -### SUCCESS: FALSE - Incomplete multi-turn conversation +### SCORE: 0 - Incomplete multi-turn conversation CONVERSATION: User turn 1: I need you to analyze our Q3 sales data and create a report with charts. @@ -129,13 +136,14 @@ Agent turn 2: Revenue by region shows North America at 45%, Europe at 30%, and A EXPECTED OUTPUT: { - "details": { + "reasoning": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", + "properties": { "user_requests": ["Analyze Q3 sales data", "Create a report with charts", "Revenue by region chart", "Monthly trend line chart"], "delivered_outcome": "Text-based summary of data analysis without actual charts or formatted report", "completion_gaps": "User requested charts but agent only provided text: 'Revenue by region shows North America at 45%...' — no visual charts or formatted report were generated." }, - "explanation": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", - "success": FALSE + "score": 0, + "status": "completed" } KEY PRINCIPLES diff --git a/assets/evaluators/builtin/task_completion/spec.yaml b/assets/evaluators/builtin/task_completion/spec.yaml index 90652d653f..5c248b9d57 100644 --- a/assets/evaluators/builtin/task_completion/spec.yaml +++ b/assets/evaluators/builtin/task_completion/spec.yaml @@ -1,6 +1,6 @@ type: "evaluator" name: "builtin.task_completion" -version: 9 +version: 10 displayName: "Task-Completion-Evaluator-(Preview)" description: "Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met, ignoring rule adherence or intent understanding. This evaluator is useful for assessing agent effectiveness in task-oriented scenarios, workflow automation, and goal-oriented AI interactions." evaluatorType: "builtin" diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py index d084afbdae..0f02cf5381 100644 --- a/assets/evaluators/tests/common/base_evaluator_runner.py +++ b/assets/evaluators/tests/common/base_evaluator_runner.py @@ -7,6 +7,7 @@ Provides common interfaces and assertion helpers shared by both prompty-based and code-based evaluators. """ +import copy from abc import ABC from typing import Any, Dict, List, Optional, Tuple, Type from unittest.mock import MagicMock @@ -106,6 +107,10 @@ def _run_evaluation_and_return_mocked_flow(self, **kwargs) -> Tuple[Dict[str, An evaluator = self._init_evaluator(**constructor_kwargs) + # Deep-copy call_kwargs to prevent evaluator preprocessing (e.g., + # _normalize_function_call_types) from mutating shared test data. + call_kwargs = copy.deepcopy(call_kwargs) + # Mock the flow only for behavioral tests flow_mock = None if self.use_mocking: diff --git a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py index cdcb98be7a..33545a78a3 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py +++ b/assets/evaluators/tests/test_evaluators_behavior/common_tool_test_data.py @@ -2044,8 +2044,46 @@ ) # ----- TCS expected flow response ----- -# For LOCAL_CALLS, FILE_SEARCH, IMAGE_GEN, MEMORY_SEARCH: _preprocess_messages is a no-op. -LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = LOCAL_CALLS_RESPONSE +# _preprocess_messages normalizes function_call/openapi_call types to tool_call/tool_result. +# LOCAL_CALLS_RESPONSE contains function_call types, so we provide the normalized version. +LOCAL_CALLS_TCS_EXPECTED_FLOW_RESPONSE = [ + { + "run_id": "", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX", + "name": "get_horoscope", + "arguments": {"sign": "Aquarius"}, + } + ], + }, + { + "run_id": "", + "tool_call_id": "call_ASUI6ResxjPRW7JDubafRBQX", + "role": "tool", + "content": [ + { + "type": "tool_result", + "tool_result": { + "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.", + }, + } + ], + }, + { + "role": "assistant", + "content": [ + { + "annotations": [], + "text": "Your horoscope for Aquarius is: Next Tuesday you will befriend a baby otter.", + "type": "output_text", + "logprobs": [], + } + ], + }, +] FILE_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = FILE_SEARCH_RESPONSE IMAGE_GEN_TCS_EXPECTED_FLOW_RESPONSE = IMAGE_GEN_RESPONSE MEMORY_SEARCH_TCS_EXPECTED_FLOW_RESPONSE = MEMORY_SEARCH_RESPONSE @@ -2053,6 +2091,53 @@ KB_MCP_TCS_EXPECTED_FLOW_RESPONSE = KB_MCP_RESPONSE[2:] MCP_TCS_EXPECTED_FLOW_RESPONSE = MCP_RESPONSE[2:] +# ----- Normalized OPENAPI response (openapi_call/openapi_call_output → tool_call/tool_result) ----- +# Used by evaluators that normalize types before calling the flow (e.g., coherence). +OPENAPI_NORMALIZED_RESPONSE = [ + { + "run_id": "", + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab", + "name": "weather_GetCurrentWeather", + "arguments": {"location": "Cairo", "format": "j1"}, + } + ], + }, + { + "run_id": "", + "tool_call_id": "call_cfb55a91a58c44ea9217b34174aad2ab", + "role": "tool", + "content": [{"type": "tool_result", "tool_result": ""}], + }, + { + "role": "assistant", + "content": [ + { + "annotations": [], + "text": ( + "**Current weather in Cairo:**\n\n- **Temperature:** 26°C (feels like 25°C)\n" + "- **Condition:** Sand (likely some dusty or sandy winds)\n" + "- **Humidity:** 28%\n" + "- **Cloud Cover:** 0% (clear skies)\n" + "- **Wind:** SW at 23 km/h\n" + "- **Visibility:** Moderate (4 km)\n" + "- **No precipitation**\n" + "- **UV Index:** 2\n\n" + "**Summary:** Cairo is experiencing warm, dry, and sunny weather, but there is " + "sand or dust in the air which may reduce visibility. Skies are clear and " + "it\u2019s breezy. Make sure to protect yourself from the dust if you\u2019re " + "heading outside!" + ), + "type": "output_text", + "logprobs": [], + } + ], + }, +] + # ============================================================================= # Expected flow inputs shared across multiple evaluators @@ -2734,9 +2819,6 @@ "arguments": { "sign": "Aquarius", }, - "tool_result": { - "horoscope": "Aquarius: Next Tuesday you will befriend a baby otter.", - }, }, ], }, diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py index bbf5cddd3a..fe8eb02ae9 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_coherence_evaluator_behavior.py @@ -61,7 +61,7 @@ class TestCoherenceEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluati test_openapi_expected_flow_inputs = { "query": data.OPENAPI_QUERY, - "response": data.OPENAPI_RESPONSE, + "response": data.OPENAPI_NORMALIZED_RESPONSE, } test_web_search_expected_flow_inputs = { diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py index 1cf9064743..51fa4c4686 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py @@ -19,64 +19,66 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation """ # region Expected flow inputs for each test + # Fluency calls reformat_agent_response() which extracts text-only content + # from assistant messages, so expected inputs are the reformatted strings. test_function_tool_local_calls_expected_flow_inputs = { - "response": data.LOCAL_CALLS_COHERENCE_EXPECTED_FLOW_RESPONSE, + "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE, } test_code_interpreter_expected_flow_inputs = { - "response": data.CODE_INTERPRETER_RESPONSE, + "response": data.CODE_INTERPRETER_IR_EXPECTED_FLOW_RESPONSE, } test_bing_grounding_expected_flow_inputs = { - "response": data.BING_GROUNDING_RESPONSE, + "response": data.BING_GROUNDING_IR_EXPECTED_FLOW_RESPONSE, } test_bing_custom_search_expected_flow_inputs = { - "response": data.BING_CUSTOM_SEARCH_RESPONSE, + "response": data.BING_CUSTOM_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_file_search_expected_flow_inputs = { - "response": data.FILE_SEARCH_RESPONSE, + "response": data.FILE_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_azure_ai_search_expected_flow_inputs = { - "response": data.AZURE_AI_SEARCH_RESPONSE, + "response": data.AZURE_AI_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_sharepoint_grounding_expected_flow_inputs = { - "response": data.SHAREPOINT_RESPONSE, + "response": data.SHAREPOINT_IR_EXPECTED_FLOW_RESPONSE, } test_fabric_data_agent_expected_flow_inputs = { - "response": data.FABRIC_RESPONSE, + "response": data.FABRIC_IR_EXPECTED_FLOW_RESPONSE, } test_openapi_expected_flow_inputs = { - "response": data.OPENAPI_RESPONSE, + "response": data.OPENAPI_IR_EXPECTED_FLOW_RESPONSE, } test_web_search_expected_flow_inputs = { - "response": data.WEB_SEARCH_RESPONSE, + "response": data.WEB_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_browser_automation_expected_flow_inputs = { - "response": data.BROWSER_AUTOMATION_RESPONSE, + "response": data.BROWSER_AUTOMATION_IR_EXPECTED_FLOW_RESPONSE, } test_image_generation_expected_flow_inputs = { - "response": data.IMAGE_GEN_RESPONSE, + "response": data.IMAGE_GEN_IR_EXPECTED_FLOW_RESPONSE, } test_memory_search_expected_flow_inputs = { - "response": data.MEMORY_SEARCH_RESPONSE, + "response": data.MEMORY_SEARCH_IR_EXPECTED_FLOW_RESPONSE, } test_kb_mcp_expected_flow_inputs = { - "response": data.KB_MCP_TCS_EXPECTED_FLOW_RESPONSE, + "response": data.KB_MCP_IR_EXPECTED_FLOW_RESPONSE, } test_mcp_expected_flow_inputs = { - "response": data.MCP_TCS_EXPECTED_FLOW_RESPONSE, + "response": data.MCP_IR_EXPECTED_FLOW_RESPONSE, } # endregion diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py index fbabb754f7..3e07a7a456 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py @@ -126,7 +126,20 @@ class TestTaskCompletionEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseTo MINIMAL_RESPONSE = BaseToolsEvaluatorBehaviorTest.email_tool_call_and_assistant_response - _additional_expected_field_suffixes = ["details"] + _additional_expected_field_suffixes = ["details", "status"] + + def assert_not_applicable(self, result_data): + """Assert a not-applicable (skipped) result for TaskCompletionEvaluator. + + Task completion returns score=None and label='skipped' for intermediate/not-applicable + responses, unlike the base class which expects a passing score. + """ + assert result_data["label"] == "skipped", \ + f"Expected 'skipped' but got '{result_data['label']}'" + assert result_data["score"] is None, \ + f"Expected score to be None for not-applicable result, got '{result_data['score']}'" + assert "Not applicable" in result_data.get("reason", ""), \ + f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'" def _create_mocked_evaluator(): @@ -193,6 +206,7 @@ def test_messages_valid_input(self): assert "task_completion_result" in result assert "task_completion_reason" in result assert "task_completion_details" in result + assert "task_completion_status" in result assert "task_completion_threshold" in result assert result["task_completion"] in (0, 1) diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py index 0e392a35a2..33417f015b 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py @@ -116,3 +116,21 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base is_tool_definition_required = True evaluator_type = ToolSelectionEvaluator + + def test_openapi(self): + """OpenAPI with empty functions list - evaluator returns pass with not-applicable reason. + + The evaluator can't find matching tool definitions for openapi_call when functions + list is empty, so it returns score=1/pass but with a "Not applicable" reason. + The flow is never called. + """ + results, flow_mock = self._run_evaluation_and_return_mocked_flow( + query=data.OPENAPI_QUERY, + response=data.OPENAPI_RESPONSE, + tool_definitions=data.OPENAPI_TOOL_DEFINITIONS, + ) + result_data = self._extract_and_print_result(results, "OpenAPI") + assert result_data["label"] == "pass" + assert result_data["score"] == 1 + assert "Not applicable" in result_data["reason"] + flow_mock.assert_not_called() From 0b81bf4ae05d0779d2092b9e4ad43d4d5bc3ab7b Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Sun, 19 Apr 2026 16:09:15 +0200 Subject: [PATCH 2/5] code health --- .../evaluator/_task_completion.py | 2 -- .../test_tool_selection_evaluator_behavior.py | 18 ------------------ 2 files changed, 20 deletions(-) diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index f9c930953f..5a9190bcbe 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1205,7 +1205,6 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool: return False # Auto-detect (_evaluation_level is None) return eval_input.get("messages") is not None - def _build_result( self, @@ -1243,7 +1242,6 @@ def _build_result( f"{self._result_key}_sample_output": p.get("sample_output", ""), } - def _not_applicable_result( self, error_message: str, threshold: Union[int, float] ) -> Dict[str, Union[str, float, Dict]]: diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py index 33417f015b..0e392a35a2 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_tool_selection_evaluator_behavior.py @@ -116,21 +116,3 @@ class TestToolSelectionEvaluatorBehavior(BaseToolCallEvaluatorBehaviorTest, Base is_tool_definition_required = True evaluator_type = ToolSelectionEvaluator - - def test_openapi(self): - """OpenAPI with empty functions list - evaluator returns pass with not-applicable reason. - - The evaluator can't find matching tool definitions for openapi_call when functions - list is empty, so it returns score=1/pass but with a "Not applicable" reason. - The flow is never called. - """ - results, flow_mock = self._run_evaluation_and_return_mocked_flow( - query=data.OPENAPI_QUERY, - response=data.OPENAPI_RESPONSE, - tool_definitions=data.OPENAPI_TOOL_DEFINITIONS, - ) - result_data = self._extract_and_print_result(results, "OpenAPI") - assert result_data["label"] == "pass" - assert result_data["score"] == 1 - assert "Not applicable" in result_data["reason"] - flow_mock.assert_not_called() From b75fd89505707577e8ea672ef5e92bd0cdae94eb Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Sun, 19 Apr 2026 18:22:50 +0200 Subject: [PATCH 3/5] update prompty --- .../evaluator/_task_completion.py | 8 ++--- .../evaluator/task_completion.prompty | 31 ++++++++++--------- .../task_completion_multi_turn.prompty | 21 +++++++------ 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index 5a9190bcbe..73c2b362ed 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1355,7 +1355,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in """Parse the prompty output into a standardized result dictionary. Shared between single-turn and multi-turn evaluation paths. - Expects the canonical schema: score (int), reasoning (str), status (str), properties (dict|null). + Expects the canonical schema: score (int), reason (str), status (str), properties (dict|null). :param prompty_output_dict: Raw output from the prompty flow. :type prompty_output_dict: Dict @@ -1367,12 +1367,12 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in if not isinstance(llm_output, dict): score = None result = "error" - reasoning = "Evaluator returned invalid output." + reason = "Evaluator returned invalid output." status = "error" properties = {} else: status = llm_output.get("status", "completed") - reasoning = llm_output.get("reasoning", "") + reason = llm_output.get("reason", "") properties = llm_output.get("properties") or {} if status == "skipped": @@ -1391,7 +1391,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in return self._build_result( score=score, result=result, - reason=reasoning, + reason=reason, status=status, details=properties, prompty_output_dict=prompty_output_dict, diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty index d8aceccd04..050b43804a 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty @@ -66,8 +66,8 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **1**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **0**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. + - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved. **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation. @@ -77,20 +77,21 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "reasoning": "<15-60 words explaining the completion status>", + "reason": "<15-60 words explaining the completion status>", "properties": { "task_requirements": "<15-60 words on what the user specifically requested>", "delivered_outcome": "<15-60 words on what the agent actually provided>", "completion_gaps": "<15-60 words on missing elements if task is incomplete>" }, - "score": <1 or 0>, + "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>, "status": "" } -- **reasoning**: A brief explanation of the completion status. -- **properties**: An object containing evaluation details. null when status is "skipped". -- **score**: 1 if the task was fully completed, 0 otherwise. null when status is "skipped". -- **status**: "completed" when you were able to evaluate the input. "skipped" when the input is not applicable or cannot be evaluated (e.g., no response provided, empty input). +**Status: Skipped** +If the CONVERSATION_HISTORY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` SCORING EXAMPLES ================ @@ -123,7 +124,7 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc EXPECTED OUTPUT: { - "reasoning": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", + "reason": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.", "properties": { "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips", @@ -149,7 +150,7 @@ Your internet issues are resolved - update firmware and change DNS as instructed EXPECTED OUTPUT: { - "reasoning": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", + "reason": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.", "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active", @@ -181,7 +182,7 @@ If you're interested in data or automation, start with Python. If you want to bu EXPECTED OUTPUT: { - "reasoning": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", + "reason": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.", "properties": { "task_requirements": "Subjective comparison of Python vs JavaScript for beginners", "delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation", @@ -201,7 +202,7 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5 EXPECTED OUTPUT: { - "reasoning": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", + "reason": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.", "properties": { "task_requirements": "Verify whether the boiling point of water is 50°C", "delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C", @@ -221,7 +222,7 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F EXPECTED OUTPUT: { - "reasoning": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", + "reason": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.", "properties": { "task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine", "delivered_outcome": "General recommendations without structured itinerary as asked", @@ -243,7 +244,7 @@ I found that your router firmware needs updating. You can usually do this throug EXPECTED OUTPUT: { - "reasoning": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", + "reason": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.", "properties": { "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan", "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed", @@ -263,7 +264,7 @@ I can help you analyze sales data! Sales analysis is important for business grow EXPECTED OUTPUT: { - "reasoning": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", + "reason": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.", "properties": { "task_requirements": "Analyze Q3 sales data and generate summary report", "delivered_outcome": "General information about sales analysis concepts", diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty index 54f022063c..559a50bee3 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty +++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty @@ -69,8 +69,8 @@ C. Assess Task Completion: - **Incomplete**: No usable deliverable or major requirements unmet D. Assign a Score: - - **1**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. - - **0**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. + - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for. + - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation. **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs. @@ -82,20 +82,21 @@ OUTPUT FORMAT ============= Output a JSON object with these keys: { - "reasoning": "<15-60 words explaining the completion status>", + "reason": "<15-60 words explaining the completion status>", "properties": { "user_requests": ["", "", ...], "delivered_outcome": "<15-60 words summarizing what the agent accomplished>", "completion_gaps": "" }, - "score": <1 or 0>, + "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>, "status": "" } -- **reasoning**: A brief explanation of the completion status. -- **properties**: An object containing evaluation details. null when status is "skipped". -- **score**: 1 if the task was fully completed, 0 otherwise. null when status is "skipped". -- **status**: "completed" when you were able to evaluate the input. "skipped" when the input is not applicable or cannot be evaluated (e.g., no response provided, empty conversation). +**Status: Skipped** +If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring: +```json +{"reason": "", "properties": null, "score": null, "status": "skipped"} +``` SCORING EXAMPLES ================ @@ -113,7 +114,7 @@ I found a Holiday Inn 0.3 miles from ExCeL London at $120/night for 3 nights. I' EXPECTED OUTPUT: { - "reasoning": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", + "reason": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.", "properties": { "user_requests": ["Book a flight from NYC to London for next Friday", "Find a hotel near ExCeL London for 3 nights"], "delivered_outcome": "Flight and hotel both booked with confirmations sent, total cost provided", @@ -136,7 +137,7 @@ Agent turn 2: Revenue by region shows North America at 45%, Europe at 30%, and A EXPECTED OUTPUT: { - "reasoning": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", + "reason": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.", "properties": { "user_requests": ["Analyze Q3 sales data", "Create a report with charts", "Revenue by region chart", "Monthly trend line chart"], "delivered_outcome": "Text-based summary of data analysis without actual charts or formatted report", From e50289742c46b675ff59faec0ff8b503f0a1d913 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Tue, 21 Apr 2026 13:18:04 +0200 Subject: [PATCH 4/5] rename details to properties --- .../task_completion/evaluator/_task_completion.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index 73c2b362ed..30ce938c3b 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1212,7 +1212,7 @@ def _build_result( result: str, reason: str, status: str, - details: Dict, + properties: Dict, prompty_output_dict: Optional[Dict] = None, ) -> Dict[str, Union[str, int, float, Dict, None]]: """Build a standardized result dictionary. @@ -1221,18 +1221,19 @@ def _build_result( :param result: The result label ("pass", "fail", "skipped", or "error"). :param reason: The reasoning or explanation string. :param status: The evaluation status ("completed", "skipped", or "error"). - :param details: The properties/details dictionary. + :param properties: The properties dictionary. :param prompty_output_dict: Optional raw prompty output for extracting token metadata. :return: The standardized result dictionary. """ p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {} return { self._result_key: score, + f"{self._result_key}_score": score, f"{self._result_key}_result": result, f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_reason": reason, f"{self._result_key}_status": status, - f"{self._result_key}_details": details, + f"{self._result_key}_properties": properties, f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), f"{self._result_key}_total_tokens": p.get("total_token_count", 0), @@ -1255,7 +1256,7 @@ def _not_applicable_result( result="skipped", reason=f"Not applicable: {error_message}", status="skipped", - details={}, + properties={}, ) @override @@ -1393,6 +1394,6 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in result=result, reason=reason, status=status, - details=properties, + properties=properties, prompty_output_dict=prompty_output_dict, ) From 3c6694053e88a723f04401a538f4e9dd0ceed686 Mon Sep 17 00:00:00 2001 From: salma-elshafey Date: Thu, 23 Apr 2026 14:49:54 +0200 Subject: [PATCH 5/5] fix tests --- .../builtin/task_completion/evaluator/_task_completion.py | 5 +++-- .../test_task_completion_evaluator_behavior.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py index 21c3ac6386..a9269d85f4 100644 --- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py +++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py @@ -1233,6 +1233,7 @@ def _build_result( f"{self._result_key}_threshold": self._threshold, f"{self._result_key}_reason": reason, f"{self._result_key}_status": status, + f"{self._result_key}_details": properties, f"{self._result_key}_properties": properties, f"{self._result_key}_prompt_tokens": p.get("input_token_count", 0), f"{self._result_key}_completion_tokens": p.get("output_token_count", 0), @@ -1253,7 +1254,7 @@ def _not_applicable_result( """ return self._build_result( score=None, - result="skipped", + result="not_applicable", reason=f"Not applicable: {error_message}", status="skipped", properties={}, @@ -1378,7 +1379,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in if status == "skipped": score = None - result = "skipped" + result = "not_applicable" else: score_value = llm_output.get("score", 0) if isinstance(score_value, str): diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py index 55f903896d..9e27e5c888 100644 --- a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py +++ b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py @@ -129,13 +129,13 @@ class TestTaskCompletionEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseTo _additional_expected_field_suffixes = ["details", "status"] def assert_not_applicable(self, result_data): - """Assert a not-applicable (skipped) result for TaskCompletionEvaluator. + """Assert a not-applicable (not_applicable) result for TaskCompletionEvaluator. - Task completion returns score=None and label='skipped' for intermediate/not-applicable + Task completion returns score=None and label='not_applicable' for intermediate/not-applicable responses, unlike the base class which expects a passing score. """ - assert result_data["label"] == "skipped", \ - f"Expected 'skipped' but got '{result_data['label']}'" + assert result_data["label"] == "not_applicable", \ + f"Expected 'not_applicable' but got '{result_data['label']}'" assert result_data["score"] is None, \ f"Expected score to be None for not-applicable result, got '{result_data['score']}'" assert "Not applicable" in result_data.get("reason", ""), \