diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py
index d0599ddea6..5e5a18dd22 100644
--- a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py
+++ b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py
@@ -1210,24 +1210,61 @@ def _should_use_conversation_level(self, eval_input: Dict) -> bool:
         # Auto-detect (_evaluation_level is None)
         return eval_input.get("messages") is not None
 
+    def _build_result(
+        self,
+        score: Optional[int],
+        result: str,
+        reason: str,
+        status: str,
+        properties: Dict,
+        prompty_output_dict: Optional[Dict] = None,
+    ) -> Dict[str, Union[str, int, float, Dict, None]]:
+        """Build a standardized result dictionary.
+
+        :param score: The evaluation score (1, 0, or None).
+        :param result: The result label ("pass", "fail", "skipped", or "error").
+        :param reason: The reasoning or explanation string.
+        :param status: The evaluation status ("completed", "skipped", or "error").
+        :param properties: The properties dictionary.
+        :param prompty_output_dict: Optional raw prompty output for extracting token metadata.
+        :return: The standardized result dictionary.
+        """
+        p = prompty_output_dict if isinstance(prompty_output_dict, dict) else {}
+        metadata = {
+            "prompt_tokens": p.get("input_token_count", 0),
+            "completion_tokens": p.get("output_token_count", 0),
+            "total_tokens": p.get("total_token_count", 0),
+            "finish_reason": p.get("finish_reason", ""),
+            "model": p.get("model_id", ""),
+            "sample_input": p.get("sample_input", ""),
+            "sample_output": p.get("sample_output", ""),
+        }
+        return {
+            self._result_key: score,
+            f"{self._result_key}_score": score,
+            f"{self._result_key}_result": result,
+            f"{self._result_key}_passed": result == "pass" if result in ["pass", "fail"] else None,
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_reason": reason,
+            f"{self._result_key}_status": status,
+            f"{self._result_key}_properties": {**properties, **metadata}
+        }
+
     def _not_applicable_result(
         self, error_message: str, threshold: Union[int, float]
     ) -> Dict[str, Union[str, float, Dict]]:
-        """Return a result indicating that the evaluation is not applicable."""
-        return {
-            self._result_key: threshold,
-            f"{self._result_key}_result": "pass",
-            f"{self._result_key}_threshold": threshold,
-            f"{self._result_key}_reason": f"Not applicable: {error_message}",
-            f"{self._result_key}_details": {},
-            f"{self._result_key}_prompt_tokens": 0,
-            f"{self._result_key}_completion_tokens": 0,
-            f"{self._result_key}_total_tokens": 0,
-            f"{self._result_key}_finish_reason": "",
-            f"{self._result_key}_model": "",
-            f"{self._result_key}_sample_input": "",
-            f"{self._result_key}_sample_output": "",
-        }
+        """Return a result indicating that the evaluation is not applicable (skipped).
+
+        Not-applicable results have no score since the evaluator cannot make a judgment
+        (e.g., intermediate responses that are not final agent responses).
+        """
+        return self._build_result(
+            score=None,
+            result="not_applicable",
+            reason=f"Not applicable: {error_message}",
+            status="skipped",
+            properties={},
+        )
 
     @override
     async def _real_call(self, **kwargs):
@@ -1326,6 +1363,7 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
         """Parse the prompty output into a standardized result dictionary.
 
         Shared between single-turn and multi-turn evaluation paths.
+        Expects the canonical schema: score (int), reason (str), status (str), properties (dict|null).
 
         :param prompty_output_dict: Raw output from the prompty flow.
         :type prompty_output_dict: Dict
@@ -1334,31 +1372,35 @@ def _parse_prompty_output(self, prompty_output_dict: Dict) -> Dict[str, Union[in
         """
         llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
-        if isinstance(llm_output, dict):
-            success_value = llm_output.get("success", False)
-            if isinstance(success_value, str):
-                success = 1 if success_value.lower() == "true" else 0
+        if not isinstance(llm_output, dict):
+            score = None
+            result = "error"
+            reason = "Evaluator returned invalid output."
+            status = "error"
+            properties = {}
+        else:
+            status = llm_output.get("status", "completed")
+            reason = llm_output.get("reason", "")
+            properties = llm_output.get("properties") or {}
+
+            if status == "skipped":
+                score = None
+                result = "not_applicable"
             else:
-                success = 1 if success_value else 0
-            success_result = "pass" if success == 1 else "fail"
-            reason = llm_output.get("explanation", "")
-            return {
-                self._result_key: success,
-                f"{self._result_key}_result": success_result,
-                f"{self._result_key}_threshold": self._threshold,
-                f"{self._result_key}_reason": reason,
-                f"{self._result_key}_details": llm_output.get("details", {}),
-                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
-                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
-                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
-                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
-            }
-        raise EvaluationException(
-            message="Evaluator returned invalid output.",
-            blame=ErrorBlame.SYSTEM_ERROR,
-            category=ErrorCategory.FAILED_EXECUTION,
-            target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR,
+                score_value = llm_output.get("score", 0)
+                if isinstance(score_value, str):
+                    score = 1 if score_value.strip() in ("1", "true") else 0
+                elif isinstance(score_value, (int, float)):
+                    score = 1 if score_value == 1 else 0
+                else:
+                    score = 1 if score_value else 0
+                result = "pass" if score == 1 else "fail"
+
+        return self._build_result(
+            score=score,
+            result=result,
+            reason=reason,
+            status=status,
+            properties=properties,
+            prompty_output_dict=prompty_output_dict,
         )
diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty
index e53ff1a9c4..050b43804a 100644
--- a/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty
+++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion.prompty
@@ -66,10 +66,10 @@ C. Assess Task Completion:
 - **Incomplete**: No usable deliverable or major requirements unmet
 
 D. Assign a Score:
-   - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
-   - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
+   - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
+   - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
 
-   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.
+   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question (e.g., "Which is better, X or Y?", "What do you think about…?"), there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs — even if it does not give a single definitive recommendation.
 
    **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question (e.g., "What is the capital of France?"), a correct and direct answer fully completes the task. No additional elaboration, context, or "actionable information" beyond the accurate answer is required.
 
@@ -77,19 +77,26 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "explanation": "<15-60 words explaining the completion status>",
-  "details": {
+  "reason": "<15-60 words explaining the completion status>",
+  "properties": {
     "task_requirements": "<15-60 words on what the user specifically requested>",
     "delivered_outcome": "<15-60 words on what the agent actually provided>",
     "completion_gaps": "<15-60 words on missing elements if task is incomplete>"
   },
-  "success": <TRUE or FALSE>
+  "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>,
+  "status": "<completed or skipped>"
 }
 
+**Status: Skipped**
+If the CONVERSATION_HISTORY or AGENT_RESPONSE is empty or not provided, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 SCORING EXAMPLES
 ================
 
-### SUCCESS: TRUE - Example A
+### SCORE: 1 - Example A
 
 CONVERSATION_HISTORY:
 User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
@@ -117,16 +124,17 @@ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advanc
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
-  "details": {
+  "reason": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
+  "properties": {
     "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
     "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example B
+### SCORE: 1 - Example B
 
 CONVERSATION_HISTORY:
 User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
@@ -142,16 +150,17 @@ Your internet issues are resolved - update firmware and change DNS as instructed
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
-  "details": {
+  "reason": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
+  "properties": {
     "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
     "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example C
+### SCORE: 1 - Example C
 
 CONVERSATION_HISTORY:
 User: Which is better for a beginner, Python or JavaScript?
@@ -173,16 +182,17 @@ If you're interested in data or automation, start with Python. If you want to bu
 
 EXPECTED OUTPUT:
 {
-  "explanation": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
-  "details": {
+  "reason": "The question is subjective with no single correct answer. The agent provided a balanced comparison covering syntax, use cases, and ecosystems for both languages with actionable guidance.",
+  "properties": {
     "task_requirements": "Subjective comparison of Python vs JavaScript for beginners",
     "delivered_outcome": "Balanced pros/cons for each language with context-dependent recommendation",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: TRUE - Example D
+### SCORE: 1 - Example D
 
 CONVERSATION_HISTORY:
 User: Is the boiling point of water 50°C?
@@ -192,16 +202,17 @@ No, the boiling point of water is 100°C at standard atmospheric pressure, not 5
 
 EXPECTED OUTPUT:
 {
-  "explanation": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
-  "details": {
+  "reason": "The user asked a factual verification question. The agent correctly identified the error and provided the accurate value. No further elaboration is required.",
+  "properties": {
     "task_requirements": "Verify whether the boiling point of water is 50°C",
     "delivered_outcome": "Correct answer provided: boiling point is 100°C, not 50°C",
     "completion_gaps": "None"
   },
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example A
+### SCORE: 0 - Example A
 
 CONVERSATION_HISTORY:
 User: Plan a detailed day-by-day 3-day itinerary for Paris with cultural landmarks and local cuisine.
@@ -211,16 +222,17 @@ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try F
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
-  "details": {
+  "reason": "Vague suggestions provided but no actual detailed day-by-day itinerary created as user requested. Missing specific landmarks, restaurants, and detailed schedule.",
+  "properties": {
     "task_requirements": "A detailed 3-day Paris itinerary with cultural landmarks and local cuisine",
     "delivered_outcome": "General recommendations without structured itinerary as asked",
     "completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example B
+### SCORE: 0 - Example B
 
 CONVERSATION_HISTORY:
 User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
@@ -232,16 +244,17 @@ I found that your router firmware needs updating. You can usually do this throug
 
 EXPECTED OUTPUT:
 {
-  "explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
-  "details": {
+  "reason": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
+  "properties": {
     "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
     "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed",
     "completion_gaps": "No specific firmware update steps, data plan upgrade not processed"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Example C
+### SCORE: 0 - Example C
 
 CONVERSATION_HISTORY:
 User: Analyze our Q3 sales data and generate a summary report.
@@ -251,13 +264,14 @@ I can help you analyze sales data! Sales analysis is important for business grow
 
 EXPECTED OUTPUT:
 {
-  "explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
-  "details": {
+  "reason": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
+  "properties": {
     "task_requirements": "Analyze Q3 sales data and generate summary report",
     "delivered_outcome": "General information about sales analysis concepts",
     "completion_gaps": "No data analysis performed, no summary report created"
   },
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
 KEY PRINCIPLES
@@ -273,4 +287,4 @@ KEY PRINCIPLES
 
 Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements.
 
-# Output
\ No newline at end of file
+# Output
diff --git a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty
index 17cffe161c..559a50bee3 100644
--- a/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty
+++ b/assets/evaluators/builtin/task_completion/evaluator/task_completion_multi_turn.prompty
@@ -69,10 +69,10 @@ C. Assess Task Completion:
 - **Incomplete**: No usable deliverable or major requirements unmet
 
 D. Assign a Score:
-   - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
-   - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation.
+   - **1 (TRUE)**: The agent delivered a complete and correct solution that accomplishes the user's entire goal by the end of the conversation. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
+   - **0 (FALSE)**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved by the end of the conversation.
 
-   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (TRUE) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs.
+   **Note on subjective/open-ended queries:** When the user asks a subjective, opinion-based, or comparison question, there is no single correct answer. The task is considered **complete** (1) if the agent provides a thoughtful, relevant response that addresses the question with reasonable perspectives or trade-offs.
 
    **Note on direct/factual queries:** When the user asks a straightforward factual, yes/no, or verification question, a correct and direct answer fully completes the task. No additional elaboration is required.
 
@@ -82,19 +82,26 @@ OUTPUT FORMAT
 =============
 Output a JSON object with these keys:
 {
-  "details": {
+  "reason": "<15-60 words explaining the completion status>",
+  "properties": {
     "user_requests": ["<request 1>", "<request 2>", ...],
     "delivered_outcome": "<15-60 words summarizing what the agent accomplished>",
     "completion_gaps": "<For each unmet request, quote the assistant's actual response (or note its absence) as evidence. 'None' if all requests met.>"
   },
-  "explanation": "<15-60 words explaining the completion status>",
-  "success": <TRUE or FALSE>
+  "score": <1 if task completed successfully, 0 if task failed or incomplete, null if status is skipped>,
+  "status": "<completed or skipped>"
 }
 
+**Status: Skipped**
+If the CONVERSATION is empty or not provided, or doesn't end with the agent response, return status "skipped" immediately without scoring:
+```json
+{"reason": "<explain why evaluation was skipped>", "properties": null, "score": null, "status": "skipped"}
+```
+
 SCORING EXAMPLES
 ================
 
-### SUCCESS: TRUE - Multi-turn task with refinement
+### SCORE: 1 - Multi-turn task with refinement
 
 CONVERSATION:
 User turn 1: Book me a flight from NYC to London for next Friday.
@@ -107,16 +114,17 @@ I found a Holiday Inn 0.3 miles from ExCeL London at $120/night for 3 nights. I'
 
 EXPECTED OUTPUT:
 {
-  "details": {
+  "reason": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.",
+  "properties": {
     "user_requests": ["Book a flight from NYC to London for next Friday", "Find a hotel near ExCeL London for 3 nights"],
     "delivered_outcome": "Flight and hotel both booked with confirmations sent, total cost provided",
     "completion_gaps": "None"
   },
-  "explanation": "Both tasks completed: flight booked as requested and hotel near conference venue secured after the user added that requirement in turn 2.",
-  "success": TRUE
+  "score": 1,
+  "status": "completed"
 }
 
-### SUCCESS: FALSE - Incomplete multi-turn conversation
+### SCORE: 0 - Incomplete multi-turn conversation
 
 CONVERSATION:
 User turn 1: I need you to analyze our Q3 sales data and create a report with charts.
@@ -129,13 +137,14 @@ Agent turn 2: Revenue by region shows North America at 45%, Europe at 30%, and A
 
 EXPECTED OUTPUT:
 {
-  "details": {
+  "reason": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.",
+  "properties": {
     "user_requests": ["Analyze Q3 sales data", "Create a report with charts", "Revenue by region chart", "Monthly trend line chart"],
     "delivered_outcome": "Text-based summary of data analysis without actual charts or formatted report",
     "completion_gaps": "User requested charts but agent only provided text: 'Revenue by region shows North America at 45%...' — no visual charts or formatted report were generated."
   },
-  "explanation": "Data was analyzed with summary stats, but the user explicitly requested charts and a report. Only text descriptions were provided.",
-  "success": FALSE
+  "score": 0,
+  "status": "completed"
 }
 
 KEY PRINCIPLES
diff --git a/assets/evaluators/tests/common/base_evaluator_runner.py b/assets/evaluators/tests/common/base_evaluator_runner.py
index 3ceb288d32..0d59900909 100644
--- a/assets/evaluators/tests/common/base_evaluator_runner.py
+++ b/assets/evaluators/tests/common/base_evaluator_runner.py
@@ -7,6 +7,7 @@
 Provides common interfaces and assertion helpers shared by both prompty-based and code-based evaluators.
 """
 
+import copy
 from abc import ABC
 from typing import Any, Dict, List, Optional, Tuple, Type
 from unittest.mock import MagicMock
@@ -106,6 +107,10 @@ def _run_evaluation_and_return_mocked_flow(self, **kwargs) -> Tuple[Dict[str, An
 
         evaluator = self._init_evaluator(**constructor_kwargs)
 
+        # Deep-copy call_kwargs to prevent evaluator preprocessing (e.g.,
+        # _normalize_function_call_types) from mutating shared test data.
+        call_kwargs = copy.deepcopy(call_kwargs)
+
         # Mock the flow only for behavioral tests
         flow_mock = None
         if self.use_mocking:
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py
index e7723befce..51fa4c4686 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_fluency_evaluator_behavior.py
@@ -19,6 +19,8 @@ class TestFluencyEvaluatorBehavior(BaseEvaluatorBehaviorTest, BaseToolEvaluation
     """
 
     # region Expected flow inputs for each test
+    # Fluency calls reformat_agent_response() which extracts text-only content
+    # from assistant messages, so expected inputs are the reformatted strings.
     test_function_tool_local_calls_expected_flow_inputs = {
         "response": data.LOCAL_CALLS_IR_EXPECTED_FLOW_RESPONSE,
     }
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py
index 5ff80fb47f..059c318a98 100644
--- a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py
+++ b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py
@@ -126,7 +126,30 @@ class TestTaskCompletionEvaluatorBehavior(BaseToolsEvaluatorBehaviorTest, BaseTo
 
     MINIMAL_RESPONSE = BaseToolsEvaluatorBehaviorTest.email_tool_call_and_assistant_response
 
-    _additional_expected_field_suffixes = ["details"]
+    _additional_expected_field_suffixes = ["status", "properties", "score", "passed"]
+
+    @property
+    def expected_result_fields(self) -> List[str]:
+        """Get expected result fields — metadata now lives inside properties, not as top-level keys."""
+        return [
+            f"{self._result_prefix}",
+            f"{self._result_prefix}_reason",
+            f"{self._result_prefix}_threshold",
+            f"{self._result_prefix}_result",
+        ] + [f"{self._result_prefix}_{suffix}" for suffix in self._additional_expected_field_suffixes]
+
+    def assert_not_applicable(self, result_data):
+        """Assert a not-applicable (not_applicable) result for TaskCompletionEvaluator.
+
+        Task completion returns score=None and label='not_applicable' for intermediate/not-applicable
+        responses, unlike the base class which expects a passing score.
+        """
+        assert result_data["label"] == "not_applicable", \
+            f"Expected 'not_applicable' but got '{result_data['label']}'"
+        assert result_data["score"] is None, \
+            f"Expected score to be None for not-applicable result, got '{result_data['score']}'"
+        assert "Not applicable" in result_data.get("reason", ""), \
+            f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'"
 
 
 def _create_mocked_evaluator():
@@ -192,8 +215,11 @@ def test_messages_valid_input(self):
         assert "task_completion" in result
         assert "task_completion_result" in result
         assert "task_completion_reason" in result
-        assert "task_completion_details" in result
+        assert "task_completion_properties" in result
+        assert "task_completion_status" in result
         assert "task_completion_threshold" in result
+        assert "task_completion_score" in result
+        assert "task_completion_passed" in result
         assert result["task_completion"] in (0, 1)
 
     def test_messages_with_tool_definitions(self):