ToolCallSuccess: move runtime-status short-circuit from prompt into Python

manaskawale · manaskawale · commit 6eb07a0a1890 · 2026-06-15T12:35:16.000-07:00
Failed/incomplete tool_call or tool_result blocks now return a deterministic fail result without invoking the LLM judge; the prompty rubric is consulted only on the success path. Drops [STATUS] suffix from the formatted LLM input (back-compat with pre-pass-through wire format). Adds _collect_failed_tool_calls helper and _return_short_circuit_failure_result method; removes _format_status_suffix; rewrites tests.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
@@ -141,6 +141,32 @@ def __call__(  # pylint: disable=docstring-missing-param
         """
         return super().__call__(*args, **kwargs)
 
+    def _return_short_circuit_failure_result(
+        self, failed_tools: List[str]
+    ) -> Dict[str, Union[str, float]]:
+        """Return a deterministic fail result without invoking the LLM judge.
+
+        Used when the runtime explicitly marks one or more tool calls as
+        failed/incomplete via the ``status`` field on a ``tool_call`` or
+        ``tool_result`` content block. The LLM call is skipped because the
+        runtime signal is authoritative.
+        """
+        failed_list = ",".join(failed_tools)
+        reason = (
+            f"Tool call(s) [{failed_list}] reported a non-success runtime status "
+            "(failed or incomplete). Short-circuited without invoking the LLM judge."
+        )
+        return {
+            self._result_key: 0.0,
+            f"{self._result_key}_score": 0.0,
+            f"{self._result_key}_passed": False,
+            f"{self._result_key}_result": "fail",
+            f"{self._result_key}_reason": reason,
+            f"{self._result_key}_status": "completed",
+            f"{self._result_key}_threshold": self._threshold,
+            f"{self._result_key}_properties": {"failed_tools": failed_list},
+        }
+
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # type: ignore[override]
         """Do Tool Call Success evaluation.
@@ -181,6 +207,16 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
 
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
+            # Short-circuit: when the runtime explicitly marks any tool_call
+            # or tool_result with a non-success status (e.g. ``failed`` or
+            # ``incomplete``) there is no point asking the LLM judge to
+            # re-derive the failure from the payload -- the runtime signal
+            # is authoritative. Return a deterministic fail result and skip
+            # the LLM call entirely. The prompty rubric is now only
+            # consulted on the success path (status ``completed`` or absent).
+            failed_tools = _collect_failed_tool_calls(eval_input["response"])
+            if failed_tools:
+                return self._return_short_circuit_failure_result(failed_tools)
             eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
         # If response is a string, pass directly without reformatting
         elif isinstance(eval_input["response"], str):
@@ -271,34 +307,87 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
         return tool_definitions
 
 
-def _format_status_suffix(status):
-    """Build the trailing ``[STATUS] <value>`` annotation for a content block.
+_FAILED_RUNTIME_STATUSES = frozenset({"failed", "incomplete"})
+
+
+def _collect_failed_tool_calls(messages):
+    """Return ordered, unique tool names whose runtime status indicates failure.
 
-    Returns the empty string when ``status`` is absent or not a non-empty
-    string, so callers can unconditionally concatenate the return value
-    without affecting back-compat output.
+    A tool call is treated as a runtime failure when either its assistant
+    ``tool_call`` content block or its matched tool ``tool_result`` content
+    block carries a ``status`` field in ``{failed, incomplete}``. The check
+    runs in Python so the LLM judge is only invoked on the success path
+    (status ``completed`` or absent); failed/incomplete calls are short-
+    circuited deterministically.
 
-    :param status: The raw ``status`` field from a ``tool_call`` or
-        ``tool_result`` content block.
-    :type status: Any
-    :return: ``" [STATUS] <value>"`` when ``status`` is a non-empty string,
-        otherwise ``""``.
-    :rtype: str
+    When the failing block carries no resolvable function name, the
+    ``tool_call_id`` is used as a stable identifier instead so the caller
+    can still surface it in ``properties.failed_tools``.
     """
-    if isinstance(status, str) and status:
-        return f" [STATUS] {status}"
-    return ""
+    if not isinstance(messages, list):
+        return []
+
+    id_to_name = {}
+    failed_ids = []
+    failed_names_without_id = []
+
+    for msg in messages:
+        if not isinstance(msg, dict) or msg.get("role") != "assistant":
+            continue
+        for content in msg.get("content", []) or []:
+            if not isinstance(content, dict) or content.get("type") != "tool_call":
+                continue
+            if "tool_call" in content and "function" in content.get("tool_call", {}):
+                tc = content["tool_call"]
+                name = tc.get("function", {}).get("name", "") or ""
+                tcid = tc.get("id")
+            else:
+                name = content.get("name", "") or ""
+                tcid = content.get("tool_call_id")
+            if tcid is not None:
+                id_to_name[tcid] = name
+            status = content.get("status")
+            if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES:
+                if tcid is not None:
+                    failed_ids.append(tcid)
+                elif name:
+                    failed_names_without_id.append(name)
+
+    for msg in messages:
+        if not isinstance(msg, dict) or msg.get("role") != "tool":
+            continue
+        tcid = msg.get("tool_call_id")
+        for content in msg.get("content", []) or []:
+            if not isinstance(content, dict) or content.get("type") != "tool_result":
+                continue
+            status = content.get("status")
+            if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and tcid is not None:
+                failed_ids.append(tcid)
+
+    ordered = []
+    seen = set()
+    for tcid in failed_ids:
+        label = id_to_name.get(tcid) or tcid
+        if label and label not in seen:
+            seen.add(label)
+            ordered.append(label)
+    for name in failed_names_without_id:
+        if name and name not in seen:
+            seen.add(name)
+            ordered.append(name)
+    return ordered
 
 
 def _get_tool_calls_results(agent_response_msgs):
     """Extract formatted agent tool calls and results from response.
 
-    Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with
-    ``[STATUS] <value>`` when the source content block carries a ``status``
-    field. The prompty rubric uses this annotation as a strong failure signal
-    (see ``tool_call_success.prompty``). When ``status`` is absent the suffix
-    is omitted and the rubric falls back to payload-only judgment, so the
-    formatted output is byte-identical to the pre-pass-through wire format.
+    The output uses the original ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line
+    format only; runtime ``status`` is no longer forwarded to the LLM judge.
+    Failed/incomplete tool calls are short-circuited in Python by
+    :func:`_collect_failed_tool_calls` before this formatter runs, so by the
+    time the LLM sees the response every remaining call has either no
+    status or a ``completed`` status -- the rubric judges those by payload
+    alone.
     """
     agent_response_text = []
     tool_results = {}
@@ -310,8 +399,7 @@ def _get_tool_calls_results(agent_response_msgs):
             for content in msg.get("content", []):
                 if content.get("type") == "tool_result":
                     result = content.get("tool_result")
-                    status_suffix = _format_status_suffix(content.get("status"))
-                    tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}"
+                    tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
 
     # Second pass: parse assistant messages and tool calls
     for msg in agent_response_msgs:
@@ -330,8 +418,7 @@ def _get_tool_calls_results(agent_response_msgs):
                         func_name = content.get("name", "")
                         args = content.get("arguments", {})
                     args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
-                    status_suffix = _format_status_suffix(content.get("status"))
-                    call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}"
+                    call_line = f"[TOOL_CALL] {func_name}({args_str})"
                     agent_response_text.append(call_line)
                     if tool_call_id in tool_results:
                         agent_response_text.append(tool_results[tool_call_id])
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty
@@ -53,15 +53,13 @@ B. Examine tool result and definition for the tool being called to check whether
   1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it:
     ERROR-CASES:
     ===========
-    - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload.
     - The tool call resulted in an error or exception
     - The tool call failed to run or failed to return
     - The tool call returned a result that indicates an error or failure
     - The tool call returned an object or JSON string that has one or more of its fields indicating an error
     - The tool timed-out or returned a result that indicate a time-out
     - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present
   2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake
-  3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload.
 C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded**
 D. You are required to return your **output** in the following format:
   {
@@ -337,61 +335,6 @@ EXPECTED OUTPUT
 }
 
 
-### Example - Failed (status annotation overrides bland payload)
-
-[TOOL_CALLS]
-[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed
-[TOOL_RESULT] {} [STATUS] failed
-
-EXPECTED OUTPUT
-{
-  "reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive",
-  "properties": {
-    "failed_tools": "send_email"
-  },
-  "score": 0,
-  "status": "completed"
-}
-
-
-### Example - Failed (status completed but payload still indicates an error)
-
-[TOOL_CALLS]
-[TOOL_CALL] get_current_user_info() [STATUS] completed
-[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed
-
-EXPECTED OUTPUT
-{
-  "reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed",
-  "properties": {
-    "failed_tools": "get_current_user_info"
-  },
-  "score": 0,
-  "status": "completed"
-}
-
-
-### Example - Failed (parallel calls in one turn, one annotated failed)
-
-[TOOL_CALLS]
-[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed
-[TOOL_RESULT] {"temp": 62} [STATUS] completed
-[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed
-[TOOL_RESULT] {} [STATUS] failed
-[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed
-[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed
-
-EXPECTED OUTPUT
-{
-  "reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation",
-  "properties": {
-    "failed_tools": "send_email"
-  },
-  "score": 0,
-  "status": "completed"
-}
-
-
 
 Now given the **INPUT** you received generate the output 
 # Output
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py