Azure · mmkawale · Jun 5, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -1,5 +1,13 @@
 # Release History
 
+## 1.17.1 (Unreleased)
+
+### Features Added
+
+- Enabled `ToolCallAccuracyEvaluator` and `_ToolInputAccuracyEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). Both evaluators grade the agent's tool selection and input arguments — neither requires the (often redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `_ToolCallSuccessEvaluator`, `GroundednessEvaluator`, and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because their rubrics consume the tool output body.
+- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
+- `_ToolCallSuccessEvaluator` now forwards the per-call runtime `status` (e.g. `failed`, `error`, `incomplete`, `cancelled`, `canceled`, `completed`) to the LLM rubric as a `[STATUS] <value>` annotation appended to each emitted `[TOOL_CALL]` / `[TOOL_RESULT]` line. The prompty rubric is updated to treat the failure annotations as a strong, authoritative failure signal that overrides a bland or otherwise-passing-looking payload, while still falling back to payload-only judgment when `status` is absent. Output is byte-identical to the previous wire format when no `status` field is populated, so existing recorded test fixtures and customers whose converters do not emit `status` are unaffected.
+
 ## 1.17.0 (2026-06-03)
 
 ### Breaking Changes

@@ -34,6 +34,7 @@
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
 from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -135,6 +136,7 @@ def lazy_import():
     "ToolCallAccuracyEvaluator",
     "_ToolOutputUtilizationEvaluator",
     "_ToolCallSuccessEvaluator",
+    "_ToolInputAccuracyEvaluator",
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",

@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -271,8 +271,35 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
         return tool_definitions
 
 
+def _format_status_suffix(status):
+    """Build the trailing ``[STATUS] <value>`` annotation for a content block.
+
+    Returns the empty string when ``status`` is absent or not a non-empty
+    string, so callers can unconditionally concatenate the return value
+    without affecting back-compat output.
+
+    :param status: The raw ``status`` field from a ``tool_call`` or
+        ``tool_result`` content block.
+    :type status: Any
+    :return: ``" [STATUS] <value>"`` when ``status`` is a non-empty string,
+        otherwise ``""``.
+    :rtype: str
+    """
+    if isinstance(status, str) and status:
+        return f" [STATUS] {status}"
+    return ""
+
+
 def _get_tool_calls_results(agent_response_msgs):
-    """Extract formatted agent tool calls and results from response."""
+    """Extract formatted agent tool calls and results from response.
+
+    Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with
+    ``[STATUS] <value>`` when the source content block carries a ``status``
+    field. The prompty rubric uses this annotation as a strong failure signal
+    (see ``tool_call_success.prompty``). When ``status`` is absent the suffix
+    is omitted and the rubric falls back to payload-only judgment, so the
+    formatted output is byte-identical to the pre-pass-through wire format.
+    """
     agent_response_text = []
     tool_results = {}
 
@@ -283,7 +310,8 @@ def _get_tool_calls_results(agent_response_msgs):
             for content in msg.get("content", []):
                 if content.get("type") == "tool_result":
                     result = content.get("tool_result")
-                    tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
+                    status_suffix = _format_status_suffix(content.get("status"))
+                    tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}"
 
     # Second pass: parse assistant messages and tool calls
     for msg in agent_response_msgs:
@@ -302,7 +330,8 @@ def _get_tool_calls_results(agent_response_msgs):
                         func_name = content.get("name", "")
                         args = content.get("arguments", {})
                     args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
-                    call_line = f"[TOOL_CALL] {func_name}({args_str})"
+                    status_suffix = _format_status_suffix(content.get("status"))
+                    call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}"
                     agent_response_text.append(call_line)
                     if tool_call_id in tool_results:
                         agent_response_text.append(tool_results[tool_call_id])

@@ -53,13 +53,15 @@ B. Examine tool result and definition for the tool being called to check whether
   1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it:
     ERROR-CASES:
     ===========
+    - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload.
     - The tool call resulted in an error or exception
     - The tool call failed to run or failed to return
     - The tool call returned a result that indicates an error or failure
     - The tool call returned an object or JSON string that has one or more of its fields indicating an error
     - The tool timed-out or returned a result that indicate a time-out
     - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present
   2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake
+  3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload.
 C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded**
 D. You are required to return your **output** in the following format:
   {
@@ -335,6 +337,61 @@ EXPECTED OUTPUT
 }
 
 
+### Example - Failed (status annotation overrides bland payload)
+
+[TOOL_CALLS]
+[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed
+[TOOL_RESULT] {} [STATUS] failed
+
+EXPECTED OUTPUT
+{
+  "reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive",
+  "properties": {
+    "failed_tools": "send_email"
+  },
+  "score": 0,
+  "status": "completed"
+}
+
+
+### Example - Failed (status completed but payload still indicates an error)
+
+[TOOL_CALLS]
+[TOOL_CALL] get_current_user_info() [STATUS] completed
+[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed
+
+EXPECTED OUTPUT
+{
+  "reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed",
+  "properties": {
+    "failed_tools": "get_current_user_info"
+  },
+  "score": 0,
+  "status": "completed"
+}
+
+
+### Example - Failed (parallel calls in one turn, one annotated failed)
+
+[TOOL_CALLS]
+[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed
+[TOOL_RESULT] {"temp": 62} [STATUS] completed
+[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed
+[TOOL_RESULT] {} [STATUS] failed
+[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed
+[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed
+
+EXPECTED OUTPUT
+{
+  "reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation",
+  "properties": {
+    "failed_tools": "send_email"
+  },
+  "score": 0,
+  "status": "completed"
+}
+
+
 
 Now given the **INPUT** you received generate the output 
 # Output
@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
 
-VERSION = "1.17.0"
+VERSION = "1.17.1"
@@ -0,0 +1,202 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""Tests for ToolCallSuccess runtime status pass-through to the LLM rubric.
+
+The evaluator's source-side preprocessing emits ``[STATUS] <value>`` annotations
+on each formatted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line whenever the source
+content block carries a ``status`` field. The prompty rubric is taught to treat
+these annotations as a strong (authoritative) failure signal when the status is
+in {failed, error, incomplete, cancelled, canceled}, and to fall back to
+payload-only judgment when ``status`` is absent.
+
+These tests cover the source-side preprocessing only (the ``[STATUS]`` string
+emission). End-to-end rubric behavior is covered by the existing behavior
+suites that exercise the full evaluator with a mocked LLM.
+"""
+
+import pytest
+
+from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import (
+    _format_status_suffix,
+    _get_tool_calls_results,
+)
+
+
+# region helpers
+
+
+def _assistant_tool_call(tool_call_id, name, arguments, status=None):
+    """Build an assistant message carrying a single tool_call content block."""
+    block = {
+        "type": "tool_call",
+        "tool_call_id": tool_call_id,
+        "name": name,
+        "arguments": arguments,
+    }
+    if status is not None:
+        block["status"] = status
+    return {"role": "assistant", "content": [block]}
+
+
+def _tool_result(tool_call_id, result, status=None):
+    """Build a tool message carrying a single tool_result content block."""
+    block = {
+        "type": "tool_result",
+        "tool_call_id": tool_call_id,
+        "tool_result": result,
+    }
+    if status is not None:
+        block["status"] = status
+    return {
+        "role": "tool",
+        "tool_call_id": tool_call_id,
+        "content": [block],
+    }
+
+
+def _assistant_parallel_tool_calls(blocks):
+    """Build a single assistant message that emits multiple tool_call blocks in one turn.
+
+    ``blocks`` is a list of ``(tool_call_id, name, arguments, status)`` tuples.
+    This is the modern Responses-API topology for parallel function-call
+    invocation: multiple ``tool_call`` content blocks under one assistant
+    message, in contrast to one assistant message per call.
+    """
+    content = []
+    for tool_call_id, name, arguments, status in blocks:
+        block = {
+            "type": "tool_call",
+            "tool_call_id": tool_call_id,
+            "name": name,
+            "arguments": arguments,
+        }
+        if status is not None:
+            block["status"] = status
+        content.append(block)
+    return {"role": "assistant", "content": content}
+
+
+# endregion
+
+
+@pytest.mark.unittest
+class TestFormatStatusSuffix:
+    """Unit tests for the ``_format_status_suffix`` helper."""
+
+    def test_known_failure_status_emits_suffix(self):
+        """A known-failure status string produces a ``[STATUS] <value>`` suffix."""
+        assert _format_status_suffix("failed") == " [STATUS] failed"
+
+    def test_completed_status_emits_suffix(self):
+        """A success status string also emits a suffix (the rubric distinguishes the two)."""
+        assert _format_status_suffix("completed") == " [STATUS] completed"
+
+    def test_arbitrary_status_string_emits_suffix(self):
+        """Any non-empty string status emits a suffix; the rubric judges semantics, not Python."""
+        assert _format_status_suffix("rate_limited") == " [STATUS] rate_limited"
+
+    def test_none_status_emits_empty(self):
+        """Absent status (``None``) emits the empty string for back-compat."""
+        assert _format_status_suffix(None) == ""
+
+    def test_empty_string_status_emits_empty(self):
+        """Empty string status emits the empty string (treated same as absent)."""
+        assert _format_status_suffix("") == ""
+
+    def test_non_string_status_emits_empty(self):
+        """Non-string statuses (int, dict, list) are ignored rather than raised on."""
+        assert _format_status_suffix(42) == ""
+        assert _format_status_suffix({"x": 1}) == ""
+        assert _format_status_suffix(["failed"]) == ""
+
+
+@pytest.mark.unittest
+class TestGetToolCallsResultsStatusPassthrough:
+    """Integration tests for ``[STATUS]`` annotation emission via ``_get_tool_calls_results``."""
+
+    def test_status_on_tool_call_is_appended_to_tool_call_line(self):
+        """When ``status`` is set on a tool_call block, the ``[TOOL_CALL]`` line carries the annotation."""
+        msgs = [
+            _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"),
+            _tool_result("c1", ""),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed'
+        # Tool result has no status -> no suffix.
+        assert lines[1] == "[TOOL_RESULT] "
+
+    def test_status_on_tool_result_is_appended_to_tool_result_line(self):
+        """When ``status`` is set on a tool_result block, the ``[TOOL_RESULT]`` line carries the annotation."""
+        msgs = [
+            _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}),
+            _tool_result("c1", "", status="error"),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com")'
+        assert lines[1] == "[TOOL_RESULT]  [STATUS] error"
+
+    def test_completed_status_is_passed_through_too(self):
+        """``[STATUS] completed`` is emitted alongside failure statuses; the rubric decides semantics."""
+        msgs = [
+            _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"),
+            _tool_result("c1", "Sunny, 72F.", status="completed"),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines[0] == '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed'
+        assert lines[1] == "[TOOL_RESULT] Sunny, 72F. [STATUS] completed"
+
+    def test_absent_status_produces_no_suffix_back_compat(self):
+        """When ``status`` is absent on every block, output matches the pre-status-pass-through format exactly."""
+        msgs = [
+            _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}),
+            _tool_result("c1", "Sunny, 72F."),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines == [
+            '[TOOL_CALL] fetch_weather(city="Seattle")',
+            "[TOOL_RESULT] Sunny, 72F.",
+        ]
+
+    def test_parallel_tool_calls_in_one_assistant_message_each_get_their_own_status(self):
+        """Multiple ``tool_call`` blocks in one assistant message each emit their own ``[STATUS]`` annotation.
+
+        This is the modern Responses-API topology and exercises that the
+        formatter walks into the content list rather than only processing the
+        first block per message.
+        """
+        msgs = [
+            _assistant_parallel_tool_calls([
+                ("c1", "fetch_weather", {"city": "Seattle"}, "completed"),
+                ("c2", "send_email",   {"to": "x@example.com"}, "failed"),
+                ("c3", "lookup_user",  {"id": "u42"}, "completed"),
+            ]),
+            _tool_result("c1", "Sunny, 72F.", status="completed"),
+            _tool_result("c2", "", status="failed"),
+            _tool_result("c3", {"user_id": "u42"}, status="completed"),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines == [
+            '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed',
+            "[TOOL_RESULT] Sunny, 72F. [STATUS] completed",
+            '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed',
+            "[TOOL_RESULT]  [STATUS] failed",
+            '[TOOL_CALL] lookup_user(id="u42") [STATUS] completed',
+            "[TOOL_RESULT] {'user_id': 'u42'} [STATUS] completed",
+        ]
+
+    def test_mixed_status_present_and_absent_across_calls(self):
+        """A response with status on some calls and not others produces a mixed-suffix output."""
+        msgs = [
+            _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"),
+            _tool_result("c1", "Sunny, 72F."),
+            _assistant_tool_call("c2", "send_email", {"to": "x@example.com"}),
+            _tool_result("c2", "", status="failed"),
+        ]
+        lines = _get_tool_calls_results(msgs)
+        assert lines == [
+            '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed',
+            "[TOOL_RESULT] Sunny, 72F.",
+            '[TOOL_CALL] send_email(to="x@example.com")',
+            "[TOOL_RESULT]  [STATUS] failed",
+        ]