Skip to content

Commit 6eb07a0

Browse files
committed
ToolCallSuccess: move runtime-status short-circuit from prompt into Python
Failed/incomplete tool_call or tool_result blocks now return a deterministic fail result without invoking the LLM judge; the prompty rubric is consulted only on the success path. Drops [STATUS] suffix from the formatted LLM input (back-compat with pre-pass-through wire format). Adds _collect_failed_tool_calls helper and _return_short_circuit_failure_result method; removes _format_status_suffix; rewrites tests.
1 parent e132532 commit 6eb07a0

3 files changed

Lines changed: 296 additions & 164 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py

Lines changed: 111 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,32 @@ def __call__( # pylint: disable=docstring-missing-param
141141
"""
142142
return super().__call__(*args, **kwargs)
143143

144+
def _return_short_circuit_failure_result(
145+
self, failed_tools: List[str]
146+
) -> Dict[str, Union[str, float]]:
147+
"""Return a deterministic fail result without invoking the LLM judge.
148+
149+
Used when the runtime explicitly marks one or more tool calls as
150+
failed/incomplete via the ``status`` field on a ``tool_call`` or
151+
``tool_result`` content block. The LLM call is skipped because the
152+
runtime signal is authoritative.
153+
"""
154+
failed_list = ",".join(failed_tools)
155+
reason = (
156+
f"Tool call(s) [{failed_list}] reported a non-success runtime status "
157+
"(failed or incomplete). Short-circuited without invoking the LLM judge."
158+
)
159+
return {
160+
self._result_key: 0.0,
161+
f"{self._result_key}_score": 0.0,
162+
f"{self._result_key}_passed": False,
163+
f"{self._result_key}_result": "fail",
164+
f"{self._result_key}_reason": reason,
165+
f"{self._result_key}_status": "completed",
166+
f"{self._result_key}_threshold": self._threshold,
167+
f"{self._result_key}_properties": {"failed_tools": failed_list},
168+
}
169+
144170
@override
145171
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # type: ignore[override]
146172
"""Do Tool Call Success evaluation.
@@ -181,6 +207,16 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
181207

182208
if isinstance(eval_input.get("response"), list):
183209
eval_input["response"] = _preprocess_messages(eval_input["response"])
210+
# Short-circuit: when the runtime explicitly marks any tool_call
211+
# or tool_result with a non-success status (e.g. ``failed`` or
212+
# ``incomplete``) there is no point asking the LLM judge to
213+
# re-derive the failure from the payload -- the runtime signal
214+
# is authoritative. Return a deterministic fail result and skip
215+
# the LLM call entirely. The prompty rubric is now only
216+
# consulted on the success path (status ``completed`` or absent).
217+
failed_tools = _collect_failed_tool_calls(eval_input["response"])
218+
if failed_tools:
219+
return self._return_short_circuit_failure_result(failed_tools)
184220
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
185221
# If response is a string, pass directly without reformatting
186222
elif isinstance(eval_input["response"], str):
@@ -271,34 +307,87 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
271307
return tool_definitions
272308

273309

274-
def _format_status_suffix(status):
275-
"""Build the trailing ``[STATUS] <value>`` annotation for a content block.
310+
_FAILED_RUNTIME_STATUSES = frozenset({"failed", "incomplete"})
311+
312+
313+
def _collect_failed_tool_calls(messages):
314+
"""Return ordered, unique tool names whose runtime status indicates failure.
276315
277-
Returns the empty string when ``status`` is absent or not a non-empty
278-
string, so callers can unconditionally concatenate the return value
279-
without affecting back-compat output.
316+
A tool call is treated as a runtime failure when either its assistant
317+
``tool_call`` content block or its matched tool ``tool_result`` content
318+
block carries a ``status`` field in ``{failed, incomplete}``. The check
319+
runs in Python so the LLM judge is only invoked on the success path
320+
(status ``completed`` or absent); failed/incomplete calls are short-
321+
circuited deterministically.
280322
281-
:param status: The raw ``status`` field from a ``tool_call`` or
282-
``tool_result`` content block.
283-
:type status: Any
284-
:return: ``" [STATUS] <value>"`` when ``status`` is a non-empty string,
285-
otherwise ``""``.
286-
:rtype: str
323+
When the failing block carries no resolvable function name, the
324+
``tool_call_id`` is used as a stable identifier instead so the caller
325+
can still surface it in ``properties.failed_tools``.
287326
"""
288-
if isinstance(status, str) and status:
289-
return f" [STATUS] {status}"
290-
return ""
327+
if not isinstance(messages, list):
328+
return []
329+
330+
id_to_name = {}
331+
failed_ids = []
332+
failed_names_without_id = []
333+
334+
for msg in messages:
335+
if not isinstance(msg, dict) or msg.get("role") != "assistant":
336+
continue
337+
for content in msg.get("content", []) or []:
338+
if not isinstance(content, dict) or content.get("type") != "tool_call":
339+
continue
340+
if "tool_call" in content and "function" in content.get("tool_call", {}):
341+
tc = content["tool_call"]
342+
name = tc.get("function", {}).get("name", "") or ""
343+
tcid = tc.get("id")
344+
else:
345+
name = content.get("name", "") or ""
346+
tcid = content.get("tool_call_id")
347+
if tcid is not None:
348+
id_to_name[tcid] = name
349+
status = content.get("status")
350+
if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES:
351+
if tcid is not None:
352+
failed_ids.append(tcid)
353+
elif name:
354+
failed_names_without_id.append(name)
355+
356+
for msg in messages:
357+
if not isinstance(msg, dict) or msg.get("role") != "tool":
358+
continue
359+
tcid = msg.get("tool_call_id")
360+
for content in msg.get("content", []) or []:
361+
if not isinstance(content, dict) or content.get("type") != "tool_result":
362+
continue
363+
status = content.get("status")
364+
if isinstance(status, str) and status in _FAILED_RUNTIME_STATUSES and tcid is not None:
365+
failed_ids.append(tcid)
366+
367+
ordered = []
368+
seen = set()
369+
for tcid in failed_ids:
370+
label = id_to_name.get(tcid) or tcid
371+
if label and label not in seen:
372+
seen.add(label)
373+
ordered.append(label)
374+
for name in failed_names_without_id:
375+
if name and name not in seen:
376+
seen.add(name)
377+
ordered.append(name)
378+
return ordered
291379

292380

293381
def _get_tool_calls_results(agent_response_msgs):
294382
"""Extract formatted agent tool calls and results from response.
295383
296-
Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with
297-
``[STATUS] <value>`` when the source content block carries a ``status``
298-
field. The prompty rubric uses this annotation as a strong failure signal
299-
(see ``tool_call_success.prompty``). When ``status`` is absent the suffix
300-
is omitted and the rubric falls back to payload-only judgment, so the
301-
formatted output is byte-identical to the pre-pass-through wire format.
384+
The output uses the original ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line
385+
format only; runtime ``status`` is no longer forwarded to the LLM judge.
386+
Failed/incomplete tool calls are short-circuited in Python by
387+
:func:`_collect_failed_tool_calls` before this formatter runs, so by the
388+
time the LLM sees the response every remaining call has either no
389+
status or a ``completed`` status -- the rubric judges those by payload
390+
alone.
302391
"""
303392
agent_response_text = []
304393
tool_results = {}
@@ -310,8 +399,7 @@ def _get_tool_calls_results(agent_response_msgs):
310399
for content in msg.get("content", []):
311400
if content.get("type") == "tool_result":
312401
result = content.get("tool_result")
313-
status_suffix = _format_status_suffix(content.get("status"))
314-
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}"
402+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
315403

316404
# Second pass: parse assistant messages and tool calls
317405
for msg in agent_response_msgs:
@@ -330,8 +418,7 @@ def _get_tool_calls_results(agent_response_msgs):
330418
func_name = content.get("name", "")
331419
args = content.get("arguments", {})
332420
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
333-
status_suffix = _format_status_suffix(content.get("status"))
334-
call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}"
421+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
335422
agent_response_text.append(call_line)
336423
if tool_call_id in tool_results:
337424
agent_response_text.append(tool_results[tool_call_id])

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,13 @@ B. Examine tool result and definition for the tool being called to check whether
5353
1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it:
5454
ERROR-CASES:
5555
===========
56-
- The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload.
5756
- The tool call resulted in an error or exception
5857
- The tool call failed to run or failed to return
5958
- The tool call returned a result that indicates an error or failure
6059
- The tool call returned an object or JSON string that has one or more of its fields indicating an error
6160
- The tool timed-out or returned a result that indicate a time-out
6261
- The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present
6362
2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake
64-
3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload.
6563
C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded**
6664
D. You are required to return your **output** in the following format:
6765
{
@@ -337,61 +335,6 @@ EXPECTED OUTPUT
337335
}
338336

339337

340-
### Example - Failed (status annotation overrides bland payload)
341-
342-
[TOOL_CALLS]
343-
[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed
344-
[TOOL_RESULT] {} [STATUS] failed
345-
346-
EXPECTED OUTPUT
347-
{
348-
"reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive",
349-
"properties": {
350-
"failed_tools": "send_email"
351-
},
352-
"score": 0,
353-
"status": "completed"
354-
}
355-
356-
357-
### Example - Failed (status completed but payload still indicates an error)
358-
359-
[TOOL_CALLS]
360-
[TOOL_CALL] get_current_user_info() [STATUS] completed
361-
[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed
362-
363-
EXPECTED OUTPUT
364-
{
365-
"reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed",
366-
"properties": {
367-
"failed_tools": "get_current_user_info"
368-
},
369-
"score": 0,
370-
"status": "completed"
371-
}
372-
373-
374-
### Example - Failed (parallel calls in one turn, one annotated failed)
375-
376-
[TOOL_CALLS]
377-
[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed
378-
[TOOL_RESULT] {"temp": 62} [STATUS] completed
379-
[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed
380-
[TOOL_RESULT] {} [STATUS] failed
381-
[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed
382-
[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed
383-
384-
EXPECTED OUTPUT
385-
{
386-
"reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation",
387-
"properties": {
388-
"failed_tools": "send_email"
389-
},
390-
"score": 0,
391-
"status": "completed"
392-
}
393-
394-
395338

396339
Now given the **INPUT** you received generate the output
397340
# Output

0 commit comments

Comments
 (0)