Skip to content

Commit d7018a9

Browse files
committed
Add tool_failure evaluator with error capture through callback handler and ATIF converter
Signed-off-by: Eric Evans <194135482+ericevans-nv@users.noreply.github.com>
1 parent f51c41c commit d7018a9

10 files changed

Lines changed: 1033 additions & 26 deletions

File tree

packages/nvidia_nat_core/src/nat/utils/atif_converter.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,29 @@ def _safe_str(value: Any) -> str:
8888
return str(value)
8989

9090

91+
def _extract_tool_error(output: Any) -> dict[str, str] | None:
92+
"""Extract error metadata from a tool output for ``step.extra["tool_errors"]``."""
93+
# TODO: return a model instead of a plain dict once ATIF spec adds error support
94+
status: str | None = getattr(output, "status", None) or (output.get("status") if isinstance(output, dict) else None)
95+
if status != "error":
96+
return None
97+
content: str = (getattr(output, "content", None) or (output.get("content") if isinstance(output, dict) else None)
98+
or _safe_str(output))
99+
error_type: str = "Unknown"
100+
error_message: str = content
101+
if ":" in content:
102+
candidate: str = content.split(":", 1)[0].strip()
103+
if candidate.isidentifier():
104+
error_type = candidate
105+
error_message = content.split(":", 1)[1].strip()
106+
return {
107+
"error": content,
108+
"error_type": error_type,
109+
"error_message": error_message,
110+
"status": "error",
111+
}
112+
113+
91114
def _extract_user_input(value: Any) -> str:
92115
"""Extract the user-facing input text from a workflow start payload.
93116
@@ -278,15 +301,26 @@ def _flush_pending() -> None:
278301
tool_name = ist.name or "unknown_tool"
279302
tool_input: dict[str, Any] = {}
280303
tool_output = ""
304+
raw_output: Any = None
305+
281306
if ist.data:
282307
tool_input = _parse_tool_arguments(ist.data.input)
283-
tool_output = _safe_str(ist.data.output)
308+
raw_output = ist.data.output
309+
tool_output = _safe_str(raw_output)
284310
call_id = f"call_{ist.UUID}"
285311
tc = ATIFToolCall(tool_call_id=call_id, function_name=tool_name, arguments=tool_input)
286312
obs = ATIFObservationResult(source_call_id=call_id, content=tool_output)
313+
tool_error: dict[str, str] | None = _extract_tool_error(raw_output)
314+
315+
if tool_error is not None:
316+
tool_error["tool"] = tool_name
317+
extra: dict[str, Any] | None = ({"tool_errors": [tool_error]} if tool_error else None)
318+
287319
if pending is not None:
288320
pending.tool_calls.append(tc)
289321
pending.observations.append(obs)
322+
if tool_error:
323+
pending.extra.setdefault("tool_errors", []).append(tool_error)
290324
else:
291325
atif_steps.append(
292326
ATIFStep(
@@ -296,6 +330,7 @@ def _flush_pending() -> None:
296330
timestamp=_epoch_to_iso(ist.event_timestamp),
297331
tool_calls=[tc],
298332
observation=ATIFObservation(results=[obs]),
333+
extra=extra,
299334
))
300335
step_id += 1
301336
continue
@@ -434,24 +469,33 @@ def push(self, ist: IntermediateStep) -> ATIFStep | None:
434469
tool_name = ist.name or "unknown_tool"
435470
tool_input: dict[str, Any] = {}
436471
tool_output = ""
472+
raw_output: Any = None
437473
if ist.data:
438474
tool_input = _parse_tool_arguments(ist.data.input)
439-
tool_output = _safe_str(ist.data.output)
475+
raw_output = ist.data.output
476+
tool_output = _safe_str(raw_output)
440477
call_id = f"call_{ist.UUID}"
441478
tc = ATIFToolCall(tool_call_id=call_id, function_name=tool_name, arguments=tool_input)
442479
obs = ATIFObservationResult(source_call_id=call_id, content=tool_output)
480+
tool_error: dict[str, str] | None = _extract_tool_error(raw_output)
481+
if tool_error is not None:
482+
tool_error["tool"] = tool_name
443483
if self._pending is not None:
444484
self._pending.tool_calls.append(tc)
445485
self._pending.observations.append(obs)
486+
if tool_error:
487+
self._pending.extra.setdefault("tool_errors", []).append(tool_error)
446488
return None
447489

490+
extra: dict[str, Any] | None = ({"tool_errors": [tool_error]} if tool_error else None)
448491
orphan_step = ATIFStep(
449492
step_id=self._step_id,
450493
source="agent",
451494
message="",
452495
timestamp=_epoch_to_iso(ist.event_timestamp),
453496
tool_calls=[tc],
454497
observation=ATIFObservation(results=[obs]),
498+
extra=extra,
455499
)
456500
self._step_id += 1
457501
self._emitted_steps.append(orphan_step)

packages/nvidia_nat_core/tests/nat/utils/test_atif_converter.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""Tests for the ATIF converter."""
1616

1717
import pytest
18+
from langchain_core.messages import ToolMessage
1819

1920
from nat.data_models.atif import ATIFTrajectory
2021
from nat.data_models.intermediate_step import IntermediateStep
@@ -505,3 +506,129 @@ def test_stream_matches_batch(
505506
assert s_step.message == b_step.message
506507
if b_step.tool_calls:
507508
assert len(s_step.tool_calls) == len(b_step.tool_calls)
509+
510+
511+
# ---------------------------------------------------------------------------
512+
# Tool error → ATIF conversion tests
513+
# ---------------------------------------------------------------------------
514+
515+
516+
@pytest.fixture(name="error_trajectory")
517+
def fixture_error_trajectory() -> list[IntermediateStep]:
518+
"""Trajectory with one successful and one failed tool call."""
519+
error_output: ToolMessage = ToolMessage(
520+
content="ValueError: bad input",
521+
name="failing_tool",
522+
tool_call_id="failing_tool",
523+
status="error",
524+
)
525+
return [
526+
_make_step(IntermediateStepType.WORKFLOW_START, input_data="Do something", timestamp_offset=0.0),
527+
_make_step(IntermediateStepType.LLM_END,
528+
name="gpt-4",
529+
output_data="calling tools",
530+
timestamp_offset=1.0,
531+
usage=_make_usage(100, 20)),
532+
_make_step(IntermediateStepType.TOOL_END,
533+
name="good_tool",
534+
input_data={"q": "hello"},
535+
output_data="success",
536+
timestamp_offset=2.0,
537+
step_uuid="tool-good"),
538+
_make_step(IntermediateStepType.TOOL_END,
539+
name="failing_tool",
540+
input_data={"q": "fail"},
541+
output_data=error_output,
542+
timestamp_offset=3.0,
543+
step_uuid="tool-fail"),
544+
_make_step(IntermediateStepType.WORKFLOW_END, output_data="partial", timestamp_offset=4.0),
545+
]
546+
547+
548+
class TestToolErrorATIFConversion:
549+
"""Verify tool errors in IntermediateStepPayload are converted to ATIF step.extra['tool_errors']."""
550+
551+
def test_error_dict_has_all_required_keys(
552+
self,
553+
batch_converter: IntermediateStepToATIFConverter,
554+
error_trajectory: list[IntermediateStep],
555+
):
556+
"""Each tool_errors entry contains exactly the expected keys."""
557+
result: ATIFTrajectory = batch_converter.convert(error_trajectory)
558+
agent_step = result.steps[1]
559+
errors: list = agent_step.extra["tool_errors"]
560+
assert len(errors) == 1
561+
assert set(errors[0].keys()) == {"tool", "error", "error_type", "error_message", "status"}
562+
563+
def test_error_dict_values_are_parsed_from_content(
564+
self,
565+
batch_converter: IntermediateStepToATIFConverter,
566+
error_trajectory: list[IntermediateStep],
567+
):
568+
"""The error dict splits the exception type from the message and preserves the full error string."""
569+
result: ATIFTrajectory = batch_converter.convert(error_trajectory)
570+
entry: dict = result.steps[1].extra["tool_errors"][0]
571+
assert entry["tool"] == "failing_tool"
572+
assert entry["status"] == "error"
573+
assert entry["error"] == "ValueError: bad input"
574+
assert entry["error_type"] == "ValueError"
575+
assert entry["error_message"] == "bad input"
576+
577+
def test_error_dict_falls_back_to_unknown_type(self):
578+
"""Error content without a parseable exception type defaults to 'Unknown'."""
579+
error_output: ToolMessage = ToolMessage(
580+
content="something went wrong",
581+
name="broken_tool",
582+
tool_call_id="broken_tool",
583+
status="error",
584+
)
585+
trajectory: list[IntermediateStep] = [
586+
_make_step(IntermediateStepType.WORKFLOW_START, input_data="q", timestamp_offset=0.0),
587+
_make_step(IntermediateStepType.LLM_END,
588+
name="gpt-4",
589+
output_data="calling",
590+
timestamp_offset=1.0,
591+
usage=_make_usage(10, 5)),
592+
_make_step(IntermediateStepType.TOOL_END,
593+
name="broken_tool",
594+
input_data={},
595+
output_data=error_output,
596+
timestamp_offset=2.0,
597+
step_uuid="tool-broken"),
598+
_make_step(IntermediateStepType.WORKFLOW_END, output_data="done", timestamp_offset=3.0),
599+
]
600+
result: ATIFTrajectory = IntermediateStepToATIFConverter().convert(trajectory)
601+
entry: dict = result.steps[1].extra["tool_errors"][0]
602+
assert entry["error_type"] == "Unknown"
603+
assert entry["error_message"] == "something went wrong"
604+
605+
def test_successful_tool_has_no_tool_errors(
606+
self,
607+
batch_converter: IntermediateStepToATIFConverter,
608+
simple_trajectory: list[IntermediateStep],
609+
):
610+
"""Successful tool calls do not produce tool_errors entries in the ATIF output."""
611+
result: ATIFTrajectory = batch_converter.convert(simple_trajectory)
612+
for step in result.steps:
613+
assert not (step.extra or {}).get("tool_errors")
614+
615+
def test_stream_and_batch_produce_same_errors(
616+
self,
617+
batch_converter: IntermediateStepToATIFConverter,
618+
error_trajectory: list[IntermediateStep],
619+
):
620+
"""Both converter code paths produce identical tool_errors for the same input trajectory."""
621+
batch_result: ATIFTrajectory = batch_converter.convert(error_trajectory)
622+
stream_conv: ATIFStreamConverter = ATIFStreamConverter()
623+
for ist in error_trajectory:
624+
stream_conv.push(ist)
625+
stream_conv.finalize()
626+
stream_result: ATIFTrajectory = stream_conv.get_trajectory()
627+
628+
def _collect_errors(trajectory: ATIFTrajectory) -> list[dict]:
629+
errors: list[dict] = []
630+
for step in trajectory.steps:
631+
errors.extend((step.extra or {}).get("tool_errors", []))
632+
return errors
633+
634+
assert _collect_errors(batch_result) == _collect_errors(stream_result)

packages/nvidia_nat_eval/src/nat/plugins/eval/register.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@
2222
from .dataset_loader.register import register_jsonl_dataset_loader
2323
from .dataset_loader.register import register_parquet_dataset_loader
2424
from .dataset_loader.register import register_xls_dataset_loader
25+
26+
# Evaluators
27+
from .tool_failure_evaluator.register import register_tool_failure_evaluator
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
from .evaluator import ToolFailureEvaluator
17+
from .models import ToolFailureReasoning
18+
from .models import ToolSummary
19+
from .register import ToolFailureEvaluatorConfig
20+
21+
__all__ = [
22+
"ToolFailureEvaluator",
23+
"ToolFailureEvaluatorConfig",
24+
"ToolFailureReasoning",
25+
"ToolSummary",
26+
]

0 commit comments

Comments
 (0)