1212# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313# See the License for the specific language governing permissions and
1414# limitations under the License.
15- """Unit tests for ToolFailureEvaluator model population.
16-
17- Validates that ToolFailureReasoning, ToolSummary, and _ToolCall are correctly
18- populated from both the legacy IntermediateStep lane and the ATIF lane, and
19- that error detection correctly distinguishes failures from successes.
20- """
15+ """Unit tests for ToolFailureEvaluator."""
2116
2217from __future__ import annotations
2318
3934from nat .data_models .invocation_node import InvocationNode
4035from nat .plugins .eval .evaluator .atif_evaluator import AtifEvalSample
4136from nat .plugins .eval .tool_failure_evaluator .evaluator import ToolFailureEvaluator
42- from nat .plugins .eval .tool_failure_evaluator .models import ToolFailureReasoning
37+ from nat .plugins .eval .tool_failure_evaluator .models import _ToolFailureReasoning
4338
4439_DUMMY_ANCESTRY : InvocationNode = InvocationNode (function_id = "f-0" , function_name = "test_fn" )
4540
@@ -107,18 +102,16 @@ def evaluator_fixture() -> ToolFailureEvaluator:
107102 return ToolFailureEvaluator ()
108103
109104
110- class TestLegacyLaneModelPopulation :
111- """Verify ToolFailureReasoning, ToolSummary, and _ToolCall are correctly
112- populated from legacy IntermediateStep trajectories.
113- """
105+ class TestEvaluateIntermediateStepTrajectory :
106+ """Tests for evaluating IntermediateStep trajectories."""
114107
115108 async def test_empty_trajectory_produces_default_reasoning (self , evaluator : ToolFailureEvaluator ):
116109 """An empty trajectory should yield default ToolFailureReasoning with
117110 zero counts, no failed tools, and a perfect score.
118111 """
119112 result = await evaluator .evaluate_item (_eval_input ("empty" , []))
120113
121- reasoning : ToolFailureReasoning = result .reasoning
114+ reasoning : _ToolFailureReasoning = result .reasoning
122115 assert reasoning .total_tool_calls == 0
123116 assert reasoning .failed_tool_calls == 0
124117 assert reasoning .failed_tools == []
@@ -136,7 +129,7 @@ async def test_all_failed_calls_populate_summary_with_error_details(self, evalua
136129 ]
137130 result = await evaluator .evaluate_item (_eval_input ("fail" , trajectory ))
138131
139- reasoning : ToolFailureReasoning = result .reasoning
132+ reasoning : _ToolFailureReasoning = result .reasoning
140133 assert reasoning .total_tool_calls == 2
141134 assert reasoning .failed_tool_calls == 2
142135 assert reasoning .failed_tools == ["lookup" ]
@@ -146,8 +139,8 @@ async def test_all_failed_calls_populate_summary_with_error_details(self, evalua
146139 assert summary .tool_name == "lookup"
147140 assert summary .total_calls == 2
148141 assert summary .failed_calls == 2
149- assert len (summary .attempts ) == 2
150- for attempt in summary .attempts :
142+ assert len (summary .failed_attempts ) == 2
143+ for attempt in summary .failed_attempts :
151144 assert attempt .error == "ValueError: bad input"
152145 assert attempt .output is None
153146
@@ -163,7 +156,7 @@ async def test_mixed_results_split_correctly_across_models(self, evaluator: Tool
163156 ]
164157 result = await evaluator .evaluate_item (_eval_input ("mixed" , trajectory ))
165158
166- reasoning : ToolFailureReasoning = result .reasoning
159+ reasoning : _ToolFailureReasoning = result .reasoning
167160 assert reasoning .total_tool_calls == 2
168161 assert reasoning .failed_tool_calls == 1
169162 assert reasoning .failed_tools == ["lookup" ]
@@ -173,7 +166,7 @@ async def test_mixed_results_split_correctly_across_models(self, evaluator: Tool
173166 assert reasoning .per_tool_summary [0 ].tool_name == "lookup"
174167
175168 async def test_same_tool_mixed_results_filters_attempts_to_failures_only (self , evaluator : ToolFailureEvaluator ):
176- """When a single tool has both successes and failures, ToolSummary.attempts
169+ """When a single tool has both successes and failures, ToolSummary.failed_attempts
177170 should contain only the failed _ToolCall entries while total_calls reflects all.
178171 """
179172 trajectory = [
@@ -184,13 +177,13 @@ async def test_same_tool_mixed_results_filters_attempts_to_failures_only(self, e
184177 ]
185178 result = await evaluator .evaluate_item (_eval_input ("filter" , trajectory ))
186179
187- reasoning : ToolFailureReasoning = result .reasoning
180+ reasoning : _ToolFailureReasoning = result .reasoning
188181 summary = reasoning .per_tool_summary [0 ]
189182 assert summary .total_calls == 2
190183 assert summary .failed_calls == 1
191- assert len (summary .attempts ) == 1
192- assert summary .attempts [0 ].error == "boom"
193- assert summary .attempts [0 ].input == {"q" : "bad" }
184+ assert len (summary .failed_attempts ) == 1
185+ assert summary .failed_attempts [0 ].error == "boom"
186+ assert summary .failed_attempts [0 ].input == {"q" : "bad" }
194187
195188 async def test_none_data_on_step_is_not_treated_as_error (self , evaluator : ToolFailureEvaluator ):
196189 """A TOOL_END step with data=None should count as a call but not a failure."""
@@ -220,10 +213,8 @@ async def test_missing_tool_name_recorded_as_unknown(self, evaluator: ToolFailur
220213 assert result .reasoning .per_tool_summary [0 ].tool_name == "unknown"
221214
222215
223- class TestAtifLaneModelPopulation :
224- """Verify ToolFailureReasoning, ToolSummary, and _ToolCall are correctly
225- populated from ATIF trajectories using each error detection path.
226- """
216+ class TestEvaluateAtifTrajectory :
217+ """Tests for evaluating ATIF trajectories."""
227218
228219 async def test_error_detected_via_extra_tool_errors (self , evaluator : ToolFailureEvaluator ):
229220 """Structured error metadata in step.extra['tool_errors'] should populate
@@ -242,11 +233,11 @@ async def test_error_detected_via_extra_tool_errors(self, evaluator: ToolFailure
242233 ]
243234 result = await evaluator .evaluate_atif_item (_atif_sample ("extra" , steps ))
244235
245- reasoning : ToolFailureReasoning = result .reasoning
236+ reasoning : _ToolFailureReasoning = result .reasoning
246237 assert reasoning .failed_tool_calls == 1
247238 assert reasoning .failed_tools == ["lookup" ]
248- assert reasoning .per_tool_summary [0 ].attempts [0 ].error == "ValueError: Column not found"
249- assert reasoning .per_tool_summary [0 ].attempts [0 ].input == {"query" : "q1" }
239+ assert reasoning .per_tool_summary [0 ].failed_attempts [0 ].error == "ValueError: Column not found"
240+ assert reasoning .per_tool_summary [0 ].failed_attempts [0 ].input == {"query" : "q1" }
250241
251242 async def test_error_detected_via_stringified_tool_message_dict (self , evaluator : ToolFailureEvaluator ):
252243 """A Python dict literal with status='error' in the observation content
@@ -262,7 +253,7 @@ async def test_error_detected_via_stringified_tool_message_dict(self, evaluator:
262253 result = await evaluator .evaluate_atif_item (_atif_sample ("parsed" , steps ))
263254
264255 assert result .reasoning .failed_tool_calls == 1
265- assert result .reasoning .per_tool_summary [0 ].attempts [0 ].error == "TimeoutError: timed out"
256+ assert result .reasoning .per_tool_summary [0 ].failed_attempts [0 ].error == "TimeoutError: timed out"
266257
267258 async def test_error_detected_via_raw_error_pattern (self , evaluator : ToolFailureEvaluator ):
268259 """Observation content matching 'XyzError: ...' should be detected as a
@@ -274,7 +265,7 @@ async def test_error_detected_via_raw_error_pattern(self, evaluator: ToolFailure
274265 result = await evaluator .evaluate_atif_item (_atif_sample ("pattern" , steps ))
275266
276267 assert result .reasoning .failed_tool_calls == 1
277- assert result .reasoning .per_tool_summary [0 ].attempts [0 ].error == "RuntimeError: internal failure"
268+ assert result .reasoning .per_tool_summary [0 ].failed_attempts [0 ].error == "RuntimeError: internal failure"
278269
279270 async def test_extra_tool_errors_takes_priority_over_observation_pattern (self , evaluator : ToolFailureEvaluator ):
280271 """When both extra['tool_errors'] and a raw error pattern match, the
@@ -292,7 +283,7 @@ async def test_extra_tool_errors_takes_priority_over_observation_pattern(self, e
292283 ]
293284 result = await evaluator .evaluate_atif_item (_atif_sample ("priority" , steps ))
294285
295- assert result .reasoning .per_tool_summary [0 ].attempts [0 ].error == "ValueError: from extra"
286+ assert result .reasoning .per_tool_summary [0 ].failed_attempts [0 ].error == "ValueError: from extra"
296287
297288 async def test_mixed_success_and_failure_populates_only_failing_tool (self , evaluator : ToolFailureEvaluator ):
298289 """With one successful and one failing tool, only the failing tool
0 commit comments