Skip to content

Commit ea272f0

Browse files
committed
Add status to prompty
1 parent b90b3e6 commit ea272f0

4 files changed

Lines changed: 62 additions & 18 deletions

File tree

assets/evaluators/builtin/tool_call_accuracy/evaluator/_tool_call_accuracy.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
974974
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
975975

976976
if isinstance(llm_output, dict):
977+
# Handle skipped status from LLM
978+
llm_status = llm_output.get("status", "completed")
979+
if llm_status == "skipped":
980+
reason = llm_output.get("reasoning", "")
981+
return self._not_applicable_result(reason, self.threshold)
982+
977983
score = llm_output.get(self._LLM_SCORE_KEY, None)
978984
if not score or not check_score_is_valid(
979985
score,
@@ -1004,12 +1010,13 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
10041010
"sample_output": prompty_output_dict.get("sample_output", ""),
10051011
})
10061012
response_dict = {
1007-
f"{self.result_key}_score": score,
1008-
f"{self.result_key}_result": score_result,
1009-
f"{self.result_key}_reasoning": reason,
1010-
f"{self.result_key}_status": "completed",
1011-
f"{self.result_key}_threshold": self._threshold,
1012-
f"{self.result_key}_properties": llm_properties,
1013+
f"{self._result_key}_score": score,
1014+
f"{self._result_key}_result": score_result,
1015+
f"{self._result_key}_passed": score_result == "pass",
1016+
f"{self._result_key}_reasoning": reason,
1017+
f"{self._result_key}_status": "completed",
1018+
f"{self._result_key}_threshold": self._threshold,
1019+
f"{self._result_key}_properties": llm_properties,
10131020
}
10141021
return response_dict
10151022

@@ -1054,23 +1061,24 @@ async def _real_call(self, **kwargs):
10541061

10551062
def _not_applicable_result(
10561063
self, error_message: str, threshold: Union[int, float]
1057-
) -> Dict[str, Union[str, float, Dict]]:
1064+
) -> Dict[str, Union[str, float, Dict, None]]:
10581065
"""Return a result indicating that the tool call is not applicable for evaluation.
10591066
10601067
:param error_message: The error message indicating why the evaluation is not applicable.
10611068
:type error_message: str
10621069
:param threshold: The threshold value for the evaluation.
10631070
:type threshold: Union[int, float]
10641071
:return: A dictionary containing the result of the evaluation.
1065-
:rtype: Dict[str, Union[str, float]]
1072+
:rtype: Dict[str, Union[str, float, None]]
10661073
"""
10671074
return {
1068-
f"{self.result_key}_score": None,
1069-
f"{self.result_key}_result": "not_applicable",
1070-
f"{self.result_key}_reasoning": f"Not applicable: {error_message}",
1071-
f"{self.result_key}_status": "skipped",
1072-
f"{self.result_key}_threshold": threshold,
1073-
f"{self.result_key}_properties": {},
1075+
f"{self._result_key}_score": None,
1076+
f"{self._result_key}_result": "not_applicable",
1077+
f"{self._result_key}_passed": None,
1078+
f"{self._result_key}_reasoning": f"Not applicable: {error_message}",
1079+
f"{self._result_key}_status": "skipped",
1080+
f"{self._result_key}_threshold": threshold,
1081+
f"{self._result_key}_properties": {},
10741082
}
10751083

10761084
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):

assets/evaluators/builtin/tool_call_accuracy/evaluator/tool_call_accuracy.prompty

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ Evaluate based on these factors:
5454

5555
**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
5656

57+
## Status: Skipped
58+
Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
59+
1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
60+
2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.
61+
62+
When skipped, return:
63+
```json
64+
{"score": null, "reasoning": "<explain why evaluation was skipped>", "status": "skipped", "properties": null}
65+
```
66+
5767

5868
# Ratings
5969
## [Tool Call Accuracy: 1] (Irrelevant)
@@ -140,8 +150,11 @@ TOOL DEFINITIONS: {{tool_definitions}}
140150
# Tasks
141151
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
142152
Your output should consist only of a JSON object that has the following keys:
143-
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
144-
- reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
153+
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
154+
- status: a string indicating the evaluation status. Must be one of:
155+
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
156+
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
157+
- reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
145158
- properties: a dictionary that contains the following keys:
146159
- tool_calls_made_by_agent: total number of tool calls made by the agent
147160
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent

assets/evaluators/tests/common/base_evaluator_runner.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ def _extract_and_print_result(self, results: Dict[str, Any], test_label: str) ->
156156
Dictionary with standardized result fields.
157157
"""
158158
score = results.get(self.result_key)
159+
if score is None:
160+
score = results.get(f"{self.result_key}_score")
159161

160162
if f"{self.result_key}_error_message" not in results and score != "not applicable":
161163
for field in self.expected_result_fields:
@@ -246,15 +248,23 @@ def assert_pass(self, result_data: Dict[str, Any]):
246248
self._assert_pass_result(result_data)
247249

248250
def assert_not_applicable(self, result_data: Dict[str, Any]):
249-
"""Assert a not-applicable result (intermediate response).
251+
"""Assert a not-applicable result (intermediate response or skipped evaluation).
250252
251253
Args:
252254
result_data: Dictionary containing evaluation result data.
253255
254256
Raises:
255257
AssertionError: If the result is not a valid not-applicable result.
256258
"""
257-
self._assert_pass_result(result_data)
259+
label_key = "label"
260+
score_key = "score"
261+
if result_data[label_key] == "not_applicable":
262+
assert result_data[label_key] == "not_applicable", \
263+
f"Expected 'not_applicable' but got '{result_data[label_key]}'"
264+
assert result_data[score_key] is None, \
265+
f"Expected score to be None for not-applicable result but got '{result_data[score_key]}'"
266+
else:
267+
self._assert_pass_result(result_data)
258268
assert "Not applicable" in result_data.get("reason", ""), \
259269
f"Expected reason to contain 'Not applicable' but got '{result_data.get('reason')}'"
260270

assets/evaluators/tests/test_evaluators_quality/test_tool_call_accuracy_evaluator_quality.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""Quality tests for Tool Call Accuracy Evaluator with real flow execution."""
55

66
import pytest
7+
from typing import List
78
from ..common.base_quality_evaluator_runner import BaseQualityEvaluatorRunner, ExpectedResult
89
from ...builtin.tool_call_accuracy.evaluator._tool_call_accuracy import ToolCallAccuracyEvaluator
910

@@ -16,6 +17,18 @@ class TestToolCallAccuracyEvaluatorQuality(BaseQualityEvaluatorRunner):
1617
Tests actual LLM evaluation with real flow execution (no mocking).
1718
"""
1819

20+
@property
21+
def expected_result_fields(self) -> List[str]:
22+
"""Get the expected result fields for tools evaluators."""
23+
return [
24+
f"{self.result_key}_score",
25+
f"{self.result_key}_reasoning",
26+
f"{self.result_key}_status",
27+
f"{self.result_key}_threshold",
28+
f"{self.result_key}_result",
29+
f"{self.result_key}_properties",
30+
]
31+
1932
evaluator_type = ToolCallAccuracyEvaluator
2033

2134
def test_pass_single_call(self) -> None:

0 commit comments

Comments
 (0)