Skip to content

Commit 3eb40a8

Browse files
committed
Update Tool Call Accuracy to output unified format
1 parent bec9bd8 commit 3eb40a8

3 files changed

Lines changed: 63 additions & 17 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,3 +438,26 @@ def _not_applicable_result(
438438
result[f"{self._result_key}_details"] = {}
439439

440440
return result
441+
442+
# TODO: After all evaluators output are updated, we can remove the _not_applicable_result method and replace calls to it with _return_not_applicable_result, which returns a "skipped" status instead of "pass" to avoid confusion.
443+
def _return_not_applicable_result(
444+
self, error_message: str, threshold: Union[int, float]
445+
) -> Dict[str, Union[str, float, Dict, None]]:
446+
"""Return a result indicating that the tool call is not applicable for evaluation.
447+
448+
:param error_message: The error message indicating why the evaluation is not applicable.
449+
:type error_message: str
450+
:param threshold: The threshold value for the evaluation.
451+
:type threshold: Union[int, float]
452+
:return: A dictionary containing the result of the evaluation.
453+
:rtype: Dict[str, Union[str, float, None]]
454+
"""
455+
return {
456+
f"{self._result_key}_score": None,
457+
f"{self._result_key}_result": "not_applicable",
458+
f"{self._result_key}_passed": None,
459+
f"{self._result_key}_reasoning": f"Not applicable: {error_message}",
460+
f"{self._result_key}_status": "skipped",
461+
f"{self._result_key}_threshold": threshold,
462+
f"{self._result_key}_properties": {},
463+
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
8686
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
8787
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
8888

89-
_LLM_SCORE_KEY = "tool_calls_success_level"
89+
_LLM_SCORE_KEY = "score"
9090

9191
_validator: ValidatorInterface
9292

@@ -256,6 +256,12 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
256256
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
257257
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
258258
if isinstance(llm_output, dict):
259+
# Handle skipped status from LLM
260+
llm_status = llm_output.get("status", "completed")
261+
if llm_status == "skipped":
262+
reason = llm_output.get("reasoning", "")
263+
return self._return_not_applicable_result(reason, self.threshold)
264+
259265
score = llm_output.get(self._LLM_SCORE_KEY, None)
260266
if not score or not check_score_is_valid(
261267
score,
@@ -271,23 +277,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
271277
)
272278

273279
# Format the output
274-
reason = llm_output.get("chain_of_thought", "")
280+
reason = llm_output.get("reasoning", "")
275281
score = float(score)
276282
score_result = "pass" if score >= self.threshold else "fail"
283+
llm_properties = llm_output.get("properties", {})
284+
llm_properties.update({
285+
"prompt_tokens": prompty_output_dict.get("input_token_count", 0),
286+
"completion_tokens": prompty_output_dict.get("output_token_count", 0),
287+
"total_tokens": prompty_output_dict.get("total_token_count", 0),
288+
"finish_reason": prompty_output_dict.get("finish_reason", ""),
289+
"model": prompty_output_dict.get("model_id", ""),
290+
"sample_input": prompty_output_dict.get("sample_input", ""),
291+
"sample_output": prompty_output_dict.get("sample_output", ""),
292+
})
277293
response_dict = {
278-
self._result_key: score,
279-
f"gpt_{self._result_key}": score,
294+
f"{self._result_key}_score": score,
280295
f"{self._result_key}_result": score_result,
296+
f"{self._result_key}_passed": score_result == "pass",
297+
f"{self._result_key}_reasoning": reason,
298+
f"{self._result_key}_status": "completed",
281299
f"{self._result_key}_threshold": self._threshold,
282-
f"{self._result_key}_reason": reason,
283-
f"{self._result_key}_details": llm_output.get("details", {}),
284-
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
285-
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
286-
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
287-
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
288-
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
289-
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
290-
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
300+
f"{self._result_key}_properties": llm_properties,
291301
}
292302
return response_dict
293303

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,16 @@ Evaluate based on these factors:
5454

5555
**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
5656

57+
## Status: Skipped
58+
Before performing any evaluation, check for the following conditions. If ANY are true, return `status: "skipped"` immediately without scoring:
59+
1. **No tool calls to evaluate**: The TOOL CALLS TO BE EVALUATED section is empty (tool calls appearing only in the CONVERSATION section do not count).
60+
2. **Missing tool definitions**: Any tool call in TOOL CALLS TO BE EVALUATED references a tool that is not present in the TOOL DEFINITIONS.
61+
62+
When skipped, return:
63+
```json
64+
{"score": null, "reasoning": "<explain why evaluation was skipped>", "status": "skipped", "properties": null}
65+
```
66+
5767

5868
# Ratings
5969
## [Tool Call Accuracy: 1] (Irrelevant)
@@ -139,10 +149,13 @@ TOOL DEFINITIONS: {{tool_definitions}}
139149

140150
# Tasks
141151
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
142-
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
143-
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
144-
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
145-
- details: a dictionary that contains the following keys:
152+
Your output should consist only of a JSON object that has the following keys:
153+
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
154+
- status: a string indicating the evaluation status. Must be one of:
155+
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
156+
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
157+
- reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
158+
- properties: a dictionary that contains the following keys:
146159
- tool_calls_made_by_agent: total number of tool calls made by the agent
147160
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
148161
- per_tool_call_details: a list of dictionaries, each containing:

0 commit comments

Comments
 (0)