Skip to content

Commit aa848fe

Browse files
Copilotm7md7sien
andauthored
Rename tool_call_accuracy reasoning output to reason and update skipped properties handling (#46355)
Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/89b3b528-f2ac-4284-88fb-c484d4c0cce1 Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
1 parent f454ee3 commit aa848fe

5 files changed

Lines changed: 21 additions & 21 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ def _return_not_applicable_result(
456456
f"{self._result_key}_score": None,
457457
f"{self._result_key}_result": "not_applicable",
458458
f"{self._result_key}_passed": None,
459-
f"{self._result_key}_reasoning": f"Not applicable: {error_message}",
459+
f"{self._result_key}_reason": f"Not applicable: {error_message}",
460460
f"{self._result_key}_status": "skipped",
461461
f"{self._result_key}_threshold": threshold,
462-
f"{self._result_key}_properties": {},
462+
f"{self._result_key}_properties": None,
463463
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
258258
# Handle skipped status from LLM
259259
llm_status = llm_output.get("status", "completed")
260260
if llm_status == "skipped":
261-
reason = llm_output.get("reasoning", "")
261+
reason = llm_output.get("reason", "")
262262
return self._return_not_applicable_result(reason, self.threshold)
263263

264264
score = llm_output.get(self._LLM_SCORE_KEY, None)
@@ -276,10 +276,10 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
276276
)
277277

278278
# Format the output
279-
reason = llm_output.get("reasoning", "")
279+
reason = llm_output.get("reason", "")
280280
score = float(score)
281281
score_result = "pass" if score >= self.threshold else "fail"
282-
llm_properties = llm_output.get("properties", {})
282+
llm_properties = llm_output.get("properties", {}) or {}
283283
llm_properties.update(
284284
{
285285
"prompt_tokens": prompty_output_dict.get("input_token_count", 0),
@@ -295,7 +295,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
295295
f"{self._result_key}_score": score,
296296
f"{self._result_key}_result": score_result,
297297
f"{self._result_key}_passed": score_result == "pass",
298-
f"{self._result_key}_reasoning": reason,
298+
f"{self._result_key}_reason": reason,
299299
f"{self._result_key}_status": "completed",
300300
f"{self._result_key}_threshold": self._threshold,
301301
f"{self._result_key}_properties": llm_properties,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ Before performing any evaluation, check for the following conditions. If ANY are
6161

6262
When skipped, return:
6363
```json
64-
{"score": null, "reasoning": "<explain why evaluation was skipped>", "status": "skipped", "properties": null}
64+
{"reason": "<explain why evaluation was skipped>", "score": null, "status": "skipped", "properties": null}
6565
```
6666

6767

@@ -150,11 +150,11 @@ TOOL DEFINITIONS: {{tool_definitions}}
150150
# Tasks
151151
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
152152
Your output should consist only of a JSON object that has the following keys:
153+
- reason: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
153154
- score: an integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level. Set to null when status is "skipped".
154155
- status: a string indicating the evaluation status. Must be one of:
155156
- "completed": tool calls were present, tool definitions were available, and evaluation was performed.
156157
- "skipped": evaluation was not performed because there were no tool calls to evaluate, or tool definitions were missing for the tool calls. When skipped, set score to null and properties to null.
157-
- reasoning: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'. When status is "skipped", explain why the evaluation was skipped.
158158
- properties: a dictionary that contains the following keys:
159159
- tool_calls_made_by_agent: total number of tool calls made by the agent
160160
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
@@ -176,4 +176,4 @@ Your output should consist only of a JSON object that has the following keys:
176176
- tool_name: name of the tool
177177
- missing_count: number of missing calls for this query
178178

179-
# Output
179+
# Output

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
7070
assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_result"] == "not_applicable"
7171
assert result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_status"] == "skipped"
7272
assert (
73-
"not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"].lower()
73+
"not applicable" in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"].lower()
7474
and ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
75-
in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reasoning"]
75+
in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
7676
)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ async def flow_side_effect(timeout, **kwargs):
4848
# Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
4949
return {
5050
"llm_output": {
51-
"reasoning": "The tool calls were very correct that I returned a huge number!",
51+
"reason": "The tool calls were very correct that I returned a huge number!",
5252
"score": 25,
5353
"properties": {},
5454
}
@@ -63,7 +63,7 @@ async def flow_side_effect(timeout, **kwargs):
6363

6464
return {
6565
"llm_output": {
66-
"reasoning": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
66+
"reason": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
6767
"score": score,
6868
"properties": {
6969
"tool_calls_made_by_agent": total_calls,
@@ -134,8 +134,8 @@ def test_evaluate_tools_valid1(self, mock_model_config):
134134
assert result[f"{key}_score"] == 3.0 # Mixed good/bad gets score 3
135135
assert result[f"{key}_result"] == "pass"
136136
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
137-
assert f"{key}_reasoning" in result
138-
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 1 correct calls."
137+
assert f"{key}_reason" in result
138+
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
139139
assert f"{key}_properties" in result
140140

141141
def test_evaluate_tools_valid2(self, mock_model_config):
@@ -196,8 +196,8 @@ def test_evaluate_tools_valid2(self, mock_model_config):
196196
assert result[f"{key}_score"] == 1.0 # All bad gets score 1
197197
assert result[f"{key}_result"] == "fail"
198198
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
199-
assert f"{key}_reasoning" in result
200-
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 0 correct calls."
199+
assert f"{key}_reason" in result
200+
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
201201
assert f"{key}_properties" in result
202202

203203
def test_evaluate_tools_valid3(self, mock_model_config):
@@ -258,8 +258,8 @@ def test_evaluate_tools_valid3(self, mock_model_config):
258258
assert result[f"{key}_score"] == 5.0 # All good gets score 5
259259
assert result[f"{key}_result"] == "pass"
260260
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
261-
assert f"{key}_reasoning" in result
262-
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 2 correct calls."
261+
assert f"{key}_reason" in result
262+
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
263263
assert f"{key}_properties" in result
264264

265265
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
@@ -383,8 +383,8 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
383383
assert result[f"{key}_score"] == 5.0 # All good gets score 5
384384
assert result[f"{key}_result"] == "pass"
385385
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
386-
assert f"{key}_reasoning" in result
387-
assert result[f"{key}_reasoning"] == "Evaluated 1 tool calls with 1 correct calls."
386+
assert f"{key}_reason" in result
387+
assert result[f"{key}_reason"] == "Evaluated 1 tool calls with 1 correct calls."
388388
assert f"{key}_properties" in result
389389

390390
def test_evaluate_tools_no_tools(self, mock_model_config):

0 commit comments

Comments
 (0)