Skip to content

Commit d3c4092

Browse files
committed
Update tests
1 parent 3eb40a8 commit d3c4092

1 file changed

Lines changed: 34 additions & 34 deletions

File tree

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ async def flow_side_effect(timeout, **kwargs):
4848
# Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
4949
return {
5050
"llm_output": {
51-
"chain_of_thought": "The tool calls were very correct that I returned a huge number!",
52-
"tool_calls_success_level": 25,
53-
"details": {},
51+
"reasoning": "The tool calls were very correct that I returned a huge number!",
52+
"score": 25,
53+
"properties": {},
5454
}
5555
}
5656

@@ -63,9 +63,9 @@ async def flow_side_effect(timeout, **kwargs):
6363

6464
return {
6565
"llm_output": {
66-
"chain_of_thought": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
67-
"tool_calls_success_level": score,
68-
"details": {
66+
"reasoning": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
67+
"score": score,
68+
"properties": {
6969
"tool_calls_made_by_agent": total_calls,
7070
"correct_tool_calls_made_by_agent": total_good_calls,
7171
},
@@ -130,13 +130,13 @@ def test_evaluate_tools_valid1(self, mock_model_config):
130130

131131
key = ToolCallAccuracyEvaluator._RESULT_KEY
132132
assert result is not None
133-
assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
134-
assert result[key] == 3.0 # Mixed good/bad gets score 3
133+
assert f"{key}_score" in result and f"{key}_result" in result and f"{key}_threshold" in result
134+
assert result[f"{key}_score"] == 3.0 # Mixed good/bad gets score 3
135135
assert result[f"{key}_result"] == "pass"
136136
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
137-
assert f"{key}_reason" in result
138-
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
139-
assert f"{key}_details" in result
137+
assert f"{key}_reasoning" in result
138+
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 1 correct calls."
139+
assert f"{key}_properties" in result
140140

141141
def test_evaluate_tools_valid2(self, mock_model_config):
142142
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -192,13 +192,13 @@ def test_evaluate_tools_valid2(self, mock_model_config):
192192

193193
key = ToolCallAccuracyEvaluator._RESULT_KEY
194194
assert result is not None
195-
assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
196-
assert result[key] == 1.0 # All bad gets score 1
195+
assert f"{key}_score" in result and f"{key}_result" in result and f"{key}_threshold" in result
196+
assert result[f"{key}_score"] == 1.0 # All bad gets score 1
197197
assert result[f"{key}_result"] == "fail"
198198
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
199-
assert f"{key}_reason" in result
200-
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
201-
assert f"{key}_details" in result
199+
assert f"{key}_reasoning" in result
200+
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 0 correct calls."
201+
assert f"{key}_properties" in result
202202

203203
def test_evaluate_tools_valid3(self, mock_model_config):
204204
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -254,13 +254,13 @@ def test_evaluate_tools_valid3(self, mock_model_config):
254254

255255
key = ToolCallAccuracyEvaluator._RESULT_KEY
256256
assert result is not None
257-
assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
258-
assert result[key] == 5.0 # All good gets score 5
257+
assert f"{key}_score" in result and f"{key}_result" in result and f"{key}_threshold" in result
258+
assert result[f"{key}_score"] == 5.0 # All good gets score 5
259259
assert result[f"{key}_result"] == "pass"
260260
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
261-
assert f"{key}_reason" in result
262-
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
263-
assert f"{key}_details" in result
261+
assert f"{key}_reasoning" in result
262+
assert result[f"{key}_reasoning"] == "Evaluated 2 tool calls with 2 correct calls."
263+
assert f"{key}_properties" in result
264264

265265
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
266266
with pytest.raises(EvaluationException) as exc_info:
@@ -379,13 +379,13 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
379379

380380
key = ToolCallAccuracyEvaluator._RESULT_KEY
381381
assert result is not None
382-
assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
383-
assert result[key] == 5.0 # All good gets score 5
382+
assert f"{key}_score" in result and f"{key}_result" in result and f"{key}_threshold" in result
383+
assert result[f"{key}_score"] == 5.0 # All good gets score 5
384384
assert result[f"{key}_result"] == "pass"
385385
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
386-
assert f"{key}_reason" in result
387-
assert result[f"{key}_reason"] == "Evaluated 1 tool calls with 1 correct calls."
388-
assert f"{key}_details" in result
386+
assert f"{key}_reasoning" in result
387+
assert result[f"{key}_reasoning"] == "Evaluated 1 tool calls with 1 correct calls."
388+
assert f"{key}_properties" in result
389389

390390
def test_evaluate_tools_no_tools(self, mock_model_config):
391391
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -444,7 +444,7 @@ def test_evaluate_bing_custom_search(self, mock_model_config):
444444

445445
key = ToolCallAccuracyEvaluator._RESULT_KEY
446446
assert result is not None
447-
assert result[key] == 5.0
447+
assert result[f"{key}_score"] == 5.0
448448
assert result[f"{key}_result"] == "pass"
449449

450450
def test_evaluate_bing_grounding(self, mock_model_config):
@@ -476,7 +476,7 @@ def test_evaluate_bing_grounding(self, mock_model_config):
476476

477477
key = ToolCallAccuracyEvaluator._RESULT_KEY
478478
assert result is not None
479-
assert result[key] == 5.0
479+
assert result[f"{key}_score"] == 5.0
480480
assert result[f"{key}_result"] == "pass"
481481

482482
def test_evaluate_file_search(self, mock_model_config):
@@ -506,7 +506,7 @@ def test_evaluate_file_search(self, mock_model_config):
506506

507507
key = ToolCallAccuracyEvaluator._RESULT_KEY
508508
assert result is not None
509-
assert result[key] == 5.0
509+
assert result[f"{key}_score"] == 5.0
510510
assert result[f"{key}_result"] == "pass"
511511

512512
def test_evaluate_azure_ai_search(self, mock_model_config):
@@ -536,7 +536,7 @@ def test_evaluate_azure_ai_search(self, mock_model_config):
536536

537537
key = ToolCallAccuracyEvaluator._RESULT_KEY
538538
assert result is not None
539-
assert result[key] == 5.0
539+
assert result[f"{key}_score"] == 5.0
540540
assert result[f"{key}_result"] == "pass"
541541

542542
def test_evaluate_fabric_dataagent(self, mock_model_config):
@@ -566,7 +566,7 @@ def test_evaluate_fabric_dataagent(self, mock_model_config):
566566

567567
key = ToolCallAccuracyEvaluator._RESULT_KEY
568568
assert result is not None
569-
assert result[key] == 5.0
569+
assert result[f"{key}_score"] == 5.0
570570
assert result[f"{key}_result"] == "pass"
571571

572572
def test_evaluate_code_interpreter(self, mock_model_config):
@@ -598,7 +598,7 @@ def test_evaluate_code_interpreter(self, mock_model_config):
598598

599599
key = ToolCallAccuracyEvaluator._RESULT_KEY
600600
assert result is not None
601-
assert result[key] == 5.0
601+
assert result[f"{key}_score"] == 5.0
602602
assert result[f"{key}_result"] == "pass"
603603

604604
def test_evaluate_sharepoint_grounding(self, mock_model_config):
@@ -628,7 +628,7 @@ def test_evaluate_sharepoint_grounding(self, mock_model_config):
628628

629629
key = ToolCallAccuracyEvaluator._RESULT_KEY
630630
assert result is not None
631-
assert result[key] == 5.0
631+
assert result[f"{key}_score"] == 5.0
632632
assert result[f"{key}_result"] == "pass"
633633

634634
def test_evaluate_open_api(self, mock_model_config):
@@ -729,7 +729,7 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
729729

730730
key = ToolCallAccuracyEvaluator._RESULT_KEY
731731
assert result is not None
732-
assert result[key] == 5.0
732+
assert result[f"{key}_score"] == 5.0
733733
assert result[f"{key}_result"] == "pass"
734734

735735
def test_evaluate_missing_query(self, mock_model_config):

0 commit comments

Comments
 (0)