@@ -48,9 +48,9 @@ async def flow_side_effect(timeout, **kwargs):
4848 # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
4949 return {
5050 "llm_output" : {
51- "chain_of_thought " : "The tool calls were very correct that I returned a huge number!" ,
52- "tool_calls_success_level " : 25 ,
53- "details " : {},
51+ "reasoning " : "The tool calls were very correct that I returned a huge number!" ,
52+ "score " : 25 ,
53+ "properties " : {},
5454 }
5555 }
5656
@@ -63,9 +63,9 @@ async def flow_side_effect(timeout, **kwargs):
6363
6464 return {
6565 "llm_output" : {
66- "chain_of_thought " : f"Evaluated { total_calls } tool calls with { total_good_calls } correct calls." ,
67- "tool_calls_success_level " : score ,
68- "details " : {
66+ "reasoning " : f"Evaluated { total_calls } tool calls with { total_good_calls } correct calls." ,
67+ "score " : score ,
68+ "properties " : {
6969 "tool_calls_made_by_agent" : total_calls ,
7070 "correct_tool_calls_made_by_agent" : total_good_calls ,
7171 },
@@ -130,13 +130,13 @@ def test_evaluate_tools_valid1(self, mock_model_config):
130130
131131 key = ToolCallAccuracyEvaluator ._RESULT_KEY
132132 assert result is not None
133- assert key in result and f"{ key } _result" in result and f"{ key } _threshold" in result
134- assert result [key ] == 3.0 # Mixed good/bad gets score 3
133+ assert f" { key } _score" in result and f"{ key } _result" in result and f"{ key } _threshold" in result
134+ assert result [f" { key } _score" ] == 3.0 # Mixed good/bad gets score 3
135135 assert result [f"{ key } _result" ] == "pass"
136136 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
137- assert f"{ key } _reason " in result
138- assert result [f"{ key } _reason " ] == "Evaluated 2 tool calls with 1 correct calls."
139- assert f"{ key } _details " in result
137+ assert f"{ key } _reasoning " in result
138+ assert result [f"{ key } _reasoning " ] == "Evaluated 2 tool calls with 1 correct calls."
139+ assert f"{ key } _properties " in result
140140
141141 def test_evaluate_tools_valid2 (self , mock_model_config ):
142142 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -192,13 +192,13 @@ def test_evaluate_tools_valid2(self, mock_model_config):
192192
193193 key = ToolCallAccuracyEvaluator ._RESULT_KEY
194194 assert result is not None
195- assert key in result and f"{ key } _result" in result and f"{ key } _threshold" in result
196- assert result [key ] == 1.0 # All bad gets score 1
195+ assert f" { key } _score" in result and f"{ key } _result" in result and f"{ key } _threshold" in result
196+ assert result [f" { key } _score" ] == 1.0 # All bad gets score 1
197197 assert result [f"{ key } _result" ] == "fail"
198198 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
199- assert f"{ key } _reason " in result
200- assert result [f"{ key } _reason " ] == "Evaluated 2 tool calls with 0 correct calls."
201- assert f"{ key } _details " in result
199+ assert f"{ key } _reasoning " in result
200+ assert result [f"{ key } _reasoning " ] == "Evaluated 2 tool calls with 0 correct calls."
201+ assert f"{ key } _properties " in result
202202
203203 def test_evaluate_tools_valid3 (self , mock_model_config ):
204204 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -254,13 +254,13 @@ def test_evaluate_tools_valid3(self, mock_model_config):
254254
255255 key = ToolCallAccuracyEvaluator ._RESULT_KEY
256256 assert result is not None
257- assert key in result and f"{ key } _result" in result and f"{ key } _threshold" in result
258- assert result [key ] == 5.0 # All good gets score 5
257+ assert f" { key } _score" in result and f"{ key } _result" in result and f"{ key } _threshold" in result
258+ assert result [f" { key } _score" ] == 5.0 # All good gets score 5
259259 assert result [f"{ key } _result" ] == "pass"
260260 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
261- assert f"{ key } _reason " in result
262- assert result [f"{ key } _reason " ] == "Evaluated 2 tool calls with 2 correct calls."
263- assert f"{ key } _details " in result
261+ assert f"{ key } _reasoning " in result
262+ assert result [f"{ key } _reasoning " ] == "Evaluated 2 tool calls with 2 correct calls."
263+ assert f"{ key } _properties " in result
264264
265265 def test_evaluate_tools_one_eval_fails (self , mock_model_config ):
266266 with pytest .raises (EvaluationException ) as exc_info :
@@ -379,13 +379,13 @@ def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
379379
380380 key = ToolCallAccuracyEvaluator ._RESULT_KEY
381381 assert result is not None
382- assert key in result and f"{ key } _result" in result and f"{ key } _threshold" in result
383- assert result [key ] == 5.0 # All good gets score 5
382+ assert f" { key } _score" in result and f"{ key } _result" in result and f"{ key } _threshold" in result
383+ assert result [f" { key } _score" ] == 5.0 # All good gets score 5
384384 assert result [f"{ key } _result" ] == "pass"
385385 assert result [f"{ key } _threshold" ] == ToolCallAccuracyEvaluator ._DEFAULT_TOOL_CALL_ACCURACY_SCORE
386- assert f"{ key } _reason " in result
387- assert result [f"{ key } _reason " ] == "Evaluated 1 tool calls with 1 correct calls."
388- assert f"{ key } _details " in result
386+ assert f"{ key } _reasoning " in result
387+ assert result [f"{ key } _reasoning " ] == "Evaluated 1 tool calls with 1 correct calls."
388+ assert f"{ key } _properties " in result
389389
390390 def test_evaluate_tools_no_tools (self , mock_model_config ):
391391 evaluator = ToolCallAccuracyEvaluator (model_config = mock_model_config )
@@ -444,7 +444,7 @@ def test_evaluate_bing_custom_search(self, mock_model_config):
444444
445445 key = ToolCallAccuracyEvaluator ._RESULT_KEY
446446 assert result is not None
447- assert result [key ] == 5.0
447+ assert result [f" { key } _score" ] == 5.0
448448 assert result [f"{ key } _result" ] == "pass"
449449
450450 def test_evaluate_bing_grounding (self , mock_model_config ):
@@ -476,7 +476,7 @@ def test_evaluate_bing_grounding(self, mock_model_config):
476476
477477 key = ToolCallAccuracyEvaluator ._RESULT_KEY
478478 assert result is not None
479- assert result [key ] == 5.0
479+ assert result [f" { key } _score" ] == 5.0
480480 assert result [f"{ key } _result" ] == "pass"
481481
482482 def test_evaluate_file_search (self , mock_model_config ):
@@ -506,7 +506,7 @@ def test_evaluate_file_search(self, mock_model_config):
506506
507507 key = ToolCallAccuracyEvaluator ._RESULT_KEY
508508 assert result is not None
509- assert result [key ] == 5.0
509+ assert result [f" { key } _score" ] == 5.0
510510 assert result [f"{ key } _result" ] == "pass"
511511
512512 def test_evaluate_azure_ai_search (self , mock_model_config ):
@@ -536,7 +536,7 @@ def test_evaluate_azure_ai_search(self, mock_model_config):
536536
537537 key = ToolCallAccuracyEvaluator ._RESULT_KEY
538538 assert result is not None
539- assert result [key ] == 5.0
539+ assert result [f" { key } _score" ] == 5.0
540540 assert result [f"{ key } _result" ] == "pass"
541541
542542 def test_evaluate_fabric_dataagent (self , mock_model_config ):
@@ -566,7 +566,7 @@ def test_evaluate_fabric_dataagent(self, mock_model_config):
566566
567567 key = ToolCallAccuracyEvaluator ._RESULT_KEY
568568 assert result is not None
569- assert result [key ] == 5.0
569+ assert result [f" { key } _score" ] == 5.0
570570 assert result [f"{ key } _result" ] == "pass"
571571
572572 def test_evaluate_code_interpreter (self , mock_model_config ):
@@ -598,7 +598,7 @@ def test_evaluate_code_interpreter(self, mock_model_config):
598598
599599 key = ToolCallAccuracyEvaluator ._RESULT_KEY
600600 assert result is not None
601- assert result [key ] == 5.0
601+ assert result [f" { key } _score" ] == 5.0
602602 assert result [f"{ key } _result" ] == "pass"
603603
604604 def test_evaluate_sharepoint_grounding (self , mock_model_config ):
@@ -628,7 +628,7 @@ def test_evaluate_sharepoint_grounding(self, mock_model_config):
628628
629629 key = ToolCallAccuracyEvaluator ._RESULT_KEY
630630 assert result is not None
631- assert result [key ] == 5.0
631+ assert result [f" { key } _score" ] == 5.0
632632 assert result [f"{ key } _result" ] == "pass"
633633
634634 def test_evaluate_open_api (self , mock_model_config ):
@@ -729,7 +729,7 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
729729
730730 key = ToolCallAccuracyEvaluator ._RESULT_KEY
731731 assert result is not None
732- assert result [key ] == 5.0
732+ assert result [f" { key } _score" ] == 5.0
733733 assert result [f"{ key } _result" ] == "pass"
734734
735735 def test_evaluate_missing_query (self , mock_model_config ):
0 commit comments