Skip to content

Commit fe55928

Browse files
authored
[Agentic Evaluators]: Accept input string as is (#45159)
1 parent b884dc6 commit fe55928

7 files changed

Lines changed: 133 additions & 111 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,11 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
375375
category=ErrorCategory.MISSING_FIELD,
376376
target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
377377
)
378+
379+
# If response is a string, we can skip the context extraction and just return the eval input
380+
if response and isinstance(response, str):
381+
return super()._convert_kwargs_to_eval_input(query=query, response=response, context=response)
382+
378383
context = self._get_context_from_agent_response(response, tool_definitions)
379384

380385
if not self._validate_context(context) and self._is_single_entry(response) and self._is_single_entry(query):

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -178,25 +178,32 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
178178
tool_calls = parsed_tool_calls
179179

180180
if not tool_calls:
181-
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
181+
# If no tool calls provided and response is string, use response string as tool calls as is
182+
if response and isinstance(response, str):
183+
tool_calls = response
184+
else:
185+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
182186

183-
if not isinstance(tool_calls, list):
187+
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
184188
tool_calls = [tool_calls]
185-
if not isinstance(tool_definitions, list):
189+
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
186190
tool_definitions = [tool_definitions] if tool_definitions else []
187191

188-
try:
189-
needed_tool_definitions = self._extract_needed_tool_definitions(
190-
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
191-
)
192-
except EvaluationException as e:
193-
# Check if this is because no tool definitions were provided at all
194-
if len(tool_definitions) == 0:
195-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
196-
else:
197-
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
198-
199-
if len(needed_tool_definitions) == 0:
192+
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
193+
needed_tool_definitions = tool_definitions
194+
else:
195+
try:
196+
needed_tool_definitions = self._extract_needed_tool_definitions(
197+
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
198+
)
199+
except EvaluationException as e:
200+
# Check if this is because no tool definitions were provided at all
201+
if len(tool_definitions) == 0:
202+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
203+
else:
204+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
205+
206+
if not needed_tool_definitions:
200207
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
201208

202209
return {

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
165165
category=ErrorCategory.MISSING_FIELD,
166166
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
167167
)
168+
if _is_intermediate_response(eval_input.get("response")):
169+
return self._not_applicable_result(
170+
"Intermediate response. Please provide the agent's final response for evaluation.",
171+
self._threshold,
172+
)
168173
if eval_input["response"] is None or eval_input["response"] == []:
169174
raise EvaluationException(
170175
message="response cannot be None or empty for the Tool Call Success evaluator.",
@@ -174,29 +179,34 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
174179
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
175180
)
176181

177-
# Check for intermediate response
178-
if _is_intermediate_response(eval_input.get("response")):
179-
return self._not_applicable_result(
180-
"Intermediate response. Please provide the agent's final response for evaluation.",
181-
self._threshold,
182-
)
183-
184-
# Preprocess messages if they are lists
185182
if isinstance(eval_input.get("response"), list):
186183
eval_input["response"] = _preprocess_messages(eval_input["response"])
184+
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
185+
# If response is a string, pass directly without reformatting
186+
elif isinstance(eval_input["response"], str):
187+
eval_input["tool_calls"] = eval_input["response"]
188+
else:
189+
raise EvaluationException(
190+
message="response must be either a list of messages or a string.",
191+
blame=ErrorBlame.USER_ERROR,
192+
category=ErrorCategory.INVALID_VALUE,
193+
target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
194+
)
195+
187196
if isinstance(eval_input.get("query"), list):
188197
eval_input["query"] = _preprocess_messages(eval_input["query"])
189198

190-
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
191-
192-
if "tool_definitions" in eval_input:
199+
# If tool definitions are string, pass directly without reformatting, else format it.
200+
if "tool_definitions" in eval_input and not isinstance(eval_input["tool_definitions"], str):
193201
tool_definitions = eval_input["tool_definitions"]
194-
filtered_tool_definitions = _filter_to_used_tools(
195-
tool_definitions=tool_definitions,
196-
msgs_list=eval_input["response"],
197-
logger=logger,
198-
)
199-
eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
202+
# Only if response is not a string, we filter tool definitions to only tools needed.
203+
if not isinstance(eval_input["response"], str):
204+
tool_definitions = _filter_to_used_tools(
205+
tool_definitions=tool_definitions,
206+
msgs_list=eval_input["response"],
207+
logger=logger,
208+
)
209+
eval_input["tool_definitions"] = _reformat_tool_definitions(tool_definitions, logger)
200210

201211
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
202212
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -117,37 +117,50 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
117117
query = kwargs.get("query")
118118
response = kwargs.get("response")
119119

120-
# Extract tool calls from response
121120
if not response:
122121
return {"error_message": "Response parameter is required to extract tool calls."}
123122

123+
# Try to parse tool calls from response
124124
tool_calls = self._parse_tools_from_response(response)
125+
125126
if not tool_calls:
126-
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
127+
# If no tool calls found and response is string, use response string as tool calls as is
128+
if isinstance(response, str):
129+
tool_calls = response
130+
else:
131+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
127132

128-
if not isinstance(tool_calls, list):
133+
# Normalize tool_calls and tool_definitions (skip for strings)
134+
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
129135
tool_calls = [tool_calls]
130-
if not isinstance(tool_definitions, list):
136+
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
131137
tool_definitions = [tool_definitions] if tool_definitions else []
132138

133-
try:
134-
# Type cast to satisfy static type checker
135-
tool_calls_typed = cast(List[Dict], tool_calls)
136-
needed_tool_definitions = self._extract_needed_tool_definitions(
137-
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
138-
)
139-
except EvaluationException as e:
140-
# Check if this is because no tool definitions were provided at all
141-
if len(tool_definitions) == 0:
142-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
143-
else:
144-
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
145-
146-
if len(needed_tool_definitions) == 0:
139+
# Cross-validation (skip when either is string)
140+
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
141+
needed_tool_definitions = tool_definitions
142+
else:
143+
try:
144+
# Type cast to satisfy static type checker
145+
tool_calls_typed = cast(List[Dict], tool_calls)
146+
needed_tool_definitions = self._extract_needed_tool_definitions(
147+
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
148+
)
149+
except EvaluationException:
150+
# Check if this is because no tool definitions were provided at all
151+
if len(tool_definitions) == 0:
152+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
153+
else:
154+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
155+
156+
if not needed_tool_definitions:
147157
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
148158

149-
# Reformat agent response with tool calls and results using reformat_agent_response
150-
agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
159+
# Reformat response for LLM (skip for strings - already a string)
160+
if isinstance(tool_calls, str):
161+
agent_response_with_tools = tool_calls
162+
else:
163+
agent_response_with_tools = reformat_agent_response(response, include_tool_messages=True)
151164

152165
return {
153166
"query": query,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -199,21 +199,29 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
199199
if isinstance(eval_input.get("query"), list):
200200
eval_input["query"] = _preprocess_messages(eval_input["query"])
201201

202+
# If response or tool_definitions are strings, pass directly without reformatting
203+
# Process each parameter individually - strings pass through, dicts get reformatted
202204
tool_definitions = eval_input["tool_definitions"]
203-
filtered_tool_definitions = filter_to_used_tools(
204-
tool_definitions=tool_definitions,
205-
msgs_lists=[eval_input["query"], eval_input["response"]],
206-
logger=logger,
207-
)
208-
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
209-
210-
eval_input["query"] = reformat_conversation_history(
211-
eval_input["query"],
212-
logger,
213-
include_system_messages=True,
214-
include_tool_messages=True,
215-
)
216-
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
205+
if not isinstance(tool_definitions, str):
206+
if not isinstance(eval_input.get("query"), str) and not isinstance(eval_input.get("response"), str):
207+
filtered_tool_definitions = filter_to_used_tools(
208+
tool_definitions=tool_definitions,
209+
msgs_lists=[eval_input["query"], eval_input["response"]],
210+
logger=logger,
211+
)
212+
else:
213+
filtered_tool_definitions = tool_definitions
214+
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
215+
216+
if not isinstance(eval_input.get("query"), str):
217+
eval_input["query"] = reformat_conversation_history(
218+
eval_input["query"],
219+
logger,
220+
include_system_messages=True,
221+
include_tool_messages=True,
222+
)
223+
if not isinstance(eval_input.get("response"), str):
224+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
217225

218226
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
219227
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -143,29 +143,36 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
143143
tool_calls = parsed_tool_calls
144144

145145
if not tool_calls:
146-
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
146+
# If no tool calls provided and response is string, use response string as tool calls as is
147+
if response and isinstance(response, str):
148+
tool_calls = response
149+
else:
150+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
147151

148-
if not isinstance(tool_calls, list):
152+
if not isinstance(tool_calls, list) and not isinstance(tool_calls, str):
149153
tool_calls = [tool_calls]
150-
if not isinstance(tool_definitions, list):
154+
if not isinstance(tool_definitions, list) and not isinstance(tool_definitions, str):
151155
tool_definitions = [tool_definitions] if tool_definitions else []
152156

153-
try:
154-
needed_tool_definitions = self._extract_needed_tool_definitions(
155-
tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
156-
)
157-
except EvaluationException as e:
158-
# Check if this is because no tool definitions were provided at all
159-
if len(tool_definitions) == 0:
160-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
161-
else:
162-
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
163-
164-
if len(needed_tool_definitions) == 0:
157+
if isinstance(tool_calls, str) or isinstance(tool_definitions, str):
158+
needed_tool_definitions = tool_definitions
159+
else:
160+
try:
161+
needed_tool_definitions = self._extract_needed_tool_definitions(
162+
tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
163+
)
164+
except EvaluationException:
165+
# Check if this is because no tool definitions were provided at all
166+
if len(tool_definitions) == 0:
167+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
168+
else:
169+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
170+
171+
if not needed_tool_definitions:
165172
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
166173

167-
# Extract only tool names from tool calls, removing parameters and results
168-
tool_names = self._extract_tool_names_from_calls(tool_calls)
174+
# Extract only tool names from tool calls, removing parameters and results (skip for strings)
175+
tool_names = tool_calls if isinstance(tool_calls, str) else self._extract_tool_names_from_calls(tool_calls)
169176

170177
return {
171178
"query": query,

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -46,34 +46,6 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
4646
)
4747
assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in str(exc_info.value)
4848

49-
# Test with response that has no tool calls
50-
result = tool_call_accuracy(
51-
query="Where is the Eiffel Tower?",
52-
response="The Eiffel Tower is in Paris.",
53-
tool_definitions=[
54-
{
55-
"name": "fetch_weather",
56-
"description": "Fetches the weather information for the specified location.",
57-
"parameters": {
58-
"type": "object",
59-
"properties": {
60-
"location": {
61-
"type": "string",
62-
"description": "The location to fetch weather for.",
63-
}
64-
},
65-
},
66-
}
67-
],
68-
)
69-
assert (
70-
result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
71-
)
72-
assert (
73-
ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
74-
in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
75-
)
76-
7749
# Test with tool call for which definition is not provided
7850
result = tool_call_accuracy(
7951
query="Where is the Eiffel Tower?",

0 commit comments

Comments
 (0)