Skip to content

Commit 4d09e4b

Browse files
authored
Handle flow dictionary direct output in evaluators (#45163)
1 parent c879eed commit 4d09e4b

12 files changed

Lines changed: 72 additions & 21 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@
55
from typing import Dict, List, Optional, Union, Any, Tuple
66

77
from typing_extensions import overload, override
8-
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
8+
9+
if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
10+
from promptflow.core._flow import AsyncPrompty
11+
else:
12+
from azure.ai.evaluation._legacy.prompty import AsyncPrompty
913

1014
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
1115
from azure.ai.evaluation._evaluators._common._validators import ConversationValidator, ValidatorInterface

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
189189
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
190190

191191
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
192-
llm_output = prompty_output_dict["llm_output"]
193-
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
192+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
194193
score = math.nan
195194
if isinstance(llm_output, dict):
196195
score = llm_output.get("score", math.nan)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
216216
if not isinstance(eval_input["response"], str):
217217
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
218218
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
219-
llm_output = result.get("llm_output")
219+
llm_output = result.get("llm_output", result)
220220
score = math.nan
221221

222222
if isinstance(llm_output, dict):

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
181181
eval_input["ground_truth"] = _preprocess_messages(eval_input["ground_truth"])
182182

183183
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
184-
llm_output = result.get("llm_output") if isinstance(result, dict) else result
184+
llm_output = result.get("llm_output", result) if isinstance(result, dict) else result
185185

186186
score = math.nan
187187
llm_output_is_dict = isinstance(llm_output, dict)
@@ -195,19 +195,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
195195

196196
binary_result = self._get_binary_result(score)
197197

198+
input_token_count = result.get("input_token_count", 0) if isinstance(result, dict) else 0
199+
output_token_count = result.get("output_token_count", 0) if isinstance(result, dict) else 0
200+
total_token_count = result.get("total_token_count", 0) if isinstance(result, dict) else 0
201+
finish_reason = result.get("finish_reason", "") if isinstance(result, dict) else ""
202+
model_id = result.get("model_id", "") if isinstance(result, dict) else ""
203+
sample_input = result.get("sample_input", "") if isinstance(result, dict) else ""
204+
sample_output = result.get("sample_output", "") if isinstance(result, dict) else ""
205+
198206
# updating the result key and threshold to int based on the schema
199207
return {
200208
f"{self._result_key}": int(score),
201209
f"{self._result_key}_result": binary_result,
202210
f"{self._result_key}_threshold": int(self._threshold),
203211
f"{self._result_key}_reason": reason,
204-
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
205-
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
206-
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
207-
f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
208-
f"{self._result_key}_model": result.get("model_id", ""),
209-
f"{self._result_key}_sample_input": result.get("sample_input", ""),
210-
f"{self._result_key}_sample_output": result.get("sample_output", ""),
212+
f"{self._result_key}_prompt_tokens": input_token_count,
213+
f"{self._result_key}_completion_tokens": output_token_count,
214+
f"{self._result_key}_total_tokens": total_token_count,
215+
f"{self._result_key}_finish_reason": finish_reason,
216+
f"{self._result_key}_model": model_id,
217+
f"{self._result_key}_sample_input": sample_input,
218+
f"{self._result_key}_sample_output": sample_output,
211219
}
212220

213221
raise EvaluationException(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing_extensions import overload, override
99

1010
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
1112

1213

1314
class SimilarityEvaluator(PromptyEvaluatorBase):
@@ -134,3 +135,41 @@ def __call__( # pylint: disable=docstring-missing-param
134135
:rtype: Dict[str, float]
135136
"""
136137
return super().__call__(*args, **kwargs)
138+
139+
@override
140+
def _convert_kwargs_to_eval_input(self, **kwargs):
141+
"""Convert keyword arguments to evaluation input, with validation."""
142+
conversation = kwargs.get("conversation")
143+
if conversation is not None:
144+
return super()._convert_kwargs_to_eval_input(**kwargs)
145+
146+
query = kwargs.get("query")
147+
response = kwargs.get("response")
148+
ground_truth = kwargs.get("ground_truth")
149+
150+
# Validate required fields are not None
151+
if query is None:
152+
raise EvaluationException(
153+
message="Either 'conversation' or individual inputs must be provided. 'query' is missing.",
154+
blame=ErrorBlame.USER_ERROR,
155+
category=ErrorCategory.MISSING_FIELD,
156+
target=ErrorTarget.SIMILARITY_EVALUATOR,
157+
)
158+
159+
if response is None:
160+
raise EvaluationException(
161+
message="Either 'conversation' or individual inputs must be provided. 'response' is missing.",
162+
blame=ErrorBlame.USER_ERROR,
163+
category=ErrorCategory.MISSING_FIELD,
164+
target=ErrorTarget.SIMILARITY_EVALUATOR,
165+
)
166+
167+
if ground_truth is None:
168+
raise EvaluationException(
169+
message="Either 'conversation' or individual inputs must be provided. 'ground_truth' is missing.",
170+
blame=ErrorBlame.USER_ERROR,
171+
category=ErrorCategory.MISSING_FIELD,
172+
target=ErrorTarget.SIMILARITY_EVALUATOR,
173+
)
174+
175+
return super()._convert_kwargs_to_eval_input(**kwargs)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
237237
}
238238

239239
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
240-
llm_output = prompty_output_dict["llm_output"]
240+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
241241

242242
if isinstance(llm_output, dict):
243243
flagged = llm_output.get("flagged", False)
@@ -249,6 +249,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
249249
return {
250250
f"{self._result_key}": score,
251251
f"{self._result_key}_result": score_result,
252+
f"{self._result_key}_threshold": self._threshold,
252253
f"{self._result_key}_reason": reasoning,
253254
f"{self._result_key}_details": llm_output.get("details", ""),
254255
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,11 +187,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
187187

188188
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
189189
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
190-
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
190+
if "tool_definitions" in eval_input and eval_input["tool_definitions"]:
191191
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
192192

193193
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
194-
llm_output = prompty_output_dict.get("llm_output", {})
194+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
195195

196196
if isinstance(llm_output, dict):
197197
success_value = llm_output.get("success", False)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
247247

248248
# Single LLM call for all tool calls
249249
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
250-
llm_output = prompty_output_dict.get("llm_output", {})
250+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
251251
if isinstance(llm_output, dict):
252252
score = llm_output.get(self._LLM_SCORE_KEY, None)
253253
if not score or not check_score_is_valid(

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
199199
eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
200200

201201
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
202-
llm_output = prompty_output_dict.get("llm_output", "")
202+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
203203

204204
if isinstance(llm_output, dict):
205205
success = llm_output.get("success", False)

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
201201

202202
# Call the LLM to evaluate
203203
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
204-
llm_output = prompty_output_dict.get("llm_output", {})
204+
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
205205

206206
if isinstance(llm_output, dict):
207207
result = llm_output.get("result", None)

0 commit comments

Comments
 (0)