Skip to content

Commit e8538bf

Browse files
AliMahmoudzadehAli Mahmoudzadeh
andauthored
Amah/end with user (#5063)
* removed the last message check * bumped the version * simplified the condition check * updated the message --------- Co-authored-by: Ali Mahmoudzadeh <amah@microsoft.com>
1 parent 3b96685 commit e8538bf

14 files changed

Lines changed: 26 additions & 112 deletions

File tree

assets/evaluators/builtin/coherence/evaluator/_coherence.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -644,16 +644,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
644644
category=ErrorCategory.INVALID_VALUE,
645645
target=self.error_target,
646646
)
647-
if messages[-1]["role"] != MessageRole.ASSISTANT:
648-
raise EvaluationException(
649-
message=(
650-
f"The last message must have role 'assistant', "
651-
f"but found role '{messages[-1]['role']}'."
652-
),
653-
blame=ErrorBlame.USER_ERROR,
654-
category=ErrorCategory.INVALID_VALUE,
655-
target=self.error_target,
656-
)
647+
657648
# The final assistant message must contain text
658649
last_content = messages[-1].get("content", "")
659650
if isinstance(last_content, list):
@@ -665,7 +656,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
665656
if not has_text:
666657
raise EvaluationException(
667658
message=(
668-
"The last assistant message must contain text content, "
659+
"The last message must contain text content, "
669660
"not only tool calls. The conversation appears to be "
670661
"mid-execution — provide the agent's final text response."
671662
),
@@ -919,7 +910,9 @@ def __init__(self, model_config, *, threshold=3, credential=None, evaluation_lev
919910
)
920911

921912
# Initialize input validator (supports both query/response and messages)
922-
self._validator = MessagesOrQueryResponseInputValidator(error_target=ErrorTarget.COHERENCE_EVALUATOR)
913+
self._validator = MessagesOrQueryResponseInputValidator(
914+
error_target=ErrorTarget.COHERENCE_EVALUATOR,
915+
)
923916

924917
super().__init__(
925918
model_config=model_config,
@@ -1177,6 +1170,7 @@ async def _real_call(self, **kwargs):
11771170
query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"])
11781171
kwargs["query"] = query_messages
11791172
kwargs["response"] = response_messages
1173+
kwargs.pop("messages", None)
11801174

11811175
# Validate input before processing
11821176
self._validator.validate_eval_input(kwargs)

assets/evaluators/builtin/coherence/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.coherence"
3-
version: 7
3+
version: 8
44
displayName: "Coherence-Evaluator"
55
description: "Evaluates how logically connected and consistent the response is. Ensures ideas flow naturally and make sense together. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting emails."
66
evaluatorType: "builtin"

assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -622,16 +622,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
622622
category=ErrorCategory.INVALID_VALUE,
623623
target=self.error_target,
624624
)
625-
if messages[-1]["role"] != MessageRole.ASSISTANT:
626-
raise EvaluationException(
627-
message=(
628-
f"The last message must have role 'assistant', "
629-
f"but found role '{messages[-1]['role']}'."
630-
),
631-
blame=ErrorBlame.USER_ERROR,
632-
category=ErrorCategory.INVALID_VALUE,
633-
target=self.error_target,
634-
)
625+
635626
# The final assistant message must contain text
636627
last_content = messages[-1].get("content", "")
637628
if isinstance(last_content, list):
@@ -643,7 +634,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
643634
if not has_text:
644635
raise EvaluationException(
645636
message=(
646-
"The last assistant message must contain text content, "
637+
"The last message must contain text content, "
647638
"not only tool calls. The conversation appears to be "
648639
"mid-execution — provide the agent's final text response."
649640
),
@@ -950,7 +941,7 @@ def __init__(self, model_config, *, credential=None, threshold=3, evaluation_lev
950941
# Initialize input validator
951942
self._validator = ConversationValidator(
952943
error_target=ExtendedErrorTarget.CUSTOMER_SATISFACTION_EVALUATOR,
953-
requires_query=True
944+
requires_query=True,
954945
)
955946

956947
super().__init__(
@@ -1110,6 +1101,7 @@ async def _real_call(self, **kwargs):
11101101
query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"])
11111102
kwargs["query"] = query_messages
11121103
kwargs["response"] = response_messages
1104+
kwargs.pop("messages", None)
11131105

11141106
self._validator.validate_eval_input(kwargs)
11151107

assets/evaluators/builtin/customer_satisfaction/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.customer_satisfaction"
3-
version: 10
3+
version: 11
44
displayName: "Customer-Satisfaction-Evaluator"
55
description: "Evaluates the predicted customer satisfaction level of an AI agent interaction on a 1-5 Likert scale. This evaluator assesses whether the agent's response would likely result in a satisfied customer based on helpfulness, completeness, tone, and resolution of the user's needs. Useful for measuring customer support quality, chatbot effectiveness, and overall user experience."
66
evaluatorType: "builtin"

assets/evaluators/builtin/groundedness/evaluator/_groundedness.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -650,16 +650,6 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
650650
category=ErrorCategory.INVALID_VALUE,
651651
target=self.error_target,
652652
)
653-
if messages[-1]["role"] != MessageRole.ASSISTANT:
654-
raise EvaluationException(
655-
message=(
656-
f"The last message must have role 'assistant', "
657-
f"but found role '{messages[-1]['role']}'."
658-
),
659-
blame=ErrorBlame.USER_ERROR,
660-
category=ErrorCategory.INVALID_VALUE,
661-
target=self.error_target,
662-
)
663653
# The final assistant message must contain text
664654
last_content = messages[-1].get("content", "")
665655
if isinstance(last_content, list):
@@ -671,7 +661,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
671661
if not has_text:
672662
raise EvaluationException(
673663
message=(
674-
"The last assistant message must contain text content, "
664+
"The last message must contain text content, "
675665
"not only tool calls. The conversation appears to be "
676666
"mid-execution — provide the agent's final text response."
677667
),
@@ -1064,7 +1054,7 @@ def __init__(self, model_config, *, threshold=3, credential=None, evaluation_lev
10641054
self._validator_messages = MessagesOrQueryResponseInputValidator(
10651055
error_target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
10661056
requires_query=False,
1067-
check_for_unsupported_tools=False
1057+
check_for_unsupported_tools=False,
10681058
)
10691059

10701060
super().__init__(
@@ -1560,6 +1550,7 @@ async def _real_call(self, **kwargs):
15601550
query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"])
15611551
kwargs["query"] = query_messages
15621552
kwargs["response"] = response_messages
1553+
kwargs.pop("messages", None)
15631554

15641555
# Validate input before processing
15651556
if kwargs.get("messages"):

assets/evaluators/builtin/groundedness/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.groundedness"
3-
version: 12
3+
version: 13
44
displayName: "Groundedness-Evaluator"
55
description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
66
evaluatorType: "builtin"

assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -697,17 +697,6 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
697697
category=ErrorCategory.INVALID_VALUE,
698698
target=self.error_target,
699699
)
700-
if messages[-1]["role"] != MessageRole.ASSISTANT:
701-
raise EvaluationException(
702-
message=(
703-
f"The last message must have role 'assistant', "
704-
f"but found role '{messages[-1]['role']}'."
705-
),
706-
blame=ErrorBlame.USER_ERROR,
707-
category=ErrorCategory.INVALID_VALUE,
708-
target=self.error_target,
709-
)
710-
711700
last_content = messages[-1].get("content", "")
712701
if isinstance(last_content, list):
713702
has_text = any(
@@ -725,7 +714,7 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
725714
if not has_text:
726715
raise EvaluationException(
727716
message=(
728-
"The last assistant message must contain text content, "
717+
"The last message must contain text content, "
729718
"not only tool calls. The conversation appears to be "
730719
"mid-execution — provide the agent's final text response."
731720
),
@@ -978,7 +967,7 @@ def __init__(
978967
)
979968

980969
self._validator = MessagesOrQueryResponseInputValidator(
981-
error_target=ErrorTarget.TASK_ADHERENCE_EVALUATOR
970+
error_target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
982971
)
983972

984973
super().__init__(
@@ -1185,6 +1174,7 @@ async def _real_call(self, **kwargs):
11851174
query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"])
11861175
kwargs["query"] = query_messages
11871176
kwargs["response"] = response_messages
1177+
kwargs.pop("messages", None)
11881178

11891179
self._validator.validate_eval_input(kwargs)
11901180

assets/evaluators/builtin/task_adherence/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.task_adherence"
3-
version: 11
3+
version: 12
44
displayName: "Task-Adherence-Evaluator-(Preview)"
55
description: "Evaluates whether the agent completed the task within the confines of the instructions given to the agentic system. Higher scores indicate better compliance with the instructions. This evaluator is useful when useful for end-to-end system-level task evaluation for agents. Example outputs include actions such as updating a database and textual responses such as writing a report."
66
evaluatorType: "builtin"

assets/evaluators/builtin/task_completion/evaluator/_task_completion.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -714,16 +714,6 @@ def validate_eval_input(self, eval_input: Dict[str, Any]) -> bool:
714714
category=ErrorCategory.INVALID_VALUE,
715715
target=self.error_target,
716716
)
717-
if messages[-1]["role"] != MessageRole.ASSISTANT:
718-
raise EvaluationException(
719-
message=(
720-
f"The last message must have role 'assistant', "
721-
f"but found role '{messages[-1]['role']}'."
722-
),
723-
blame=ErrorBlame.USER_ERROR,
724-
category=ErrorCategory.INVALID_VALUE,
725-
target=self.error_target,
726-
)
727717
# The final assistant message must contain text
728718
last_content = messages[-1].get("content", "")
729719
if isinstance(last_content, list):
@@ -1034,7 +1024,7 @@ def __init__(self, model_config, *, credential=None, evaluation_level=None, **kw
10341024

10351025
# Initialize input validator (supports both query/response and messages)
10361026
self._validator = MessagesOrQueryResponseInputValidator(
1037-
error_target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR
1027+
error_target=ExtendedErrorTarget.TASK_COMPLETION_EVALUATOR,
10381028
)
10391029

10401030
super().__init__(
@@ -1309,6 +1299,7 @@ async def _real_call(self, **kwargs):
13091299
query_messages, response_messages = _split_messages_at_latest_user(kwargs["messages"])
13101300
kwargs["query"] = query_messages
13111301
kwargs["response"] = response_messages
1302+
kwargs.pop("messages", None)
13121303

13131304
self._validator.validate_eval_input(kwargs)
13141305

assets/evaluators/builtin/task_completion/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.task_completion"
3-
version: 14
3+
version: 15
44
displayName: "Task-Completion-Evaluator-(Preview)"
55
description: "Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met, ignoring rule adherence or intent understanding. This evaluator is useful for assessing agent effectiveness in task-oriented scenarios, workflow automation, and goal-oriented AI interactions."
66
evaluatorType: "builtin"

0 commit comments

Comments
 (0)