Skip to content

Commit 4dc4bb0

Browse files
Handle empty evaluation_level for backwards compatibility (#4983)
* Update TaskCompletion to support multi-turn eval * run code health * Update multi-turn validation, change session term to conversation, add tests and comments * Add optional evaluation_mode parameter * update tests * code health * update conversation parameter to messages * add supported_evaluation_level * pop messages in single-turn path * add messages to the optional params * Add preview tag, update test * Rename supported evaluation level to evaluation level * add reformatting tests * remove preview tag, update spec * update tests * Update supported_evaluation_level, use constants for roles * handle empty string evaluation_level * csat handle empty evaluation_level * groundedness handle empty evaluation_level * TA eval handle empty evaluation_level * add tests for empty evaluation_level * code health
1 parent a5f4ca4 commit 4dc4bb0

12 files changed

Lines changed: 103 additions & 8 deletions

File tree

assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def _resolve_evaluation_level(
9696
:rtype: Optional[EvaluationLevel]
9797
"""
9898
valid = [level.value for level in EvaluationLevel]
99-
if evaluation_level is None:
99+
if evaluation_level is None or evaluation_level == '':
100100
return None
101101
if isinstance(evaluation_level, EvaluationLevel):
102102
return evaluation_level

assets/evaluators/builtin/customer_satisfaction/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.customer_satisfaction"
3-
version: 6
3+
version: 7
44
displayName: "Customer-Satisfaction-Evaluator"
55
description: "Evaluates the predicted customer satisfaction level of an AI agent interaction on a 1-5 Likert scale. This evaluator assesses whether the agent's response would likely result in a satisfied customer based on helpfulness, completeness, tone, and resolution of the user's needs. Useful for measuring customer support quality, chatbot effectiveness, and overall user experience."
66
evaluatorType: "builtin"

assets/evaluators/builtin/groundedness/evaluator/_groundedness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _resolve_evaluation_level(
9595
:rtype: Optional[EvaluationLevel]
9696
"""
9797
valid = [level.value for level in EvaluationLevel]
98-
if evaluation_level is None:
98+
if evaluation_level is None or evaluation_level == '':
9999
return None
100100
if isinstance(evaluation_level, EvaluationLevel):
101101
return evaluation_level

assets/evaluators/builtin/groundedness/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.groundedness"
3-
version: 9
3+
version: 10
44
displayName: "Groundedness-Evaluator"
55
description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
66
evaluatorType: "builtin"

assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def _resolve_evaluation_level(
9898
) -> Optional[EvaluationLevel]:
9999
"""Validate and normalize the evaluation_level parameter."""
100100
valid = [level.value for level in EvaluationLevel]
101-
if evaluation_level is None:
101+
if evaluation_level is None or evaluation_level == '':
102102
return None
103103
if isinstance(evaluation_level, EvaluationLevel):
104104
return evaluation_level

assets/evaluators/builtin/task_adherence/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.task_adherence"
3-
version: 8
3+
version: 9
44
displayName: "Task-Adherence-Evaluator-(Preview)"
55
description: "Evaluates whether the agent completed the task within the confines of the instructions given to the agentic system. Higher scores indicate better compliance with the instructions. This evaluator is useful when useful for end-to-end system-level task evaluation for agents. Example outputs include actions such as updating a database and textual responses such as writing a report."
66
evaluatorType: "builtin"

assets/evaluators/builtin/task_completion/evaluator/_task_completion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def _resolve_evaluation_level(
9494
:rtype: Optional[EvaluationLevel]
9595
"""
9696
valid = [level.value for level in EvaluationLevel]
97-
if evaluation_level is None:
97+
if evaluation_level is None or evaluation_level == '':
9898
return None
9999
if isinstance(evaluation_level, EvaluationLevel):
100100
return evaluation_level

assets/evaluators/builtin/task_completion/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.task_completion"
3-
version: 11
3+
version: 12
44
displayName: "Task-Completion-Evaluator-(Preview)"
55
description: "Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met, ignoring rule adherence or intent understanding. This evaluator is useful for assessing agent effectiveness in task-oriented scenarios, workflow automation, and goal-oriented AI interactions."
66
evaluatorType: "builtin"

assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,54 @@ def test_messages_allows_developer_role(self):
353353

354354

355355
# endregion
356+
357+
358+
# region evaluation_level tests
359+
360+
def _create_mocked_evaluator_with_level(evaluation_level=None):
361+
"""Create a CustomerSatisfactionEvaluator with evaluation_level and mocked flows."""
362+
model_config = AzureOpenAIModelConfiguration(
363+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://Sanitized.api.cognitive.microsoft.com"),
364+
azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT", "aoai-deployment"),
365+
)
366+
evaluator = CustomerSatisfactionEvaluator(
367+
model_config=model_config,
368+
evaluation_level=evaluation_level,
369+
)
370+
mock_side_effect = get_flow_side_effect_for_evaluator("customer_satisfaction")
371+
evaluator._flow = MagicMock(side_effect=mock_side_effect)
372+
evaluator._multi_turn_flow = MagicMock(side_effect=mock_side_effect)
373+
return evaluator
374+
375+
376+
@pytest.mark.unittest
377+
class TestCustomerSatisfactionEvaluationLevel:
378+
"""Tests for the evaluation_level parameter."""
379+
380+
def test_empty_string_level_defaults_to_auto_detect_messages(self):
381+
"""Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
382+
evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
383+
result = evaluator(messages=VALID_MESSAGES)
384+
evaluator._multi_turn_flow.assert_called_once()
385+
evaluator._flow.assert_not_called()
386+
assert "customer_satisfaction" in result
387+
388+
def test_empty_string_level_defaults_to_auto_detect_query_response(self):
389+
"""Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for query/response."""
390+
evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
391+
evaluator(query="How do I return an item?", response="You can return within 30 days.")
392+
evaluator._flow.assert_called_once()
393+
evaluator._multi_turn_flow.assert_not_called()
394+
395+
def test_invalid_string_level_raises(self):
396+
"""Invalid string evaluation_level raises at init time."""
397+
with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
398+
_create_mocked_evaluator_with_level(evaluation_level="batch")
399+
400+
def test_invalid_type_level_raises(self):
401+
"""Non-string/non-enum evaluation_level raises at init time."""
402+
with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
403+
_create_mocked_evaluator_with_level(evaluation_level=42)
404+
405+
406+
# endregion

assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,20 @@ def test_string_level_turn(self):
482482
evaluator._flow.assert_called_once()
483483
evaluator._multi_turn_flow.assert_not_called()
484484

485+
def test_empty_string_level_defaults_to_auto_detect_messages(self):
486+
"""Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
487+
evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="")
488+
evaluator(messages=VALID_GROUNDEDNESS_MESSAGES)
489+
evaluator._multi_turn_flow.assert_called_once()
490+
evaluator._flow.assert_not_called()
491+
492+
def test_empty_string_level_defaults_to_auto_detect_response_context(self):
493+
"""Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for response/context."""
494+
evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="")
495+
evaluator(response="The sky is blue.", context="The sky is blue due to Rayleigh scattering.")
496+
evaluator._flow.assert_called_once()
497+
evaluator._multi_turn_flow.assert_not_called()
498+
485499
def test_invalid_string_level_raises(self):
486500
"""Invalid string evaluation_level raises at init time."""
487501
with pytest.raises(EvaluationException, match="Invalid evaluation_level"):

0 commit comments

Comments
 (0)