Handle empty evaluation_level for backwards compatibility (#4983)

salma-elshafey · web-flow · commit 4dc4bb02893e · 2026-04-30T22:03:07.000+03:00
* Update TaskCompletion to support multi-turn eval

* run code health

* Update multi-turn validation, change session term to conversation, add tests and comments

* Add optional evaluation_mode parameter

* update tests

* code health

* update conversation parameter to messages

* add supported_evaluation_level

* pop messages in single-turn path

* add messages to the optional params

* Add preview tag, update test

* Rename supported evaluation level to evaluation level

* add reformatting tests

* remove preview tag, update spec

* update tests

* Update supported_evaluation_level, use constants for roles

* handle empty string evaluation_level

* csat handle empty evaluation_level

* groundedness handle empty evaluation_level

* TA eval handle empty evaluation_level

* add tests for empty evaluation_level

* code health
diff --git a/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py b/assets/evaluators/builtin/customer_satisfaction/evaluator/_customer_satisfaction.py
@@ -96,7 +96,7 @@ def _resolve_evaluation_level(
     :rtype: Optional[EvaluationLevel]
     """
     valid = [level.value for level in EvaluationLevel]
-    if evaluation_level is None:
+    if evaluation_level is None or evaluation_level == '':
         return None
     if isinstance(evaluation_level, EvaluationLevel):
         return evaluation_level
diff --git a/assets/evaluators/builtin/customer_satisfaction/spec.yaml b/assets/evaluators/builtin/customer_satisfaction/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.customer_satisfaction"
-version: 6
+version: 7
 displayName: "Customer-Satisfaction-Evaluator"
 description: "Evaluates the predicted customer satisfaction level of an AI agent interaction on a 1-5 Likert scale. This evaluator assesses whether the agent's response would likely result in a satisfied customer based on helpfulness, completeness, tone, and resolution of the user's needs. Useful for measuring customer support quality, chatbot effectiveness, and overall user experience."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py
@@ -95,7 +95,7 @@ def _resolve_evaluation_level(
     :rtype: Optional[EvaluationLevel]
     """
     valid = [level.value for level in EvaluationLevel]
-    if evaluation_level is None:
+    if evaluation_level is None or evaluation_level == '':
         return None
     if isinstance(evaluation_level, EvaluationLevel):
         return evaluation_level
diff --git a/assets/evaluators/builtin/groundedness/spec.yaml b/assets/evaluators/builtin/groundedness/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.groundedness"
-version: 9
+version: 10
 displayName: "Groundedness-Evaluator"
 description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py b/assets/evaluators/builtin/task_adherence/evaluator/_task_adherence.py
@@ -98,7 +98,7 @@ def _resolve_evaluation_level(
 ) -> Optional[EvaluationLevel]:
     """Validate and normalize the evaluation_level parameter."""
     valid = [level.value for level in EvaluationLevel]
-    if evaluation_level is None:
+    if evaluation_level is None or evaluation_level == '':
         return None
     if isinstance(evaluation_level, EvaluationLevel):
         return evaluation_level
diff --git a/assets/evaluators/builtin/task_adherence/spec.yaml b/assets/evaluators/builtin/task_adherence/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.task_adherence"
-version: 8
+version: 9
 displayName: "Task-Adherence-Evaluator-(Preview)"
 description: "Evaluates whether the agent completed the task within the confines of the instructions given to the agentic system. Higher scores indicate better compliance with the instructions. This evaluator is useful when useful for end-to-end system-level task evaluation for agents. Example outputs include actions such as updating a database and textual responses such as writing a report."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py b/assets/evaluators/builtin/task_completion/evaluator/_task_completion.py
@@ -94,7 +94,7 @@ def _resolve_evaluation_level(
     :rtype: Optional[EvaluationLevel]
     """
     valid = [level.value for level in EvaluationLevel]
-    if evaluation_level is None:
+    if evaluation_level is None or evaluation_level == '':
         return None
     if isinstance(evaluation_level, EvaluationLevel):
         return evaluation_level
diff --git a/assets/evaluators/builtin/task_completion/spec.yaml b/assets/evaluators/builtin/task_completion/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.task_completion"
-version: 11
+version: 12
 displayName: "Task-Completion-Evaluator-(Preview)"
 description: "Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met, ignoring rule adherence or intent understanding. This evaluator is useful for assessing agent effectiveness in task-oriented scenarios, workflow automation, and goal-oriented AI interactions."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_customer_satisfaction_evaluator_behavior.py
@@ -353,3 +353,54 @@ def test_messages_allows_developer_role(self):
 
 
 # endregion
+
+
+# region evaluation_level tests
+
+def _create_mocked_evaluator_with_level(evaluation_level=None):
+    """Create a CustomerSatisfactionEvaluator with evaluation_level and mocked flows."""
+    model_config = AzureOpenAIModelConfiguration(
+        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://Sanitized.api.cognitive.microsoft.com"),
+        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT", "aoai-deployment"),
+    )
+    evaluator = CustomerSatisfactionEvaluator(
+        model_config=model_config,
+        evaluation_level=evaluation_level,
+    )
+    mock_side_effect = get_flow_side_effect_for_evaluator("customer_satisfaction")
+    evaluator._flow = MagicMock(side_effect=mock_side_effect)
+    evaluator._multi_turn_flow = MagicMock(side_effect=mock_side_effect)
+    return evaluator
+
+
+@pytest.mark.unittest
+class TestCustomerSatisfactionEvaluationLevel:
+    """Tests for the evaluation_level parameter."""
+
+    def test_empty_string_level_defaults_to_auto_detect_messages(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        result = evaluator(messages=VALID_MESSAGES)
+        evaluator._multi_turn_flow.assert_called_once()
+        evaluator._flow.assert_not_called()
+        assert "customer_satisfaction" in result
+
+    def test_empty_string_level_defaults_to_auto_detect_query_response(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for query/response."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        evaluator(query="How do I return an item?", response="You can return within 30 days.")
+        evaluator._flow.assert_called_once()
+        evaluator._multi_turn_flow.assert_not_called()
+
+    def test_invalid_string_level_raises(self):
+        """Invalid string evaluation_level raises at init time."""
+        with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
+            _create_mocked_evaluator_with_level(evaluation_level="batch")
+
+    def test_invalid_type_level_raises(self):
+        """Non-string/non-enum evaluation_level raises at init time."""
+        with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
+            _create_mocked_evaluator_with_level(evaluation_level=42)
+
+
+# endregion
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_groundedness_evaluator_behavior.py
@@ -482,6 +482,20 @@ def test_string_level_turn(self):
         evaluator._flow.assert_called_once()
         evaluator._multi_turn_flow.assert_not_called()
 
+    def test_empty_string_level_defaults_to_auto_detect_messages(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
+        evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="")
+        evaluator(messages=VALID_GROUNDEDNESS_MESSAGES)
+        evaluator._multi_turn_flow.assert_called_once()
+        evaluator._flow.assert_not_called()
+
+    def test_empty_string_level_defaults_to_auto_detect_response_context(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for response/context."""
+        evaluator = _create_mocked_groundedness_evaluator_with_level(evaluation_level="")
+        evaluator(response="The sky is blue.", context="The sky is blue due to Rayleigh scattering.")
+        evaluator._flow.assert_called_once()
+        evaluator._multi_turn_flow.assert_not_called()
+
     def test_invalid_string_level_raises(self):
         """Invalid string evaluation_level raises at init time."""
         with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_adherence_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_adherence_evaluator_behavior.py
@@ -468,6 +468,21 @@ def test_string_level_turn(self):
         evaluator._multi_turn_flow.assert_not_called()
         assert "task_adherence" in result
 
+    def test_empty_string_level_defaults_to_auto_detect_messages(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        result = evaluator(messages=VALID_MESSAGES)
+        evaluator._multi_turn_flow.assert_called_once()
+        evaluator._flow.assert_not_called()
+        assert "task_adherence" in result
+
+    def test_empty_string_level_defaults_to_auto_detect_query_response(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for query/response."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        evaluator(query="Plan a trip.", response="Here's your itinerary.")
+        evaluator._flow.assert_called_once()
+        evaluator._multi_turn_flow.assert_not_called()
+
     def test_invalid_string_level_raises(self):
         """Invalid string evaluation_level raises at init time."""
         with pytest.raises(EvaluationException, match="Invalid evaluation_level"):
diff --git a/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py b/assets/evaluators/tests/test_evaluators_behavior/test_task_completion_evaluator_behavior.py
@@ -634,6 +634,21 @@ def test_string_level_turn(self):
         evaluator._multi_turn_flow.assert_not_called()
         assert "task_completion" in result
 
+    def test_empty_string_level_defaults_to_auto_detect_messages(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses multi-turn for messages."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        result = evaluator(messages=VALID_MESSAGES)
+        evaluator._multi_turn_flow.assert_called_once()
+        evaluator._flow.assert_not_called()
+        assert "task_completion" in result
+
+    def test_empty_string_level_defaults_to_auto_detect_query_response(self):
+        """Empty string evaluation_level is treated as None (auto-detect) and uses single-turn for query/response."""
+        evaluator = _create_mocked_evaluator_with_level(evaluation_level="")
+        evaluator(query="Plan a trip.", response="Here's your itinerary.")
+        evaluator._flow.assert_called_once()
+        evaluator._multi_turn_flow.assert_not_called()
+
     def test_invalid_string_level_raises(self):
         """Invalid string evaluation_level raises at init time."""
         with pytest.raises(EvaluationException, match="Invalid evaluation_level"):