Bump versions for affected evaluators (#5107)

vebudumu · web-flow · commit b859b7d42fbb · 2026-06-04T11:23:05.000-07:00
diff --git a/assets/evaluators/builtin/coherence/evaluator/_coherence.py b/assets/evaluators/builtin/coherence/evaluator/_coherence.py
@@ -1279,8 +1279,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         result = await self._the_super_do_eval(eval_input)
 
-        # Check if base returned nan (invalid output case)
-        if math.isnan(result.get(self._result_key, 0)):
+        # Check if base returned nan (invalid output case); None means not-applicable/skipped
+        _score = result.get(self._result_key, 0)
+        if _score is not None and math.isnan(_score):
             raise EvaluationException(
                 message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
diff --git a/assets/evaluators/builtin/coherence/spec.yaml b/assets/evaluators/builtin/coherence/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.coherence"
-version: 9
+version: 10
 displayName: "Coherence-Evaluator"
 description: "Evaluates how logically connected and consistent the response is. Ensures ideas flow naturally and make sense together. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting emails."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/fluency/evaluator/_fluency.py b/assets/evaluators/builtin/fluency/evaluator/_fluency.py
@@ -921,8 +921,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         result = await self._the_super_do_eval(eval_input)
 
-        # Check if base returned nan (invalid output case)
-        if math.isnan(result.get(self._result_key, 0)):
+        # Check if base returned nan (invalid output case); None means not-applicable/skipped
+        _score = result.get(self._result_key, 0)
+        if _score is not None and math.isnan(_score):
             raise EvaluationException(
                 message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
diff --git a/assets/evaluators/builtin/fluency/spec.yaml b/assets/evaluators/builtin/fluency/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.fluency"
-version: 8
+version: 9
 displayName: "Fluency-Evaluator"
 description: "Evaluates how natural and grammatically correct the response sounds. Higher scores indicate smoother and clearer language. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting email."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py b/assets/evaluators/builtin/groundedness/evaluator/_groundedness.py
@@ -1425,8 +1425,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
             eval_input["query"] = _preprocess_messages(eval_input["query"])
         if eval_input.get("query", None) is None:
             result = await self._the_super_do_eval(eval_input)
-            # Check if base returned nan (invalid output case)
-            if math.isnan(result.get(self._result_key, 0)):
+            # Check if base returned nan (invalid output case); None means not-applicable/skipped
+            _score = result.get(self._result_key, 0)
+            if _score is not None and math.isnan(_score):
                 raise EvaluationException(
                     message="Evaluator returned invalid output.",
                     blame=ErrorBlame.SYSTEM_ERROR,
@@ -1449,8 +1450,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
 
         # Replace and call the parent method
         result = await self._the_super_do_eval(simplified_eval_input)
-        # Check if base returned nan (invalid output case)
-        if math.isnan(result.get(self._result_key, 0)):
+        # Check if base returned nan (invalid output case); None means not-applicable/skipped
+        _score = result.get(self._result_key, 0)
+        if _score is not None and math.isnan(_score):
             raise EvaluationException(
                 message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
diff --git a/assets/evaluators/builtin/groundedness/spec.yaml b/assets/evaluators/builtin/groundedness/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.groundedness"
-version: 14
+version: 15
 displayName: "Groundedness-Evaluator"
 description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py b/assets/evaluators/builtin/retrieval/evaluator/_retrieval.py
@@ -374,8 +374,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             eval_input["query"] = _preprocess_messages(eval_input["query"])
 
         result = await self._the_super_do_eval(eval_input)
-        # Check if base returned nan (invalid output case)
-        if math.isnan(result.get(self._result_key, 0)):
+        # Check if base returned nan (invalid output case); None means not-applicable/skipped
+        _score = result.get(self._result_key, 0)
+        if _score is not None and math.isnan(_score):
             raise EvaluationException(
                 message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
diff --git a/assets/evaluators/builtin/retrieval/spec.yaml b/assets/evaluators/builtin/retrieval/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.retrieval"
-version: 10
+version: 11
 displayName: "Retrieval-Evaluator"
 description: "Measures how effectively the system retrieves relevant data or content. Higher scores mean better recall of useful information. It’s best used for the quality of search in information retrieval and retrieval augmented generation, when you don't have ground truth for chunk retrieval rankings. Use the retrieval score when you want to assess to what extent the context chunks retrieved are highly relevant and ranked at the top for answering your users' queries."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/similarity/evaluator/_similarity.py b/assets/evaluators/builtin/similarity/evaluator/_similarity.py
@@ -389,8 +389,9 @@ async def _do_eval(self, eval_input: Dict):  # type: ignore[override]
         :rtype: Dict
         """
         result = await self._the_super_do_eval(eval_input)
-        # Check if base returned nan (invalid output case)
-        if math.isnan(result.get(self._result_key, 0)):
+        # Check if base returned nan (invalid output case); None means not-applicable/skipped
+        _score = result.get(self._result_key, 0)
+        if _score is not None and math.isnan(_score):
             raise EvaluationException(
                 message="Evaluator returned invalid output.",
                 blame=ErrorBlame.SYSTEM_ERROR,
diff --git a/assets/evaluators/builtin/similarity/spec.yaml b/assets/evaluators/builtin/similarity/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.similarity"
-version: 5
+version: 6
 displayName: "Similarity-Evaluator"
 description: "Measures how closely two pieces of text resemble each other in meaning. Higher scores indicate greater semantic similarity. It’s best used for NLP tasks with a user query. Use it when you want an objective evaluation of an AI model's performance, particularly in text generation tasks where you have access to ground truth responses."
 evaluatorType: "builtin"