Skip to content

Commit b859b7d

Browse files
authored
Bump versions for affected evaluators (#5107)
1 parent f77c3e4 commit b859b7d

10 files changed

Lines changed: 23 additions & 17 deletions

File tree

assets/evaluators/builtin/coherence/evaluator/_coherence.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,8 +1279,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
12791279

12801280
result = await self._the_super_do_eval(eval_input)
12811281

1282-
# Check if base returned nan (invalid output case)
1283-
if math.isnan(result.get(self._result_key, 0)):
1282+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
1283+
_score = result.get(self._result_key, 0)
1284+
if _score is not None and math.isnan(_score):
12841285
raise EvaluationException(
12851286
message="Evaluator returned invalid output.",
12861287
blame=ErrorBlame.SYSTEM_ERROR,

assets/evaluators/builtin/coherence/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.coherence"
3-
version: 9
3+
version: 10
44
displayName: "Coherence-Evaluator"
55
description: "Evaluates how logically connected and consistent the response is. Ensures ideas flow naturally and make sense together. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting emails."
66
evaluatorType: "builtin"

assets/evaluators/builtin/fluency/evaluator/_fluency.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -921,8 +921,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
921921

922922
result = await self._the_super_do_eval(eval_input)
923923

924-
# Check if base returned nan (invalid output case)
925-
if math.isnan(result.get(self._result_key, 0)):
924+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
925+
_score = result.get(self._result_key, 0)
926+
if _score is not None and math.isnan(_score):
926927
raise EvaluationException(
927928
message="Evaluator returned invalid output.",
928929
blame=ErrorBlame.SYSTEM_ERROR,

assets/evaluators/builtin/fluency/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.fluency"
3-
version: 8
3+
version: 9
44
displayName: "Fluency-Evaluator"
55
description: "Evaluates how natural and grammatically correct the response sounds. Higher scores indicate smoother and clearer language. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting email."
66
evaluatorType: "builtin"

assets/evaluators/builtin/groundedness/evaluator/_groundedness.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,8 +1425,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
14251425
eval_input["query"] = _preprocess_messages(eval_input["query"])
14261426
if eval_input.get("query", None) is None:
14271427
result = await self._the_super_do_eval(eval_input)
1428-
# Check if base returned nan (invalid output case)
1429-
if math.isnan(result.get(self._result_key, 0)):
1428+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
1429+
_score = result.get(self._result_key, 0)
1430+
if _score is not None and math.isnan(_score):
14301431
raise EvaluationException(
14311432
message="Evaluator returned invalid output.",
14321433
blame=ErrorBlame.SYSTEM_ERROR,
@@ -1449,8 +1450,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
14491450

14501451
# Replace and call the parent method
14511452
result = await self._the_super_do_eval(simplified_eval_input)
1452-
# Check if base returned nan (invalid output case)
1453-
if math.isnan(result.get(self._result_key, 0)):
1453+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
1454+
_score = result.get(self._result_key, 0)
1455+
if _score is not None and math.isnan(_score):
14541456
raise EvaluationException(
14551457
message="Evaluator returned invalid output.",
14561458
blame=ErrorBlame.SYSTEM_ERROR,

assets/evaluators/builtin/groundedness/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.groundedness"
3-
version: 14
3+
version: 15
44
displayName: "Groundedness-Evaluator"
55
description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
66
evaluatorType: "builtin"

assets/evaluators/builtin/retrieval/evaluator/_retrieval.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
374374
eval_input["query"] = _preprocess_messages(eval_input["query"])
375375

376376
result = await self._the_super_do_eval(eval_input)
377-
# Check if base returned nan (invalid output case)
378-
if math.isnan(result.get(self._result_key, 0)):
377+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
378+
_score = result.get(self._result_key, 0)
379+
if _score is not None and math.isnan(_score):
379380
raise EvaluationException(
380381
message="Evaluator returned invalid output.",
381382
blame=ErrorBlame.SYSTEM_ERROR,

assets/evaluators/builtin/retrieval/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.retrieval"
3-
version: 10
3+
version: 11
44
displayName: "Retrieval-Evaluator"
55
description: "Measures how effectively the system retrieves relevant data or content. Higher scores mean better recall of useful information. It’s best used for the quality of search in information retrieval and retrieval augmented generation, when you don't have ground truth for chunk retrieval rankings. Use the retrieval score when you want to assess to what extent the context chunks retrieved are highly relevant and ranked at the top for answering your users' queries."
66
evaluatorType: "builtin"

assets/evaluators/builtin/similarity/evaluator/_similarity.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,8 +389,9 @@ async def _do_eval(self, eval_input: Dict): # type: ignore[override]
389389
:rtype: Dict
390390
"""
391391
result = await self._the_super_do_eval(eval_input)
392-
# Check if base returned nan (invalid output case)
393-
if math.isnan(result.get(self._result_key, 0)):
392+
# Check if base returned nan (invalid output case); None means not-applicable/skipped
393+
_score = result.get(self._result_key, 0)
394+
if _score is not None and math.isnan(_score):
394395
raise EvaluationException(
395396
message="Evaluator returned invalid output.",
396397
blame=ErrorBlame.SYSTEM_ERROR,

assets/evaluators/builtin/similarity/spec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
type: "evaluator"
22
name: "builtin.similarity"
3-
version: 5
3+
version: 6
44
displayName: "Similarity-Evaluator"
55
description: "Measures how closely two pieces of text resemble each other in meaning. Higher scores indicate greater semantic similarity. It’s best used for NLP tasks with a user query. Use it when you want an objective evaluation of an AI model's performance, particularly in text generation tasks where you have access to ground truth responses."
66
evaluatorType: "builtin"

0 commit comments

Comments
 (0)