Updated version number for evaluators (#5099)

posaninagendra · web-flow · commit 2c37dca0f7cd · 2026-05-29T21:46:52.000-07:00
diff --git a/assets/evaluators/builtin/customer_satisfaction/spec.yaml b/assets/evaluators/builtin/customer_satisfaction/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.customer_satisfaction"
-version: 11
+version: 12
 displayName: "Customer-Satisfaction-Evaluator"
 description: "Evaluates the predicted customer satisfaction level of an AI agent interaction on a 1-5 Likert scale. This evaluator assesses whether the agent's response would likely result in a satisfied customer based on helpfulness, completeness, tone, and resolution of the user's needs. Useful for measuring customer support quality, chatbot effectiveness, and overall user experience."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/fluency/spec.yaml b/assets/evaluators/builtin/fluency/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.fluency"
-version: 7
+version: 8
 displayName: "Fluency-Evaluator"
 description: "Evaluates how natural and grammatically correct the response sounds. Higher scores indicate smoother and clearer language. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting email."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/groundedness/spec.yaml b/assets/evaluators/builtin/groundedness/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.groundedness"
-version: 13
+version: 14
 displayName: "Groundedness-Evaluator"
 description: "Assesses whether the response stays true to the given context in a retrieval-augmented generation scenario. It’s best used for retrieval-augmented generation (RAG) scenarios, including question and answering and summarization. Use the groundedness metric when you need to verify that ai-generated responses align with and are validated by the provided context."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/intent_resolution/spec.yaml b/assets/evaluators/builtin/intent_resolution/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.intent_resolution"
-version: 6
+version: 7
 displayName: "Intent-Resolution-Evaluator-(Preview)"
 description: "Checks whether the model correctly interprets and resolves user intent. Ensures the response aligns with what the user asked. Use this metric in conversational AI assistants, and customer support bots where understanding user intent is essential."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/relevance/spec.yaml b/assets/evaluators/builtin/relevance/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.relevance"
-version: 9
+version: 10
 displayName: "Relevance-Evaluator"
 description: "Assesses how well the response matches the user’s intent or question. Higher scores mean better alignment with the prompt. It’s best used for generative business writing such as summarizing meeting notes, creating marketing materials, and drafting email."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/response_completeness/spec.yaml b/assets/evaluators/builtin/response_completeness/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.response_completeness"
-version: 7
+version: 8
 displayName: "Response-Completeness-Evaluator-(Preview)"
 description: "Assesses whether the response covers all key aspects of the question. Higher scores indicate more thorough and complete answers. This evaluator is useful when evaluating chatbots, virtual assistants, and QA systems where full and informative responses are critical."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/retrieval/spec.yaml b/assets/evaluators/builtin/retrieval/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.retrieval"
-version: 9
+version: 10
 displayName: "Retrieval-Evaluator"
 description: "Measures how effectively the system retrieves relevant data or content. Higher scores mean better recall of useful information. It’s best used for the quality of search in information retrieval and retrieval augmented generation, when you don't have ground truth for chunk retrieval rankings. Use the retrieval score when you want to assess to what extent the context chunks retrieved are highly relevant and ranked at the top for answering your users' queries."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/similarity/spec.yaml b/assets/evaluators/builtin/similarity/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.similarity"
-version: 4
+version: 5
 displayName: "Similarity-Evaluator"
 description: "Measures how closely two pieces of text resemble each other in meaning. Higher scores indicate greater semantic similarity. It’s best used for NLP tasks with a user query. Use it when you want an objective evaluation of an AI model's performance, particularly in text generation tasks where you have access to ground truth responses."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/task_adherence/spec.yaml b/assets/evaluators/builtin/task_adherence/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.task_adherence"
-version: 12
+version: 13
 displayName: "Task-Adherence-Evaluator-(Preview)"
 description: "Evaluates whether the agent completed the task within the confines of the instructions given to the agentic system. Higher scores indicate better compliance with the instructions. This evaluator is useful when useful for end-to-end system-level task evaluation for agents. Example outputs include actions such as updating a database and textual responses such as writing a report."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/task_completion/spec.yaml b/assets/evaluators/builtin/task_completion/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.task_completion"
-version: 15
+version: 16
 displayName: "Task-Completion-Evaluator-(Preview)"
 description: "Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met, ignoring rule adherence or intent understanding. This evaluator is useful for assessing agent effectiveness in task-oriented scenarios, workflow automation, and goal-oriented AI interactions."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_call_success/spec.yaml b/assets/evaluators/builtin/tool_call_success/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_call_success"
-version: 6
+version: 7
 displayName: "Tool-Call-Success-Evaluator"
 description: "Evaluates whether all tool calls were successful or not. It checks all tool calls to determine if any of these resulted in technical failure like exception, error or timeout. This evaluator is useful for when you want to evaluate the tool calls generated by an AI agent for being successful."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_input_accuracy/spec.yaml b/assets/evaluators/builtin/tool_input_accuracy/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_input_accuracy"
-version: 11
+version: 12
 displayName: "Tool-Input-Accuracy-Evaluator"
 description: "A binary evaluator (0 or 1) that checks whether all parameters in an agent’s tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness using LLM-based analysis. Use it to verify agent tool usage, API integration tests, or to ensure tool call parameters are fully correct in AI workflows."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_output_utilization/spec.yaml b/assets/evaluators/builtin/tool_output_utilization/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_output_utilization"
-version: 5
+version: 6
 displayName: "Tool-Output-Utilization-Evaluator"
 description: "Checks if an agent correctly interprets and contextually uses the outputs returned by invoked tools (e.g., APIs, DB queries, search results) without fabrication or omission. Use it to validate that agents accurately reuse and represent tool outputs in their responses across tool-dependent systems."
 evaluatorType: "builtin"
diff --git a/assets/evaluators/builtin/tool_selection/spec.yaml b/assets/evaluators/builtin/tool_selection/spec.yaml
@@ -1,6 +1,6 @@
 type: "evaluator"
 name: "builtin.tool_selection"
-version: 9
+version: 10
 displayName: "Tool-Selection-Evaluator"
 description: "Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. Use it to assess tool choice quality in agent-based systems, orchestration platforms, and AI assistants that must pick the right tools from available options."
 evaluatorType: "builtin"