dataset: Add elastic kb retrieval (#4487)

emilia-elastic · KennethEnevoldsen · web-flow · commit e792bce8df77 · 2026-04-30T11:10:53.000Z
* add: elastic-kb-retrieval fix: config add: results fix: remove eval results, add prompt and fix sample_creation - Remove local evaluation results (not part of task PR) - Add query prompt for instruction-tuned models - Change sample_creation to "found and created" (mix of real chat queries and synthetic) fix: clarify description for real-world vs synthetic query grounding add: baseline results for ElasticKBRetrieval * update dataset card * update: is_public, sample creation and eval_splits * update: descriptive stats on added eval_splits * update reference * add: contributed by * Update mteb/tasks/retrieval/eng/elastic_kb_retrieval.py Co-authored-by: Kenneth Enevoldsen <kenevoldsen@pm.me> * update: clarify that documents are real documents * update: license * Apply suggestion from @KennethEnevoldsen * Apply suggestion from @KennethEnevoldsen --------- Co-authored-by: Kenneth Enevoldsen <kenevoldsen@pm.me>
diff --git a/mteb/descriptive_stats/Retrieval/ElasticKBRetrieval.json b/mteb/descriptive_stats/Retrieval/ElasticKBRetrieval.json
@@ -0,0 +1,126 @@
+{
+    "synthetic_test": {
+        "num_samples": 9942,
+        "number_of_characters": 21726999,
+        "documents_text_statistics": {
+            "total_text_length": 21696425,
+            "min_text_length": 97,
+            "average_text_length": 2224.3618002870617,
+            "max_text_length": 75820,
+            "unique_texts": 9687
+        },
+        "documents_image_statistics": null,
+        "documents_audio_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 30574,
+            "min_text_length": 96,
+            "average_text_length": 162.62765957446808,
+            "max_text_length": 269,
+            "unique_texts": 188
+        },
+        "queries_image_statistics": null,
+        "queries_audio_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 3100,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 16.48936170212766,
+            "max_relevant_docs_per_query": 319,
+            "unique_relevant_docs": 1710
+        },
+        "top_ranked_statistics": null,
+        "hf_subset_descriptive_stats": {
+            "en": {
+                "num_samples": 9942,
+                "number_of_characters": 21726999,
+                "documents_text_statistics": {
+                    "total_text_length": 21696425,
+                    "min_text_length": 97,
+                    "average_text_length": 2224.3618002870617,
+                    "max_text_length": 75820,
+                    "unique_texts": 9687
+                },
+                "documents_image_statistics": null,
+                "documents_audio_statistics": null,
+                "queries_text_statistics": {
+                    "total_text_length": 30574,
+                    "min_text_length": 96,
+                    "average_text_length": 162.62765957446808,
+                    "max_text_length": 269,
+                    "unique_texts": 188
+                },
+                "queries_image_statistics": null,
+                "queries_audio_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 3100,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 16.48936170212766,
+                    "max_relevant_docs_per_query": 319,
+                    "unique_relevant_docs": 1710
+                },
+                "top_ranked_statistics": null
+            }
+        }
+    },
+    "real_chat_test": {
+        "num_samples": 9986,
+        "number_of_characters": 21736626,
+        "documents_text_statistics": {
+            "total_text_length": 21696425,
+            "min_text_length": 97,
+            "average_text_length": 2224.3618002870617,
+            "max_text_length": 75820,
+            "unique_texts": 9687
+        },
+        "documents_image_statistics": null,
+        "documents_audio_statistics": null,
+        "queries_text_statistics": {
+            "total_text_length": 40201,
+            "min_text_length": 7,
+            "average_text_length": 173.2801724137931,
+            "max_text_length": 5247,
+            "unique_texts": 232
+        },
+        "queries_image_statistics": null,
+        "queries_audio_statistics": null,
+        "relevant_docs_statistics": {
+            "num_relevant_docs": 39016,
+            "min_relevant_docs_per_query": 1,
+            "average_relevant_docs_per_query": 168.17241379310346,
+            "max_relevant_docs_per_query": 2729,
+            "unique_relevant_docs": 7419
+        },
+        "top_ranked_statistics": null,
+        "hf_subset_descriptive_stats": {
+            "en": {
+                "num_samples": 9986,
+                "number_of_characters": 21736626,
+                "documents_text_statistics": {
+                    "total_text_length": 21696425,
+                    "min_text_length": 97,
+                    "average_text_length": 2224.3618002870617,
+                    "max_text_length": 75820,
+                    "unique_texts": 9687
+                },
+                "documents_image_statistics": null,
+                "documents_audio_statistics": null,
+                "queries_text_statistics": {
+                    "total_text_length": 40201,
+                    "min_text_length": 7,
+                    "average_text_length": 173.2801724137931,
+                    "max_text_length": 5247,
+                    "unique_texts": 232
+                },
+                "queries_image_statistics": null,
+                "queries_audio_statistics": null,
+                "relevant_docs_statistics": {
+                    "num_relevant_docs": 39016,
+                    "min_relevant_docs_per_query": 1,
+                    "average_relevant_docs_per_query": 168.17241379310346,
+                    "max_relevant_docs_per_query": 2729,
+                    "unique_relevant_docs": 7419
+                },
+                "top_ranked_statistics": null
+            }
+        }
+    }
+}
diff --git a/mteb/tasks/retrieval/eng/__init__.py b/mteb/tasks/retrieval/eng/__init__.py
@@ -121,6 +121,7 @@
     DiDeMoVT2ARetrieval,
 )
 from .edis_t2it_retrieval import EDIST2ITRetrieval
+from .elastic_kb_retrieval import ElasticKBRetrieval
 from .emo_vdb import EmoVDBA2TRetrieval, EmoVDBT2ARetrieval
 from .encyclopedia_vqa_it2it_retrieval import EncyclopediaVQAIT2ITRetrieval
 from .english_finance1_retrieval import EnglishFinance1Retrieval
@@ -491,6 +492,7 @@
     "DiDeMoVA2TRetrieval",
     "DiDeMoVT2ARetrieval",
     "EDIST2ITRetrieval",
+    "ElasticKBRetrieval",
     "EmoVDBA2TRetrieval",
     "EmoVDBT2ARetrieval",
     "EncyclopediaVQAIT2ITRetrieval",
diff --git a/mteb/tasks/retrieval/eng/elastic_kb_retrieval.py b/mteb/tasks/retrieval/eng/elastic_kb_retrieval.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from mteb.abstasks.retrieval import AbsTaskRetrieval
+from mteb.abstasks.task_metadata import TaskMetadata
+
+
+class ElasticKBRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ElasticKBRetrieval",
+        description=(
+            "Retrieval benchmark built from the Elastic support knowledge base. "
+            "Contains 9,754 documents (real documents from the Elastic support knowledge base) and 420 queries (232 from real-world support "
+            "chat sessions, 188 synthetic queries generated from KB articles). "
+            "Relevance judgments are augmented labels produced by exhaustive "
+            "all-pairs LLM annotation using strict comparison to original doc "
+            "(grounding doc that lead to self-served ticket for real-world queries "
+            "and generating doc for synthetic queries)."
+        ),
+        reference="https://huggingface.co/blog/rteb",  # private set
+        dataset={
+            "path": "mteb-private/elastic-kb-retrieval",
+            "revision": "21bdf1e024bf7c9f46720017559ce2f8c6116507",
+        },
+        type="Retrieval",
+        category="t2t",
+        modalities=["text"],
+        eval_splits=["synthetic_test", "real_chat_test"],
+        eval_langs={"en": ["eng-Latn"]},
+        main_score="ndcg_at_10",
+        is_public=False,
+        date=("2015-01-01", "2026-04-01"),
+        domains=["Written", "Engineering"],
+        task_subtypes=["Question answering", "Conversational retrieval"],
+        license="not specified",  # shared as an evaluation dataset, results can be shared, and the dataset is allowed to be sent to embedding APIs
+        annotations_creators="LM-generated",
+        dialect=[],
+        prompt={
+            "query": "Given a support question, retrieve knowledge base articles that answer the question"
+        },
+        sample_creation="multiple",  # see description
+        bibtex_citation="",
+        contributed_by="Jina by Elastic",
+    )
diff --git a/tests/test_abstasks/test_private_tasks.py b/tests/test_abstasks/test_private_tasks.py
@@ -20,6 +20,7 @@
     "Vidore3TelecomRetrieval.v2",
     "Vidore3NuclearRetrieval.v2",
     "LexRetrieval.v1",
+    "ElasticKBRetrieval",
     # Add task names here that are allowed to be private
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`"Vidore3TelecomRetrieval.v2",`
`21`	`21`	`"Vidore3NuclearRetrieval.v2",`
`22`	`22`	`"LexRetrieval.v1",`
	`23`	`+ "ElasticKBRetrieval",`
`23`	`24`	`# Add task names here that are allowed to be private`
`24`	`25`	`]`
`25`	`26`