Skip to content

Commit e792bce

Browse files
dataset: Add elastic kb retrieval (#4487)
* add: elastic-kb-retrieval fix: config add: results fix: remove eval results, add prompt and fix sample_creation - Remove local evaluation results (not part of task PR) - Add query prompt for instruction-tuned models - Change sample_creation to "found and created" (mix of real chat queries and synthetic) fix: clarify description for real-world vs synthetic query grounding add: baseline results for ElasticKBRetrieval * update dataset card * update: is_public, sample creation and eval_splits * update: descriptive stats on added eval_splits * update reference * add: contributed by * Update mteb/tasks/retrieval/eng/elastic_kb_retrieval.py Co-authored-by: Kenneth Enevoldsen <kenevoldsen@pm.me> * update: clarify that documents are real documents * update: license * Apply suggestion from @KennethEnevoldsen * Apply suggestion from @KennethEnevoldsen --------- Co-authored-by: Kenneth Enevoldsen <kenevoldsen@pm.me>
1 parent b81f9b0 commit e792bce

4 files changed

Lines changed: 172 additions & 0 deletions

File tree

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"synthetic_test": {
3+
"num_samples": 9942,
4+
"number_of_characters": 21726999,
5+
"documents_text_statistics": {
6+
"total_text_length": 21696425,
7+
"min_text_length": 97,
8+
"average_text_length": 2224.3618002870617,
9+
"max_text_length": 75820,
10+
"unique_texts": 9687
11+
},
12+
"documents_image_statistics": null,
13+
"documents_audio_statistics": null,
14+
"queries_text_statistics": {
15+
"total_text_length": 30574,
16+
"min_text_length": 96,
17+
"average_text_length": 162.62765957446808,
18+
"max_text_length": 269,
19+
"unique_texts": 188
20+
},
21+
"queries_image_statistics": null,
22+
"queries_audio_statistics": null,
23+
"relevant_docs_statistics": {
24+
"num_relevant_docs": 3100,
25+
"min_relevant_docs_per_query": 1,
26+
"average_relevant_docs_per_query": 16.48936170212766,
27+
"max_relevant_docs_per_query": 319,
28+
"unique_relevant_docs": 1710
29+
},
30+
"top_ranked_statistics": null,
31+
"hf_subset_descriptive_stats": {
32+
"en": {
33+
"num_samples": 9942,
34+
"number_of_characters": 21726999,
35+
"documents_text_statistics": {
36+
"total_text_length": 21696425,
37+
"min_text_length": 97,
38+
"average_text_length": 2224.3618002870617,
39+
"max_text_length": 75820,
40+
"unique_texts": 9687
41+
},
42+
"documents_image_statistics": null,
43+
"documents_audio_statistics": null,
44+
"queries_text_statistics": {
45+
"total_text_length": 30574,
46+
"min_text_length": 96,
47+
"average_text_length": 162.62765957446808,
48+
"max_text_length": 269,
49+
"unique_texts": 188
50+
},
51+
"queries_image_statistics": null,
52+
"queries_audio_statistics": null,
53+
"relevant_docs_statistics": {
54+
"num_relevant_docs": 3100,
55+
"min_relevant_docs_per_query": 1,
56+
"average_relevant_docs_per_query": 16.48936170212766,
57+
"max_relevant_docs_per_query": 319,
58+
"unique_relevant_docs": 1710
59+
},
60+
"top_ranked_statistics": null
61+
}
62+
}
63+
},
64+
"real_chat_test": {
65+
"num_samples": 9986,
66+
"number_of_characters": 21736626,
67+
"documents_text_statistics": {
68+
"total_text_length": 21696425,
69+
"min_text_length": 97,
70+
"average_text_length": 2224.3618002870617,
71+
"max_text_length": 75820,
72+
"unique_texts": 9687
73+
},
74+
"documents_image_statistics": null,
75+
"documents_audio_statistics": null,
76+
"queries_text_statistics": {
77+
"total_text_length": 40201,
78+
"min_text_length": 7,
79+
"average_text_length": 173.2801724137931,
80+
"max_text_length": 5247,
81+
"unique_texts": 232
82+
},
83+
"queries_image_statistics": null,
84+
"queries_audio_statistics": null,
85+
"relevant_docs_statistics": {
86+
"num_relevant_docs": 39016,
87+
"min_relevant_docs_per_query": 1,
88+
"average_relevant_docs_per_query": 168.17241379310346,
89+
"max_relevant_docs_per_query": 2729,
90+
"unique_relevant_docs": 7419
91+
},
92+
"top_ranked_statistics": null,
93+
"hf_subset_descriptive_stats": {
94+
"en": {
95+
"num_samples": 9986,
96+
"number_of_characters": 21736626,
97+
"documents_text_statistics": {
98+
"total_text_length": 21696425,
99+
"min_text_length": 97,
100+
"average_text_length": 2224.3618002870617,
101+
"max_text_length": 75820,
102+
"unique_texts": 9687
103+
},
104+
"documents_image_statistics": null,
105+
"documents_audio_statistics": null,
106+
"queries_text_statistics": {
107+
"total_text_length": 40201,
108+
"min_text_length": 7,
109+
"average_text_length": 173.2801724137931,
110+
"max_text_length": 5247,
111+
"unique_texts": 232
112+
},
113+
"queries_image_statistics": null,
114+
"queries_audio_statistics": null,
115+
"relevant_docs_statistics": {
116+
"num_relevant_docs": 39016,
117+
"min_relevant_docs_per_query": 1,
118+
"average_relevant_docs_per_query": 168.17241379310346,
119+
"max_relevant_docs_per_query": 2729,
120+
"unique_relevant_docs": 7419
121+
},
122+
"top_ranked_statistics": null
123+
}
124+
}
125+
}
126+
}

mteb/tasks/retrieval/eng/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
DiDeMoVT2ARetrieval,
122122
)
123123
from .edis_t2it_retrieval import EDIST2ITRetrieval
124+
from .elastic_kb_retrieval import ElasticKBRetrieval
124125
from .emo_vdb import EmoVDBA2TRetrieval, EmoVDBT2ARetrieval
125126
from .encyclopedia_vqa_it2it_retrieval import EncyclopediaVQAIT2ITRetrieval
126127
from .english_finance1_retrieval import EnglishFinance1Retrieval
@@ -491,6 +492,7 @@
491492
"DiDeMoVA2TRetrieval",
492493
"DiDeMoVT2ARetrieval",
493494
"EDIST2ITRetrieval",
495+
"ElasticKBRetrieval",
494496
"EmoVDBA2TRetrieval",
495497
"EmoVDBT2ARetrieval",
496498
"EncyclopediaVQAIT2ITRetrieval",
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from __future__ import annotations
2+
3+
from mteb.abstasks.retrieval import AbsTaskRetrieval
4+
from mteb.abstasks.task_metadata import TaskMetadata
5+
6+
7+
class ElasticKBRetrieval(AbsTaskRetrieval):
8+
metadata = TaskMetadata(
9+
name="ElasticKBRetrieval",
10+
description=(
11+
"Retrieval benchmark built from the Elastic support knowledge base. "
12+
"Contains 9,754 documents (real documents from the Elastic support knowledge base) and 420 queries (232 from real-world support "
13+
"chat sessions, 188 synthetic queries generated from KB articles). "
14+
"Relevance judgments are augmented labels produced by exhaustive "
15+
"all-pairs LLM annotation using strict comparison to original doc "
16+
"(grounding doc that lead to self-served ticket for real-world queries "
17+
"and generating doc for synthetic queries)."
18+
),
19+
reference="https://huggingface.co/blog/rteb", # private set
20+
dataset={
21+
"path": "mteb-private/elastic-kb-retrieval",
22+
"revision": "21bdf1e024bf7c9f46720017559ce2f8c6116507",
23+
},
24+
type="Retrieval",
25+
category="t2t",
26+
modalities=["text"],
27+
eval_splits=["synthetic_test", "real_chat_test"],
28+
eval_langs={"en": ["eng-Latn"]},
29+
main_score="ndcg_at_10",
30+
is_public=False,
31+
date=("2015-01-01", "2026-04-01"),
32+
domains=["Written", "Engineering"],
33+
task_subtypes=["Question answering", "Conversational retrieval"],
34+
license="not specified", # shared as an evaluation dataset, results can be shared, and the dataset is allowed to be sent to embedding APIs
35+
annotations_creators="LM-generated",
36+
dialect=[],
37+
prompt={
38+
"query": "Given a support question, retrieve knowledge base articles that answer the question"
39+
},
40+
sample_creation="multiple", # see description
41+
bibtex_citation="",
42+
contributed_by="Jina by Elastic",
43+
)

tests/test_abstasks/test_private_tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"Vidore3TelecomRetrieval.v2",
2121
"Vidore3NuclearRetrieval.v2",
2222
"LexRetrieval.v1",
23+
"ElasticKBRetrieval",
2324
# Add task names here that are allowed to be private
2425
]
2526

0 commit comments

Comments
 (0)