Skip to content

Commit 40bac77

Browse files
committed
feat: move mteb_version to individual subsets in result files
Store mteb_version per subset in score dicts so that results evaluated with different MTEB versions can be merged without erasing existing subset scores. The top-level mteb_version is now computed as the latest version across all subsets. - Add mteb_version to each subset's score dict in from_task_results() - Remove mteb_version from default merge criteria (is_mergeable/merge) - Compute top-level mteb_version from per-subset versions after merge - Fix bug in aggregated_task.py where mteb_version was always overwritten regardless of version mismatch condition Closes #4354
1 parent e722b76 commit 40bac77

3 files changed

Lines changed: 136 additions & 8 deletions

File tree

mteb/abstasks/aggregated_task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,9 @@ def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult:
126126
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
127127
logger.warning(msg)
128128
warnings.warn(msg)
129-
task_res.mteb_version = None
130-
task_res.mteb_version = task_results[0].mteb_version
129+
task_res.mteb_version = TaskResult._compute_top_level_mteb_version(
130+
task_res.scores
131+
)
131132
return task_res
132133

133134
def evaluate(

mteb/results/task_result.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ def from_task_results(
194194
"""
195195
task_meta = task.metadata
196196
subset2langscripts = task_meta.hf_subsets_to_langscripts
197+
mteb_ver = version("mteb")
197198
flat_scores = defaultdict(list)
198199
for split, hf_subset_scores in scores.items():
199200
for hf_subset, hf_scores in hf_subset_scores.items():
@@ -202,13 +203,14 @@ def from_task_results(
202203
**hf_scores,
203204
"hf_subset": hf_subset,
204205
"languages": eval_langs,
206+
"mteb_version": mteb_ver,
205207
}
206208
flat_scores[split].append(_scores)
207209

208210
return TaskResult(
209211
dataset_revision=task.metadata.revision,
210212
task_name=task.metadata.name,
211-
mteb_version=version("mteb"),
213+
mteb_version=mteb_ver,
212214
scores=flat_scores,
213215
evaluation_time=evaluation_time,
214216
kg_co2_emissions=kg_co2_emissions,
@@ -730,7 +732,6 @@ def is_mergeable(
730732
self,
731733
result: TaskResult | AbsTask,
732734
criteria: list[str] | list[Criteria] = [
733-
"mteb_version",
734735
"dataset_revision",
735736
],
736737
raise_error: bool = False,
@@ -739,7 +740,7 @@ def is_mergeable(
739740
740741
Args:
741742
result: The TaskResult or Task object to check against.
742-
criteria: Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision".
743+
criteria: Additional criteria to check for merging. Can be "dataset_revision" or "mteb_version" (opt-in).
743744
It will always check that the task name match.
744745
raise_error: If True, raises an error if the objects cannot be merged. If False, returns False.
745746
@@ -789,7 +790,6 @@ def merge(
789790
self,
790791
new_results: TaskResult,
791792
criteria: list[str] | list[Criteria] = [
792-
"mteb_version",
793793
"dataset_revision",
794794
],
795795
) -> TaskResult:
@@ -831,10 +831,12 @@ def merge(
831831
date = self.date
832832
if new_results.date is not None and (date is None or new_results.date > date):
833833
date = new_results.date
834+
mteb_ver = self._compute_top_level_mteb_version(merged_scores)
835+
834836
merged_results = TaskResult(
835837
dataset_revision=new_results.dataset_revision,
836838
task_name=new_results.task_name,
837-
mteb_version=new_results.mteb_version,
839+
mteb_version=mteb_ver,
838840
scores=merged_scores,
839841
evaluation_time=merged_evaluation_time,
840842
kg_co2_emissions=merged_kg_co2_emissions,
@@ -843,6 +845,25 @@ def merge(
843845

844846
return merged_results
845847

848+
@staticmethod
849+
def _compute_top_level_mteb_version(
850+
scores: dict[SplitName, list[ScoresDict]],
851+
) -> str | None:
852+
"""Compute the top-level mteb_version from per-subset versions.
853+
854+
Returns the latest version found across all subsets, or None if no
855+
per-subset versions are present.
856+
"""
857+
versions: set[str] = set()
858+
for split_scores in scores.values():
859+
for subset_scores in split_scores:
860+
v = subset_scores.get("mteb_version")
861+
if v is not None:
862+
versions.add(v)
863+
if not versions:
864+
return None
865+
return str(max(Version(v) for v in versions))
866+
846867
@staticmethod
847868
def _merge_split_scores(
848869
existing_scores: list[ScoresDict], new_scores: list[ScoresDict]

tests/test_results/test_task_results.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,11 @@ def test_task_results_get_score(task_result: TaskResult):
8383

8484

8585
def test_task_results_to_dict(task_result: TaskResult):
86+
mteb_ver = version("mteb")
8687
dict_repr = {
8788
"dataset_revision": "1.0",
8889
"task_name": "dummy_task",
89-
"mteb_version": version("mteb"),
90+
"mteb_version": mteb_ver,
9091
"evaluation_time": 100,
9192
"date": None,
9293
"kg_co2_emissions": None,
@@ -96,11 +97,13 @@ def test_task_results_to_dict(task_result: TaskResult):
9697
"main_score": 0.5,
9798
"hf_subset": "en-de",
9899
"languages": ["eng-Latn", "deu-Latn"],
100+
"mteb_version": mteb_ver,
99101
},
100102
{
101103
"main_score": 0.6,
102104
"hf_subset": "en-fr",
103105
"languages": ["eng-Latn", "fra-Latn"],
106+
"mteb_version": mteb_ver,
104107
},
105108
]
106109
},
@@ -153,6 +156,109 @@ def test_task_results_validate_and_filter():
153156
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores
154157

155158

159+
def test_per_subset_mteb_version(task_result: TaskResult):
160+
"""Test that each subset score dict includes mteb_version."""
161+
mteb_ver = version("mteb")
162+
for split_scores in task_result.scores.values():
163+
for subset_scores in split_scores:
164+
assert "mteb_version" in subset_scores
165+
assert subset_scores["mteb_version"] == mteb_ver
166+
167+
168+
def test_merge_across_mteb_versions():
169+
"""Test that results from different MTEB versions can be merged."""
170+
existing = TaskResult(
171+
dataset_revision="1.0",
172+
task_name="dummy_task",
173+
mteb_version="2.12.4",
174+
scores={
175+
"train": [
176+
{
177+
"main_score": 0.5,
178+
"hf_subset": "en-de",
179+
"languages": ["eng-Latn", "deu-Latn"],
180+
"mteb_version": "2.12.4",
181+
},
182+
]
183+
},
184+
evaluation_time=50,
185+
)
186+
187+
new = TaskResult(
188+
dataset_revision="1.0",
189+
task_name="dummy_task",
190+
mteb_version="2.20.1",
191+
scores={
192+
"train": [
193+
{
194+
"main_score": 0.6,
195+
"hf_subset": "en-fr",
196+
"languages": ["eng-Latn", "fra-Latn"],
197+
"mteb_version": "2.20.1",
198+
},
199+
]
200+
},
201+
evaluation_time=60,
202+
)
203+
204+
assert existing.is_mergeable(new)
205+
merged = existing.merge(new)
206+
207+
# Both subsets should be present
208+
assert len(merged.scores["train"]) == 2
209+
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
210+
assert subsets["en-de"]["mteb_version"] == "2.12.4"
211+
assert subsets["en-fr"]["mteb_version"] == "2.20.1"
212+
213+
# Top-level version should be the latest
214+
assert merged.mteb_version == "2.20.1"
215+
216+
217+
def test_merge_without_per_subset_version():
218+
"""Test merging old results that lack per-subset mteb_version."""
219+
existing = TaskResult(
220+
dataset_revision="1.0",
221+
task_name="dummy_task",
222+
mteb_version="2.12.4",
223+
scores={
224+
"train": [
225+
{
226+
"main_score": 0.5,
227+
"hf_subset": "en-de",
228+
"languages": ["eng-Latn", "deu-Latn"],
229+
},
230+
]
231+
},
232+
evaluation_time=50,
233+
)
234+
235+
new = TaskResult(
236+
dataset_revision="1.0",
237+
task_name="dummy_task",
238+
mteb_version="2.20.1",
239+
scores={
240+
"train": [
241+
{
242+
"main_score": 0.6,
243+
"hf_subset": "en-fr",
244+
"languages": ["eng-Latn", "fra-Latn"],
245+
"mteb_version": "2.20.1",
246+
},
247+
]
248+
},
249+
evaluation_time=60,
250+
)
251+
252+
merged = existing.merge(new)
253+
# Old subset has no per-subset version, new one does
254+
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
255+
assert "mteb_version" not in subsets["en-de"]
256+
assert subsets["en-fr"]["mteb_version"] == "2.20.1"
257+
258+
# Top-level should be the latest found across subsets
259+
assert merged.mteb_version == "2.20.1"
260+
261+
156262
@pytest.mark.parametrize(
157263
"path", list((tests_folder / "historic_results").glob("*.json"))
158264
)

0 commit comments

Comments
 (0)