Skip to content

Commit cb55fea

Browse files
committed
feat: move mteb_version to individual subsets in result files
Store mteb_version per subset in score dicts so that results evaluated with different MTEB versions can be merged without erasing existing subset scores. The top-level mteb_version is now computed as the latest version across all subsets. - Add mteb_version to each subset's score dict in from_task_results() - Remove mteb_version from default merge criteria (is_mergeable/merge) - Compute top-level mteb_version from per-subset versions after merge - Fix bug in aggregated_task.py where mteb_version was always overwritten regardless of version mismatch condition Closes #4354
1 parent e722b76 commit cb55fea

3 files changed

Lines changed: 136 additions & 17 deletions

File tree

mteb/abstasks/aggregated_task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,9 @@ def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult:
126126
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
127127
logger.warning(msg)
128128
warnings.warn(msg)
129-
task_res.mteb_version = None
130-
task_res.mteb_version = task_results[0].mteb_version
129+
task_res.mteb_version = TaskResult._compute_top_level_mteb_version(
130+
task_res.scores
131+
)
131132
return task_res
132133

133134
def evaluate(

mteb/results/task_result.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ def from_task_results(
194194
"""
195195
task_meta = task.metadata
196196
subset2langscripts = task_meta.hf_subsets_to_langscripts
197+
mteb_ver = version("mteb")
197198
flat_scores = defaultdict(list)
198199
for split, hf_subset_scores in scores.items():
199200
for hf_subset, hf_scores in hf_subset_scores.items():
@@ -202,13 +203,14 @@ def from_task_results(
202203
**hf_scores,
203204
"hf_subset": hf_subset,
204205
"languages": eval_langs,
206+
"mteb_version": mteb_ver,
205207
}
206208
flat_scores[split].append(_scores)
207209

208210
return TaskResult(
209211
dataset_revision=task.metadata.revision,
210212
task_name=task.metadata.name,
211-
mteb_version=version("mteb"),
213+
mteb_version=mteb_ver,
212214
scores=flat_scores,
213215
evaluation_time=evaluation_time,
214216
kg_co2_emissions=kg_co2_emissions,
@@ -730,7 +732,6 @@ def is_mergeable(
730732
self,
731733
result: TaskResult | AbsTask,
732734
criteria: list[str] | list[Criteria] = [
733-
"mteb_version",
734735
"dataset_revision",
735736
],
736737
raise_error: bool = False,
@@ -739,7 +740,7 @@ def is_mergeable(
739740
740741
Args:
741742
result: The TaskResult or Task object to check against.
742-
criteria: Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision".
743+
criteria: Additional criteria to check for merging. Can be "dataset_revision".
743744
It will always check that the task name match.
744745
raise_error: If True, raises an error if the objects cannot be merged. If False, returns False.
745746
@@ -750,9 +751,7 @@ def is_mergeable(
750751
if isinstance(result, TaskResult):
751752
name = result.task_name
752753
revision = result.dataset_revision
753-
mteb_version = result.mteb_version
754754
elif isinstance(result, AbsTask):
755-
mteb_version = version("mteb")
756755
name = result.metadata.name
757756
revision = result.metadata.revision
758757
else:
@@ -769,13 +768,6 @@ def is_mergeable(
769768
logger.debug(msg)
770769
return False
771770

772-
if Criteria.MTEB_VERSION in criteria and self.mteb_version != mteb_version:
773-
msg = f"Cannot merge TaskResult objects as they are derived from different MTEB versions ({self.mteb_version} (loaded) and {mteb_version} (current))"
774-
if raise_error:
775-
raise ValueError(msg)
776-
logger.debug(msg)
777-
return False
778-
779771
if Criteria.DATASET_REVISION in criteria and self.dataset_revision != revision:
780772
msg = f"Cannot merge TaskResult objects as they are derived from different dataset revisions ({self.dataset_revision} and {revision})"
781773
if raise_error:
@@ -789,7 +781,6 @@ def merge(
789781
self,
790782
new_results: TaskResult,
791783
criteria: list[str] | list[Criteria] = [
792-
"mteb_version",
793784
"dataset_revision",
794785
],
795786
) -> TaskResult:
@@ -831,10 +822,12 @@ def merge(
831822
date = self.date
832823
if new_results.date is not None and (date is None or new_results.date > date):
833824
date = new_results.date
825+
mteb_ver = self._compute_top_level_mteb_version(merged_scores)
826+
834827
merged_results = TaskResult(
835828
dataset_revision=new_results.dataset_revision,
836829
task_name=new_results.task_name,
837-
mteb_version=new_results.mteb_version,
830+
mteb_version=mteb_ver,
838831
scores=merged_scores,
839832
evaluation_time=merged_evaluation_time,
840833
kg_co2_emissions=merged_kg_co2_emissions,
@@ -843,6 +836,25 @@ def merge(
843836

844837
return merged_results
845838

839+
@staticmethod
840+
def _compute_top_level_mteb_version(
841+
scores: dict[SplitName, list[ScoresDict]],
842+
) -> str | None:
843+
"""Compute the top-level mteb_version from per-subset versions.
844+
845+
Returns the latest version found across all subsets, or None if no
846+
per-subset versions are present.
847+
"""
848+
versions: set[str] = set()
849+
for split_scores in scores.values():
850+
for subset_scores in split_scores:
851+
v = subset_scores.get("mteb_version")
852+
if v is not None:
853+
versions.add(v)
854+
if not versions:
855+
return None
856+
return str(max(Version(v) for v in versions))
857+
846858
@staticmethod
847859
def _merge_split_scores(
848860
existing_scores: list[ScoresDict], new_scores: list[ScoresDict]

tests/test_results/test_task_results.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,11 @@ def test_task_results_get_score(task_result: TaskResult):
8383

8484

8585
def test_task_results_to_dict(task_result: TaskResult):
86+
mteb_ver = version("mteb")
8687
dict_repr = {
8788
"dataset_revision": "1.0",
8889
"task_name": "dummy_task",
89-
"mteb_version": version("mteb"),
90+
"mteb_version": mteb_ver,
9091
"evaluation_time": 100,
9192
"date": None,
9293
"kg_co2_emissions": None,
@@ -96,11 +97,13 @@ def test_task_results_to_dict(task_result: TaskResult):
9697
"main_score": 0.5,
9798
"hf_subset": "en-de",
9899
"languages": ["eng-Latn", "deu-Latn"],
100+
"mteb_version": mteb_ver,
99101
},
100102
{
101103
"main_score": 0.6,
102104
"hf_subset": "en-fr",
103105
"languages": ["eng-Latn", "fra-Latn"],
106+
"mteb_version": mteb_ver,
104107
},
105108
]
106109
},
@@ -153,6 +156,109 @@ def test_task_results_validate_and_filter():
153156
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores
154157

155158

159+
def test_per_subset_mteb_version(task_result: TaskResult):
160+
"""Test that each subset score dict includes mteb_version."""
161+
mteb_ver = version("mteb")
162+
for split_scores in task_result.scores.values():
163+
for subset_scores in split_scores:
164+
assert "mteb_version" in subset_scores
165+
assert subset_scores["mteb_version"] == mteb_ver
166+
167+
168+
def test_merge_across_mteb_versions():
169+
"""Test that results from different MTEB versions can be merged."""
170+
existing = TaskResult(
171+
dataset_revision="1.0",
172+
task_name="dummy_task",
173+
mteb_version="2.12.4",
174+
scores={
175+
"train": [
176+
{
177+
"main_score": 0.5,
178+
"hf_subset": "en-de",
179+
"languages": ["eng-Latn", "deu-Latn"],
180+
"mteb_version": "2.12.4",
181+
},
182+
]
183+
},
184+
evaluation_time=50,
185+
)
186+
187+
new = TaskResult(
188+
dataset_revision="1.0",
189+
task_name="dummy_task",
190+
mteb_version="2.20.1",
191+
scores={
192+
"train": [
193+
{
194+
"main_score": 0.6,
195+
"hf_subset": "en-fr",
196+
"languages": ["eng-Latn", "fra-Latn"],
197+
"mteb_version": "2.20.1",
198+
},
199+
]
200+
},
201+
evaluation_time=60,
202+
)
203+
204+
assert existing.is_mergeable(new)
205+
merged = existing.merge(new)
206+
207+
# Both subsets should be present
208+
assert len(merged.scores["train"]) == 2
209+
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
210+
assert subsets["en-de"]["mteb_version"] == "2.12.4"
211+
assert subsets["en-fr"]["mteb_version"] == "2.20.1"
212+
213+
# Top-level version should be the latest
214+
assert merged.mteb_version == "2.20.1"
215+
216+
217+
def test_merge_without_per_subset_version():
218+
"""Test merging old results that lack per-subset mteb_version."""
219+
existing = TaskResult(
220+
dataset_revision="1.0",
221+
task_name="dummy_task",
222+
mteb_version="2.12.4",
223+
scores={
224+
"train": [
225+
{
226+
"main_score": 0.5,
227+
"hf_subset": "en-de",
228+
"languages": ["eng-Latn", "deu-Latn"],
229+
},
230+
]
231+
},
232+
evaluation_time=50,
233+
)
234+
235+
new = TaskResult(
236+
dataset_revision="1.0",
237+
task_name="dummy_task",
238+
mteb_version="2.20.1",
239+
scores={
240+
"train": [
241+
{
242+
"main_score": 0.6,
243+
"hf_subset": "en-fr",
244+
"languages": ["eng-Latn", "fra-Latn"],
245+
"mteb_version": "2.20.1",
246+
},
247+
]
248+
},
249+
evaluation_time=60,
250+
)
251+
252+
merged = existing.merge(new)
253+
# Old subset has no per-subset version, new one does
254+
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
255+
assert "mteb_version" not in subsets["en-de"]
256+
assert subsets["en-fr"]["mteb_version"] == "2.20.1"
257+
258+
# Top-level should be the latest found across subsets
259+
assert merged.mteb_version == "2.20.1"
260+
261+
156262
@pytest.mark.parametrize(
157263
"path", list((tests_folder / "historic_results").glob("*.json"))
158264
)

0 commit comments

Comments
 (0)