Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions mteb/abstasks/aggregated_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult:
msg = f"All tasks of {self.metadata.name} is not run using the same version. different versions found are: {mteb_versions}"
logger.warning(msg)
warnings.warn(msg)
task_res.mteb_version = None
task_res.mteb_version = task_results[0].mteb_version
task_res.mteb_version = TaskResult._compute_top_level_mteb_version(
task_res.scores
)
return task_res

def evaluate(
Expand Down
49 changes: 43 additions & 6 deletions mteb/results/task_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import numpy as np
from huggingface_hub import EvalResult
from packaging.version import Version
from pydantic import BaseModel, field_validator
from pydantic import BaseModel, field_validator, model_validator
from typing_extensions import deprecated

from mteb._helpful_enum import HelpfulStrEnum
Expand Down Expand Up @@ -194,6 +194,7 @@ def from_task_results(
"""
task_meta = task.metadata
subset2langscripts = task_meta.hf_subsets_to_langscripts
mteb_ver = version("mteb")
flat_scores = defaultdict(list)
for split, hf_subset_scores in scores.items():
for hf_subset, hf_scores in hf_subset_scores.items():
Expand All @@ -202,13 +203,14 @@ def from_task_results(
**hf_scores,
"hf_subset": hf_subset,
"languages": eval_langs,
"mteb_version": mteb_ver,
}
flat_scores[split].append(_scores)

return TaskResult(
dataset_revision=task.metadata.revision,
task_name=task.metadata.name,
mteb_version=version("mteb"),
mteb_version=mteb_ver,
scores=flat_scores,
evaluation_time=evaluation_time,
kg_co2_emissions=kg_co2_emissions,
Expand All @@ -227,6 +229,17 @@ def _validate_scores(
cls._validate_scores_dict(hf_subset_score)
return v

@model_validator(mode="after")
def _backfill_per_subset_mteb_version(self) -> Self:
"""Backfill mteb_version from top-level into subsets that lack it."""
if self.mteb_version is None:
return self
for split_scores in self.scores.values():
for subset_scores in split_scores:
if "mteb_version" not in subset_scores:
subset_scores["mteb_version"] = self.mteb_version # type: ignore[index]
return self

@staticmethod
def _validate_scores_dict(scores: ScoresDict) -> None:
if "main_score" not in scores:
Expand Down Expand Up @@ -730,7 +743,6 @@ def is_mergeable(
self,
result: TaskResult | AbsTask,
criteria: list[str] | list[Criteria] = [
"mteb_version",
Comment thread
fzoll marked this conversation as resolved.
"dataset_revision",
],
raise_error: bool = False,
Expand All @@ -739,7 +751,7 @@ def is_mergeable(

Args:
result: The TaskResult or Task object to check against.
criteria: Additional criteria to check for merging. Can be "mteb_version" or "dataset_revision".
criteria: Additional criteria to check for merging. Can be "dataset_revision" or "mteb_version" (opt-in).
It will always check that the task name match.
raise_error: If True, raises an error if the objects cannot be merged. If False, returns False.

Expand Down Expand Up @@ -789,7 +801,6 @@ def merge(
self,
new_results: TaskResult,
criteria: list[str] | list[Criteria] = [
"mteb_version",
"dataset_revision",
],
) -> TaskResult:
Expand Down Expand Up @@ -831,10 +842,12 @@ def merge(
date = self.date
if new_results.date is not None and (date is None or new_results.date > date):
date = new_results.date
mteb_ver = self._compute_top_level_mteb_version(merged_scores)

merged_results = TaskResult(
dataset_revision=new_results.dataset_revision,
task_name=new_results.task_name,
mteb_version=new_results.mteb_version,
mteb_version=mteb_ver,
scores=merged_scores,
evaluation_time=merged_evaluation_time,
kg_co2_emissions=merged_kg_co2_emissions,
Expand All @@ -843,6 +856,30 @@ def merge(

return merged_results

@staticmethod
def _compute_top_level_mteb_version(
scores: dict[SplitName, list[ScoresDict]],
) -> str | None:
"""Compute the top-level mteb_version from per-subset versions.

Returns a version range (e.g. "2.12.0-2.12.19") if subsets were
evaluated with different versions, a single version if all match,
or None if no per-subset versions are present.
"""
versions: set[str] = set()
for split_scores in scores.values():
for subset_scores in split_scores:
v = subset_scores.get("mteb_version")
if v is not None:
versions.add(v)
if not versions:
return None
min_ver = str(min(Version(v) for v in versions))
max_ver = str(max(Version(v) for v in versions))
if min_ver == max_ver:
return min_ver
return f"{min_ver}-{max_ver}"

@staticmethod
def _merge_split_scores(
existing_scores: list[ScoresDict], new_scores: list[ScoresDict]
Expand Down
108 changes: 107 additions & 1 deletion tests/test_results/test_task_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ def test_task_results_get_score(task_result: TaskResult):


def test_task_results_to_dict(task_result: TaskResult):
mteb_ver = version("mteb")
dict_repr = {
"dataset_revision": "1.0",
"task_name": "dummy_task",
"mteb_version": version("mteb"),
"mteb_version": mteb_ver,
"evaluation_time": 100,
"date": None,
"kg_co2_emissions": None,
Expand All @@ -96,11 +97,13 @@ def test_task_results_to_dict(task_result: TaskResult):
"main_score": 0.5,
"hf_subset": "en-de",
"languages": ["eng-Latn", "deu-Latn"],
"mteb_version": mteb_ver,
},
{
"main_score": 0.6,
"hf_subset": "en-fr",
"languages": ["eng-Latn", "fra-Latn"],
"mteb_version": mteb_ver,
},
]
},
Expand Down Expand Up @@ -153,6 +156,109 @@ def test_task_results_validate_and_filter():
assert res3.get_score() == (0.5 + 0.3) / 2 # only en-de scores


def test_per_subset_mteb_version(task_result: TaskResult):
"""Test that each subset score dict includes mteb_version."""
mteb_ver = version("mteb")
for split_scores in task_result.scores.values():
for subset_scores in split_scores:
assert "mteb_version" in subset_scores
assert subset_scores["mteb_version"] == mteb_ver


def test_merge_across_mteb_versions():
"""Test that results from different MTEB versions can be merged."""
existing = TaskResult(
dataset_revision="1.0",
task_name="dummy_task",
mteb_version="2.12.4",
scores={
"train": [
{
"main_score": 0.5,
"hf_subset": "en-de",
"languages": ["eng-Latn", "deu-Latn"],
"mteb_version": "2.12.4",
},
]
},
evaluation_time=50,
)

new = TaskResult(
dataset_revision="1.0",
task_name="dummy_task",
mteb_version="2.20.1",
scores={
"train": [
{
"main_score": 0.6,
"hf_subset": "en-fr",
"languages": ["eng-Latn", "fra-Latn"],
"mteb_version": "2.20.1",
},
]
},
evaluation_time=60,
)

assert existing.is_mergeable(new)
merged = existing.merge(new)

# Both subsets should be present
assert len(merged.scores["train"]) == 2
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
assert subsets["en-de"]["mteb_version"] == "2.12.4"
assert subsets["en-fr"]["mteb_version"] == "2.20.1"

# Top-level version should be a range when subsets differ
assert merged.mteb_version == "2.12.4-2.20.1"


def test_merge_without_per_subset_version():
"""Test merging old results that lack per-subset mteb_version."""
existing = TaskResult(
dataset_revision="1.0",
task_name="dummy_task",
mteb_version="2.12.4",
scores={
"train": [
{
"main_score": 0.5,
"hf_subset": "en-de",
"languages": ["eng-Latn", "deu-Latn"],
},
]
},
evaluation_time=50,
)

new = TaskResult(
dataset_revision="1.0",
task_name="dummy_task",
mteb_version="2.20.1",
scores={
"train": [
{
"main_score": 0.6,
"hf_subset": "en-fr",
"languages": ["eng-Latn", "fra-Latn"],
"mteb_version": "2.20.1",
},
]
},
evaluation_time=60,
)

merged = existing.merge(new)
# Old subset has no per-subset version — backfilled from top-level
subsets = {s["hf_subset"]: s for s in merged.scores["train"]}
assert subsets["en-de"]["mteb_version"] == "2.12.4"
assert subsets["en-fr"]["mteb_version"] == "2.20.1"

# Top-level should be a range across all subset versions
assert merged.mteb_version == "2.12.4-2.20.1"


@pytest.mark.parametrize(
"path", list((tests_folder / "historic_results").glob("*.json"))
)
Expand Down
Loading