diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 92c2c574a..d88a3adda 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -28,6 +28,7 @@ import logging import math from abc import ABC, abstractmethod +from itertools import zip_longest from typing import Literal import numpy as np @@ -142,7 +143,7 @@ def get_metric(self): def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation.""" metric = self.get_metric() - golds = [i.golds for i in items] + golds = [as_list(i.golds) for i in items] preds = [] for i in items: pred = as_list(i.preds) @@ -153,9 +154,12 @@ def compute_corpus(self, items: list[GenerativeCorpusMetricInput]) -> float: preds.append(pred[0]) if self.metric_type == "bleu": - golds = [[gold[0] for gold in golds]] + references = [[gold[0] for gold in golds]] + else: + # SacreBLEU expects references as [reference_id][sample_id]. + references = [list(ref_group) for ref_group in zip_longest(*golds, fillvalue=None)] - corpus_score = metric.corpus_score(hypotheses=preds, references=golds) + corpus_score = metric.corpus_score(hypotheses=preds, references=references) score = corpus_score.score results = float(score) return results diff --git a/tests/unit/metrics/test_cases/chrf.json b/tests/unit/metrics/test_cases/chrf.json index f55028674..1d11195d8 100644 --- a/tests/unit/metrics/test_cases/chrf.json +++ b/tests/unit/metrics/test_cases/chrf.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 100.0, + "expected_output": 89.1577277178731, "tolerance": 0.1, - "description": "High similarity - minor character differences (CHRF ≈ 88.0)" + "description": "High similarity - minor character differences (CHRF ≈ 89.16)" }, { "name": "CHRF - Word Order Changes", @@ -119,9 +119,9 @@ "text": ["Lernen Maschinelles"] } ], - "expected_output": 78.84, + "expected_output": 82.21013454114589, "tolerance": 0.1, - "description": "Word order changes - same characters, different order (CHRF ≈ 75.0)" + "description": "Word order changes - same characters, different order (CHRF ≈ 82.21)" }, { "name": "CHRF - Moderate Similarity", @@ -159,9 +159,9 @@ "text": ["Die Sterne"] } ], - "expected_output": 37.68, + "expected_output": 34.847705825542, "tolerance": 0.1, - "description": "Moderate similarity - partial character overlap (CHRF ≈ 50.0)" + "description": "Moderate similarity - partial character overlap (CHRF ≈ 34.85)" }, { "name": "CHRF - Low Similarity", @@ -199,9 +199,9 @@ "text": ["Es sehr heiß"] } ], - "expected_output": 7.7, + "expected_output": 6.82996024778865, "tolerance": 0.1, - "description": "Low similarity - minimal character overlap (CHRF ≈ 20.0)" + "description": "Low similarity - minimal character overlap (CHRF ≈ 6.83)" } ] } diff --git a/tests/unit/metrics/test_cases/chrf_plus.json b/tests/unit/metrics/test_cases/chrf_plus.json index 29c45720d..89061449e 100644 --- a/tests/unit/metrics/test_cases/chrf_plus.json +++ b/tests/unit/metrics/test_cases/chrf_plus.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 100.0, + "expected_output": 88.06512778139042, "tolerance": 0.1, - "description": "High similarity - minor character differences (CHRF++ ≈ 85.0)" + "description": "High similarity - minor character differences (CHRF++ ≈ 88.07)" }, { "name": "CHRF Plus - Moderate Similarity", @@ -119,9 +119,9 @@ "text": ["ML"] } ], - "expected_output": 58.82, + "expected_output": 24.450573381865112, "tolerance": 0.1, - "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 45.0)" + "description": "Moderate similarity - significant character omissions (CHRF++ ≈ 24.45)" }, { "name": "CHRF Plus - Low Similarity", @@ -159,9 +159,9 @@ "text": ["Es heiß"] } ], - "expected_output": 15.0, - "tolerance": 10.0, - "description": "Low similarity - minimal character overlap (CHRF++ ≈ 15.0)" + "expected_output": 2.733143823910822, + "tolerance": 0.1, + "description": "Low similarity - minimal character overlap (CHRF++ ≈ 2.73)" } ] } diff --git a/tests/unit/metrics/test_cases/ter.json b/tests/unit/metrics/test_cases/ter.json index 39b671b0f..2986df5c7 100644 --- a/tests/unit/metrics/test_cases/ter.json +++ b/tests/unit/metrics/test_cases/ter.json @@ -79,9 +79,9 @@ "text": ["Das Wetter ist schön"] } ], - "expected_output": 0.0, + "expected_output": 9.090909090909092, "tolerance": 0.05, - "description": "Minor edits - small word differences" + "description": "Minor edits - small word differences (TER ≈ 9.09)" }, { "name": "TER - Major Edits", @@ -159,9 +159,9 @@ "text": ["Es ist sehr heiß"] } ], - "expected_output": 80.0, + "expected_output": 100.0, "tolerance": 0.1, - "description": "Completely different translations - maximum edit distance" + "description": "Completely different translations - maximum edit distance (TER = 100.0)" } ] } diff --git a/tests/unit/metrics/test_corpus_translation_metrics.py b/tests/unit/metrics/test_corpus_translation_metrics.py new file mode 100644 index 000000000..cfada215f --- /dev/null +++ b/tests/unit/metrics/test_corpus_translation_metrics.py @@ -0,0 +1,75 @@ +# MIT License +# +# Copyright (c) 2024 The HuggingFace Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from itertools import zip_longest + +import pytest + +from lighteval.metrics.metrics_corpus import CorpusLevelTranslationMetric +from lighteval.metrics.sample_preparator import GenerativeCorpusMetricInput +from lighteval.utils.utils import as_list + + +def _transpose_references(items: list[GenerativeCorpusMetricInput]) -> list[list[str | None]]: + per_sample_references = [as_list(item.golds) for item in items] + return [list(ref_group) for ref_group in zip_longest(*per_sample_references, fillvalue=None)] + + +def _first_prediction_per_sample(items: list[GenerativeCorpusMetricInput]) -> list[str]: + return [as_list(item.preds)[0] for item in items] + + +@pytest.mark.parametrize("metric_type", ["chrf", "chrf++", "ter"]) +def test_translation_metrics_use_all_hypotheses(metric_type: str): + items = [ + GenerativeCorpusMetricInput(golds=["GOOD"], preds=["GOOD"]), + GenerativeCorpusMetricInput(golds=["REF2"], preds=["PRED2"]), + ] + metric = CorpusLevelTranslationMetric(metric_type=metric_type) + + hypotheses = _first_prediction_per_sample(items) + expected_references = _transpose_references(items) + wrong_orientation_references = [item.golds for item in items] + + expected_score = metric.get_metric().corpus_score(hypotheses=hypotheses, references=expected_references).score + wrong_score = ( + metric.get_metric().corpus_score(hypotheses=hypotheses, references=wrong_orientation_references).score + ) + actual_score = metric.compute_corpus(items) + + assert actual_score == pytest.approx(expected_score) + assert wrong_score != pytest.approx(expected_score) + + +@pytest.mark.parametrize("metric_type", ["chrf", "chrf++", "ter"]) +def test_translation_metrics_support_variable_reference_counts(metric_type: str): + items = [ + GenerativeCorpusMetricInput(golds=["the cat sits", "cat is sitting"], preds=["the cat sits"]), + GenerativeCorpusMetricInput(golds=["goodbye"], preds=["hello"]), + ] + metric = CorpusLevelTranslationMetric(metric_type=metric_type) + + hypotheses = _first_prediction_per_sample(items) + references = _transpose_references(items) + expected_score = metric.get_metric().corpus_score(hypotheses=hypotheses, references=references).score + + assert metric.compute_corpus(items) == pytest.approx(expected_score)