From dbcfbabd1714df5cc4d1d2a99c300fc9c6a7c414 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Wed, 22 Apr 2026 14:20:57 +0200 Subject: [PATCH] feat(scoring): add NEL-NMP integration contracts (Approach 3) Defines the abstract interface between NEL and downstream metric providers (e.g., NMP's nemo_evaluator_sdk) in a new module: src/nemo_evaluator/scoring/contracts.py (~250 LOC). Provides: - Metric protocols (runtime_checkable, structural): * Metric -- core object-style metric contract (type, metric, compute_scores, score_names) * CorpusMetric -- corpus-level aggregation * MetricWithSecrets -- secret resolution protocol * MetricWithPreflight -- one-time preflight setup - Pydantic result types (no heavy deps -- no pyarrow, no pandas): * MetricScore, MetricResult * ScoreStats, RubricScoreValue, RubricScoreStat - SecretRefLike Protocol (abstract secret reference) - SecretResolver type alias - metric_as_scorer() helper: bridges object-style Metric to NEL's function-style (ScorerInput) -> dict scorer protocol Design rationale (Approach 3): - NMP's concrete metric implementations (BLEU, ROUGE, F1, RAGAS, LLMJudge, etc.) stay in nemo_evaluator_sdk with their external deps (sacrebleu, rouge_score, openai, ragas). - NEL owns only the contract surface. NMP SDK will be updated in a follow-up PR to depend on NEL for these types, dropping its own copies and becoming thinner. - Dependency direction is strictly one-way: SDK -> NEL. NEL never imports from any provider. - Contracts are dep-light: pure Pydantic models + typing.Protocol, no pyarrow/pandas/openai/etc. leaking into NEL. Verified: - 14 new unit tests pass - SDK's ExactMatchMetric structurally satisfies the Metric Protocol with zero code changes (isinstance check passes) - All 162 SDK + contracts tests pass together This commit is stacked on wprazuch/sdk-onboarding-approach1 so the contracts land alongside the verbatim SDK copy for easier review. The follow-up plan is: 1. Merge Approach 1 (verbatim copy) to dev/0.3.0 2. Merge this PR (contracts) to dev/0.3.0 3. Open a matching PR on NMP's nemo_evaluator_sdk to depend on NEL's contracts and drop its redundant abstraction copies Signed-off-by: Wojciech Prazuch --- src/nemo_evaluator/scoring/__init__.py | 14 ++ src/nemo_evaluator/scoring/contracts.py | 267 ++++++++++++++++++++++++ tests/test_scoring/test_contracts.py | 195 +++++++++++++++++ 3 files changed, 476 insertions(+) create mode 100644 src/nemo_evaluator/scoring/contracts.py create mode 100644 tests/test_scoring/test_contracts.py diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py index 3ca193cf7..292621f7d 100644 --- a/src/nemo_evaluator/scoring/__init__.py +++ b/src/nemo_evaluator/scoring/__init__.py @@ -22,6 +22,20 @@ from typing import Callable +from nemo_evaluator.scoring.contracts import ( + CorpusMetric, + Metric, + MetricResult, + MetricScore, + MetricWithPreflight, + MetricWithSecrets, + RubricScoreStat, + RubricScoreValue, + ScoreStats, + SecretRefLike, + SecretResolver, + metric_as_scorer, +) from nemo_evaluator.scoring.judge import ( JudgeScoringConfig, build_judge_prompt, diff --git a/src/nemo_evaluator/scoring/contracts.py b/src/nemo_evaluator/scoring/contracts.py new file mode 100644 index 000000000..5ad2a7b00 --- /dev/null +++ b/src/nemo_evaluator/scoring/contracts.py @@ -0,0 +1,267 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Contracts between NeMo Evaluator and downstream metric providers. + +This module defines the abstract interface that NMP's ``nemo_evaluator_sdk`` +and other metric providers satisfy. Concrete metric implementations (BLEU, +ROUGE, LLM judge, RAGAS, etc.) live **outside** NEL — they are runtime +code with external dependencies (sacrebleu, openai, ragas, etc.). NEL +only owns the contract: + +- **Metric protocols** describing how NEL calls a metric +- **Result types** describing what NEL receives back + +The design mirrors Python's ``typing.Protocol`` approach: concrete classes +satisfy the contract structurally, no inheritance required. A metric +provider library depends on NEL for these contracts; NEL never depends on +any provider. + +See also ``nemo_evaluator.scoring.types.ScorerInput`` — the companion +contract for function-style scorers ``(ScorerInput) -> dict``. Object-style +metrics (this module) and function-style scorers coexist; ``metric_as_scorer`` +bridges the two. +""" + +from __future__ import annotations + +import math +from typing import Any, Awaitable, Callable, Protocol, runtime_checkable + +from pydantic import BaseModel, Field, field_serializer, field_validator + + +__all__ = [ + "CorpusMetric", + "Metric", + "MetricResult", + "MetricScore", + "MetricWithPreflight", + "MetricWithSecrets", + "RubricScoreStat", + "RubricScoreValue", + "ScoreStats", + "SecretRefLike", + "SecretResolver", + "metric_as_scorer", +] + + +# ----------------------------------------------------------------------------- +# Score value types +# ----------------------------------------------------------------------------- + + +class RubricScoreValue(BaseModel): + """Rubric-based score definition for grading criteria.""" + + label: str = Field(description="Label for this rubric level.") + description: str | None = Field( + default=None, + description="Semantic meaning of the rubric level.", + ) + value: float | int = Field(description="Score value assigned to this rubric level.") + + +class RubricScoreStat(RubricScoreValue): + """Rubric score with sample-count statistics.""" + + count: int = Field(default=0, description="Number of samples at this rubric level.") + + +class ScoreStats(BaseModel): + """Stats for a score. NaN floats serialize as the string ``"NaN"`` for JSON portability.""" + + count: int | None = None + sum: float | None = None + sum_squared: float | None = None + min: float | None = None + max: float | None = None + mean: float | None = None + variance: float | None = None + stddev: float | None = None + stderr: float | None = None + nan_count: int | None = None + rubric_distribution: list[RubricScoreStat] | None = None + + @field_serializer("sum", "sum_squared", "min", "max", "mean", "variance", "stddev", "stderr") + def _serialize_nan(self, v: float | None) -> float | str | None: + if v is None: + return None + if isinstance(v, float) and math.isnan(v): + return "NaN" + return v + + +class MetricScore(BaseModel): + """One named score emitted by a metric call.""" + + name: str + value: float + stats: ScoreStats | None = Field( + default=None, + description="Aggregate statistics for this score, if any.", + ) + + @field_validator("value", mode="before") + @classmethod + def _parse_value(cls, v: Any) -> Any: + if isinstance(v, str): + if v.strip().lower() == "nan": + return float("nan") + raise ValueError("The only string value allowed for 'value' is NaN") + return v + + @field_serializer("value") + def _serialize_value(self, v: float) -> float | str: + if isinstance(v, float) and math.isnan(v): + return "NaN" + return v + + +class MetricResult(BaseModel): + """Result of one metric call: one or more named scores.""" + + scores: list[MetricScore] + + +# ----------------------------------------------------------------------------- +# Secret references (abstract) +# ----------------------------------------------------------------------------- + + +@runtime_checkable +class SecretRefLike(Protocol): + """Abstract secret reference. + + Concrete ``SecretRef`` types (with Pydantic root validation, etc.) live in + NMP's ``nemo_evaluator_sdk.values.common``. NEL only needs the ``root`` + accessor to describe the environment-variable name. + """ + + @property + def root(self) -> str: ... + + +SecretResolver = Callable[[str], Awaitable[str | None]] + + +# ----------------------------------------------------------------------------- +# Metric protocols +# ----------------------------------------------------------------------------- + + +@runtime_checkable +class Metric(Protocol): + """Structural contract for object-style metrics. + + Concrete implementations (BLEU, ROUGE, F1, ExactMatch, StringCheck, + NumberCheck, ToolCalling, LLMJudge, Remote, RAGAS variants) live in + ``nemo_evaluator_sdk`` and other provider libraries. They satisfy this + Protocol by having the four members below — no base class required. + + ``type`` is a public string identifier. Providers may implement it with a + plain string, a ``str``-backed ``Enum``, or a ``StrEnum``; consumers must + treat it as a string and not rely on enum-only APIs (``.value`` etc.). + """ + + @property + def type(self) -> str: + """Public string identifier for this metric.""" + ... + + def metric(self, item: dict, sample: dict, trace: Any = None) -> float | bool: + """Compute a single raw score for an (item, sample) pair.""" + ... + + async def compute_scores(self, item: dict, sample: dict) -> MetricResult: + """Compute structured scores for an (item, sample) pair.""" + ... + + def score_names(self) -> list[str]: + """Return the canonical score names this metric emits.""" + ... + + +@runtime_checkable +class CorpusMetric(Protocol): + """Metrics that also emit corpus-level scores (e.g., BLEU corpus).""" + + async def compute_corpus_scores( + self, items: list[dict], samples: list[dict] + ) -> MetricResult | None: + """Compute corpus-level scores across all evaluated rows.""" + ... + + +@runtime_checkable +class MetricWithSecrets(Protocol): + """Metrics that depend on secrets (e.g., API keys for remote judges).""" + + def secrets(self) -> dict[str, SecretRefLike]: + """Environment-variable names mapped to secret references.""" + ... + + async def resolve_secrets(self, resolver: SecretResolver) -> None: + """Resolve declared secrets via the provided resolver before use.""" + ... + + +@runtime_checkable +class MetricWithPreflight(Protocol): + """Metrics that need one-time setup before parallel evaluation starts.""" + + async def preflight(self) -> None: + """Run one-time preflight (capability detection, warm-up, etc.).""" + ... + + +# ----------------------------------------------------------------------------- +# Bridge: Metric -> NEL scorer callable +# ----------------------------------------------------------------------------- + + +def metric_as_scorer(metric: Metric) -> Callable[[Any], dict]: + """Adapt a :class:`Metric` to NEL's function-style scorer protocol. + + NEL's scoring registry (``nemo_evaluator.scoring._SCORER_REGISTRY``) + holds callables ``(ScorerInput) -> dict``. This helper wraps an object- + style ``Metric`` so it can register there without any glue code. + + Mapping performed: + + - ``item = {"reference": scorer_input.target, **scorer_input.metadata}`` + - ``sample = {"output_text": scorer_input.response, "response": scorer_input.response}`` + - ``score = metric.metric(item, sample)`` + + Providers whose metric templates reference these keys (e.g. + ``reference="{{ reference }}"``) plug in unchanged. + """ + from nemo_evaluator.scoring.types import ScorerInput # local: avoid cycles + + def _scorer(scorer_input: ScorerInput) -> dict: + item: dict = {"reference": scorer_input.target, **(scorer_input.metadata or {})} + sample: dict = { + "output_text": scorer_input.response, + "response": scorer_input.response, + } + score = metric.metric(item, sample) + score_value = float(score) if isinstance(score, (bool, int, float)) else 0.0 + return { + "score": score_value, + "metric_type": getattr(metric, "type", type(metric).__name__), + } + + return _scorer diff --git a/tests/test_scoring/test_contracts.py b/tests/test_scoring/test_contracts.py new file mode 100644 index 000000000..7c2faa4be --- /dev/null +++ b/tests/test_scoring/test_contracts.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for nemo_evaluator.scoring.contracts. + +The contracts module is the Approach-3 integration boundary between NEL +and NMP's nemo_evaluator_sdk. These tests verify the Protocols are +runtime-checkable and that SDK-style concrete classes satisfy them. +""" + +from __future__ import annotations + +import math + +import pytest + +from nemo_evaluator.scoring.contracts import ( + CorpusMetric, + Metric, + MetricResult, + MetricScore, + MetricWithPreflight, + MetricWithSecrets, + ScoreStats, + metric_as_scorer, +) +from nemo_evaluator.scoring.types import ScorerInput + + +# ----------------------------------------------------------------------------- +# Pydantic result types +# ----------------------------------------------------------------------------- + + +def test_metric_score_basic(): + s = MetricScore(name="bleu", value=0.42) + assert s.name == "bleu" + assert s.value == pytest.approx(0.42) + assert s.stats is None + + +def test_metric_score_nan_serialization(): + """NaN values round-trip as the string 'NaN' for JSON portability.""" + s = MetricScore(name="bleu", value=float("nan")) + dumped = s.model_dump() + assert dumped["value"] == "NaN" + + restored = MetricScore.model_validate(dumped) + assert math.isnan(restored.value) + + +def test_metric_score_rejects_non_nan_strings(): + with pytest.raises(ValueError, match="NaN"): + MetricScore(name="bleu", value="not-a-number") + + +def test_metric_result_holds_multiple_scores(): + r = MetricResult(scores=[ + MetricScore(name="bleu", value=0.3), + MetricScore(name="bleu-1", value=0.5), + ]) + assert len(r.scores) == 2 + + +def test_score_stats_serializes_nan(): + stats = ScoreStats(mean=float("nan"), stddev=0.1, count=10) + dumped = stats.model_dump() + assert dumped["mean"] == "NaN" + assert dumped["stddev"] == pytest.approx(0.1) + assert dumped["count"] == 10 + + +# ----------------------------------------------------------------------------- +# Protocol satisfaction (SDK-style concrete class) +# ----------------------------------------------------------------------------- + + +class _DuckMetric: + """Fake metric class with SDK-style surface. No base class.""" + + type = "duck" + + def metric(self, item: dict, sample: dict, trace=None) -> float: + return 1.0 if item.get("reference") == sample.get("output_text") else 0.0 + + async def compute_scores(self, item: dict, sample: dict) -> MetricResult: + return MetricResult(scores=[MetricScore(name="duck", value=self.metric(item, sample))]) + + def score_names(self) -> list[str]: + return ["duck"] + + +class _DuckCorpus(_DuckMetric): + async def compute_corpus_scores(self, items, samples): + return MetricResult(scores=[MetricScore(name="duck_corpus", value=0.5)]) + + +class _DuckSecrets(_DuckMetric): + def secrets(self) -> dict: + return {"API_KEY": _SecretRef(root="API_KEY")} + + async def resolve_secrets(self, resolver) -> None: + pass + + +class _DuckPreflight(_DuckMetric): + async def preflight(self) -> None: + pass + + +class _SecretRef: + def __init__(self, root: str): + self._root = root + + @property + def root(self) -> str: + return self._root + + +def test_duck_class_satisfies_metric_protocol(): + """An SDK-style class satisfies the Metric Protocol structurally (no inheritance).""" + assert isinstance(_DuckMetric(), Metric) + + +def test_corpus_metric_protocol(): + assert isinstance(_DuckCorpus(), CorpusMetric) + + +def test_metric_with_secrets_protocol(): + assert isinstance(_DuckSecrets(), MetricWithSecrets) + + +def test_metric_with_preflight_protocol(): + assert isinstance(_DuckPreflight(), MetricWithPreflight) + + +def test_plain_class_not_satisfying_protocol(): + class NotAMetric: + pass + + assert not isinstance(NotAMetric(), Metric) + + +# ----------------------------------------------------------------------------- +# Bridge: metric_as_scorer +# ----------------------------------------------------------------------------- + + +def test_metric_as_scorer_positive_match(): + scorer = metric_as_scorer(_DuckMetric()) + result = scorer(ScorerInput(response="hello", target="hello")) + + assert isinstance(result, dict) + assert result["score"] == 1.0 + assert result["metric_type"] == "duck" + + +def test_metric_as_scorer_negative_match(): + scorer = metric_as_scorer(_DuckMetric()) + result = scorer(ScorerInput(response="hello", target="goodbye")) + + assert result["score"] == 0.0 + + +def test_metric_as_scorer_preserves_metadata(): + """Metadata from ScorerInput flows into item dict.""" + + class _InspectMetric(_DuckMetric): + captured_item: dict = {} + + def metric(self, item, sample, trace=None): + _InspectMetric.captured_item = item + return 0.0 + + m = _InspectMetric() + scorer = metric_as_scorer(m) + scorer(ScorerInput(response="x", target="y", metadata={"problem_id": 7})) + + assert _InspectMetric.captured_item["reference"] == "y" + assert _InspectMetric.captured_item["problem_id"] == 7 + + +# ----------------------------------------------------------------------------- +# Re-export from scoring package +# ----------------------------------------------------------------------------- + + +def test_contracts_available_from_scoring_package(): + from nemo_evaluator.scoring import Metric as ImportedMetric + from nemo_evaluator.scoring import MetricResult as ImportedResult + from nemo_evaluator.scoring import metric_as_scorer as imported_bridge + + assert ImportedMetric is Metric + assert ImportedResult is MetricResult + assert imported_bridge is metric_as_scorer