From dbcfbabd1714df5cc4d1d2a99c300fc9c6a7c414 Mon Sep 17 00:00:00 2001
From: Wojciech Prazuch <wprazuch@nvidia.com>
Date: Wed, 22 Apr 2026 14:20:57 +0200
Subject: [PATCH] feat(scoring): add NEL-NMP integration contracts (Approach 3)

Defines the abstract interface between NEL and downstream metric
providers (e.g., NMP's nemo_evaluator_sdk) in a new module:
src/nemo_evaluator/scoring/contracts.py (~250 LOC).

Provides:
- Metric protocols (runtime_checkable, structural):
  * Metric -- core object-style metric contract (type, metric,
    compute_scores, score_names)
  * CorpusMetric -- corpus-level aggregation
  * MetricWithSecrets -- secret resolution protocol
  * MetricWithPreflight -- one-time preflight setup
- Pydantic result types (no heavy deps -- no pyarrow, no pandas):
  * MetricScore, MetricResult
  * ScoreStats, RubricScoreValue, RubricScoreStat
- SecretRefLike Protocol (abstract secret reference)
- SecretResolver type alias
- metric_as_scorer() helper: bridges object-style Metric to NEL's
  function-style (ScorerInput) -> dict scorer protocol

Design rationale (Approach 3):
- NMP's concrete metric implementations (BLEU, ROUGE, F1, RAGAS,
  LLMJudge, etc.) stay in nemo_evaluator_sdk with their external deps
  (sacrebleu, rouge_score, openai, ragas).
- NEL owns only the contract surface. NMP SDK will be updated in a
  follow-up PR to depend on NEL for these types, dropping its own
  copies and becoming thinner.
- Dependency direction is strictly one-way: SDK -> NEL. NEL never
  imports from any provider.
- Contracts are dep-light: pure Pydantic models + typing.Protocol,
  no pyarrow/pandas/openai/etc. leaking into NEL.

Verified:
- 14 new unit tests pass
- SDK's ExactMatchMetric structurally satisfies the Metric Protocol
  with zero code changes (isinstance check passes)
- All 162 SDK + contracts tests pass together

This commit is stacked on wprazuch/sdk-onboarding-approach1 so the
contracts land alongside the verbatim SDK copy for easier review. The
follow-up plan is:
1. Merge Approach 1 (verbatim copy) to dev/0.3.0
2. Merge this PR (contracts) to dev/0.3.0
3. Open a matching PR on NMP's nemo_evaluator_sdk to depend on NEL's
   contracts and drop its redundant abstraction copies

Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
---
 src/nemo_evaluator/scoring/__init__.py  |  14 ++
 src/nemo_evaluator/scoring/contracts.py | 267 ++++++++++++++++++++++++
 tests/test_scoring/test_contracts.py    | 195 +++++++++++++++++
 3 files changed, 476 insertions(+)
 create mode 100644 src/nemo_evaluator/scoring/contracts.py
 create mode 100644 tests/test_scoring/test_contracts.py

diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py
index 3ca193cf7..292621f7d 100644
--- a/src/nemo_evaluator/scoring/__init__.py
+++ b/src/nemo_evaluator/scoring/__init__.py
@@ -22,6 +22,20 @@
 
 from typing import Callable
 
+from nemo_evaluator.scoring.contracts import (
+    CorpusMetric,
+    Metric,
+    MetricResult,
+    MetricScore,
+    MetricWithPreflight,
+    MetricWithSecrets,
+    RubricScoreStat,
+    RubricScoreValue,
+    ScoreStats,
+    SecretRefLike,
+    SecretResolver,
+    metric_as_scorer,
+)
 from nemo_evaluator.scoring.judge import (
     JudgeScoringConfig,
     build_judge_prompt,
diff --git a/src/nemo_evaluator/scoring/contracts.py b/src/nemo_evaluator/scoring/contracts.py
new file mode 100644
index 000000000..5ad2a7b00
--- /dev/null
+++ b/src/nemo_evaluator/scoring/contracts.py
@@ -0,0 +1,267 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contracts between NeMo Evaluator and downstream metric providers.
+
+This module defines the abstract interface that NMP's ``nemo_evaluator_sdk``
+and other metric providers satisfy. Concrete metric implementations (BLEU,
+ROUGE, LLM judge, RAGAS, etc.) live **outside** NEL — they are runtime
+code with external dependencies (sacrebleu, openai, ragas, etc.). NEL
+only owns the contract:
+
+- **Metric protocols** describing how NEL calls a metric
+- **Result types** describing what NEL receives back
+
+The design mirrors Python's ``typing.Protocol`` approach: concrete classes
+satisfy the contract structurally, no inheritance required. A metric
+provider library depends on NEL for these contracts; NEL never depends on
+any provider.
+
+See also ``nemo_evaluator.scoring.types.ScorerInput`` — the companion
+contract for function-style scorers ``(ScorerInput) -> dict``. Object-style
+metrics (this module) and function-style scorers coexist; ``metric_as_scorer``
+bridges the two.
+"""
+
+from __future__ import annotations
+
+import math
+from typing import Any, Awaitable, Callable, Protocol, runtime_checkable
+
+from pydantic import BaseModel, Field, field_serializer, field_validator
+
+
+__all__ = [
+    "CorpusMetric",
+    "Metric",
+    "MetricResult",
+    "MetricScore",
+    "MetricWithPreflight",
+    "MetricWithSecrets",
+    "RubricScoreStat",
+    "RubricScoreValue",
+    "ScoreStats",
+    "SecretRefLike",
+    "SecretResolver",
+    "metric_as_scorer",
+]
+
+
+# -----------------------------------------------------------------------------
+# Score value types
+# -----------------------------------------------------------------------------
+
+
+class RubricScoreValue(BaseModel):
+    """Rubric-based score definition for grading criteria."""
+
+    label: str = Field(description="Label for this rubric level.")
+    description: str | None = Field(
+        default=None,
+        description="Semantic meaning of the rubric level.",
+    )
+    value: float | int = Field(description="Score value assigned to this rubric level.")
+
+
+class RubricScoreStat(RubricScoreValue):
+    """Rubric score with sample-count statistics."""
+
+    count: int = Field(default=0, description="Number of samples at this rubric level.")
+
+
+class ScoreStats(BaseModel):
+    """Stats for a score. NaN floats serialize as the string ``"NaN"`` for JSON portability."""
+
+    count: int | None = None
+    sum: float | None = None
+    sum_squared: float | None = None
+    min: float | None = None
+    max: float | None = None
+    mean: float | None = None
+    variance: float | None = None
+    stddev: float | None = None
+    stderr: float | None = None
+    nan_count: int | None = None
+    rubric_distribution: list[RubricScoreStat] | None = None
+
+    @field_serializer("sum", "sum_squared", "min", "max", "mean", "variance", "stddev", "stderr")
+    def _serialize_nan(self, v: float | None) -> float | str | None:
+        if v is None:
+            return None
+        if isinstance(v, float) and math.isnan(v):
+            return "NaN"
+        return v
+
+
+class MetricScore(BaseModel):
+    """One named score emitted by a metric call."""
+
+    name: str
+    value: float
+    stats: ScoreStats | None = Field(
+        default=None,
+        description="Aggregate statistics for this score, if any.",
+    )
+
+    @field_validator("value", mode="before")
+    @classmethod
+    def _parse_value(cls, v: Any) -> Any:
+        if isinstance(v, str):
+            if v.strip().lower() == "nan":
+                return float("nan")
+            raise ValueError("The only string value allowed for 'value' is NaN")
+        return v
+
+    @field_serializer("value")
+    def _serialize_value(self, v: float) -> float | str:
+        if isinstance(v, float) and math.isnan(v):
+            return "NaN"
+        return v
+
+
+class MetricResult(BaseModel):
+    """Result of one metric call: one or more named scores."""
+
+    scores: list[MetricScore]
+
+
+# -----------------------------------------------------------------------------
+# Secret references (abstract)
+# -----------------------------------------------------------------------------
+
+
+@runtime_checkable
+class SecretRefLike(Protocol):
+    """Abstract secret reference.
+
+    Concrete ``SecretRef`` types (with Pydantic root validation, etc.) live in
+    NMP's ``nemo_evaluator_sdk.values.common``. NEL only needs the ``root``
+    accessor to describe the environment-variable name.
+    """
+
+    @property
+    def root(self) -> str: ...
+
+
+SecretResolver = Callable[[str], Awaitable[str | None]]
+
+
+# -----------------------------------------------------------------------------
+# Metric protocols
+# -----------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Metric(Protocol):
+    """Structural contract for object-style metrics.
+
+    Concrete implementations (BLEU, ROUGE, F1, ExactMatch, StringCheck,
+    NumberCheck, ToolCalling, LLMJudge, Remote, RAGAS variants) live in
+    ``nemo_evaluator_sdk`` and other provider libraries. They satisfy this
+    Protocol by having the four members below — no base class required.
+
+    ``type`` is a public string identifier. Providers may implement it with a
+    plain string, a ``str``-backed ``Enum``, or a ``StrEnum``; consumers must
+    treat it as a string and not rely on enum-only APIs (``.value`` etc.).
+    """
+
+    @property
+    def type(self) -> str:
+        """Public string identifier for this metric."""
+        ...
+
+    def metric(self, item: dict, sample: dict, trace: Any = None) -> float | bool:
+        """Compute a single raw score for an (item, sample) pair."""
+        ...
+
+    async def compute_scores(self, item: dict, sample: dict) -> MetricResult:
+        """Compute structured scores for an (item, sample) pair."""
+        ...
+
+    def score_names(self) -> list[str]:
+        """Return the canonical score names this metric emits."""
+        ...
+
+
+@runtime_checkable
+class CorpusMetric(Protocol):
+    """Metrics that also emit corpus-level scores (e.g., BLEU corpus)."""
+
+    async def compute_corpus_scores(
+        self, items: list[dict], samples: list[dict]
+    ) -> MetricResult | None:
+        """Compute corpus-level scores across all evaluated rows."""
+        ...
+
+
+@runtime_checkable
+class MetricWithSecrets(Protocol):
+    """Metrics that depend on secrets (e.g., API keys for remote judges)."""
+
+    def secrets(self) -> dict[str, SecretRefLike]:
+        """Environment-variable names mapped to secret references."""
+        ...
+
+    async def resolve_secrets(self, resolver: SecretResolver) -> None:
+        """Resolve declared secrets via the provided resolver before use."""
+        ...
+
+
+@runtime_checkable
+class MetricWithPreflight(Protocol):
+    """Metrics that need one-time setup before parallel evaluation starts."""
+
+    async def preflight(self) -> None:
+        """Run one-time preflight (capability detection, warm-up, etc.)."""
+        ...
+
+
+# -----------------------------------------------------------------------------
+# Bridge: Metric -> NEL scorer callable
+# -----------------------------------------------------------------------------
+
+
+def metric_as_scorer(metric: Metric) -> Callable[[Any], dict]:
+    """Adapt a :class:`Metric` to NEL's function-style scorer protocol.
+
+    NEL's scoring registry (``nemo_evaluator.scoring._SCORER_REGISTRY``)
+    holds callables ``(ScorerInput) -> dict``. This helper wraps an object-
+    style ``Metric`` so it can register there without any glue code.
+
+    Mapping performed:
+
+    - ``item   = {"reference": scorer_input.target, **scorer_input.metadata}``
+    - ``sample = {"output_text": scorer_input.response, "response": scorer_input.response}``
+    - ``score  = metric.metric(item, sample)``
+
+    Providers whose metric templates reference these keys (e.g.
+    ``reference="{{ reference }}"``) plug in unchanged.
+    """
+    from nemo_evaluator.scoring.types import ScorerInput  # local: avoid cycles
+
+    def _scorer(scorer_input: ScorerInput) -> dict:
+        item: dict = {"reference": scorer_input.target, **(scorer_input.metadata or {})}
+        sample: dict = {
+            "output_text": scorer_input.response,
+            "response": scorer_input.response,
+        }
+        score = metric.metric(item, sample)
+        score_value = float(score) if isinstance(score, (bool, int, float)) else 0.0
+        return {
+            "score": score_value,
+            "metric_type": getattr(metric, "type", type(metric).__name__),
+        }
+
+    return _scorer
diff --git a/tests/test_scoring/test_contracts.py b/tests/test_scoring/test_contracts.py
new file mode 100644
index 000000000..7c2faa4be
--- /dev/null
+++ b/tests/test_scoring/test_contracts.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for nemo_evaluator.scoring.contracts.
+
+The contracts module is the Approach-3 integration boundary between NEL
+and NMP's nemo_evaluator_sdk. These tests verify the Protocols are
+runtime-checkable and that SDK-style concrete classes satisfy them.
+"""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from nemo_evaluator.scoring.contracts import (
+    CorpusMetric,
+    Metric,
+    MetricResult,
+    MetricScore,
+    MetricWithPreflight,
+    MetricWithSecrets,
+    ScoreStats,
+    metric_as_scorer,
+)
+from nemo_evaluator.scoring.types import ScorerInput
+
+
+# -----------------------------------------------------------------------------
+# Pydantic result types
+# -----------------------------------------------------------------------------
+
+
+def test_metric_score_basic():
+    s = MetricScore(name="bleu", value=0.42)
+    assert s.name == "bleu"
+    assert s.value == pytest.approx(0.42)
+    assert s.stats is None
+
+
+def test_metric_score_nan_serialization():
+    """NaN values round-trip as the string 'NaN' for JSON portability."""
+    s = MetricScore(name="bleu", value=float("nan"))
+    dumped = s.model_dump()
+    assert dumped["value"] == "NaN"
+
+    restored = MetricScore.model_validate(dumped)
+    assert math.isnan(restored.value)
+
+
+def test_metric_score_rejects_non_nan_strings():
+    with pytest.raises(ValueError, match="NaN"):
+        MetricScore(name="bleu", value="not-a-number")
+
+
+def test_metric_result_holds_multiple_scores():
+    r = MetricResult(scores=[
+        MetricScore(name="bleu", value=0.3),
+        MetricScore(name="bleu-1", value=0.5),
+    ])
+    assert len(r.scores) == 2
+
+
+def test_score_stats_serializes_nan():
+    stats = ScoreStats(mean=float("nan"), stddev=0.1, count=10)
+    dumped = stats.model_dump()
+    assert dumped["mean"] == "NaN"
+    assert dumped["stddev"] == pytest.approx(0.1)
+    assert dumped["count"] == 10
+
+
+# -----------------------------------------------------------------------------
+# Protocol satisfaction (SDK-style concrete class)
+# -----------------------------------------------------------------------------
+
+
+class _DuckMetric:
+    """Fake metric class with SDK-style surface. No base class."""
+
+    type = "duck"
+
+    def metric(self, item: dict, sample: dict, trace=None) -> float:
+        return 1.0 if item.get("reference") == sample.get("output_text") else 0.0
+
+    async def compute_scores(self, item: dict, sample: dict) -> MetricResult:
+        return MetricResult(scores=[MetricScore(name="duck", value=self.metric(item, sample))])
+
+    def score_names(self) -> list[str]:
+        return ["duck"]
+
+
+class _DuckCorpus(_DuckMetric):
+    async def compute_corpus_scores(self, items, samples):
+        return MetricResult(scores=[MetricScore(name="duck_corpus", value=0.5)])
+
+
+class _DuckSecrets(_DuckMetric):
+    def secrets(self) -> dict:
+        return {"API_KEY": _SecretRef(root="API_KEY")}
+
+    async def resolve_secrets(self, resolver) -> None:
+        pass
+
+
+class _DuckPreflight(_DuckMetric):
+    async def preflight(self) -> None:
+        pass
+
+
+class _SecretRef:
+    def __init__(self, root: str):
+        self._root = root
+
+    @property
+    def root(self) -> str:
+        return self._root
+
+
+def test_duck_class_satisfies_metric_protocol():
+    """An SDK-style class satisfies the Metric Protocol structurally (no inheritance)."""
+    assert isinstance(_DuckMetric(), Metric)
+
+
+def test_corpus_metric_protocol():
+    assert isinstance(_DuckCorpus(), CorpusMetric)
+
+
+def test_metric_with_secrets_protocol():
+    assert isinstance(_DuckSecrets(), MetricWithSecrets)
+
+
+def test_metric_with_preflight_protocol():
+    assert isinstance(_DuckPreflight(), MetricWithPreflight)
+
+
+def test_plain_class_not_satisfying_protocol():
+    class NotAMetric:
+        pass
+
+    assert not isinstance(NotAMetric(), Metric)
+
+
+# -----------------------------------------------------------------------------
+# Bridge: metric_as_scorer
+# -----------------------------------------------------------------------------
+
+
+def test_metric_as_scorer_positive_match():
+    scorer = metric_as_scorer(_DuckMetric())
+    result = scorer(ScorerInput(response="hello", target="hello"))
+
+    assert isinstance(result, dict)
+    assert result["score"] == 1.0
+    assert result["metric_type"] == "duck"
+
+
+def test_metric_as_scorer_negative_match():
+    scorer = metric_as_scorer(_DuckMetric())
+    result = scorer(ScorerInput(response="hello", target="goodbye"))
+
+    assert result["score"] == 0.0
+
+
+def test_metric_as_scorer_preserves_metadata():
+    """Metadata from ScorerInput flows into item dict."""
+
+    class _InspectMetric(_DuckMetric):
+        captured_item: dict = {}
+
+        def metric(self, item, sample, trace=None):
+            _InspectMetric.captured_item = item
+            return 0.0
+
+    m = _InspectMetric()
+    scorer = metric_as_scorer(m)
+    scorer(ScorerInput(response="x", target="y", metadata={"problem_id": 7}))
+
+    assert _InspectMetric.captured_item["reference"] == "y"
+    assert _InspectMetric.captured_item["problem_id"] == 7
+
+
+# -----------------------------------------------------------------------------
+# Re-export from scoring package
+# -----------------------------------------------------------------------------
+
+
+def test_contracts_available_from_scoring_package():
+    from nemo_evaluator.scoring import Metric as ImportedMetric
+    from nemo_evaluator.scoring import MetricResult as ImportedResult
+    from nemo_evaluator.scoring import metric_as_scorer as imported_bridge
+
+    assert ImportedMetric is Metric
+    assert ImportedResult is MetricResult
+    assert imported_bridge is metric_as_scorer