From 78bccb8c1106a9256cde37b427448e1b17474716 Mon Sep 17 00:00:00 2001 From: Wojciech Prazuch Date: Thu, 23 Apr 2026 09:47:41 +0200 Subject: [PATCH] feat(scoring): add ergonomics helpers for TemplateMetric authoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the ~20-30 LOC authoring promise by giving TemplateMetric subclasses ready-made helpers. Stacked on wprazuch/metric-abstractions. New: - TemplateMetric._render(template, input) — Jinja2 rendering with both NEL-native ({{ response }}, {{ target }}, {{ metadata.* }}) and SDK-native ({{ output_text }}, {{ reference }}) variable names, so templates authored against either vocabulary work unchanged. Strict undefined-variable handling raises instead of silently rendering empty. - CorpusTemplateMetric(TemplateMetric) — base class for metrics with both row-level and corpus-level scores. Subclasses implement _score() and _corpus_score(); defaults wrap each in a MetricResult. score_names() includes both '' and '_corpus'. Empty inputs -> None. - SecretsMixin — mixin that satisfies MetricWithSecrets protocol. Subclasses declare secret_env_vars: ClassVar[tuple[str, ...]]. Secrets are eagerly loaded from os.environ at construction, with async resolve_secrets() as a fallback path (NMP Platform flow). Resolved values are stored as SecretStr private attrs; get_secret(env_var) returns the plaintext value or None. Tests (+18 new, 42 total): - _render: NEL-native names, SDK-native aliases, metadata/config access, StrictUndefined raises on missing variables. - CorpusTemplateMetric: satisfies both Metric + CorpusMetric protocols, row-level default, corpus-level default, empty-inputs, score_names includes both. - SecretsMixin: satisfies MetricWithSecrets, declares env vars, reads env at construction, async resolver fills gaps, resolver is skipped when already loaded. - Target ergonomics proof: _TinyLengthMetric in ~15 LOC of user code demonstrates the authoring pattern. NMP can pick these up as they land. The contract API is stable — helpers are additive (subclassing-based), so SDK concrete metrics can adopt them incrementally without breakage. Signed-off-by: Wojciech Prazuch --- src/nemo_evaluator/scoring/__init__.py | 2 + src/nemo_evaluator/scoring/contracts.py | 157 ++++++++++- .../test_scoring/test_contracts_ergonomics.py | 247 ++++++++++++++++++ 3 files changed, 404 insertions(+), 2 deletions(-) create mode 100644 tests/test_scoring/test_contracts_ergonomics.py diff --git a/src/nemo_evaluator/scoring/__init__.py b/src/nemo_evaluator/scoring/__init__.py index 225326236..86908a3fe 100644 --- a/src/nemo_evaluator/scoring/__init__.py +++ b/src/nemo_evaluator/scoring/__init__.py @@ -24,6 +24,7 @@ from nemo_evaluator.scoring.contracts import ( CorpusMetric, + CorpusTemplateMetric, Metric, MetricInput, MetricOutput, @@ -33,6 +34,7 @@ Scorer, SecretRefLike, SecretResolver, + SecretsMixin, TemplateMetric, get_metric, list_metrics, diff --git a/src/nemo_evaluator/scoring/contracts.py b/src/nemo_evaluator/scoring/contracts.py index fab86a266..880c402fd 100644 --- a/src/nemo_evaluator/scoring/contracts.py +++ b/src/nemo_evaluator/scoring/contracts.py @@ -73,10 +73,12 @@ def _score(self, input: MetricInput) -> float: from __future__ import annotations import math +import os from abc import abstractmethod from typing import Any, Awaitable, Callable, ClassVar, Protocol, runtime_checkable -from pydantic import BaseModel, Field, field_serializer, field_validator +from jinja2 import Environment, StrictUndefined +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, field_serializer, field_validator from nemo_evaluator.scoring.types import ScorerInput @@ -94,8 +96,11 @@ def _score(self, input: MetricInput) -> float: "Scorer", "SecretRefLike", "SecretResolver", - # Base class for easy authoring + # Base classes for easy authoring "TemplateMetric", + "CorpusTemplateMetric", + # Mixins + "SecretsMixin", # Bridge "metric_as_scorer", # Registry @@ -287,6 +292,13 @@ def _score(self, input: MetricInput) -> float: type: str = Field(description="Public string identifier for this metric.") + _jinja_env: ClassVar[Environment] = Environment( + undefined=StrictUndefined, + trim_blocks=False, + lstrip_blocks=False, + autoescape=False, + ) + @abstractmethod def _score(self, input: MetricInput) -> float: """Compute one raw score value. Subclass implements this.""" @@ -300,6 +312,147 @@ def score_names(self) -> list[str]: """Default implementation: a single score named after ``self.type``.""" return [self.type] + def _render(self, template: str, input: MetricInput) -> str: + """Render a Jinja2 template against a :class:`MetricInput`. + + The rendering context exposes both NEL-native names and SDK-native + aliases, so templates authored against either vocabulary work:: + + # NEL-native + "{{ response }}", "{{ target }}", "{{ metadata. }}" + # SDK-native aliases + "{{ output_text }}" (== response) + "{{ reference }}" (== target) + + Raises :class:`jinja2.exceptions.UndefinedError` if the template + references a variable not in the context (strict mode). + """ + ctx: dict[str, Any] = { + "response": input.response, + "target": input.target, + "output_text": input.response, + "reference": input.target, + "metadata": dict(input.metadata or {}), + "config": dict(input.config or {}), + } + return self._jinja_env.from_string(template).render(**ctx) + + +# ============================================================================ +# CorpusTemplateMetric — base for metrics with corpus-level scoring +# ============================================================================ + + +class CorpusTemplateMetric(TemplateMetric): + """Base class for metrics that emit both row-level and corpus-level scores. + + Subclasses implement :meth:`_score` (row) and :meth:`_corpus_score` + (aggregate). Default :meth:`compute_corpus_scores` wraps ``_corpus_score`` + in a single-score MetricResult named ``{type}_corpus``. + + Example:: + + @register_metric + class Accuracy(CorpusTemplateMetric): + type: Literal["accuracy"] = "accuracy" + + def _score(self, input): + return 1.0 if input.response == input.target else 0.0 + + def _corpus_score(self, inputs): + return sum(self._score(i) for i in inputs) / len(inputs) + """ + + @abstractmethod + def _corpus_score(self, inputs: list[MetricInput]) -> float: + """Compute one corpus-level score across inputs. Subclass implements this.""" + ... + + async def compute_corpus_scores( + self, inputs: list[MetricInput] + ) -> MetricResult | None: + """Default implementation: wrap ``_corpus_score`` in a single-score MetricResult.""" + if not inputs: + return None + corpus_name = f"{self.type}_corpus" + return MetricResult( + scores=[MetricScore(name=corpus_name, value=self._corpus_score(inputs))] + ) + + def score_names(self) -> list[str]: + """Includes both the row-level and corpus-level score names.""" + return [self.type, f"{self.type}_corpus"] + + +# ============================================================================ +# SecretsMixin — ergonomics for metrics that need API keys +# ============================================================================ + + +class _SimpleSecretRef(BaseModel): + """Plain SecretRef satisfying :class:`SecretRefLike` — NMP SDK has its own.""" + + root: str + + @property + def env(self) -> str: + return self.root + + +class SecretsMixin(BaseModel): + """Mixin for metrics that need secrets (API keys for remote judges, etc.). + + Subclasses declare one or more env-var names via ``secret_env_vars``. + The mixin implements :meth:`secrets` and :meth:`resolve_secrets` so the + class satisfies :class:`MetricWithSecrets`. + + Resolved values are stored in ``_resolved_secrets`` (a dict of env-var + name -> ``SecretStr``). Access via :meth:`get_secret`. + + Example:: + + from typing import Literal, ClassVar + from nemo_evaluator.scoring import TemplateMetric, SecretsMixin, register_metric + + @register_metric + class JudgeMetric(TemplateMetric, SecretsMixin): + type: Literal["judge"] = "judge" + secret_env_vars: ClassVar[tuple[str, ...]] = ("JUDGE_API_KEY",) + + def _score(self, input): + key = self.get_secret("JUDGE_API_KEY") + # ... call remote judge with key ... + return 0.0 + """ + + secret_env_vars: ClassVar[tuple[str, ...]] = () + _resolved_secrets: dict[str, SecretStr] = PrivateAttr(default_factory=dict) + + def model_post_init(self, __context: Any) -> None: + """Eagerly load any secrets already in os.environ at construction time.""" + for var in self.secret_env_vars: + value = os.environ.get(var) + if value: + self._resolved_secrets[var] = SecretStr(value) + + def secrets(self) -> dict[str, SecretRefLike]: + """Declared secret env-vars mapped to :class:`SecretRefLike` references.""" + return {var: _SimpleSecretRef(root=var) for var in self.secret_env_vars} + + async def resolve_secrets(self, resolver: SecretResolver) -> None: + """Resolve declared secrets via the provided async resolver.""" + for var in self.secret_env_vars: + if var in self._resolved_secrets: + continue # already populated from env + value = await resolver(var) + if value: + self._resolved_secrets[var] = SecretStr(value) + + def get_secret(self, env_var: str) -> str | None: + """Return the resolved secret value for ``env_var``, or ``None``.""" + secret = self._resolved_secrets.get(env_var) + return secret.get_secret_value() if secret else None + # ============================================================================ # Bridge: Metric -> function-style Scorer diff --git a/tests/test_scoring/test_contracts_ergonomics.py b/tests/test_scoring/test_contracts_ergonomics.py new file mode 100644 index 000000000..30246e22d --- /dev/null +++ b/tests/test_scoring/test_contracts_ergonomics.py @@ -0,0 +1,247 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Ergonomics tests for the metric-abstractions helpers: + +- ``TemplateMetric._render`` — Jinja2 helper +- :class:`CorpusTemplateMetric` — row + corpus pattern +- :class:`SecretsMixin` — API-key resolution +""" + +from __future__ import annotations + +from typing import ClassVar, Literal + +import pytest +from jinja2.exceptions import UndefinedError + +from nemo_evaluator.scoring.contracts import ( + CorpusTemplateMetric, + Metric, + MetricInput, + MetricResult, + MetricWithSecrets, + SecretsMixin, + TemplateMetric, +) + + +# --------------------------------------------------------------------------- +# TemplateMetric._render +# --------------------------------------------------------------------------- + + +class _RenderProbe(TemplateMetric): + type: Literal["probe"] = "probe" + + def _score(self, input): + return 0.0 + + +def test_render_simple_substitution_nel_native_names(): + m = _RenderProbe() + out = m._render("{{ response }} vs {{ target }}", MetricInput(response="hi", target="hello")) + assert out == "hi vs hello" + + +def test_render_sdk_native_aliases(): + """SDK-style templates using 'output_text' and 'reference' render correctly.""" + m = _RenderProbe() + out = m._render("{{ output_text }} :: {{ reference }}", MetricInput(response="a", target="b")) + assert out == "a :: b" + + +def test_render_metadata_access(): + m = _RenderProbe() + out = m._render( + "problem={{ metadata.problem_id }}", + MetricInput(response="", target="", metadata={"problem_id": 42}), + ) + assert out == "problem=42" + + +def test_render_config_access(): + m = _RenderProbe() + out = m._render( + "k={{ config.k }}", + MetricInput(response="", target="", config={"k": 10}), + ) + assert out == "k=10" + + +def test_render_missing_variable_raises_strict_undefined(): + """Undefined variables should raise, not silently render empty.""" + m = _RenderProbe() + with pytest.raises(UndefinedError): + m._render("{{ not_a_field }}", MetricInput(response="x", target="y")) + + +# --------------------------------------------------------------------------- +# CorpusTemplateMetric +# --------------------------------------------------------------------------- + + +class _Accuracy(CorpusTemplateMetric): + type: Literal["accuracy"] = "accuracy" + + def _score(self, input: MetricInput) -> float: + return 1.0 if input.response == input.target else 0.0 + + def _corpus_score(self, inputs: list[MetricInput]) -> float: + return sum(self._score(i) for i in inputs) / len(inputs) + + +def test_corpus_metric_satisfies_both_protocols(): + from nemo_evaluator.scoring.contracts import CorpusMetric + + m = _Accuracy() + assert isinstance(m, Metric) + assert isinstance(m, CorpusMetric) + + +async def test_corpus_metric_row_score_default_wrapper(): + m = _Accuracy() + out = await m.compute_scores(MetricInput(response="x", target="x")) + assert out.scores[0].name == "accuracy" + assert out.scores[0].value == 1.0 + + +async def test_corpus_metric_compute_corpus_scores(): + m = _Accuracy() + inputs = [ + MetricInput(response="a", target="a"), + MetricInput(response="b", target="b"), + MetricInput(response="c", target="wrong"), + ] + out = await m.compute_corpus_scores(inputs) + assert out is not None + assert out.scores[0].name == "accuracy_corpus" + assert out.scores[0].value == pytest.approx(2 / 3) + + +async def test_corpus_metric_empty_inputs_returns_none(): + m = _Accuracy() + out = await m.compute_corpus_scores([]) + assert out is None + + +def test_corpus_metric_score_names_includes_both(): + m = _Accuracy() + names = m.score_names() + assert "accuracy" in names + assert "accuracy_corpus" in names + + +# --------------------------------------------------------------------------- +# SecretsMixin +# --------------------------------------------------------------------------- + + +class _JudgeMetric(TemplateMetric, SecretsMixin): + type: Literal["judge"] = "judge" + secret_env_vars: ClassVar[tuple[str, ...]] = ("JUDGE_API_KEY",) + + def _score(self, input): + # Real metrics would call the judge endpoint with self.get_secret(...) + return 0.0 + + +def test_secrets_mixin_satisfies_metric_with_secrets_protocol(): + m = _JudgeMetric() + assert isinstance(m, MetricWithSecrets) + + +def test_secrets_mixin_declares_expected_secrets(): + m = _JudgeMetric() + declared = m.secrets() + assert set(declared) == {"JUDGE_API_KEY"} + assert declared["JUDGE_API_KEY"].root == "JUDGE_API_KEY" + + +def test_secrets_mixin_reads_env_at_construction(monkeypatch): + """If env var is set, construction eagerly loads it.""" + monkeypatch.setenv("JUDGE_API_KEY", "test-key-123") + m = _JudgeMetric() + assert m.get_secret("JUDGE_API_KEY") == "test-key-123" + + +def test_secrets_mixin_missing_env_returns_none(): + m = _JudgeMetric() + # JUDGE_API_KEY not set in this test; constructed returns None + # (fixture-isolation: other tests may have set monkeypatch, but + # this test wasn't parametrized with it) + if m.get_secret("JUDGE_API_KEY") is not None: + pytest.skip("JUDGE_API_KEY leaked from another test's env") + assert m.get_secret("JUDGE_API_KEY") is None + + +async def test_secrets_mixin_resolve_via_resolver(monkeypatch): + """Resolver is called for unresolved secrets and populates them.""" + monkeypatch.delenv("JUDGE_API_KEY", raising=False) + m = _JudgeMetric() + assert m.get_secret("JUDGE_API_KEY") is None + + async def resolver(name: str) -> str | None: + return "resolved-value" if name == "JUDGE_API_KEY" else None + + await m.resolve_secrets(resolver) + assert m.get_secret("JUDGE_API_KEY") == "resolved-value" + + +async def test_secrets_mixin_resolver_skips_already_loaded(monkeypatch): + """If env-loaded already, resolver is not consulted.""" + monkeypatch.setenv("JUDGE_API_KEY", "from-env") + m = _JudgeMetric() + + resolver_calls: list[str] = [] + + async def resolver(name: str) -> str | None: + resolver_calls.append(name) + return "from-resolver" + + await m.resolve_secrets(resolver) + + assert resolver_calls == [] # resolver not consulted + assert m.get_secret("JUDGE_API_KEY") == "from-env" + + +def test_secrets_mixin_default_is_empty_tuple(): + """A class without ``secret_env_vars`` has no secrets to declare.""" + + class NoSecrets(TemplateMetric, SecretsMixin): + type: Literal["none"] = "none" + + def _score(self, input): + return 0.0 + + m = NoSecrets() + assert m.secrets() == {} + + +# --------------------------------------------------------------------------- +# Target ergonomics: ~20-30 LOC real-looking metric +# --------------------------------------------------------------------------- + + +class _TinyLengthMetric(TemplateMetric): + """Demonstration: len-matching metric in exactly 14 LOC of user code.""" + + type: Literal["length-match"] = "length-match" + + tolerance: int = 5 + + def _score(self, input: MetricInput) -> float: + # compare response length to target length within tolerance + expected = len(str(input.target)) + actual = len(str(input.response)) + return 1.0 if abs(expected - actual) <= self.tolerance else 0.0 + + +async def test_tiny_length_metric_demonstrates_20_loc_target(): + """Proves a real-looking metric fits in ~15 LOC with TemplateMetric.""" + m = _TinyLengthMetric(tolerance=2) + out_close = await m.compute_scores(MetricInput(response="hello", target="world")) + assert out_close.scores[0].value == 1.0 + + out_far = await m.compute_scores(MetricInput(response="x", target="much much longer")) + assert out_far.scores[0].value == 0.0