Skip to content

Commit e5f90a3

Browse files
committed
feat(scoring): add ergonomics helpers for TemplateMetric authoring
Completes the ~20-30 LOC authoring promise by giving TemplateMetric subclasses ready-made helpers. Stacked on wprazuch/metric-abstractions. New: - TemplateMetric._render(template, input) — Jinja2 rendering with both NEL-native ({{ response }}, {{ target }}, {{ metadata.* }}) and SDK-native ({{ output_text }}, {{ reference }}) variable names, so templates authored against either vocabulary work unchanged. Strict undefined-variable handling raises instead of silently rendering empty. - CorpusTemplateMetric(TemplateMetric) — base class for metrics with both row-level and corpus-level scores. Subclasses implement _score() and _corpus_score(); defaults wrap each in a MetricResult. score_names() includes both '<type>' and '<type>_corpus'. Empty inputs -> None. - SecretsMixin — mixin that satisfies MetricWithSecrets protocol. Subclasses declare secret_env_vars: ClassVar[tuple[str, ...]]. Secrets are eagerly loaded from os.environ at construction, with async resolve_secrets() as a fallback path (NMP Platform flow). Resolved values are stored as SecretStr private attrs; get_secret(env_var) returns the plaintext value or None. Tests (+18 new, 42 total): - _render: NEL-native names, SDK-native aliases, metadata/config access, StrictUndefined raises on missing variables. - CorpusTemplateMetric: satisfies both Metric + CorpusMetric protocols, row-level default, corpus-level default, empty-inputs, score_names includes both. - SecretsMixin: satisfies MetricWithSecrets, declares env vars, reads env at construction, async resolver fills gaps, resolver is skipped when already loaded. - Target ergonomics proof: _TinyLengthMetric in ~15 LOC of user code demonstrates the authoring pattern. NMP can pick these up as they land. The contract API is stable — helpers are additive (subclassing-based), so SDK concrete metrics can adopt them incrementally without breakage. Signed-off-by: Wojciech Prazuch <wprazuch@nvidia.com>
1 parent a394a42 commit e5f90a3

3 files changed

Lines changed: 404 additions & 2 deletions

File tree

src/nemo_evaluator/scoring/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
from nemo_evaluator.scoring.contracts import (
2626
CorpusMetric,
27+
CorpusTemplateMetric,
2728
Metric,
2829
MetricInput,
2930
MetricOutput,
@@ -37,6 +38,7 @@
3738
ScoreStats,
3839
SecretRefLike,
3940
SecretResolver,
41+
SecretsMixin,
4042
TemplateMetric,
4143
get_metric,
4244
list_metrics,

src/nemo_evaluator/scoring/contracts.py

Lines changed: 155 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,12 @@ def _score(self, input: MetricInput) -> float:
7373
from __future__ import annotations
7474

7575
import math
76+
import os
7677
from abc import abstractmethod
7778
from typing import Any, Awaitable, Callable, ClassVar, Protocol, runtime_checkable
7879

79-
from pydantic import BaseModel, Field, field_serializer, field_validator
80+
from jinja2 import Environment, StrictUndefined
81+
from pydantic import BaseModel, Field, PrivateAttr, SecretStr, field_serializer, field_validator
8082

8183
from nemo_evaluator.scoring.types import ScorerInput
8284

@@ -98,8 +100,11 @@ def _score(self, input: MetricInput) -> float:
98100
"Scorer",
99101
"SecretRefLike",
100102
"SecretResolver",
101-
# Base class for easy authoring
103+
# Base classes for easy authoring
102104
"TemplateMetric",
105+
"CorpusTemplateMetric",
106+
# Mixins
107+
"SecretsMixin",
103108
# Bridge
104109
"metric_as_scorer",
105110
# Registry
@@ -338,6 +343,13 @@ def _score(self, input: MetricInput) -> float:
338343

339344
type: str = Field(description="Public string identifier for this metric.")
340345

346+
_jinja_env: ClassVar[Environment] = Environment(
347+
undefined=StrictUndefined,
348+
trim_blocks=False,
349+
lstrip_blocks=False,
350+
autoescape=False,
351+
)
352+
341353
@abstractmethod
342354
def _score(self, input: MetricInput) -> float:
343355
"""Compute one raw score value. Subclass implements this."""
@@ -351,6 +363,147 @@ def score_names(self) -> list[str]:
351363
"""Default implementation: a single score named after ``self.type``."""
352364
return [self.type]
353365

366+
def _render(self, template: str, input: MetricInput) -> str:
367+
"""Render a Jinja2 template against a :class:`MetricInput`.
368+
369+
The rendering context exposes both NEL-native names and SDK-native
370+
aliases, so templates authored against either vocabulary work::
371+
372+
# NEL-native
373+
"{{ response }}", "{{ target }}", "{{ metadata.<key> }}"
374+
# SDK-native aliases
375+
"{{ output_text }}" (== response)
376+
"{{ reference }}" (== target)
377+
378+
Raises :class:`jinja2.exceptions.UndefinedError` if the template
379+
references a variable not in the context (strict mode).
380+
"""
381+
ctx: dict[str, Any] = {
382+
"response": input.response,
383+
"target": input.target,
384+
"output_text": input.response,
385+
"reference": input.target,
386+
"metadata": dict(input.metadata or {}),
387+
"config": dict(input.config or {}),
388+
}
389+
return self._jinja_env.from_string(template).render(**ctx)
390+
391+
392+
# ============================================================================
393+
# CorpusTemplateMetric — base for metrics with corpus-level scoring
394+
# ============================================================================
395+
396+
397+
class CorpusTemplateMetric(TemplateMetric):
398+
"""Base class for metrics that emit both row-level and corpus-level scores.
399+
400+
Subclasses implement :meth:`_score` (row) and :meth:`_corpus_score`
401+
(aggregate). Default :meth:`compute_corpus_scores` wraps ``_corpus_score``
402+
in a single-score MetricResult named ``{type}_corpus``.
403+
404+
Example::
405+
406+
@register_metric
407+
class Accuracy(CorpusTemplateMetric):
408+
type: Literal["accuracy"] = "accuracy"
409+
410+
def _score(self, input):
411+
return 1.0 if input.response == input.target else 0.0
412+
413+
def _corpus_score(self, inputs):
414+
return sum(self._score(i) for i in inputs) / len(inputs)
415+
"""
416+
417+
@abstractmethod
418+
def _corpus_score(self, inputs: list[MetricInput]) -> float:
419+
"""Compute one corpus-level score across inputs. Subclass implements this."""
420+
...
421+
422+
async def compute_corpus_scores(
423+
self, inputs: list[MetricInput]
424+
) -> MetricResult | None:
425+
"""Default implementation: wrap ``_corpus_score`` in a single-score MetricResult."""
426+
if not inputs:
427+
return None
428+
corpus_name = f"{self.type}_corpus"
429+
return MetricResult(
430+
scores=[MetricScore(name=corpus_name, value=self._corpus_score(inputs))]
431+
)
432+
433+
def score_names(self) -> list[str]:
434+
"""Includes both the row-level and corpus-level score names."""
435+
return [self.type, f"{self.type}_corpus"]
436+
437+
438+
# ============================================================================
439+
# SecretsMixin — ergonomics for metrics that need API keys
440+
# ============================================================================
441+
442+
443+
class _SimpleSecretRef(BaseModel):
444+
"""Plain SecretRef satisfying :class:`SecretRefLike` — NMP SDK has its own."""
445+
446+
root: str
447+
448+
@property
449+
def env(self) -> str:
450+
return self.root
451+
452+
453+
class SecretsMixin(BaseModel):
454+
"""Mixin for metrics that need secrets (API keys for remote judges, etc.).
455+
456+
Subclasses declare one or more env-var names via ``secret_env_vars``.
457+
The mixin implements :meth:`secrets` and :meth:`resolve_secrets` so the
458+
class satisfies :class:`MetricWithSecrets`.
459+
460+
Resolved values are stored in ``_resolved_secrets`` (a dict of env-var
461+
name -> ``SecretStr``). Access via :meth:`get_secret`.
462+
463+
Example::
464+
465+
from typing import Literal, ClassVar
466+
from nemo_evaluator.scoring import TemplateMetric, SecretsMixin, register_metric
467+
468+
@register_metric
469+
class JudgeMetric(TemplateMetric, SecretsMixin):
470+
type: Literal["judge"] = "judge"
471+
secret_env_vars: ClassVar[tuple[str, ...]] = ("JUDGE_API_KEY",)
472+
473+
def _score(self, input):
474+
key = self.get_secret("JUDGE_API_KEY")
475+
# ... call remote judge with key ...
476+
return 0.0
477+
"""
478+
479+
secret_env_vars: ClassVar[tuple[str, ...]] = ()
480+
_resolved_secrets: dict[str, SecretStr] = PrivateAttr(default_factory=dict)
481+
482+
def model_post_init(self, __context: Any) -> None:
483+
"""Eagerly load any secrets already in os.environ at construction time."""
484+
for var in self.secret_env_vars:
485+
value = os.environ.get(var)
486+
if value:
487+
self._resolved_secrets[var] = SecretStr(value)
488+
489+
def secrets(self) -> dict[str, SecretRefLike]:
490+
"""Declared secret env-vars mapped to :class:`SecretRefLike` references."""
491+
return {var: _SimpleSecretRef(root=var) for var in self.secret_env_vars}
492+
493+
async def resolve_secrets(self, resolver: SecretResolver) -> None:
494+
"""Resolve declared secrets via the provided async resolver."""
495+
for var in self.secret_env_vars:
496+
if var in self._resolved_secrets:
497+
continue # already populated from env
498+
value = await resolver(var)
499+
if value:
500+
self._resolved_secrets[var] = SecretStr(value)
501+
502+
def get_secret(self, env_var: str) -> str | None:
503+
"""Return the resolved secret value for ``env_var``, or ``None``."""
504+
secret = self._resolved_secrets.get(env_var)
505+
return secret.get_secret_value() if secret else None
506+
354507

355508
# ============================================================================
356509
# Bridge: Metric -> function-style Scorer

0 commit comments

Comments
 (0)