Skip to content

Commit 1a72e1a

Browse files
SandyChapmanclaude
andcommitted
fix(evaluator): address PR #438 review feedback
- api/schemas.py: require a non-empty `type` on the inline wire payload so malformed inline metrics are rejected at the API boundary instead of failing at execution (mirrors the runtime InlineMetricPayload validator). The metric body stays an open object; concrete shape is still validated on hydration. - docs/evaluator/test_doc_examples.py: stub the executor with a sentinel in the built-in-submit test instead of `except Exception: pass`, so the test proves packaging resolved (no packager required) without swallowing unrelated errors. - docs/evaluator/metrics/llm-as-a-judge.mdx: replace the `{{ end_of_text }}` Fern template token with a literal `<end_of_text>` per docs guidelines. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Signed-off-by: Sandy Chapman <schapman@nvidia.com>
1 parent 4bc8623 commit 1a72e1a

3 files changed

Lines changed: 26 additions & 12 deletions

File tree

docs/evaluator/metrics/llm-as-a-judge.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ prompt_template = {
645645
"temperature": 0.1, # Lower for more consistent scoring
646646
"max_tokens": 1024, # Increase if judge needs more space
647647
"timeout": 30, # Request timeout in seconds
648-
"stop": ["<{{ end_of_text }}>"], # Stop sequences
648+
"stop": ["<end_of_text>"], # Stop sequences
649649
}
650650
```
651651

docs/evaluator/test_doc_examples.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,23 +105,23 @@ def test_packager_param_is_submit_only() -> None:
105105
def test_builtin_submit_does_not_require_a_packager() -> None:
106106
"""Built-in metrics bundle inline, so docs omit the packager on ``submit()``.
107107
108-
Submitting reaches the executor (which then needs a live service); the point
109-
is only that no packager-policy error is raised for a built-in metric.
108+
Packager resolution happens before delegating to the executor, so we stub the
109+
executor with a sentinel: reaching it (rather than raising a packager-policy
110+
error) proves the built-in metric bundled inline with no packager required —
111+
without depending on a live service or swallowing unrelated failures.
110112
"""
113+
from unittest.mock import patch
114+
111115
from nemo_evaluator_sdk import ExactMatchMetric
112116

113117
evaluator = _evaluator()
114118
metric = ExactMatchMetric(reference="{{item.expected}}", candidate="{{item.output}}")
115119
dataset = [{"expected": "Paris", "output": "Paris"}]
120+
sentinel = RuntimeError("reached executor.submit (packaging resolved without a packager)")
116121

117-
try:
118-
evaluator.submit(metric=metric, dataset=dataset)
119-
except MetricBundlePackagerPolicyError as error: # pragma: no cover - regression guard
120-
pytest.fail(f"built-in submit should not require a packager: {error}")
121-
except Exception:
122-
# Any other failure (e.g. connection refused with no live service) is fine;
123-
# it means the built-in metric was bundled inline and reached execution.
124-
pass
122+
with patch.object(evaluator._executor, "submit", side_effect=sentinel):
123+
with pytest.raises(RuntimeError, match="reached executor.submit"):
124+
evaluator.submit(metric=metric, dataset=dataset)
125125

126126

127127
def test_custom_submit_requires_an_explicit_packager() -> None:

plugins/nemo-evaluator/src/nemo_evaluator/api/schemas.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
)
1616
from nemo_evaluator_sdk.values.common import SecretRef
1717
from nemo_platform_plugin.schema import DatetimeFilter, Filter
18-
from pydantic import BaseModel, ConfigDict, Field
18+
from pydantic import BaseModel, ConfigDict, Field, field_validator
1919

2020

2121
class CloudpickleMetricPayload(BaseModel):
@@ -60,6 +60,20 @@ class InlineMetricPayload(BaseModel):
6060
description="SHA-256 digest of the canonical metric JSON. Informational; recomputed server-side.",
6161
)
6262

63+
@field_validator("metric")
64+
@classmethod
65+
def _metric_must_declare_type(cls, value: dict[str, Any]) -> dict[str, Any]:
66+
"""Reject payloads without a metric ``type`` discriminator at the API boundary.
67+
68+
The metric body stays an open object (the concrete shape is validated when
69+
the bundle is hydrated against the metric type union), but a non-empty
70+
``type`` is required so malformed payloads fail fast rather than at execution.
71+
"""
72+
metric_type = value.get("type")
73+
if not isinstance(metric_type, str) or not metric_type:
74+
raise ValueError("inline metric payload must include a non-empty 'type'")
75+
return value
76+
6377

6478
# Discriminated on ``kind`` so additional payload formats can join the union
6579
# without changing the field type.

0 commit comments

Comments
 (0)