Skip to content

Commit 06f060f

Browse files
SandyChapmanclaude
andauthored
feat(evaluator): add Trial→Intake boundary mapping module (D8) (#443)
Adds plugins/nemo-evaluator/src/nemo_evaluator/intake/mapping.py: the single pure layer that translates Evaluator vocabulary (AgentEvalTrial, AgentEvalTaskScore, MetricOutput) into the platform SDK's typed Intake request params, so the D3/D4/D5 write-adapters share one source of request shapes. - trial_to_atif_ingest -> AtifCreateParams (minimal single-step trajectory until D2 trace normalization; defaults agent.version per design §3.9 #6). - score_to_evaluator_results -> list[EvaluatorResultCreateParams], one row per MetricOutput, name='{metric_type}.{output}', span_id supplied by the caller (resolved post-ingest; the adapter owns that orchestration). - run_task_to_experiment_context -> ExperimentContextParam (lean {experiment_id, test_case_id}). Returns the generated nemo-platform-sdk *CreateParams TypedDicts (runtime dicts, statically checked against the real schema) rather than hand-shaped dicts; imports the SDK client types, never the Intake service (nmp.intake.*). CATEGORICAL coercion is intentionally deferred (strings -> TEXT) until a real signal exists. Includes unit tests for all coercions + the .root unwrap and an import-hygiene guardrail. Refs: AALGO-289 Signed-off-by: Sandy Chapman <schapman@nvidia.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent b4473e8 commit 06f060f

3 files changed

Lines changed: 395 additions & 0 deletions

File tree

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Boundary mapping: Evaluator vocabulary -> Intake/Experiments wire shapes.
5+
6+
This is the single place where Evaluator domain objects (``AgentEvalTrial``,
7+
``AgentEvalTaskScore``, ``MetricOutput``) become the request bodies Intake and
8+
the Experiments API expect. The Intake write-adapter tickets (D3/D4/D5) obtain
9+
their request shapes and field names *only* from here, so a later rename is a
10+
one-file change.
11+
12+
Design constraints (see AALGO-289):
13+
14+
* **Pure.** Every function reads SDK types and returns request params. No HTTP,
15+
no platform client, no imports from the Intake *service* (``nmp.intake.*``).
16+
* **Typed at the boundary.** The returned values are the generated platform
17+
SDK's ``TypedDict`` params (``AtifCreateParams`` / ``EvaluatorResultCreateParams``).
18+
At runtime they are plain dicts the adapter splats into the client
19+
(``client.intake.ingest.atif.create(**body)``); statically, ``ty`` checks our
20+
field names, literals, and nested shapes against the real generated schema, so
21+
an API change that regenerates the SDK surfaces here as a type error instead of
22+
drifting silently. We depend on the client SDK (already a plugin dependency),
23+
never on the Intake service package.
24+
* The well-known evidence-key constants (``initial_state``/``trace``/``logs``/
25+
``final_state``/``verifier_logs``) belong with the SDK evidence work (D1,
26+
AALGO-281). Until D1 lands, this module references them as string literals so
27+
it stays unblocked.
28+
"""
29+
30+
from __future__ import annotations
31+
32+
from typing import Literal
33+
34+
from nemo_evaluator_sdk.agent_eval.scores import AgentEvalTaskScore
35+
from nemo_evaluator_sdk.agent_eval.trials import AgentEvalTrial
36+
from nemo_platform.types.intake.evaluator_result_create_params import EvaluatorResultCreateParams
37+
from nemo_platform.types.intake.evaluator_result_data_type import EvaluatorResultDataType
38+
from nemo_platform.types.intake.experiment_context_param import ExperimentContextParam
39+
from nemo_platform.types.intake.ingest.atif_agent_param import AtifAgentParam
40+
from nemo_platform.types.intake.ingest.atif_create_params import AtifCreateParams
41+
from nemo_platform.types.intake.ingest.atif_final_metrics_param import AtifFinalMetricsParam
42+
from nemo_platform.types.intake.ingest.atif_step_agent_param import AtifStepAgentParam
43+
44+
# --- Shared conventions -----------------------------------------------------
45+
46+
#: ATIF schema version the adapter emits.
47+
ATIF_SCHEMA_VERSION: Literal["ATIF-v1.7"] = "ATIF-v1.7"
48+
49+
#: Default ``agent.version`` when the run target carries none. Neither Model nor
50+
#: Agent has a version field today, and ATIF requires one (design doc §3.9 #6).
51+
DEFAULT_AGENT_VERSION = "unknown"
52+
53+
# Evidence-descriptor keys. ATIF is carried as a ``format`` on ``kind="trace"``,
54+
# *not* as a distinct ``kind``. These are string literals until D1 (AALGO-281)
55+
# promotes them to shared descriptor-key constants on the SDK evidence types.
56+
EVIDENCE_KIND_TRACE = "trace"
57+
TRACE_FORMAT_ATIF = "atif"
58+
59+
60+
def session_id_for(run_id: str, trial_id: str) -> str:
61+
"""Return the stable, adapter-minted session id for a trial.
62+
63+
One session id per Trial keeps ATIF ingest idempotent and lets per-metric
64+
scores be attached to the same trajectory afterward. This is the single
65+
source of the convention; callers must not hand-roll it.
66+
"""
67+
return f"{run_id}:{trial_id}"
68+
69+
70+
def run_task_to_experiment_context(trial: AgentEvalTrial, *, experiment_id: str) -> ExperimentContextParam:
71+
"""Build the lean ingest ``experiment_context`` for a trial.
72+
73+
Only ``experiment_id`` and ``test_case_id`` live here. Dataset, group, and
74+
free-form metadata belong on the Experiment entity (created separately via
75+
the platform Experiments SDK), not on the per-ingest context.
76+
"""
77+
return {"experiment_id": experiment_id, "test_case_id": trial.task_id}
78+
79+
80+
def trial_to_atif_ingest(
81+
trial: AgentEvalTrial,
82+
*,
83+
run_id: str,
84+
experiment_id: str,
85+
agent_name: str,
86+
agent_version: str = DEFAULT_AGENT_VERSION,
87+
model_name: str | None = None,
88+
final_metrics: AtifFinalMetricsParam | None = None,
89+
) -> AtifCreateParams:
90+
"""Build the ATIF ingest params for a single Trial.
91+
92+
Until ATIF normalization of trace evidence lands (D2, AALGO-282), this emits
93+
a minimal single-step trajectory carrying the trial's final output text, so
94+
the session/score path works end to end. Real ``steps[]`` reconstructed from
95+
``trial.evidence`` arrive with D2.
96+
"""
97+
output_text = trial.output.output_text if trial.output is not None else None
98+
agent: AtifAgentParam = {"name": agent_name, "version": agent_version}
99+
if model_name is not None:
100+
agent["model_name"] = model_name
101+
step: AtifStepAgentParam = {"source": "agent", "step_id": 1, "message": output_text or ""}
102+
103+
body: AtifCreateParams = {
104+
"schema_version": ATIF_SCHEMA_VERSION,
105+
"session_id": session_id_for(run_id, trial.id),
106+
"agent": agent,
107+
"steps": [step],
108+
"experiment_context": run_task_to_experiment_context(trial, experiment_id=experiment_id),
109+
}
110+
if final_metrics is not None:
111+
body["final_metrics"] = final_metrics
112+
return body
113+
114+
115+
def score_to_evaluator_results(
116+
score: AgentEvalTaskScore,
117+
*,
118+
session_id: str,
119+
span_id: str,
120+
) -> list[EvaluatorResultCreateParams]:
121+
"""Turn one ``AgentEvalTaskScore`` into one evaluator-result param per output.
122+
123+
``name`` is ``"{metric_type}.{output}"`` (matching the SDK summary's
124+
aggregate naming). The output's value is coerced into the matching
125+
``data_type``, populating exactly one of ``value`` / ``string_value``.
126+
``session_id`` and ``span_id`` are supplied by the caller: the trajectory
127+
span id is resolved at publish time (the adapter's concern), not derivable
128+
from the pure score.
129+
"""
130+
comment = score.diagnostics[0].message if score.diagnostics else None
131+
rows: list[EvaluatorResultCreateParams] = []
132+
for output in score.outputs:
133+
data_type, value, string_value = _coerce_metric_value(output.value)
134+
row: EvaluatorResultCreateParams = {
135+
"session_id": session_id,
136+
"span_id": span_id,
137+
"name": f"{score.metric_type}.{output.name}",
138+
"data_type": data_type,
139+
}
140+
if value is not None:
141+
row["value"] = value
142+
if string_value is not None:
143+
row["string_value"] = string_value
144+
if comment is not None:
145+
row["comment"] = comment
146+
rows.append(row)
147+
return rows
148+
149+
150+
def _coerce_metric_value(value: object) -> tuple[EvaluatorResultDataType, float | None, str | None]:
151+
"""Classify a metric output value into ``(data_type, value, string_value)``.
152+
153+
Unwraps a Pydantic ``RootModel`` (``.root``) first, then:
154+
155+
* ``bool`` -> ``BOOLEAN`` with value 1.0/0.0 (checked before ``int``, since
156+
``bool`` is a subclass of ``int``);
157+
* ``int``/``float`` -> ``NUMERIC``;
158+
* anything else (strings, labels) -> ``TEXT`` via ``str()``.
159+
160+
CATEGORICAL is intentionally not emitted: a category and free text are
161+
indistinguishable at the value level today (both arrive as ``str``/``Label``),
162+
so everything string-valued maps to TEXT until a real signal exists.
163+
"""
164+
unwrapped = getattr(value, "root", value)
165+
if isinstance(unwrapped, bool):
166+
return "BOOLEAN", (1.0 if unwrapped else 0.0), None
167+
if isinstance(unwrapped, (int, float)):
168+
return "NUMERIC", float(unwrapped), None
169+
return "TEXT", None, str(unwrapped)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Guardrail: the intake mapping module must not import the Intake service.
5+
6+
The mapping is pure boundary code: it reads SDK types and returns plain dicts
7+
shaped for Intake's requests, but it must not depend on the Intake service
8+
(``nmp.intake.*``), an HTTP client, or the platform client. This keeps the
9+
translation isolated so D3/D4/D5 can build the wire calls on top of it without
10+
the mapping itself pulling in the service.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import re
16+
from pathlib import Path
17+
18+
import nemo_evaluator.intake as intake
19+
20+
INTAKE_ROOT = Path(next(iter(intake.__path__))).resolve()
21+
22+
# Imports that would couple the pure mapping to the Intake service or transport.
23+
_FORBIDDEN = re.compile(
24+
r"^\s*(?:from|import)\s+(nmp\.intake|nmp_intake|httpx)",
25+
re.MULTILINE,
26+
)
27+
28+
29+
def test_intake_mapping_has_no_service_imports() -> None:
30+
offenders: list[str] = []
31+
for path in sorted(INTAKE_ROOT.rglob("*.py")):
32+
text = path.read_text(encoding="utf-8")
33+
for match in _FORBIDDEN.finditer(text):
34+
line_no = text.count("\n", 0, match.start()) + 1
35+
offenders.append(f"{path.relative_to(INTAKE_ROOT)}:{line_no}: {match.group(0).strip()}")
36+
37+
assert not offenders, "nemo_evaluator.intake must not import the Intake service / transport:\n" + "\n".join(
38+
offenders
39+
)
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Unit tests for the Evaluator -> Intake boundary mapping module."""
5+
6+
from __future__ import annotations
7+
8+
import pytest
9+
from nemo_evaluator.intake.mapping import (
10+
ATIF_SCHEMA_VERSION,
11+
DEFAULT_AGENT_VERSION,
12+
run_task_to_experiment_context,
13+
score_to_evaluator_results,
14+
session_id_for,
15+
trial_to_atif_ingest,
16+
)
17+
from nemo_evaluator_sdk.agent_eval.scores import (
18+
AgentEvalDiagnostic,
19+
AgentEvalDiagnosticSeverity,
20+
AgentEvalScoreStatus,
21+
AgentEvalTaskScore,
22+
)
23+
from nemo_evaluator_sdk.agent_eval.trials import AgentEvalTrial, AgentEvalTrialStatus, AgentOutput
24+
from nemo_evaluator_sdk.metrics.protocol import (
25+
BooleanValue,
26+
ContinuousScore,
27+
DiscreteScore,
28+
Label,
29+
MetricOutput,
30+
)
31+
32+
33+
def _trial(*, trial_id: str = "trial-1", task_id: str = "task-1", output_text: str | None = "hello") -> AgentEvalTrial:
34+
output = AgentOutput(output_text=output_text) if output_text is not None else None
35+
status = AgentEvalTrialStatus.COMPLETED if output is not None else AgentEvalTrialStatus.FAILED
36+
return AgentEvalTrial(id=trial_id, task_id=task_id, status=status, output=output)
37+
38+
39+
def _score(*, outputs: list[MetricOutput], diagnostics: list[AgentEvalDiagnostic] | None = None) -> AgentEvalTaskScore:
40+
return AgentEvalTaskScore(
41+
id="score-1",
42+
run_id="run-1",
43+
task_id="task-1",
44+
trial_id="trial-1",
45+
metric_type="accuracy",
46+
status=AgentEvalScoreStatus.COMPLETED,
47+
outputs=outputs,
48+
diagnostics=diagnostics or [],
49+
)
50+
51+
52+
# --- session_id_for ---------------------------------------------------------
53+
54+
55+
def test_session_id_is_stable_per_trial() -> None:
56+
assert session_id_for("run-1", "trial-1") == "run-1:trial-1"
57+
58+
59+
# --- run_task_to_experiment_context -----------------------------------------
60+
61+
62+
def test_experiment_context_is_lean() -> None:
63+
context = run_task_to_experiment_context(_trial(task_id="task-42"), experiment_id="bench-x-variant")
64+
assert context == {"experiment_id": "bench-x-variant", "test_case_id": "task-42"}
65+
66+
67+
# --- trial_to_atif_ingest ---------------------------------------------------
68+
69+
70+
def test_trial_to_atif_ingest_shape() -> None:
71+
body = trial_to_atif_ingest(
72+
_trial(trial_id="t-1", task_id="task-1", output_text="final answer"),
73+
run_id="run-1",
74+
experiment_id="exp-1",
75+
agent_name="my-agent",
76+
model_name="gpt-4o",
77+
)
78+
assert body["schema_version"] == ATIF_SCHEMA_VERSION
79+
assert body["session_id"] == "run-1:t-1"
80+
assert body["agent"] == {"name": "my-agent", "version": DEFAULT_AGENT_VERSION, "model_name": "gpt-4o"}
81+
assert body["steps"] == [{"source": "agent", "step_id": 1, "message": "final answer"}]
82+
assert body["experiment_context"] == {"experiment_id": "exp-1", "test_case_id": "task-1"}
83+
assert "final_metrics" not in body
84+
85+
86+
def test_trial_to_atif_ingest_defaults_version_and_omits_model_name() -> None:
87+
body = trial_to_atif_ingest(_trial(), run_id="run-1", experiment_id="exp-1", agent_name="a")
88+
assert body["agent"] == {"name": "a", "version": "unknown"}
89+
assert "model_name" not in body["agent"]
90+
91+
92+
def test_trial_to_atif_ingest_handles_missing_output() -> None:
93+
body = trial_to_atif_ingest(_trial(output_text=None), run_id="run-1", experiment_id="exp-1", agent_name="a")
94+
assert body["steps"] == [{"source": "agent", "step_id": 1, "message": ""}]
95+
96+
97+
def test_trial_to_atif_ingest_includes_final_metrics_when_given() -> None:
98+
body = trial_to_atif_ingest(
99+
_trial(),
100+
run_id="run-1",
101+
experiment_id="exp-1",
102+
agent_name="a",
103+
final_metrics={"total_prompt_tokens": 10},
104+
)
105+
assert body["final_metrics"] == {"total_prompt_tokens": 10}
106+
107+
108+
# --- score_to_evaluator_results: data_type coercions ------------------------
109+
110+
111+
def test_score_row_naming_and_targeting() -> None:
112+
rows = score_to_evaluator_results(
113+
_score(outputs=[MetricOutput(name="score", value=0.5)]),
114+
session_id="run-1:trial-1",
115+
span_id="span-abc",
116+
)
117+
assert len(rows) == 1
118+
assert rows[0]["name"] == "accuracy.score"
119+
assert rows[0]["session_id"] == "run-1:trial-1"
120+
assert rows[0]["span_id"] == "span-abc"
121+
122+
123+
def test_one_row_per_output() -> None:
124+
rows = score_to_evaluator_results(
125+
_score(outputs=[MetricOutput(name="a", value=1.0), MetricOutput(name="b", value=2.0)]),
126+
session_id="s",
127+
span_id="span",
128+
)
129+
assert [row["name"] for row in rows] == ["accuracy.a", "accuracy.b"]
130+
131+
132+
@pytest.mark.parametrize("value", [True, BooleanValue(True)])
133+
def test_boolean_coercion_true(value: object) -> None:
134+
row = score_to_evaluator_results(
135+
_score(outputs=[MetricOutput(name="passed", value=value)]), session_id="s", span_id="sp"
136+
)[0]
137+
assert row["data_type"] == "BOOLEAN"
138+
assert row["value"] == 1.0
139+
assert "string_value" not in row
140+
141+
142+
@pytest.mark.parametrize("value", [False, BooleanValue(False)])
143+
def test_boolean_coercion_false(value: object) -> None:
144+
row = score_to_evaluator_results(
145+
_score(outputs=[MetricOutput(name="passed", value=value)]), session_id="s", span_id="sp"
146+
)[0]
147+
assert row["data_type"] == "BOOLEAN"
148+
assert row["value"] == 0.0
149+
150+
151+
@pytest.mark.parametrize("value", [0.87, 3, ContinuousScore(0.87), DiscreteScore(3)])
152+
def test_numeric_coercion(value: object) -> None:
153+
row = score_to_evaluator_results(
154+
_score(outputs=[MetricOutput(name="m", value=value)]), session_id="s", span_id="sp"
155+
)[0]
156+
assert row["data_type"] == "NUMERIC"
157+
assert isinstance(row["value"], float)
158+
assert "string_value" not in row
159+
160+
161+
@pytest.mark.parametrize("value", ["PASS", Label("PASS")])
162+
def test_text_coercion(value: object) -> None:
163+
row = score_to_evaluator_results(
164+
_score(outputs=[MetricOutput(name="verdict", value=value)]), session_id="s", span_id="sp"
165+
)[0]
166+
assert row["data_type"] == "TEXT"
167+
assert row["string_value"] == "PASS"
168+
assert "value" not in row
169+
170+
171+
def test_comment_taken_from_first_diagnostic() -> None:
172+
score = _score(
173+
outputs=[MetricOutput(name="score", value=1.0)],
174+
diagnostics=[
175+
AgentEvalDiagnostic(severity=AgentEvalDiagnosticSeverity.WARNING, message="first"),
176+
AgentEvalDiagnostic(severity=AgentEvalDiagnosticSeverity.INFO, message="second"),
177+
],
178+
)
179+
row = score_to_evaluator_results(score, session_id="s", span_id="sp")[0]
180+
assert row["comment"] == "first"
181+
182+
183+
def test_comment_absent_without_diagnostics() -> None:
184+
row = score_to_evaluator_results(
185+
_score(outputs=[MetricOutput(name="score", value=1.0)]), session_id="s", span_id="sp"
186+
)[0]
187+
assert "comment" not in row

0 commit comments

Comments
 (0)