Skip to content

Commit 2da7a67

Browse files
committed
feat(evaluator): add ATIF trace and log read handles for evidence
Add the ATIF (Agent Trace Interchange Format) model and normalizers for NAT trajectories, OpenTelemetry GenAI spans, and OpenInference spans, plus the TraceHandle and LogHandle read handles exposed via CandidateEvidence.trace() and CandidateEvidence.logs(). Includes the example-only inefficient_retry_loop metric that scores over the normalized trace evidence. Vendored mirror synced via make vendor. Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
1 parent e1ac31b commit 2da7a67

13 files changed

Lines changed: 2237 additions & 25 deletions

File tree

packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,22 @@
33

44
"""Reference metrics-over-evidence for this example (not SDK API).
55
6-
These show how to score from the SDK's filesystem evidence handle instead of a
7-
stamped verifier reward:
6+
These show how to score from the SDK's evidence handles instead of a stamped
7+
verifier reward:
88
99
* :class:`TestsPassMetric` runs a command against ``final_state`` filesystem
1010
evidence (in a throwaway overlay) and scores on exit 0.
1111
* :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state``
1212
and fails if the agent touched protected (e.g. test) paths.
13+
* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails
14+
when the same tool call repeats past a threshold.
1315
"""
1416

1517
from __future__ import annotations
1618

1719
from collections.abc import Sequence
1820

21+
from nemo_evaluator_sdk.agent_eval.trials import EVIDENCE_FINAL_STATE, EVIDENCE_INITIAL_STATE, EVIDENCE_TRACE
1922
from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
2023

2124

@@ -26,7 +29,7 @@ def __init__(
2629
self,
2730
command: Sequence[str],
2831
*,
29-
evidence_name: str = "final_state",
32+
evidence_name: str = EVIDENCE_FINAL_STATE,
3033
cwd: str = ".",
3134
timeout_s: float = 300.0,
3235
) -> None:
@@ -60,8 +63,8 @@ def __init__(
6063
*,
6164
protected: Sequence[str] = ("tests/",),
6265
change_types: Sequence[str] = ("added", "modified", "deleted"),
63-
initial_name: str = "initial_state",
64-
final_name: str = "final_state",
66+
initial_name: str = EVIDENCE_INITIAL_STATE,
67+
final_name: str = EVIDENCE_FINAL_STATE,
6568
) -> None:
6669
self._protected = tuple(protected)
6770
self._change_types = set(change_types)
@@ -87,3 +90,38 @@ async def compute_scores(self, input: MetricInput) -> MetricResult:
8790
]
8891
clean = not violations
8992
return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)])
93+
94+
95+
class InefficientRetryLoopMetric:
96+
"""Score ``False`` when the same tool call repeats more than ``threshold`` times."""
97+
98+
def __init__(self, *, threshold: int = 2, evidence_name: str = EVIDENCE_TRACE) -> None:
99+
self._threshold = threshold
100+
self._evidence_name = evidence_name
101+
102+
@property
103+
def type(self) -> str:
104+
return "inefficient_retry_loop"
105+
106+
def output_spec(self) -> list[MetricOutputSpec]:
107+
return [
108+
MetricOutputSpec.boolean("efficient_tool_use"),
109+
MetricOutputSpec.discrete_score("max_repeated_tool_calls"),
110+
]
111+
112+
async def compute_scores(self, input: MetricInput) -> MetricResult:
113+
max_repeats = 0
114+
evidence = input.candidate.evidence
115+
if evidence is not None and evidence.get(self._evidence_name) is not None:
116+
calls = await (await evidence.trace(self._evidence_name)).tool_calls()
117+
counts: dict[str, int] = {}
118+
for call in calls:
119+
key = f"{call.function_name}:{sorted((call.arguments or {}).items())}"
120+
counts[key] = counts.get(key, 0) + 1
121+
max_repeats = max(counts.values(), default=0)
122+
return MetricResult(
123+
outputs=[
124+
MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
125+
MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
126+
]
127+
)

packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/evaluator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
from nemo_evaluator_sdk.metrics.protocol import Metric, validate_metric_result
4545
from nemo_evaluator_sdk.metrics.utils import metric_type_name
4646
from nemo_evaluator_sdk.values import Agent, Model, RunConfig, RunConfigOnline, RunConfigOnlineModel
47-
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
47+
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, normalize_trace_descriptor
4848
from openai import AsyncOpenAI
4949

5050
log = getLogger(__name__)
@@ -327,7 +327,9 @@ def _trial_from_sample(task: AgentEvalTask, target: Model | Agent, sample: dict[
327327
# trial stays scorable instead of being dropped as empty output.
328328
output_text = _reasoning_content_fallback(sample.get("response"))
329329
if "trajectory" in sample:
330-
trace = EvidenceDescriptor(kind="trace", format="json", data=sample["trajectory"])
330+
# Normalize to ATIF before the trial is persisted so the stored shape is
331+
# source-agnostic (sources in, ATIF out); TraceHandle then reads it uniformly.
332+
trace = normalize_trace_descriptor(EvidenceDescriptor(kind="trace", format="json", data=sample["trajectory"]))
331333
else:
332334
trace = EvidenceDescriptor(kind="sdk_online_generation", data={"task_id": task.id, "target": target.name})
333335

packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/agent_eval/trials.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from nemo_evaluator_sdk.agent_eval.tasks import AgentEvalRunConfig, AgentEvalTask
1616
from nemo_evaluator_sdk.values import Agent, Model
17-
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
17+
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, normalize_trace_descriptor
1818
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
1919

2020
# Well-known evidence keys produced by ``standard_evidence_descriptors``. Harness
@@ -158,10 +158,14 @@ def standard_evidence_descriptors(
158158
if trace_path is not None:
159159
trace_name = Path(trace_path).name.lower()
160160
is_atif = trace_name.startswith("atif") or ".atif." in trace_name
161-
descriptors[EVIDENCE_TRACE] = EvidenceDescriptor(
162-
kind="trace",
163-
format="atif" if is_atif else "json",
164-
ref=str(trace_path),
161+
# Normalize the source trace into a sibling ATIF file before persistence so the
162+
# stored descriptor is ATIF regardless of producer (no-op if already ATIF/missing).
163+
descriptors[EVIDENCE_TRACE] = normalize_trace_descriptor(
164+
EvidenceDescriptor(
165+
kind="trace",
166+
format="atif" if is_atif else "json",
167+
ref=str(trace_path),
168+
)
165169
)
166170

167171
logs_metadata = {"primary_log": primary_log} if primary_log else {}

packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@
44
"""Public value types for evaluator SDK runtime."""
55

66
from nemo_evaluator_sdk.values.agents import Agent
7+
from nemo_evaluator_sdk.values.atif import (
8+
FinalMetrics,
9+
Metrics,
10+
Observation,
11+
ObservationResult,
12+
Step,
13+
ToolCall,
14+
Trajectory,
15+
)
716
from nemo_evaluator_sdk.values.common import SecretRef, SupportedJobTypes
817
from nemo_evaluator_sdk.values.dataset_schemas import (
918
FieldMapping,
@@ -17,6 +26,11 @@
1726
FilesystemDiff,
1827
FilesystemEntry,
1928
LocalFilesystemEvidence,
29+
LogHandle,
30+
TraceHandle,
31+
WellKnownEvidenceKey,
32+
normalize_candidate_evidence,
33+
normalize_trace_descriptor,
2034
)
2135
from nemo_evaluator_sdk.values.metrics import (
2236
BLEU,
@@ -109,6 +123,18 @@
109123
"ContinuousScore",
110124
"FilesystemDiff",
111125
"FilesystemEntry",
126+
"FinalMetrics",
127+
"LogHandle",
128+
"Metrics",
129+
"Observation",
130+
"ObservationResult",
131+
"Step",
132+
"ToolCall",
133+
"Trajectory",
134+
"TraceHandle",
135+
"WellKnownEvidenceKey",
136+
"normalize_candidate_evidence",
137+
"normalize_trace_descriptor",
112138
"DatasetRow",
113139
"DatasetRows",
114140
"DefaultAggregateFieldName",

0 commit comments

Comments
 (0)