Skip to content

Commit 55ce862

Browse files
committed
feat(evaluator): add ATIF trace and log read handles for evidence
Add the ATIF (Agent Trace Interchange Format) model and normalizers for NAT trajectories, OpenTelemetry GenAI spans, and OpenInference spans, plus the TraceHandle and LogHandle read handles exposed via CandidateEvidence.trace() and CandidateEvidence.logs(). Includes the example-only inefficient_retry_loop metric that scores over the normalized trace evidence. Vendored mirror synced via make vendor. Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
1 parent 4897b31 commit 55ce862

7 files changed

Lines changed: 709 additions & 6 deletions

File tree

packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33

44
"""Reference metrics-over-evidence for this example (not SDK API).
55
6-
These show how to score from the SDK's filesystem evidence handle instead of a
7-
stamped verifier reward:
6+
These show how to score from the SDK's evidence handles instead of a stamped
7+
verifier reward:
88
99
* :class:`TestsPassMetric` runs a command against ``final_state`` filesystem
1010
evidence (in a throwaway overlay) and scores on exit 0.
1111
* :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state``
1212
and fails if the agent touched protected (e.g. test) paths.
13+
* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails
14+
when the same tool call repeats past a threshold.
1315
"""
1416

1517
from __future__ import annotations
@@ -87,3 +89,38 @@ async def compute_scores(self, input: MetricInput) -> MetricResult:
8789
]
8890
clean = not violations
8991
return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)])
92+
93+
94+
class InefficientRetryLoopMetric:
95+
"""Score ``False`` when the same tool call repeats more than ``threshold`` times."""
96+
97+
def __init__(self, *, threshold: int = 2, evidence_name: str = "trace") -> None:
98+
self._threshold = threshold
99+
self._evidence_name = evidence_name
100+
101+
@property
102+
def type(self) -> str:
103+
return "inefficient_retry_loop"
104+
105+
def output_spec(self) -> list[MetricOutputSpec]:
106+
return [
107+
MetricOutputSpec.boolean("efficient_tool_use"),
108+
MetricOutputSpec.discrete_score("max_repeated_tool_calls"),
109+
]
110+
111+
async def compute_scores(self, input: MetricInput) -> MetricResult:
112+
max_repeats = 0
113+
evidence = input.candidate.evidence
114+
if evidence is not None and evidence.get(self._evidence_name) is not None:
115+
calls = await evidence.trace(self._evidence_name).tool_calls()
116+
counts: dict[str, int] = {}
117+
for call in calls:
118+
key = f"{call.name}:{sorted((call.arguments or {}).items())}"
119+
counts[key] = counts.get(key, 0) + 1
120+
max_repeats = max(counts.values(), default=0)
121+
return MetricResult(
122+
outputs=[
123+
MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
124+
MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
125+
]
126+
)

packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,18 @@
1111
)
1212
from nemo_evaluator_sdk.values.datasets import DatasetInput, DatasetRows
1313
from nemo_evaluator_sdk.values.evidence import (
14+
AtifEvent,
15+
AtifEventType,
16+
AtifTokenUsage,
17+
AtifTrace,
1418
CandidateEvidence,
1519
CommandResult,
1620
EvidenceDescriptor,
1721
FilesystemDiff,
1822
FilesystemEntry,
1923
LocalFilesystemEvidence,
24+
LogHandle,
25+
TraceHandle,
2026
)
2127
from nemo_evaluator_sdk.values.metrics import (
2228
BLEU,
@@ -102,13 +108,19 @@
102108
"AggregateRubricScore",
103109
"AggregateScore",
104110
"AggregateScoreBase",
111+
"AtifEvent",
112+
"AtifEventType",
113+
"AtifTokenUsage",
114+
"AtifTrace",
105115
"BooleanValue",
106116
"CandidateEvidence",
107117
"CandidateOutput",
108118
"CommandResult",
109119
"ContinuousScore",
110120
"FilesystemDiff",
111121
"FilesystemEntry",
122+
"LogHandle",
123+
"TraceHandle",
112124
"DatasetRow",
113125
"DatasetRows",
114126
"DefaultAggregateFieldName",

0 commit comments

Comments
 (0)