NVIDIA-NeMo
diff --git a/‎packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py‎
Lines changed: 39 additions & 2 deletions b/‎packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py‎
Lines changed: 12 additions & 0 deletions
@@ -3,13 +3,15 @@
 
 """Reference metrics-over-evidence for this example (not SDK API).
 
-These show how to score from the SDK's filesystem evidence handle instead of a
-stamped verifier reward:
+These show how to score from the SDK's evidence handles instead of a stamped
+verifier reward:
 
 * :class:`TestsPassMetric` runs a command against ``final_state`` filesystem
   evidence (in a throwaway overlay) and scores on exit 0.
 * :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state``
   and fails if the agent touched protected (e.g. test) paths.
+* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails
+  when the same tool call repeats past a threshold.
 """
 
 from __future__ import annotations
@@ -87,3 +89,38 @@ async def compute_scores(self, input: MetricInput) -> MetricResult:
             ]
             clean = not violations
         return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)])
+
+
+class InefficientRetryLoopMetric:
+    """Score ``False`` when the same tool call repeats more than ``threshold`` times."""
+
+    def __init__(self, *, threshold: int = 2, evidence_name: str = "trace") -> None:
+        self._threshold = threshold
+        self._evidence_name = evidence_name
+
+    @property
+    def type(self) -> str:
+        return "inefficient_retry_loop"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [
+            MetricOutputSpec.boolean("efficient_tool_use"),
+            MetricOutputSpec.discrete_score("max_repeated_tool_calls"),
+        ]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        max_repeats = 0
+        evidence = input.candidate.evidence
+        if evidence is not None and evidence.get(self._evidence_name) is not None:
+            calls = await evidence.trace(self._evidence_name).tool_calls()
+            counts: dict[str, int] = {}
+            for call in calls:
+                key = f"{call.name}:{sorted((call.arguments or {}).items())}"
+                counts[key] = counts.get(key, 0) + 1
+            max_repeats = max(counts.values(), default=0)
+        return MetricResult(
+            outputs=[
+                MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
+                MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
+            ]
+        )
@@ -11,12 +11,18 @@
 )
 from nemo_evaluator_sdk.values.datasets import DatasetInput, DatasetRows
 from nemo_evaluator_sdk.values.evidence import (
+    AtifEvent,
+    AtifEventType,
+    AtifTokenUsage,
+    AtifTrace,
     CandidateEvidence,
     CommandResult,
     EvidenceDescriptor,
     FilesystemDiff,
     FilesystemEntry,
     LocalFilesystemEvidence,
+    LogHandle,
+    TraceHandle,
 )
 from nemo_evaluator_sdk.values.metrics import (
     BLEU,
@@ -102,13 +108,19 @@
     "AggregateRubricScore",
     "AggregateScore",
     "AggregateScoreBase",
+    "AtifEvent",
+    "AtifEventType",
+    "AtifTokenUsage",
+    "AtifTrace",
     "BooleanValue",
     "CandidateEvidence",
     "CandidateOutput",
     "CommandResult",
     "ContinuousScore",
     "FilesystemDiff",
     "FilesystemEntry",
+    "LogHandle",
+    "TraceHandle",
     "DatasetRow",
     "DatasetRows",
     "DefaultAggregateFieldName",