fix: Support autoevals inline score typing (#403)

j13huang · web-flow · commit 5f309d1cc89a · 2026-05-12T14:34:54.000-04:00
I noticed that using autoevals as scorers inline fails typing. Codex says this is because the autoevals scorers return a similar-but-different Score class https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/score.py#L10
diff --git a/py/pyproject.toml b/py/pyproject.toml
@@ -177,6 +177,7 @@ test-cli = [
 
 test-types = [
     {include-group = "test"},
+    "autoevals==0.2.0",
     "pyright==1.1.408",
     "mypy==1.20.0",
 ]
diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
@@ -48,7 +48,7 @@
     validate_parameters,
 )
 from .resource_manager import ResourceManager
-from .score import Classification, ClassificationItem, Score, is_classification, is_score, is_scorer
+from .score import Classification, ClassificationItem, Score, ScoreLike, is_classification, is_score, is_scorer
 from .serializable_data_class import SerializableDataClass
 from .span_types import SpanTypeAttribute
 from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent
@@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]):
     metadata: Metadata | None = None
 
 
-OneOrMoreScores = float | int | bool | None | Score | list[Score]
+OneOrMoreScores = float | int | bool | None | ScoreLike | list[ScoreLike]
 OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]]
 
 
@@ -1286,7 +1286,7 @@ def _classifier_name(classifier, classifier_idx):
     return _callable_name(classifier, classifier_idx, "classifier")
 
 
-def _build_span_metadata(results: list[Score] | list[Classification]) -> Metadata | None:
+def _build_span_metadata(results: list[ScoreLike] | list[Classification]) -> Metadata | None:
     if not results:
         return None
     if len(results) == 1:
diff --git a/py/src/braintrust/score.py b/py/src/braintrust/score.py
@@ -2,9 +2,10 @@
 import inspect
 import warnings
 from abc import ABC, abstractmethod
-from typing import Any, TypedDict
+from collections.abc import Mapping
+from typing import Any, Protocol, TypedDict
 
-from typing_extensions import NotRequired
+from typing_extensions import NotRequired, TypeGuard
 
 from .serializable_data_class import SerializableDataClass
 from .types import Metadata
@@ -53,6 +54,19 @@ def __post_init__(self):
             )
 
 
+class ScoreLike(Protocol):
+    @property
+    def name(self) -> str: ...
+
+    @property
+    def score(self) -> float | None: ...
+
+    @property
+    def metadata(self) -> Metadata: ...
+
+    def as_dict(self) -> Mapping[str, Any]: ...
+
+
 class ClassificationItem(TypedDict):
     id: str
     label: NotRequired[str]
@@ -76,7 +90,7 @@ class Classification(SerializableDataClass):
     """Optional metadata attached to the classification result."""
 
     def as_dict(self):
-        result = {"id": self.id}
+        result: Mapping[str, Any] = {"id": self.id}
         if self.name is not None:
             result["name"] = self.name
         if self.label is not None:
@@ -102,7 +116,7 @@ def __post_init__(self):
             raise ValueError("classification label must be a string when provided")
 
 
-def is_score(obj):
+def is_score(obj: object) -> TypeGuard[ScoreLike]:
     return hasattr(obj, "name") and hasattr(obj, "score") and hasattr(obj, "metadata") and hasattr(obj, "as_dict")
 
 
@@ -151,6 +165,7 @@ def is_scorer(obj):
     "Classification",
     "ClassificationItem",
     "Score",
+    "ScoreLike",
     "Scorer",
     "is_classification",
     "is_score",
diff --git a/py/src/braintrust/type_tests/test_autoevals_scorers.py b/py/src/braintrust/type_tests/test_autoevals_scorers.py
@@ -0,0 +1,131 @@
+"""Type-check and runtime tests for autoevals scorers in Eval."""
+
+import pytest
+from autoevals import Levenshtein  # type: ignore[import-untyped]
+from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer
+
+
+def accepts_autoevals_scorer(
+    scorer: EvalScorer[str, str, str],
+) -> EvalScorer[str, str, str]:
+    return scorer
+
+
+def autoevals_data():
+    return iter([EvalCase(input="query", expected="hello world")])
+
+
+def autoevals_task(input: str) -> str:
+    return "hello world"
+
+
+async def autoevals_task_async(input: str) -> str:
+    return "hello world"
+
+
+autoevals_scores: list[EvalScorer[str, str, str]] = [
+    accepts_autoevals_scorer(Levenshtein()),
+    accepts_autoevals_scorer(Levenshtein),
+    accepts_autoevals_scorer(Levenshtein.partial(foo="bar")),
+]
+
+autoevals_scores_untyped = [
+    Levenshtein(),
+    Levenshtein,
+    Levenshtein.partial(foo="bar"),
+]
+
+
+def test_eval_accepts_autoevals_scorers_typed():
+    result = Eval(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task,
+        scores=autoevals_scores,
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
+
+
+def test_eval_accepts_autoevals_scorers_untyped():
+    result = Eval(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task,
+        scores=autoevals_scores,
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
+
+
+def test_eval_accepts_autoevals_scorers_inline():
+    result = Eval(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task,
+        scores=[
+            Levenshtein(),
+            Levenshtein,
+            Levenshtein.partial(foo="bar"),
+        ],
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
+
+
+@pytest.mark.asyncio
+async def test_eval_async_accepts_autoevals_scorers_typed():
+    result = await EvalAsync(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task_async,
+        scores=autoevals_scores,
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
+
+
+@pytest.mark.asyncio
+async def test_eval_async_accepts_autoevals_scorers_untyped():
+    result = await EvalAsync(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task_async,
+        scores=autoevals_scores,
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
+
+
+@pytest.mark.asyncio
+async def test_eval_async_accepts_autoevals_scorers_inline():
+    result = await EvalAsync(
+        "test-autoevals-scorers",
+        data=autoevals_data,
+        task=autoevals_task_async,
+        scores=[
+            Levenshtein(),
+            Levenshtein,
+            Levenshtein.partial(foo="bar"),
+        ],
+        no_send_logs=True,
+    )
+
+    score = result.results[0].scores["Levenshtein"]
+    assert score is not None
+    assert score > 0
diff --git a/py/uv.lock b/py/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,7 @@ test-cli = [`
`177`	`177`
`178`	`178`	`test-types = [`
`179`	`179`	`{include-group = "test"},`
	`180`	`+ "autoevals==0.2.0",`
`180`	`181`	`"pyright==1.1.408",`
`181`	`182`	`"mypy==1.20.0",`
`182`	`183`	`]`