Skip to content

Commit 5f309d1

Browse files
authored
fix: Support autoevals inline score typing (#403)
I noticed that using autoevals as scorers inline fails typing. Codex says this is because the autoevals scorers return a similar-but-different Score class https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/score.py#L10
1 parent e6401e3 commit 5f309d1

5 files changed

Lines changed: 1004 additions & 970 deletions

File tree

py/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ test-cli = [
177177

178178
test-types = [
179179
{include-group = "test"},
180+
"autoevals==0.2.0",
180181
"pyright==1.1.408",
181182
"mypy==1.20.0",
182183
]

py/src/braintrust/framework.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
validate_parameters,
4949
)
5050
from .resource_manager import ResourceManager
51-
from .score import Classification, ClassificationItem, Score, is_classification, is_score, is_scorer
51+
from .score import Classification, ClassificationItem, Score, ScoreLike, is_classification, is_score, is_scorer
5252
from .serializable_data_class import SerializableDataClass
5353
from .span_types import SpanTypeAttribute
5454
from .types._eval import EvalCaseDict, EvalCaseDictNoOutput, ExperimentDatasetEvent
@@ -216,7 +216,7 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output, Expected]):
216216
metadata: Metadata | None = None
217217

218218

219-
OneOrMoreScores = float | int | bool | None | Score | list[Score]
219+
OneOrMoreScores = float | int | bool | None | ScoreLike | list[ScoreLike]
220220
OneOrMoreClassifications = None | Classification | Mapping[str, Any] | list[Classification | Mapping[str, Any]]
221221

222222

@@ -1286,7 +1286,7 @@ def _classifier_name(classifier, classifier_idx):
12861286
return _callable_name(classifier, classifier_idx, "classifier")
12871287

12881288

1289-
def _build_span_metadata(results: list[Score] | list[Classification]) -> Metadata | None:
1289+
def _build_span_metadata(results: list[ScoreLike] | list[Classification]) -> Metadata | None:
12901290
if not results:
12911291
return None
12921292
if len(results) == 1:

py/src/braintrust/score.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import inspect
33
import warnings
44
from abc import ABC, abstractmethod
5-
from typing import Any, TypedDict
5+
from collections.abc import Mapping
6+
from typing import Any, Protocol, TypedDict
67

7-
from typing_extensions import NotRequired
8+
from typing_extensions import NotRequired, TypeGuard
89

910
from .serializable_data_class import SerializableDataClass
1011
from .types import Metadata
@@ -53,6 +54,19 @@ def __post_init__(self):
5354
)
5455

5556

57+
class ScoreLike(Protocol):
58+
@property
59+
def name(self) -> str: ...
60+
61+
@property
62+
def score(self) -> float | None: ...
63+
64+
@property
65+
def metadata(self) -> Metadata: ...
66+
67+
def as_dict(self) -> Mapping[str, Any]: ...
68+
69+
5670
class ClassificationItem(TypedDict):
5771
id: str
5872
label: NotRequired[str]
@@ -76,7 +90,7 @@ class Classification(SerializableDataClass):
7690
"""Optional metadata attached to the classification result."""
7791

7892
def as_dict(self):
79-
result = {"id": self.id}
93+
result: Mapping[str, Any] = {"id": self.id}
8094
if self.name is not None:
8195
result["name"] = self.name
8296
if self.label is not None:
@@ -102,7 +116,7 @@ def __post_init__(self):
102116
raise ValueError("classification label must be a string when provided")
103117

104118

105-
def is_score(obj):
119+
def is_score(obj: object) -> TypeGuard[ScoreLike]:
106120
return hasattr(obj, "name") and hasattr(obj, "score") and hasattr(obj, "metadata") and hasattr(obj, "as_dict")
107121

108122

@@ -151,6 +165,7 @@ def is_scorer(obj):
151165
"Classification",
152166
"ClassificationItem",
153167
"Score",
168+
"ScoreLike",
154169
"Scorer",
155170
"is_classification",
156171
"is_score",
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Type-check and runtime tests for autoevals scorers in Eval."""
2+
3+
import pytest
4+
from autoevals import Levenshtein # type: ignore[import-untyped]
5+
from braintrust.framework import Eval, EvalAsync, EvalCase, EvalScorer
6+
7+
8+
def accepts_autoevals_scorer(
9+
scorer: EvalScorer[str, str, str],
10+
) -> EvalScorer[str, str, str]:
11+
return scorer
12+
13+
14+
def autoevals_data():
15+
return iter([EvalCase(input="query", expected="hello world")])
16+
17+
18+
def autoevals_task(input: str) -> str:
19+
return "hello world"
20+
21+
22+
async def autoevals_task_async(input: str) -> str:
23+
return "hello world"
24+
25+
26+
autoevals_scores: list[EvalScorer[str, str, str]] = [
27+
accepts_autoevals_scorer(Levenshtein()),
28+
accepts_autoevals_scorer(Levenshtein),
29+
accepts_autoevals_scorer(Levenshtein.partial(foo="bar")),
30+
]
31+
32+
autoevals_scores_untyped = [
33+
Levenshtein(),
34+
Levenshtein,
35+
Levenshtein.partial(foo="bar"),
36+
]
37+
38+
39+
def test_eval_accepts_autoevals_scorers_typed():
40+
result = Eval(
41+
"test-autoevals-scorers",
42+
data=autoevals_data,
43+
task=autoevals_task,
44+
scores=autoevals_scores,
45+
no_send_logs=True,
46+
)
47+
48+
score = result.results[0].scores["Levenshtein"]
49+
assert score is not None
50+
assert score > 0
51+
52+
53+
def test_eval_accepts_autoevals_scorers_untyped():
54+
result = Eval(
55+
"test-autoevals-scorers",
56+
data=autoevals_data,
57+
task=autoevals_task,
58+
scores=autoevals_scores,
59+
no_send_logs=True,
60+
)
61+
62+
score = result.results[0].scores["Levenshtein"]
63+
assert score is not None
64+
assert score > 0
65+
66+
67+
def test_eval_accepts_autoevals_scorers_inline():
68+
result = Eval(
69+
"test-autoevals-scorers",
70+
data=autoevals_data,
71+
task=autoevals_task,
72+
scores=[
73+
Levenshtein(),
74+
Levenshtein,
75+
Levenshtein.partial(foo="bar"),
76+
],
77+
no_send_logs=True,
78+
)
79+
80+
score = result.results[0].scores["Levenshtein"]
81+
assert score is not None
82+
assert score > 0
83+
84+
85+
@pytest.mark.asyncio
86+
async def test_eval_async_accepts_autoevals_scorers_typed():
87+
result = await EvalAsync(
88+
"test-autoevals-scorers",
89+
data=autoevals_data,
90+
task=autoevals_task_async,
91+
scores=autoevals_scores,
92+
no_send_logs=True,
93+
)
94+
95+
score = result.results[0].scores["Levenshtein"]
96+
assert score is not None
97+
assert score > 0
98+
99+
100+
@pytest.mark.asyncio
101+
async def test_eval_async_accepts_autoevals_scorers_untyped():
102+
result = await EvalAsync(
103+
"test-autoevals-scorers",
104+
data=autoevals_data,
105+
task=autoevals_task_async,
106+
scores=autoevals_scores,
107+
no_send_logs=True,
108+
)
109+
110+
score = result.results[0].scores["Levenshtein"]
111+
assert score is not None
112+
assert score > 0
113+
114+
115+
@pytest.mark.asyncio
116+
async def test_eval_async_accepts_autoevals_scorers_inline():
117+
result = await EvalAsync(
118+
"test-autoevals-scorers",
119+
data=autoevals_data,
120+
task=autoevals_task_async,
121+
scores=[
122+
Levenshtein(),
123+
Levenshtein,
124+
Levenshtein.partial(foo="bar"),
125+
],
126+
no_send_logs=True,
127+
)
128+
129+
score = result.results[0].scores["Levenshtein"]
130+
assert score is not None
131+
assert score > 0

0 commit comments

Comments
 (0)