Skip to content

Commit 71f588f

Browse files
author
Dylan Huang
committed
Merge branch 'main' into data-loader
2 parents c7fef33 + 1002941 commit 71f588f

67 files changed

Lines changed: 4672 additions & 1571 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@
1010
"editor.formatOnSave": true,
1111
"[python]": {
1212
"editor.defaultFormatter": "charliermarsh.ruff"
13+
},
14+
"[typescript]": {
15+
"editor.defaultFormatter": "esbenp.prettier-vscode"
1316
}
1417
}

README.md

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,18 @@
44

55
**The open-source toolkit for building your internal model leaderboard.**
66

7-
When you have multiple AI models to choose from—different versions, providers, or configurations—how do you know which one is best for your use case?
7+
When you have multiple AI models to choose from—different versions, providers,
8+
or configurations—how do you know which one is best for your use case?
9+
10+
## 🚀 Features
11+
12+
- **Custom Evaluations**: Write evaluations tailored to your specific business needs
13+
- **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces using out-of-the-box evaluators
14+
- **RL Environments via MCP**: Build reinforcement learning environments using the Model Control Protocol (MCP) to simulate user interactions and advanced evaluation scenarios
15+
- **Consistent Testing**: Test across various models and configurations with a unified framework
16+
- **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
17+
- **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
18+
- **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
819

920
## Quick Examples
1021

@@ -69,15 +80,6 @@ def test_math_reasoning(row: EvaluationRow) -> EvaluationRow:
6980
return row
7081
```
7182

72-
## 🚀 Features
73-
74-
- **Custom Evaluations**: Write evaluations tailored to your specific business needs
75-
- **Auto-Evaluation**: Stack-rank models using LLMs as judges with just model traces
76-
- **Model Context Protocol (MCP) Integration**: Build reinforcement learning environments and trigger user simulations for complex scenarios
77-
- **Consistent Testing**: Test across various models and configurations with a unified framework
78-
- **Resilient Runtime**: Automatic retries for unstable LLM APIs and concurrent execution for long-running evaluations
79-
- **Rich Visualizations**: Built-in pivot tables and visualizations for result analysis
80-
- **Data-Driven Decisions**: Make informed model deployment decisions based on comprehensive evaluation results
8183

8284
## 📚 Resources
8385

eval_protocol/__init__.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,19 @@
3737
from .resources import create_llm_resource
3838
from .reward_function import RewardFunction
3939
from .typed_interface import reward_function
40-
from .quickstart import aha_judge, split_multi_turn_rows
40+
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
4141
from .pytest import evaluation_test, SingleTurnRolloutProcessor
4242
from .pytest.parameterize import DefaultParameterIdGenerator
4343

4444
from .adapters import OpenAIResponsesAdapter
4545

4646
try:
47-
from .adapters import LangfuseAdapter
47+
from .adapters import LangfuseAdapter, create_langfuse_adapter
4848
except ImportError:
4949
LangfuseAdapter = None
5050

5151
try:
52-
from .adapters import BraintrustAdapter
52+
from .adapters import BraintrustAdapter, create_braintrust_adapter
5353
except ImportError:
5454
BraintrustAdapter = None
5555

@@ -64,12 +64,15 @@
6464
__all__ = [
6565
"DefaultParameterIdGenerator",
6666
"aha_judge",
67-
"split_multi_turn_rows",
67+
"multi_turn_assistant_to_ground_truth",
68+
"assistant_to_ground_truth",
6869
"evaluation_test",
6970
"SingleTurnRolloutProcessor",
7071
"OpenAIResponsesAdapter",
7172
"LangfuseAdapter",
73+
"create_langfuse_adapter",
7274
"BraintrustAdapter",
75+
"create_braintrust_adapter",
7376
"LangSmithAdapter",
7477
# Core interfaces
7578
"Message",

eval_protocol/adapters/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
1919
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
2020
"""Upload evaluation scores back to the data source for tracking and analysis."""
2121
pass
22+
23+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
24+
"""Upload evaluation score for a single row back to the data source."""
25+
pass

eval_protocol/adapters/braintrust.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,40 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
264264
except Exception as e:
265265
logger.warning("Failed to push scores to Braintrust: %s", e)
266266

267+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
268+
"""Upload evaluation score for a single row back to Braintrust.
269+
270+
Args:
271+
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
272+
model_name: Name of the model (used as the score name in Braintrust)
273+
"""
274+
try:
275+
if (
276+
row.evaluation_result
277+
and row.evaluation_result.is_score_valid
278+
and row.input_metadata
279+
and row.input_metadata.session_data
280+
and "braintrust_trace_id" in row.input_metadata.session_data
281+
):
282+
headers = {
283+
"Authorization": f"Bearer {self.api_key}",
284+
"Content-Type": "application/json",
285+
}
286+
287+
trace_id = row.input_metadata.session_data["braintrust_trace_id"]
288+
if trace_id:
289+
feedback_items = [{"id": trace_id, "scores": {model_name: row.evaluation_result.score}}]
290+
291+
response = requests.post(
292+
f"{self.api_url}/v1/feedback",
293+
headers=headers,
294+
json={"feedback": feedback_items},
295+
timeout=30,
296+
)
297+
response.raise_for_status()
298+
except Exception as e:
299+
logger.warning("Failed to upload single score to Braintrust: %s", e)
300+
267301

268302
def create_braintrust_adapter(
269303
api_key: Optional[str] = None,

0 commit comments

Comments
 (0)