Skip to content

Commit 57f259f

Browse files
authored
Change scoring mechanism and change to pointwise (#193)
* Change scoring mechanism and change to pointwise * update * update
1 parent 446fbb1 commit 57f259f

18 files changed

Lines changed: 595 additions & 308 deletions

eval_protocol/__init__.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,19 @@
3737
from .resources import create_llm_resource
3838
from .reward_function import RewardFunction
3939
from .typed_interface import reward_function
40-
from .quickstart import aha_judge, split_multi_turn_rows
40+
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
4141
from .pytest import evaluation_test, SingleTurnRolloutProcessor
4242
from .pytest.parameterize import DefaultParameterIdGenerator
4343

4444
from .adapters import OpenAIResponsesAdapter
4545

4646
try:
47-
from .adapters import LangfuseAdapter
47+
from .adapters import LangfuseAdapter, create_langfuse_adapter
4848
except ImportError:
4949
LangfuseAdapter = None
5050

5151
try:
52-
from .adapters import BraintrustAdapter
52+
from .adapters import BraintrustAdapter, create_braintrust_adapter
5353
except ImportError:
5454
BraintrustAdapter = None
5555

@@ -64,12 +64,15 @@
6464
__all__ = [
6565
"DefaultParameterIdGenerator",
6666
"aha_judge",
67-
"split_multi_turn_rows",
67+
"multi_turn_assistant_to_ground_truth",
68+
"assistant_to_ground_truth",
6869
"evaluation_test",
6970
"SingleTurnRolloutProcessor",
7071
"OpenAIResponsesAdapter",
7172
"LangfuseAdapter",
73+
"create_langfuse_adapter",
7274
"BraintrustAdapter",
75+
"create_braintrust_adapter",
7376
"LangSmithAdapter",
7477
# Core interfaces
7578
"Message",

eval_protocol/adapters/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ def get_evaluation_rows(self, *args, **kwargs) -> List[EvaluationRow]:
1919
def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
2020
"""Upload evaluation scores back to the data source for tracking and analysis."""
2121
pass
22+
23+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
24+
"""Upload evaluation score for a single row back to the data source."""
25+
pass

eval_protocol/adapters/braintrust.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,40 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
264264
except Exception as e:
265265
logger.warning("Failed to push scores to Braintrust: %s", e)
266266

267+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
268+
"""Upload evaluation score for a single row back to Braintrust.
269+
270+
Args:
271+
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
272+
model_name: Name of the model (used as the score name in Braintrust)
273+
"""
274+
try:
275+
if (
276+
row.evaluation_result
277+
and row.evaluation_result.is_score_valid
278+
and row.input_metadata
279+
and row.input_metadata.session_data
280+
and "braintrust_trace_id" in row.input_metadata.session_data
281+
):
282+
headers = {
283+
"Authorization": f"Bearer {self.api_key}",
284+
"Content-Type": "application/json",
285+
}
286+
287+
trace_id = row.input_metadata.session_data["braintrust_trace_id"]
288+
if trace_id:
289+
feedback_items = [{"id": trace_id, "scores": {model_name: row.evaluation_result.score}}]
290+
291+
response = requests.post(
292+
f"{self.api_url}/v1/feedback",
293+
headers=headers,
294+
json={"feedback": feedback_items},
295+
timeout=30,
296+
)
297+
response.raise_for_status()
298+
except Exception as e:
299+
logger.warning("Failed to upload single score to Braintrust: %s", e)
300+
267301

268302
def create_braintrust_adapter(
269303
api_key: Optional[str] = None,

eval_protocol/adapters/langfuse.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,9 +445,6 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
445445
rows: List of EvaluationRow objects with session_data containing trace IDs
446446
model_name: Name of the model (used as the score name in Langfuse)
447447
mean_score: The calculated mean score to push to Langfuse
448-
449-
Note:
450-
Silently handles errors if rows lack session data
451448
"""
452449
try:
453450
for trace_id in set(
@@ -464,6 +461,31 @@ def upload_scores(self, rows: List[EvaluationRow], model_name: str, mean_score:
464461
except Exception as e:
465462
logger.warning("Failed to push scores to Langfuse: %s", e)
466463

464+
def upload_score(self, row: EvaluationRow, model_name: str) -> None:
465+
"""Upload evaluation score for a single row back to Langfuse.
466+
467+
Args:
468+
row: Single EvaluationRow with evaluation_result and session_data containing trace ID
469+
model_name: Name of the model (used as the score name in Langfuse)
470+
"""
471+
try:
472+
if (
473+
row.evaluation_result
474+
and row.evaluation_result.is_score_valid
475+
and row.input_metadata
476+
and row.input_metadata.session_data
477+
and "langfuse_trace_id" in row.input_metadata.session_data
478+
):
479+
trace_id = row.input_metadata.session_data["langfuse_trace_id"]
480+
if trace_id:
481+
self.client.create_score(
482+
trace_id=trace_id,
483+
name=model_name,
484+
value=row.evaluation_result.score,
485+
)
486+
except Exception as e:
487+
logger.warning("Failed to push score to Langfuse: %s", e)
488+
467489

468490
def create_langfuse_adapter() -> LangfuseAdapter:
469491
"""Factory function to create a Langfuse adapter."""

eval_protocol/dataset_logger/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,19 @@ def read(self, rollout_id=None):
2222

2323
# Lazy property that creates the logger only when accessed
2424
class _LazyLogger(DatasetLogger):
25+
def __init__(self):
26+
self._logger: DatasetLogger | None = None
27+
28+
def _get_logger(self):
29+
if self._logger is None:
30+
self._logger = _get_default_logger()
31+
return self._logger
32+
2533
def log(self, row):
26-
return _get_default_logger().log(row)
34+
return self._get_logger().log(row)
2735

2836
def read(self, rollout_id=None):
29-
return _get_default_logger().read(rollout_id)
37+
return self._get_logger().read(rollout_id)
3038

3139

3240
default_logger: DatasetLogger = _LazyLogger()

eval_protocol/dataset_logger/sqlite_evaluation_row_store.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ class SqliteEvaluationRowStore:
1717
def __init__(self, db_path: str):
1818
os.makedirs(os.path.dirname(db_path), exist_ok=True)
1919
self._db_path = db_path
20-
self._db = SqliteDatabase(self._db_path)
20+
self._db = SqliteDatabase(self._db_path, pragmas={"journal_mode": "wal"})
2121

2222
class BaseModel(Model):
2323
class Meta:
@@ -41,10 +41,12 @@ def upsert_row(self, data: dict) -> None:
4141
rollout_id = data["execution_metadata"]["rollout_id"]
4242
if rollout_id is None:
4343
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
44-
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
45-
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
46-
else:
47-
self._EvaluationRow.create(rollout_id=rollout_id, data=data)
44+
45+
with self._db.atomic("EXCLUSIVE"):
46+
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
47+
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
48+
else:
49+
self._EvaluationRow.create(rollout_id=rollout_id, data=data)
4850

4951
def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
5052
if rollout_id is None:

eval_protocol/pytest/evaluation_test_postprocess.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from eval_protocol.models import CompletionParams, EvaluationRow, EvaluationThreshold
1111
from eval_protocol.pytest.handle_persist_flow import handle_persist_flow
1212
from eval_protocol.pytest.types import EvaluationTestMode
13-
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename # pyright: ignore[reportUnknownVariableType]
13+
from eval_protocol.pytest.utils import AggregationMethod, aggregate, extract_effort_tag, sanitize_filename
1414
from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci
1515

1616

@@ -25,9 +25,18 @@ def postprocess(
2525
num_runs: int,
2626
experiment_duration_seconds: float,
2727
):
28-
scores = [
29-
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result) for result in all_results
28+
valid_results = [
29+
[r for r in result if r.evaluation_result and r.evaluation_result.is_score_valid] for result in all_results
3030
]
31+
32+
if aggregation_method == "bootstrap":
33+
scores = [r.evaluation_result.score for result in valid_results for r in result if r.evaluation_result]
34+
else:
35+
scores = [
36+
sum(r.evaluation_result.score for r in result if r.evaluation_result) / len(result)
37+
for result in valid_results
38+
if result
39+
]
3140
agg_score = aggregate(scores, aggregation_method)
3241

3342
# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)

eval_protocol/pytest/handle_persist_flow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
7171
row_data["evals"] = {"score": 0}
7272
row_data["eval_details"] = {
7373
"score": 0,
74-
"is_score_valid": True,
74+
"is_score_valid": False,
7575
"reason": "No evaluation result",
7676
"metrics": {},
7777
}

eval_protocol/pytest/utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,36 @@
2727

2828
import logging
2929
import json
30+
import pandas as pd
3031

3132

32-
AggregationMethod = Literal["mean", "max", "min"]
33+
AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
34+
35+
36+
def calculate_bootstrap_scores(all_scores: list[float]) -> float:
37+
"""
38+
Calculate bootstrap confidence intervals for individual scores.
39+
40+
Args:
41+
all_scores: List of individual scores from all rows
42+
43+
Returns:
44+
Mean bootstrap score
45+
"""
46+
if not all_scores:
47+
return 0.0
48+
49+
# Create DataFrame (single column of scores)
50+
battles = pd.DataFrame({"score": all_scores})
51+
52+
# Bootstrap sampling for calculating relative performance
53+
bootstrap_means = [battles.sample(frac=1.0, replace=True)["score"].mean() for _ in range(100)]
54+
55+
# Calculate final scores
56+
bootstraps = pd.Series(bootstrap_means)
57+
mean_score = bootstraps.mean()
58+
59+
return float(mean_score)
3360

3461

3562
def aggregate(scores: list[float], method: AggregationMethod) -> float:
@@ -41,7 +68,8 @@ def aggregate(scores: list[float], method: AggregationMethod) -> float:
4168
return max(scores)
4269
if method == "min":
4370
return min(scores)
44-
raise ValueError(f"Unknown aggregation method: {method}") # pyright: ignore[reportUnreachable]
71+
if method == "bootstrap":
72+
return calculate_bootstrap_scores(scores)
4573

4674

4775
def log_eval_status_and_rows(
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llm_judge import aha_judge
2-
from .utils import split_multi_turn_rows
2+
from .utils import multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
33

4-
__all__ = ["aha_judge"]
4+
__all__ = ["aha_judge", "multi_turn_assistant_to_ground_truth", "assistant_to_ground_truth"]

0 commit comments

Comments
 (0)