2323 print(summary.by_metric)
2424"""
2525
26+ from collections .abc import Callable
2627from typing import Optional
2728
2829from lightspeed_evaluation .core .models import (
2930 EvaluationData ,
3031 EvaluationResult ,
32+ EvaluationRunContext ,
3133 SystemConfig ,
3234 TurnData ,
3335)
3638from lightspeed_evaluation .pipeline .evaluation import EvaluationPipeline
3739
3840
39- def evaluate (
41+ def evaluate ( # pylint: disable=too-many-arguments
4042 config : SystemConfig ,
4143 data : list [EvaluationData ],
4244 output_dir : Optional [str ] = None ,
45+ * ,
46+ evaluation_data_path : Optional [str ] = None ,
47+ on_complete : Optional [
48+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
49+ ] = None ,
4350) -> list [EvaluationResult ]:
4451 """Run evaluation on the provided data using the given configuration.
4552
@@ -51,6 +58,12 @@ def evaluate(
5158 config: A pre-built SystemConfig instance.
5259 data: List of EvaluationData conversations to evaluate.
5360 output_dir: Optional override for the output directory.
61+ evaluation_data_path: Optional path to the evaluation data file, used
62+ for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
63+ on_complete: Optional callback after a successful run; receives results
64+ and an :class:`EvaluationRunContext`. See
65+ :mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
66+ a Langfuse helper. Failures in the callback do not fail the run.
5467
5568 Returns:
5669 List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +74,25 @@ def evaluate(
6174 loader = ConfigLoader .from_config (config )
6275 pipeline = EvaluationPipeline (loader , output_dir )
6376 try :
64- return pipeline .run_evaluation (data )
77+ return pipeline .run_evaluation (
78+ data ,
79+ original_data_path = evaluation_data_path ,
80+ on_complete = on_complete ,
81+ )
6582 finally :
6683 pipeline .close ()
6784
6885
69- def evaluate_with_summary (
86+ def evaluate_with_summary ( # pylint: disable=too-many-arguments
7087 config : SystemConfig ,
7188 data : list [EvaluationData ],
7289 output_dir : Optional [str ] = None ,
7390 compute_confidence_intervals : bool = False ,
91+ * ,
92+ evaluation_data_path : Optional [str ] = None ,
93+ on_complete : Optional [
94+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
95+ ] = None ,
7496) -> EvaluationSummary :
7597 """Run evaluation and return structured results with computed statistics.
7698
@@ -84,22 +106,35 @@ def evaluate_with_summary(
84106 output_dir: Optional override for the output directory.
85107 compute_confidence_intervals: Whether to compute bootstrap confidence
86108 intervals. Default False.
109+ evaluation_data_path: Same as for :func:`evaluate`.
110+ on_complete: Same as for :func:`evaluate`.
87111
88112 Returns:
89113 EvaluationSummary with results and computed statistics.
90114 """
91- results = evaluate (config , data , output_dir = output_dir )
115+ results = evaluate (
116+ config ,
117+ data ,
118+ output_dir = output_dir ,
119+ evaluation_data_path = evaluation_data_path ,
120+ on_complete = on_complete ,
121+ )
92122 return EvaluationSummary .from_results (
93123 results ,
94124 evaluation_data = data if data else None ,
95125 compute_confidence_intervals = compute_confidence_intervals ,
96126 )
97127
98128
99- def evaluate_conversation (
129+ def evaluate_conversation ( # pylint: disable=too-many-arguments
100130 config : SystemConfig ,
101131 data : EvaluationData ,
102132 output_dir : Optional [str ] = None ,
133+ * ,
134+ evaluation_data_path : Optional [str ] = None ,
135+ on_complete : Optional [
136+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
137+ ] = None ,
103138) -> list [EvaluationResult ]:
104139 """Evaluate a single conversation group.
105140
@@ -109,18 +144,31 @@ def evaluate_conversation(
109144 config: A pre-built SystemConfig instance.
110145 data: A single EvaluationData conversation to evaluate.
111146 output_dir: Optional override for the output directory.
147+ evaluation_data_path: Same as for :func:`evaluate`.
148+ on_complete: Same as for :func:`evaluate`.
112149
113150 Returns:
114151 List of EvaluationResult objects.
115152 """
116- return evaluate (config , [data ], output_dir = output_dir )
153+ return evaluate (
154+ config ,
155+ [data ],
156+ output_dir = output_dir ,
157+ evaluation_data_path = evaluation_data_path ,
158+ on_complete = on_complete ,
159+ )
117160
118161
119- def evaluate_conversation_with_summary (
162+ def evaluate_conversation_with_summary ( # pylint: disable=too-many-arguments
120163 config : SystemConfig ,
121164 data : EvaluationData ,
122165 output_dir : Optional [str ] = None ,
123166 compute_confidence_intervals : bool = False ,
167+ * ,
168+ evaluation_data_path : Optional [str ] = None ,
169+ on_complete : Optional [
170+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
171+ ] = None ,
124172) -> EvaluationSummary :
125173 """Evaluate a single conversation and return structured results.
126174
@@ -132,6 +180,8 @@ def evaluate_conversation_with_summary(
132180 output_dir: Optional override for the output directory.
133181 compute_confidence_intervals: Whether to compute bootstrap confidence
134182 intervals. Default False.
183+ evaluation_data_path: Same as for :func:`evaluate`.
184+ on_complete: Same as for :func:`evaluate`.
135185
136186 Returns:
137187 EvaluationSummary with results and computed statistics.
@@ -141,15 +191,22 @@ def evaluate_conversation_with_summary(
141191 [data ],
142192 output_dir = output_dir ,
143193 compute_confidence_intervals = compute_confidence_intervals ,
194+ evaluation_data_path = evaluation_data_path ,
195+ on_complete = on_complete ,
144196 )
145197
146198
147- def evaluate_turn (
199+ def evaluate_turn ( # pylint: disable=too-many-arguments
148200 config : SystemConfig ,
149201 turn : TurnData ,
150202 metrics : Optional [list [str ]] = None ,
151203 conversation_group_id : str = "programmatic_eval" ,
152204 output_dir : Optional [str ] = None ,
205+ * ,
206+ evaluation_data_path : Optional [str ] = None ,
207+ on_complete : Optional [
208+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
209+ ] = None ,
153210) -> list [EvaluationResult ]:
154211 """Evaluate a single turn.
155212
@@ -163,6 +220,8 @@ def evaluate_turn(
163220 metrics: Optional list of metric identifiers to override turn_metrics.
164221 conversation_group_id: Conversation group ID for the wrapper.
165222 output_dir: Optional override for the output directory.
223+ evaluation_data_path: Same as for :func:`evaluate`.
224+ on_complete: Same as for :func:`evaluate`.
166225
167226 Returns:
168227 List of EvaluationResult objects.
@@ -174,15 +233,26 @@ def evaluate_turn(
174233 conversation_group_id = conversation_group_id ,
175234 turns = [turn ],
176235 )
177- return evaluate (config , [data ], output_dir = output_dir )
236+ return evaluate (
237+ config ,
238+ [data ],
239+ output_dir = output_dir ,
240+ evaluation_data_path = evaluation_data_path ,
241+ on_complete = on_complete ,
242+ )
178243
179244
180- def evaluate_turn_with_summary (
245+ def evaluate_turn_with_summary ( # pylint: disable=too-many-arguments
181246 config : SystemConfig ,
182247 turn : TurnData ,
183248 metrics : Optional [list [str ]] = None ,
184249 conversation_group_id : str = "programmatic_eval" ,
185250 output_dir : Optional [str ] = None ,
251+ * ,
252+ evaluation_data_path : Optional [str ] = None ,
253+ on_complete : Optional [
254+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
255+ ] = None ,
186256) -> EvaluationSummary :
187257 """Evaluate a single turn and return structured results.
188258
@@ -194,6 +264,8 @@ def evaluate_turn_with_summary(
194264 metrics: Optional list of metric identifiers to override turn_metrics.
195265 conversation_group_id: Conversation group ID for the wrapper.
196266 output_dir: Optional override for the output directory.
267+ evaluation_data_path: Same as for :func:`evaluate`.
268+ on_complete: Same as for :func:`evaluate`.
197269
198270 Returns:
199271 EvaluationSummary with results and computed statistics.
@@ -210,4 +282,6 @@ def evaluate_turn_with_summary(
210282 [data ],
211283 output_dir = output_dir ,
212284 compute_confidence_intervals = False ,
285+ evaluation_data_path = evaluation_data_path ,
286+ on_complete = on_complete ,
213287 )
0 commit comments