2323 print(summary.by_metric)
2424"""
2525
26+ from collections .abc import Callable
2627from typing import Optional
2728
2829from lightspeed_evaluation .core .models import (
2930 EvaluationData ,
3031 EvaluationResult ,
32+ EvaluationRunContext ,
3133 SystemConfig ,
3234 TurnData ,
3335)
3638from lightspeed_evaluation .pipeline .evaluation import EvaluationPipeline
3739
3840
39- def evaluate (
41+ def _on_complete_with_optional_storage_langfuse (
42+ config : SystemConfig ,
43+ on_complete : Optional [
44+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
45+ ],
46+ ) -> Optional [Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]]:
47+ """Respect an explicit callback; otherwise attach Langfuse when configured in storage."""
48+ if on_complete is not None :
49+ return on_complete
50+ from lightspeed_evaluation .integrations .langfuse_reporter import ( # pylint: disable=import-outside-toplevel
51+ build_langfuse_on_complete_from_storage_configs ,
52+ )
53+
54+ return build_langfuse_on_complete_from_storage_configs (config .storage )
55+
56+
57+ def evaluate ( # pylint: disable=too-many-arguments
4058 config : SystemConfig ,
4159 data : list [EvaluationData ],
4260 output_dir : Optional [str ] = None ,
61+ * ,
62+ evaluation_data_path : Optional [str ] = None ,
63+ on_complete : Optional [
64+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
65+ ] = None ,
4366) -> list [EvaluationResult ]:
4467 """Run evaluation on the provided data using the given configuration.
4568
@@ -51,6 +74,14 @@ def evaluate(
5174 config: A pre-built SystemConfig instance.
5275 data: List of EvaluationData conversations to evaluate.
5376 output_dir: Optional override for the output directory.
77+ evaluation_data_path: Optional path to the evaluation data file, used
78+ for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
79+ on_complete: Optional callback after a successful run; receives results
80+ and an :class:`EvaluationRunContext`. See
81+ :mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
82+ a Langfuse helper. If omitted and ``config.storage`` contains
83+ ``type: langfuse`` (with required ``host``), a Langfuse export callback
84+ is attached automatically. Failures in the callback do not fail the run.
5485
5586 Returns:
5687 List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +92,28 @@ def evaluate(
6192 loader = ConfigLoader .from_config (config )
6293 pipeline = EvaluationPipeline (loader , output_dir )
6394 try :
64- return pipeline .run_evaluation (data )
95+ effective_on_complete = _on_complete_with_optional_storage_langfuse (
96+ config , on_complete
97+ )
98+ return pipeline .run_evaluation (
99+ data ,
100+ original_data_path = evaluation_data_path ,
101+ on_complete = effective_on_complete ,
102+ )
65103 finally :
66104 pipeline .close ()
67105
68106
69- def evaluate_with_summary (
107+ def evaluate_with_summary ( # pylint: disable=too-many-arguments
70108 config : SystemConfig ,
71109 data : list [EvaluationData ],
72110 output_dir : Optional [str ] = None ,
73111 compute_confidence_intervals : bool = False ,
112+ * ,
113+ evaluation_data_path : Optional [str ] = None ,
114+ on_complete : Optional [
115+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
116+ ] = None ,
74117) -> EvaluationSummary :
75118 """Run evaluation and return structured results with computed statistics.
76119
@@ -84,22 +127,35 @@ def evaluate_with_summary(
84127 output_dir: Optional override for the output directory.
85128 compute_confidence_intervals: Whether to compute bootstrap confidence
86129 intervals. Default False.
130+ evaluation_data_path: Same as for :func:`evaluate`.
131+ on_complete: Same as for :func:`evaluate`.
87132
88133 Returns:
89134 EvaluationSummary with results and computed statistics.
90135 """
91- results = evaluate (config , data , output_dir = output_dir )
136+ results = evaluate (
137+ config ,
138+ data ,
139+ output_dir = output_dir ,
140+ evaluation_data_path = evaluation_data_path ,
141+ on_complete = on_complete ,
142+ )
92143 return EvaluationSummary .from_results (
93144 results ,
94145 evaluation_data = data if data else None ,
95146 compute_confidence_intervals = compute_confidence_intervals ,
96147 )
97148
98149
99- def evaluate_conversation (
150+ def evaluate_conversation ( # pylint: disable=too-many-arguments
100151 config : SystemConfig ,
101152 data : EvaluationData ,
102153 output_dir : Optional [str ] = None ,
154+ * ,
155+ evaluation_data_path : Optional [str ] = None ,
156+ on_complete : Optional [
157+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
158+ ] = None ,
103159) -> list [EvaluationResult ]:
104160 """Evaluate a single conversation group.
105161
@@ -109,18 +165,31 @@ def evaluate_conversation(
109165 config: A pre-built SystemConfig instance.
110166 data: A single EvaluationData conversation to evaluate.
111167 output_dir: Optional override for the output directory.
168+ evaluation_data_path: Same as for :func:`evaluate`.
169+ on_complete: Same as for :func:`evaluate`.
112170
113171 Returns:
114172 List of EvaluationResult objects.
115173 """
116- return evaluate (config , [data ], output_dir = output_dir )
174+ return evaluate (
175+ config ,
176+ [data ],
177+ output_dir = output_dir ,
178+ evaluation_data_path = evaluation_data_path ,
179+ on_complete = on_complete ,
180+ )
117181
118182
119- def evaluate_conversation_with_summary (
183+ def evaluate_conversation_with_summary ( # pylint: disable=too-many-arguments
120184 config : SystemConfig ,
121185 data : EvaluationData ,
122186 output_dir : Optional [str ] = None ,
123187 compute_confidence_intervals : bool = False ,
188+ * ,
189+ evaluation_data_path : Optional [str ] = None ,
190+ on_complete : Optional [
191+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
192+ ] = None ,
124193) -> EvaluationSummary :
125194 """Evaluate a single conversation and return structured results.
126195
@@ -132,6 +201,8 @@ def evaluate_conversation_with_summary(
132201 output_dir: Optional override for the output directory.
133202 compute_confidence_intervals: Whether to compute bootstrap confidence
134203 intervals. Default False.
204+ evaluation_data_path: Same as for :func:`evaluate`.
205+ on_complete: Same as for :func:`evaluate`.
135206
136207 Returns:
137208 EvaluationSummary with results and computed statistics.
@@ -141,15 +212,22 @@ def evaluate_conversation_with_summary(
141212 [data ],
142213 output_dir = output_dir ,
143214 compute_confidence_intervals = compute_confidence_intervals ,
215+ evaluation_data_path = evaluation_data_path ,
216+ on_complete = on_complete ,
144217 )
145218
146219
147- def evaluate_turn (
220+ def evaluate_turn ( # pylint: disable=too-many-arguments
148221 config : SystemConfig ,
149222 turn : TurnData ,
150223 metrics : Optional [list [str ]] = None ,
151224 conversation_group_id : str = "programmatic_eval" ,
152225 output_dir : Optional [str ] = None ,
226+ * ,
227+ evaluation_data_path : Optional [str ] = None ,
228+ on_complete : Optional [
229+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
230+ ] = None ,
153231) -> list [EvaluationResult ]:
154232 """Evaluate a single turn.
155233
@@ -163,6 +241,8 @@ def evaluate_turn(
163241 metrics: Optional list of metric identifiers to override turn_metrics.
164242 conversation_group_id: Conversation group ID for the wrapper.
165243 output_dir: Optional override for the output directory.
244+ evaluation_data_path: Same as for :func:`evaluate`.
245+ on_complete: Same as for :func:`evaluate`.
166246
167247 Returns:
168248 List of EvaluationResult objects.
@@ -174,15 +254,26 @@ def evaluate_turn(
174254 conversation_group_id = conversation_group_id ,
175255 turns = [turn ],
176256 )
177- return evaluate (config , [data ], output_dir = output_dir )
257+ return evaluate (
258+ config ,
259+ [data ],
260+ output_dir = output_dir ,
261+ evaluation_data_path = evaluation_data_path ,
262+ on_complete = on_complete ,
263+ )
178264
179265
180- def evaluate_turn_with_summary (
266+ def evaluate_turn_with_summary ( # pylint: disable=too-many-arguments
181267 config : SystemConfig ,
182268 turn : TurnData ,
183269 metrics : Optional [list [str ]] = None ,
184270 conversation_group_id : str = "programmatic_eval" ,
185271 output_dir : Optional [str ] = None ,
272+ * ,
273+ evaluation_data_path : Optional [str ] = None ,
274+ on_complete : Optional [
275+ Callable [[list [EvaluationResult ], EvaluationRunContext ], None ]
276+ ] = None ,
186277) -> EvaluationSummary :
187278 """Evaluate a single turn and return structured results.
188279
@@ -194,6 +285,8 @@ def evaluate_turn_with_summary(
194285 metrics: Optional list of metric identifiers to override turn_metrics.
195286 conversation_group_id: Conversation group ID for the wrapper.
196287 output_dir: Optional override for the output directory.
288+ evaluation_data_path: Same as for :func:`evaluate`.
289+ on_complete: Same as for :func:`evaluate`.
197290
198291 Returns:
199292 EvaluationSummary with results and computed statistics.
@@ -210,4 +303,6 @@ def evaluate_turn_with_summary(
210303 [data ],
211304 output_dir = output_dir ,
212305 compute_confidence_intervals = False ,
306+ evaluation_data_path = evaluation_data_path ,
307+ on_complete = on_complete ,
213308 )
0 commit comments