Skip to content

Commit 2d25fa3

Browse files
committed
Added langfuse compatibility with the lightspeed-eval
1 parent 07cee27 commit 2d25fa3

24 files changed

Lines changed: 1239 additions & 22 deletions

File tree

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,9 @@ export AZURE_API_BASE="https://your-resource.openai.azure.com/"
492492
export API_KEY="your-api-endpoint-key"
493493
```
494494

495+
#### Optional: Langfuse
496+
After a run, you can send one trace with per-metric scores to [Langfuse](https://langfuse.com/). Install `lightspeed-evaluation[langfuse]`, set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`. Enable export by adding under ``storage:`` in ``system.yaml`` a row ``- type: "langfuse"`` with a required **`host`** (API base URL, e.g. `https://cloud.langfuse.com`); that host is always used for the client and `LANGFUSE_HOST` is not read for this entry. Optional `public_key` / `secret_key` on that row override the env keys. From Python you can rely on the same ``storage`` list on ``SystemConfig``, or pass ``on_complete=build_langfuse_on_complete_callback()`` from `lightspeed_evaluation.integrations.langfuse_reporter`.
497+
495498
## 📈 Output & Visualization
496499

497500
### Generated Reports

config/system.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,10 @@ storage:
301301
# database: "./eval_results.db"
302302
# table_name: "evaluation_results"
303303

304+
# Langfuse backend (optional) - stores results incrementally to Langfuse
305+
# - type: "langfuse"
306+
# host: "https://cloud.langfuse.com"
307+
304308
# Visualization settings
305309
visualization:
306310
figsize: [12, 8] # Graph size (width, height)

pyproject.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,18 @@ nlp-metrics = [
5252
"rapidfuzz>=3.0.0,<=3.14.3", # Required for semantic_similarity_distance
5353
]
5454

55+
# Optional Langfuse reporting. Uses the v2 SDK.
56+
# pip install 'lightspeed-evaluation[langfuse]'
57+
# or
58+
# uv sync --extra langfuse
59+
langfuse = [
60+
"langfuse>=2.0.0,<3.0.0",
61+
]
62+
5563
[dependency-groups]
5664
dev = [
65+
# Matches [project.optional-dependencies] langfuse — for typecheck/tests.
66+
"langfuse>=2.0.0,<3.0.0",
5767
"bandit>=1.7.0,<=1.9.2",
5868
"black==25.1.0",
5969
"mypy>=1.15.0,<=1.17.1",

requirements-all-extras.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ annotated-types==0.7.0
2020
anyio==4.13.0
2121
# via
2222
# httpx
23+
# langfuse
2324
# openai
2425
appdirs==1.4.4
2526
# via ragas
@@ -29,7 +30,9 @@ attrs==26.1.0
2930
# jsonschema
3031
# referencing
3132
backoff==2.2.1
32-
# via posthog
33+
# via
34+
# langfuse
35+
# posthog
3336
certifi==2026.4.22
3437
# via
3538
# httpcore
@@ -134,6 +137,7 @@ httpcore==1.0.9
134137
httpx==0.28.1
135138
# via
136139
# huggingface-hub
140+
# langfuse
137141
# langgraph-sdk
138142
# langsmith
139143
# lightspeed-evaluation
@@ -152,6 +156,7 @@ idna==3.14
152156
# via
153157
# anyio
154158
# httpx
159+
# langfuse
155160
# requests
156161
# yarl
157162
importlib-metadata==8.5.0
@@ -217,6 +222,8 @@ langchain-protocol==0.0.15
217222
# via langchain-core
218223
langchain-text-splitters==1.1.2
219224
# via langchain-classic
225+
langfuse==2.60.10
226+
# via lightspeed-evaluation
220227
langgraph==1.1.10
221228
# via langchain
222229
langgraph-checkpoint==4.1.0
@@ -312,6 +319,7 @@ packaging==26.2
312319
# datasets
313320
# huggingface-hub
314321
# langchain-core
322+
# langfuse
315323
# langsmith
316324
# marshmallow
317325
# matplotlib
@@ -367,6 +375,7 @@ pydantic==2.12.5
367375
# langchain-classic
368376
# langchain-core
369377
# langchain-google-genai
378+
# langfuse
370379
# langgraph
371380
# langsmith
372381
# lightspeed-evaluation
@@ -450,6 +459,7 @@ requests==2.34.0
450459
# instructor
451460
# langchain-classic
452461
# langchain-community
462+
# langfuse
453463
# langsmith
454464
# posthog
455465
# requests-toolbelt
@@ -590,6 +600,8 @@ uuid-utils==0.15.0
590600
# langsmith
591601
wheel==0.47.0
592602
# via deepeval
603+
wrapt==1.17.3
604+
# via langfuse
593605
xxhash==3.7.0
594606
# via
595607
# datasets

src/lightspeed_evaluation/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
APIConfig,
2727
EvaluationData,
2828
EvaluationResult,
29+
EvaluationRunContext,
2930
LLMConfig,
3031
LoggingConfig,
3132
TurnData,
@@ -80,6 +81,10 @@
8081
"EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
8182
"TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
8283
"EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
84+
"EvaluationRunContext": (
85+
"lightspeed_evaluation.core.models",
86+
"EvaluationRunContext",
87+
),
8388
"EvaluationSummary": (
8489
"lightspeed_evaluation.core.models.summary",
8590
"EvaluationSummary",

src/lightspeed_evaluation/api.py

Lines changed: 105 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@
2323
print(summary.by_metric)
2424
"""
2525

26+
from collections.abc import Callable
2627
from typing import Optional
2728

2829
from lightspeed_evaluation.core.models import (
2930
EvaluationData,
3031
EvaluationResult,
32+
EvaluationRunContext,
3133
SystemConfig,
3234
TurnData,
3335
)
@@ -36,10 +38,31 @@
3638
from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
3739

3840

39-
def evaluate(
41+
def _on_complete_with_optional_storage_langfuse(
42+
config: SystemConfig,
43+
on_complete: Optional[
44+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
45+
],
46+
) -> Optional[Callable[[list[EvaluationResult], EvaluationRunContext], None]]:
47+
"""Respect an explicit callback; otherwise attach Langfuse when configured in storage."""
48+
if on_complete is not None:
49+
return on_complete
50+
from lightspeed_evaluation.integrations.langfuse_reporter import ( # pylint: disable=import-outside-toplevel
51+
build_langfuse_on_complete_from_storage_configs,
52+
)
53+
54+
return build_langfuse_on_complete_from_storage_configs(config.storage)
55+
56+
57+
def evaluate( # pylint: disable=too-many-arguments
4058
config: SystemConfig,
4159
data: list[EvaluationData],
4260
output_dir: Optional[str] = None,
61+
*,
62+
evaluation_data_path: Optional[str] = None,
63+
on_complete: Optional[
64+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
65+
] = None,
4366
) -> list[EvaluationResult]:
4467
"""Run evaluation on the provided data using the given configuration.
4568
@@ -51,6 +74,14 @@ def evaluate(
5174
config: A pre-built SystemConfig instance.
5275
data: List of EvaluationData conversations to evaluate.
5376
output_dir: Optional override for the output directory.
77+
evaluation_data_path: Optional path to the evaluation data file, used
78+
for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
79+
on_complete: Optional callback after a successful run; receives results
80+
and an :class:`EvaluationRunContext`. See
81+
:mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
82+
a Langfuse helper. If omitted and ``config.storage`` contains
83+
``type: langfuse`` (with required ``host``), a Langfuse export callback
84+
is attached automatically. Failures in the callback do not fail the run.
5485
5586
Returns:
5687
List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +92,28 @@ def evaluate(
6192
loader = ConfigLoader.from_config(config)
6293
pipeline = EvaluationPipeline(loader, output_dir)
6394
try:
64-
return pipeline.run_evaluation(data)
95+
effective_on_complete = _on_complete_with_optional_storage_langfuse(
96+
config, on_complete
97+
)
98+
return pipeline.run_evaluation(
99+
data,
100+
original_data_path=evaluation_data_path,
101+
on_complete=effective_on_complete,
102+
)
65103
finally:
66104
pipeline.close()
67105

68106

69-
def evaluate_with_summary(
107+
def evaluate_with_summary( # pylint: disable=too-many-arguments
70108
config: SystemConfig,
71109
data: list[EvaluationData],
72110
output_dir: Optional[str] = None,
73111
compute_confidence_intervals: bool = False,
112+
*,
113+
evaluation_data_path: Optional[str] = None,
114+
on_complete: Optional[
115+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
116+
] = None,
74117
) -> EvaluationSummary:
75118
"""Run evaluation and return structured results with computed statistics.
76119
@@ -84,22 +127,35 @@ def evaluate_with_summary(
84127
output_dir: Optional override for the output directory.
85128
compute_confidence_intervals: Whether to compute bootstrap confidence
86129
intervals. Default False.
130+
evaluation_data_path: Same as for :func:`evaluate`.
131+
on_complete: Same as for :func:`evaluate`.
87132
88133
Returns:
89134
EvaluationSummary with results and computed statistics.
90135
"""
91-
results = evaluate(config, data, output_dir=output_dir)
136+
results = evaluate(
137+
config,
138+
data,
139+
output_dir=output_dir,
140+
evaluation_data_path=evaluation_data_path,
141+
on_complete=on_complete,
142+
)
92143
return EvaluationSummary.from_results(
93144
results,
94145
evaluation_data=data if data else None,
95146
compute_confidence_intervals=compute_confidence_intervals,
96147
)
97148

98149

99-
def evaluate_conversation(
150+
def evaluate_conversation( # pylint: disable=too-many-arguments
100151
config: SystemConfig,
101152
data: EvaluationData,
102153
output_dir: Optional[str] = None,
154+
*,
155+
evaluation_data_path: Optional[str] = None,
156+
on_complete: Optional[
157+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
158+
] = None,
103159
) -> list[EvaluationResult]:
104160
"""Evaluate a single conversation group.
105161
@@ -109,18 +165,31 @@ def evaluate_conversation(
109165
config: A pre-built SystemConfig instance.
110166
data: A single EvaluationData conversation to evaluate.
111167
output_dir: Optional override for the output directory.
168+
evaluation_data_path: Same as for :func:`evaluate`.
169+
on_complete: Same as for :func:`evaluate`.
112170
113171
Returns:
114172
List of EvaluationResult objects.
115173
"""
116-
return evaluate(config, [data], output_dir=output_dir)
174+
return evaluate(
175+
config,
176+
[data],
177+
output_dir=output_dir,
178+
evaluation_data_path=evaluation_data_path,
179+
on_complete=on_complete,
180+
)
117181

118182

119-
def evaluate_conversation_with_summary(
183+
def evaluate_conversation_with_summary( # pylint: disable=too-many-arguments
120184
config: SystemConfig,
121185
data: EvaluationData,
122186
output_dir: Optional[str] = None,
123187
compute_confidence_intervals: bool = False,
188+
*,
189+
evaluation_data_path: Optional[str] = None,
190+
on_complete: Optional[
191+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
192+
] = None,
124193
) -> EvaluationSummary:
125194
"""Evaluate a single conversation and return structured results.
126195
@@ -132,6 +201,8 @@ def evaluate_conversation_with_summary(
132201
output_dir: Optional override for the output directory.
133202
compute_confidence_intervals: Whether to compute bootstrap confidence
134203
intervals. Default False.
204+
evaluation_data_path: Same as for :func:`evaluate`.
205+
on_complete: Same as for :func:`evaluate`.
135206
136207
Returns:
137208
EvaluationSummary with results and computed statistics.
@@ -141,15 +212,22 @@ def evaluate_conversation_with_summary(
141212
[data],
142213
output_dir=output_dir,
143214
compute_confidence_intervals=compute_confidence_intervals,
215+
evaluation_data_path=evaluation_data_path,
216+
on_complete=on_complete,
144217
)
145218

146219

147-
def evaluate_turn(
220+
def evaluate_turn( # pylint: disable=too-many-arguments
148221
config: SystemConfig,
149222
turn: TurnData,
150223
metrics: Optional[list[str]] = None,
151224
conversation_group_id: str = "programmatic_eval",
152225
output_dir: Optional[str] = None,
226+
*,
227+
evaluation_data_path: Optional[str] = None,
228+
on_complete: Optional[
229+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
230+
] = None,
153231
) -> list[EvaluationResult]:
154232
"""Evaluate a single turn.
155233
@@ -163,6 +241,8 @@ def evaluate_turn(
163241
metrics: Optional list of metric identifiers to override turn_metrics.
164242
conversation_group_id: Conversation group ID for the wrapper.
165243
output_dir: Optional override for the output directory.
244+
evaluation_data_path: Same as for :func:`evaluate`.
245+
on_complete: Same as for :func:`evaluate`.
166246
167247
Returns:
168248
List of EvaluationResult objects.
@@ -174,15 +254,26 @@ def evaluate_turn(
174254
conversation_group_id=conversation_group_id,
175255
turns=[turn],
176256
)
177-
return evaluate(config, [data], output_dir=output_dir)
257+
return evaluate(
258+
config,
259+
[data],
260+
output_dir=output_dir,
261+
evaluation_data_path=evaluation_data_path,
262+
on_complete=on_complete,
263+
)
178264

179265

180-
def evaluate_turn_with_summary(
266+
def evaluate_turn_with_summary( # pylint: disable=too-many-arguments
181267
config: SystemConfig,
182268
turn: TurnData,
183269
metrics: Optional[list[str]] = None,
184270
conversation_group_id: str = "programmatic_eval",
185271
output_dir: Optional[str] = None,
272+
*,
273+
evaluation_data_path: Optional[str] = None,
274+
on_complete: Optional[
275+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
276+
] = None,
186277
) -> EvaluationSummary:
187278
"""Evaluate a single turn and return structured results.
188279
@@ -194,6 +285,8 @@ def evaluate_turn_with_summary(
194285
metrics: Optional list of metric identifiers to override turn_metrics.
195286
conversation_group_id: Conversation group ID for the wrapper.
196287
output_dir: Optional override for the output directory.
288+
evaluation_data_path: Same as for :func:`evaluate`.
289+
on_complete: Same as for :func:`evaluate`.
197290
198291
Returns:
199292
EvaluationSummary with results and computed statistics.
@@ -210,4 +303,6 @@ def evaluate_turn_with_summary(
210303
[data],
211304
output_dir=output_dir,
212305
compute_confidence_intervals=False,
306+
evaluation_data_path=evaluation_data_path,
307+
on_complete=on_complete,
213308
)

0 commit comments

Comments
 (0)