Skip to content

Commit cac2b29

Browse files
committed
Added langfuse compatibility with the lightspeed-eval
1 parent fd9ee3a commit cac2b29

19 files changed

Lines changed: 730 additions & 58 deletions

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,9 @@ export AZURE_API_BASE="https://your-resource.openai.azure.com/"
491491
export API_KEY="your-api-endpoint-key"
492492
```
493493

494+
#### Optional: Langfuse
495+
After a run, you can send one trace with per-metric scores to [Langfuse](https://langfuse.com/). Install `lightspeed-evaluation[langfuse]`, set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` (and `LANGFUSE_HOST` if not using the default cloud), then use `lightspeed-eval --langfuse` or set `LIGHTSPEED_USE_LANGFUSE=1`. From Python, pass `on_complete=build_langfuse_on_complete_callback()` (from `lightspeed_evaluation.integrations.langfuse_reporter`) to `evaluate()`.
496+
494497
## 📈 Output & Visualization
495498

496499
### Generated Reports

pyproject.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,18 @@ nlp-metrics = [
5252
"rapidfuzz>=3.0.0,<=3.14.3", # Required for semantic_similarity_distance
5353
]
5454

55+
# Optional Langfuse reporting (on_complete / CLI --langfuse). Uses the v2 SDK.
56+
# pip install 'lightspeed-evaluation[langfuse]'
57+
# or
58+
# uv sync --extra langfuse
59+
langfuse = [
60+
"langfuse>=2.0.0,<3.0.0",
61+
]
62+
5563
[dependency-groups]
5664
dev = [
65+
# Matches [project.optional-dependencies] langfuse — for typecheck/tests.
66+
"langfuse>=2.0.0,<3.0.0",
5767
"bandit>=1.7.0,<=1.9.2",
5868
"black==25.1.0",
5969
"mypy>=1.15.0,<=1.17.1",

requirements-all-extras.txt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ annotated-types==0.7.0
2020
anyio==4.13.0
2121
# via
2222
# httpx
23+
# langfuse
2324
# openai
2425
appdirs==1.4.4
2526
# via ragas
@@ -29,7 +30,9 @@ attrs==26.1.0
2930
# jsonschema
3031
# referencing
3132
backoff==2.2.1
32-
# via posthog
33+
# via
34+
# langfuse
35+
# posthog
3336
certifi==2026.2.25
3437
# via
3538
# httpcore
@@ -134,6 +137,7 @@ httpcore==1.0.9
134137
httpx==0.28.1
135138
# via
136139
# huggingface-hub
140+
# langfuse
137141
# langgraph-sdk
138142
# langsmith
139143
# lightspeed-evaluation
@@ -152,6 +156,7 @@ idna==3.11
152156
# via
153157
# anyio
154158
# httpx
159+
# langfuse
155160
# requests
156161
# yarl
157162
importlib-metadata==8.7.1
@@ -215,6 +220,8 @@ langchain-openai==1.1.12
215220
# via ragas
216221
langchain-text-splitters==1.1.1
217222
# via langchain-classic
223+
langfuse==2.60.10
224+
# via lightspeed-evaluation
218225
langgraph==1.1.6
219226
# via langchain
220227
langgraph-checkpoint==4.0.1
@@ -305,11 +312,12 @@ orjson==3.11.8
305312
# langsmith
306313
ormsgpack==1.12.2
307314
# via langgraph-checkpoint
308-
packaging==26.0
315+
packaging==24.2
309316
# via
310317
# datasets
311318
# huggingface-hub
312319
# langchain-core
320+
# langfuse
313321
# langsmith
314322
# marshmallow
315323
# matplotlib
@@ -365,6 +373,7 @@ pydantic==2.11.7
365373
# langchain-classic
366374
# langchain-core
367375
# langchain-google-genai
376+
# langfuse
368377
# langgraph
369378
# langsmith
370379
# lightspeed-evaluation
@@ -448,6 +457,7 @@ requests==2.33.1
448457
# instructor
449458
# langchain-classic
450459
# langchain-community
460+
# langfuse
451461
# langsmith
452462
# posthog
453463
# requests-toolbelt
@@ -587,6 +597,8 @@ uuid-utils==0.14.1
587597
# langsmith
588598
wheel==0.46.3
589599
# via deepeval
600+
wrapt==1.17.3
601+
# via langfuse
590602
xxhash==3.6.0
591603
# via
592604
# datasets

requirements-local-embeddings.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ orjson==3.11.8
293293
# langsmith
294294
ormsgpack==1.12.2
295295
# via langgraph-checkpoint
296-
packaging==26.0
296+
packaging==24.2
297297
# via
298298
# datasets
299299
# huggingface-hub

requirements-nlp-metrics.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ orjson==3.11.8
291291
# langsmith
292292
ormsgpack==1.12.2
293293
# via langgraph-checkpoint
294-
packaging==26.0
294+
packaging==24.2
295295
# via
296296
# datasets
297297
# huggingface-hub

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ orjson==3.11.8
279279
# langsmith
280280
ormsgpack==1.12.2
281281
# via langgraph-checkpoint
282-
packaging==26.0
282+
packaging==24.2
283283
# via
284284
# datasets
285285
# huggingface-hub

src/lightspeed_evaluation/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
APIConfig,
2727
EvaluationData,
2828
EvaluationResult,
29+
EvaluationRunContext,
2930
LLMConfig,
3031
LoggingConfig,
3132
TurnData,
@@ -80,6 +81,10 @@
8081
"EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
8182
"TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
8283
"EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
84+
"EvaluationRunContext": (
85+
"lightspeed_evaluation.core.models",
86+
"EvaluationRunContext",
87+
),
8388
"EvaluationSummary": (
8489
"lightspeed_evaluation.core.models.summary",
8590
"EvaluationSummary",

src/lightspeed_evaluation/api.py

Lines changed: 84 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@
2323
print(summary.by_metric)
2424
"""
2525

26+
from collections.abc import Callable
2627
from typing import Optional
2728

2829
from lightspeed_evaluation.core.models import (
2930
EvaluationData,
3031
EvaluationResult,
32+
EvaluationRunContext,
3133
SystemConfig,
3234
TurnData,
3335
)
@@ -36,10 +38,15 @@
3638
from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
3739

3840

39-
def evaluate(
41+
def evaluate( # pylint: disable=too-many-arguments
4042
config: SystemConfig,
4143
data: list[EvaluationData],
4244
output_dir: Optional[str] = None,
45+
*,
46+
evaluation_data_path: Optional[str] = None,
47+
on_complete: Optional[
48+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
49+
] = None,
4350
) -> list[EvaluationResult]:
4451
"""Run evaluation on the provided data using the given configuration.
4552
@@ -51,6 +58,12 @@ def evaluate(
5158
config: A pre-built SystemConfig instance.
5259
data: List of EvaluationData conversations to evaluate.
5360
output_dir: Optional override for the output directory.
61+
evaluation_data_path: Optional path to the evaluation data file, used
62+
for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
63+
on_complete: Optional callback after a successful run; receives results
64+
and an :class:`EvaluationRunContext`. See
65+
:mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
66+
a Langfuse helper. Failures in the callback do not fail the run.
5467
5568
Returns:
5669
List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +74,25 @@ def evaluate(
6174
loader = ConfigLoader.from_config(config)
6275
pipeline = EvaluationPipeline(loader, output_dir)
6376
try:
64-
return pipeline.run_evaluation(data)
77+
return pipeline.run_evaluation(
78+
data,
79+
original_data_path=evaluation_data_path,
80+
on_complete=on_complete,
81+
)
6582
finally:
6683
pipeline.close()
6784

6885

69-
def evaluate_with_summary(
86+
def evaluate_with_summary( # pylint: disable=too-many-arguments
7087
config: SystemConfig,
7188
data: list[EvaluationData],
7289
output_dir: Optional[str] = None,
7390
compute_confidence_intervals: bool = False,
91+
*,
92+
evaluation_data_path: Optional[str] = None,
93+
on_complete: Optional[
94+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
95+
] = None,
7496
) -> EvaluationSummary:
7597
"""Run evaluation and return structured results with computed statistics.
7698
@@ -84,22 +106,35 @@ def evaluate_with_summary(
84106
output_dir: Optional override for the output directory.
85107
compute_confidence_intervals: Whether to compute bootstrap confidence
86108
intervals. Default False.
109+
evaluation_data_path: Same as for :func:`evaluate`.
110+
on_complete: Same as for :func:`evaluate`.
87111
88112
Returns:
89113
EvaluationSummary with results and computed statistics.
90114
"""
91-
results = evaluate(config, data, output_dir=output_dir)
115+
results = evaluate(
116+
config,
117+
data,
118+
output_dir=output_dir,
119+
evaluation_data_path=evaluation_data_path,
120+
on_complete=on_complete,
121+
)
92122
return EvaluationSummary.from_results(
93123
results,
94124
evaluation_data=data if data else None,
95125
compute_confidence_intervals=compute_confidence_intervals,
96126
)
97127

98128

99-
def evaluate_conversation(
129+
def evaluate_conversation( # pylint: disable=too-many-arguments
100130
config: SystemConfig,
101131
data: EvaluationData,
102132
output_dir: Optional[str] = None,
133+
*,
134+
evaluation_data_path: Optional[str] = None,
135+
on_complete: Optional[
136+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
137+
] = None,
103138
) -> list[EvaluationResult]:
104139
"""Evaluate a single conversation group.
105140
@@ -109,18 +144,31 @@ def evaluate_conversation(
109144
config: A pre-built SystemConfig instance.
110145
data: A single EvaluationData conversation to evaluate.
111146
output_dir: Optional override for the output directory.
147+
evaluation_data_path: Same as for :func:`evaluate`.
148+
on_complete: Same as for :func:`evaluate`.
112149
113150
Returns:
114151
List of EvaluationResult objects.
115152
"""
116-
return evaluate(config, [data], output_dir=output_dir)
153+
return evaluate(
154+
config,
155+
[data],
156+
output_dir=output_dir,
157+
evaluation_data_path=evaluation_data_path,
158+
on_complete=on_complete,
159+
)
117160

118161

119-
def evaluate_conversation_with_summary(
162+
def evaluate_conversation_with_summary( # pylint: disable=too-many-arguments
120163
config: SystemConfig,
121164
data: EvaluationData,
122165
output_dir: Optional[str] = None,
123166
compute_confidence_intervals: bool = False,
167+
*,
168+
evaluation_data_path: Optional[str] = None,
169+
on_complete: Optional[
170+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
171+
] = None,
124172
) -> EvaluationSummary:
125173
"""Evaluate a single conversation and return structured results.
126174
@@ -132,6 +180,8 @@ def evaluate_conversation_with_summary(
132180
output_dir: Optional override for the output directory.
133181
compute_confidence_intervals: Whether to compute bootstrap confidence
134182
intervals. Default False.
183+
evaluation_data_path: Same as for :func:`evaluate`.
184+
on_complete: Same as for :func:`evaluate`.
135185
136186
Returns:
137187
EvaluationSummary with results and computed statistics.
@@ -141,15 +191,22 @@ def evaluate_conversation_with_summary(
141191
[data],
142192
output_dir=output_dir,
143193
compute_confidence_intervals=compute_confidence_intervals,
194+
evaluation_data_path=evaluation_data_path,
195+
on_complete=on_complete,
144196
)
145197

146198

147-
def evaluate_turn(
199+
def evaluate_turn( # pylint: disable=too-many-arguments
148200
config: SystemConfig,
149201
turn: TurnData,
150202
metrics: Optional[list[str]] = None,
151203
conversation_group_id: str = "programmatic_eval",
152204
output_dir: Optional[str] = None,
205+
*,
206+
evaluation_data_path: Optional[str] = None,
207+
on_complete: Optional[
208+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
209+
] = None,
153210
) -> list[EvaluationResult]:
154211
"""Evaluate a single turn.
155212
@@ -163,6 +220,8 @@ def evaluate_turn(
163220
metrics: Optional list of metric identifiers to override turn_metrics.
164221
conversation_group_id: Conversation group ID for the wrapper.
165222
output_dir: Optional override for the output directory.
223+
evaluation_data_path: Same as for :func:`evaluate`.
224+
on_complete: Same as for :func:`evaluate`.
166225
167226
Returns:
168227
List of EvaluationResult objects.
@@ -174,15 +233,26 @@ def evaluate_turn(
174233
conversation_group_id=conversation_group_id,
175234
turns=[turn],
176235
)
177-
return evaluate(config, [data], output_dir=output_dir)
236+
return evaluate(
237+
config,
238+
[data],
239+
output_dir=output_dir,
240+
evaluation_data_path=evaluation_data_path,
241+
on_complete=on_complete,
242+
)
178243

179244

180-
def evaluate_turn_with_summary(
245+
def evaluate_turn_with_summary( # pylint: disable=too-many-arguments
181246
config: SystemConfig,
182247
turn: TurnData,
183248
metrics: Optional[list[str]] = None,
184249
conversation_group_id: str = "programmatic_eval",
185250
output_dir: Optional[str] = None,
251+
*,
252+
evaluation_data_path: Optional[str] = None,
253+
on_complete: Optional[
254+
Callable[[list[EvaluationResult], EvaluationRunContext], None]
255+
] = None,
186256
) -> EvaluationSummary:
187257
"""Evaluate a single turn and return structured results.
188258
@@ -194,6 +264,8 @@ def evaluate_turn_with_summary(
194264
metrics: Optional list of metric identifiers to override turn_metrics.
195265
conversation_group_id: Conversation group ID for the wrapper.
196266
output_dir: Optional override for the output directory.
267+
evaluation_data_path: Same as for :func:`evaluate`.
268+
on_complete: Same as for :func:`evaluate`.
197269
198270
Returns:
199271
EvaluationSummary with results and computed statistics.
@@ -210,4 +282,6 @@ def evaluate_turn_with_summary(
210282
[data],
211283
output_dir=output_dir,
212284
compute_confidence_intervals=False,
285+
evaluation_data_path=evaluation_data_path,
286+
on_complete=on_complete,
213287
)

0 commit comments

Comments
 (0)