lightspeed-core
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/system.yaml‎
Lines changed: 4 additions & 0 deletions b/‎config/system.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 10 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎requirements-all-extras.txt‎
Lines changed: 13 additions & 1 deletion b/‎requirements-all-extras.txt‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/lightspeed_evaluation/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎src/lightspeed_evaluation/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/lightspeed_evaluation/api.py‎
Lines changed: 105 additions & 10 deletions b/‎src/lightspeed_evaluation/api.py‎
Lines changed: 105 additions & 10 deletions
@@ -492,6 +492,9 @@ export AZURE_API_BASE="https://your-resource.openai.azure.com/"
 export API_KEY="your-api-endpoint-key"
 ```
 
+#### Optional: Langfuse
+After a run, you can send one trace with per-metric scores to [Langfuse](https://langfuse.com/). Install `lightspeed-evaluation[langfuse]`, set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY`. Enable export by adding under ``storage:`` in ``system.yaml`` a row ``- type: "langfuse"`` with a required **`host`** (API base URL, e.g. `https://cloud.langfuse.com`); that host is always used for the client and `LANGFUSE_HOST` is not read for this entry. Optional `public_key` / `secret_key` on that row override the env keys. From Python you can rely on the same ``storage`` list on ``SystemConfig``, or pass ``on_complete=build_langfuse_on_complete_callback()`` from `lightspeed_evaluation.integrations.langfuse_reporter`.
+
 ## 📈 Output & Visualization
 
 ### Generated Reports
 
@@ -301,6 +301,10 @@ storage:
   #   database: "./eval_results.db"
   #   table_name: "evaluation_results"
 
+  # Langfuse backend (optional) - stores results incrementally to Langfuse
+  # - type: "langfuse"
+  #   host: "https://cloud.langfuse.com"
+
 # Visualization settings
 visualization:
   figsize: [12, 8]            # Graph size (width, height)
 
@@ -52,8 +52,18 @@ nlp-metrics = [
     "rapidfuzz>=3.0.0,<=3.14.3",     # Required for semantic_similarity_distance
 ]
 
+# Optional Langfuse reporting. Uses the v2 SDK.
+#   pip install 'lightspeed-evaluation[langfuse]'
+# or
+#   uv sync --extra langfuse
+langfuse = [
+    "langfuse>=2.0.0,<3.0.0",
+]
+
 [dependency-groups]
 dev = [
+    # Matches [project.optional-dependencies] langfuse — for typecheck/tests.
+    "langfuse>=2.0.0,<3.0.0",
     "bandit>=1.7.0,<=1.9.2",
     "black==25.1.0",
     "mypy>=1.15.0,<=1.17.1",
 
@@ -20,6 +20,7 @@ annotated-types==0.7.0
 anyio==4.13.0
     # via
     #   httpx
+    #   langfuse
     #   openai
 appdirs==1.4.4
     # via ragas
@@ -29,7 +30,9 @@ attrs==26.1.0
     #   jsonschema
     #   referencing
 backoff==2.2.1
-    # via posthog
+    # via
+    #   langfuse
+    #   posthog
 certifi==2026.4.22
     # via
     #   httpcore
@@ -134,6 +137,7 @@ httpcore==1.0.9
 httpx==0.28.1
     # via
     #   huggingface-hub
+    #   langfuse
     #   langgraph-sdk
     #   langsmith
     #   lightspeed-evaluation
@@ -152,6 +156,7 @@ idna==3.14
     # via
     #   anyio
     #   httpx
+    #   langfuse
     #   requests
     #   yarl
 importlib-metadata==8.5.0
@@ -217,6 +222,8 @@ langchain-protocol==0.0.15
     # via langchain-core
 langchain-text-splitters==1.1.2
     # via langchain-classic
+langfuse==2.60.10
+    # via lightspeed-evaluation
 langgraph==1.1.10
     # via langchain
 langgraph-checkpoint==4.1.0
@@ -312,6 +319,7 @@ packaging==26.2
     #   datasets
     #   huggingface-hub
     #   langchain-core
+    #   langfuse
     #   langsmith
     #   marshmallow
     #   matplotlib
@@ -367,6 +375,7 @@ pydantic==2.12.5
     #   langchain-classic
     #   langchain-core
     #   langchain-google-genai
+    #   langfuse
     #   langgraph
     #   langsmith
     #   lightspeed-evaluation
@@ -450,6 +459,7 @@ requests==2.34.0
     #   instructor
     #   langchain-classic
     #   langchain-community
+    #   langfuse
     #   langsmith
     #   posthog
     #   requests-toolbelt
@@ -590,6 +600,8 @@ uuid-utils==0.15.0
     #   langsmith
 wheel==0.47.0
     # via deepeval
+wrapt==1.17.3
+    # via langfuse
 xxhash==3.7.0
     # via
     #   datasets
 
@@ -26,6 +26,7 @@
         APIConfig,
         EvaluationData,
         EvaluationResult,
+        EvaluationRunContext,
         LLMConfig,
         LoggingConfig,
         TurnData,
@@ -80,6 +81,10 @@
     "EvaluationData": ("lightspeed_evaluation.core.models", "EvaluationData"),
     "TurnData": ("lightspeed_evaluation.core.models", "TurnData"),
     "EvaluationResult": ("lightspeed_evaluation.core.models", "EvaluationResult"),
+    "EvaluationRunContext": (
+        "lightspeed_evaluation.core.models",
+        "EvaluationRunContext",
+    ),
     "EvaluationSummary": (
         "lightspeed_evaluation.core.models.summary",
         "EvaluationSummary",
 
@@ -23,11 +23,13 @@
     print(summary.by_metric)
 """
 
+from collections.abc import Callable
 from typing import Optional
 
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationResult,
+    EvaluationRunContext,
     SystemConfig,
     TurnData,
 )
@@ -36,10 +38,31 @@
 from lightspeed_evaluation.pipeline.evaluation import EvaluationPipeline
 
 
-def evaluate(
+def _on_complete_with_optional_storage_langfuse(
+    config: SystemConfig,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ],
+) -> Optional[Callable[[list[EvaluationResult], EvaluationRunContext], None]]:
+    """Respect an explicit callback; otherwise attach Langfuse when configured in storage."""
+    if on_complete is not None:
+        return on_complete
+    from lightspeed_evaluation.integrations.langfuse_reporter import (  # pylint: disable=import-outside-toplevel
+        build_langfuse_on_complete_from_storage_configs,
+    )
+
+    return build_langfuse_on_complete_from_storage_configs(config.storage)
+
+
+def evaluate(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: list[EvaluationData],
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Run evaluation on the provided data using the given configuration.
 
@@ -51,6 +74,14 @@ def evaluate(
         config: A pre-built SystemConfig instance.
         data: List of EvaluationData conversations to evaluate.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Optional path to the evaluation data file, used
+            for run naming and in :class:`EvaluationRunContext` (e.g. Langfuse).
+        on_complete: Optional callback after a successful run; receives results
+            and an :class:`EvaluationRunContext`. See
+            :mod:`lightspeed_evaluation.integrations.langfuse_reporter` for
+            a Langfuse helper. If omitted and ``config.storage`` contains
+            ``type: langfuse`` (with required ``host``), a Langfuse export callback
+            is attached automatically. Failures in the callback do not fail the run.
 
     Returns:
         List of EvaluationResult objects (one per metric per turn/conversation).
@@ -61,16 +92,28 @@ def evaluate(
     loader = ConfigLoader.from_config(config)
     pipeline = EvaluationPipeline(loader, output_dir)
     try:
-        return pipeline.run_evaluation(data)
+        effective_on_complete = _on_complete_with_optional_storage_langfuse(
+            config, on_complete
+        )
+        return pipeline.run_evaluation(
+            data,
+            original_data_path=evaluation_data_path,
+            on_complete=effective_on_complete,
+        )
     finally:
         pipeline.close()
 
 
-def evaluate_with_summary(
+def evaluate_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: list[EvaluationData],
     output_dir: Optional[str] = None,
     compute_confidence_intervals: bool = False,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Run evaluation and return structured results with computed statistics.
 
@@ -84,22 +127,35 @@ def evaluate_with_summary(
         output_dir: Optional override for the output directory.
         compute_confidence_intervals: Whether to compute bootstrap confidence
             intervals. Default False.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
     """
-    results = evaluate(config, data, output_dir=output_dir)
+    results = evaluate(
+        config,
+        data,
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
     return EvaluationSummary.from_results(
         results,
         evaluation_data=data if data else None,
         compute_confidence_intervals=compute_confidence_intervals,
     )
 
 
-def evaluate_conversation(
+def evaluate_conversation(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: EvaluationData,
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Evaluate a single conversation group.
 
@@ -109,18 +165,31 @@ def evaluate_conversation(
         config: A pre-built SystemConfig instance.
         data: A single EvaluationData conversation to evaluate.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         List of EvaluationResult objects.
     """
-    return evaluate(config, [data], output_dir=output_dir)
+    return evaluate(
+        config,
+        [data],
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
 
 
-def evaluate_conversation_with_summary(
+def evaluate_conversation_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     data: EvaluationData,
     output_dir: Optional[str] = None,
     compute_confidence_intervals: bool = False,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Evaluate a single conversation and return structured results.
 
@@ -132,6 +201,8 @@ def evaluate_conversation_with_summary(
         output_dir: Optional override for the output directory.
         compute_confidence_intervals: Whether to compute bootstrap confidence
             intervals. Default False.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
@@ -141,15 +212,22 @@ def evaluate_conversation_with_summary(
         [data],
         output_dir=output_dir,
         compute_confidence_intervals=compute_confidence_intervals,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
     )
 
 
-def evaluate_turn(
+def evaluate_turn(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     turn: TurnData,
     metrics: Optional[list[str]] = None,
     conversation_group_id: str = "programmatic_eval",
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> list[EvaluationResult]:
     """Evaluate a single turn.
 
@@ -163,6 +241,8 @@ def evaluate_turn(
         metrics: Optional list of metric identifiers to override turn_metrics.
         conversation_group_id: Conversation group ID for the wrapper.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         List of EvaluationResult objects.
@@ -174,15 +254,26 @@ def evaluate_turn(
         conversation_group_id=conversation_group_id,
         turns=[turn],
     )
-    return evaluate(config, [data], output_dir=output_dir)
+    return evaluate(
+        config,
+        [data],
+        output_dir=output_dir,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
+    )
 
 
-def evaluate_turn_with_summary(
+def evaluate_turn_with_summary(  # pylint: disable=too-many-arguments
     config: SystemConfig,
     turn: TurnData,
     metrics: Optional[list[str]] = None,
     conversation_group_id: str = "programmatic_eval",
     output_dir: Optional[str] = None,
+    *,
+    evaluation_data_path: Optional[str] = None,
+    on_complete: Optional[
+        Callable[[list[EvaluationResult], EvaluationRunContext], None]
+    ] = None,
 ) -> EvaluationSummary:
     """Evaluate a single turn and return structured results.
 
@@ -194,6 +285,8 @@ def evaluate_turn_with_summary(
         metrics: Optional list of metric identifiers to override turn_metrics.
         conversation_group_id: Conversation group ID for the wrapper.
         output_dir: Optional override for the output directory.
+        evaluation_data_path: Same as for :func:`evaluate`.
+        on_complete: Same as for :func:`evaluate`.
 
     Returns:
         EvaluationSummary with results and computed statistics.
@@ -210,4 +303,6 @@ def evaluate_turn_with_summary(
         [data],
         output_dir=output_dir,
         compute_confidence_intervals=False,
+        evaluation_data_path=evaluation_data_path,
+        on_complete=on_complete,
     )