|
| 1 | +"""Langfuse storage backend for evaluation results. |
| 2 | +
|
| 3 | +Implements :class:`~lightspeed_evaluation.core.storage.protocol.BaseStorageBackend` |
| 4 | +so Langfuse plugs into the standard pipeline storage lifecycle without any |
| 5 | +changes to the runner, API, or pipeline modules. |
| 6 | +
|
| 7 | +Install with: ``pip install 'lightspeed-evaluation[langfuse]'`` |
| 8 | +
|
| 9 | +Credentials are resolved from :class:`LangfuseBackendConfig` fields first, |
| 10 | +then from ``LANGFUSE_PUBLIC_KEY``, ``LANGFUSE_SECRET_KEY``, and |
| 11 | +``LANGFUSE_HOST`` environment variables as fallback (standard Langfuse SDK |
| 12 | +behavior). |
| 13 | +
|
| 14 | +Lifecycle: |
| 15 | + 1. ``initialize(run_info)`` — creates the Langfuse client and trace. |
| 16 | + 2. ``save_run(results)`` — accumulates all results (called per conversation). |
| 17 | + 3. ``finalize()`` — writes scores to the trace and flushes. |
| 18 | + 4. ``close()`` — shuts down the client. |
| 19 | +""" |
| 20 | + |
| 21 | +from __future__ import annotations |
| 22 | + |
| 23 | +import importlib |
| 24 | +import logging |
| 25 | +from typing import Any, Optional |
| 26 | + |
| 27 | +from lightspeed_evaluation.core.models.data import EvaluationData, EvaluationResult |
| 28 | +from lightspeed_evaluation.core.storage.config import LangfuseBackendConfig |
| 29 | +from lightspeed_evaluation.core.storage.protocol import RunInfo |
| 30 | + |
| 31 | +logger = logging.getLogger(__name__) |
| 32 | + |
| 33 | +_HAS_LANGFUSE = importlib.util.find_spec("langfuse") is not None |
| 34 | + |
| 35 | + |
| 36 | +class LangfuseStorageBackend: |
| 37 | + """Storage backend that exports evaluation results to Langfuse. |
| 38 | +
|
| 39 | + Creates one Langfuse trace per evaluation run with one score per |
| 40 | + evaluation result. Results with ``score=None`` (ERROR/SKIPPED) are |
| 41 | + skipped from numeric scoring but their status is logged. |
| 42 | +
|
| 43 | + All Langfuse SDK errors are caught and logged — they never fail |
| 44 | + the evaluation pipeline. |
| 45 | + """ |
| 46 | + |
| 47 | + def __init__(self, config: LangfuseBackendConfig) -> None: |
| 48 | + """Initialize the Langfuse storage backend. |
| 49 | +
|
| 50 | + Args: |
| 51 | + config: Langfuse backend configuration with optional host, |
| 52 | + public_key, and secret_key fields. |
| 53 | + """ |
| 54 | + self._config = config |
| 55 | + self._client: Any = None |
| 56 | + self._trace: Any = None |
| 57 | + self._run_info: Optional[RunInfo] = None |
| 58 | + self._results: list[EvaluationResult] = [] |
| 59 | + |
| 60 | + @property |
| 61 | + def backend_name(self) -> str: |
| 62 | + """Return the name of this storage backend.""" |
| 63 | + return "langfuse" |
| 64 | + |
| 65 | + def initialize(self, run_info: RunInfo) -> None: |
| 66 | + """Create the Langfuse client and a trace for this run.""" |
| 67 | + self._run_info = run_info |
| 68 | + self._results = [] |
| 69 | + |
| 70 | + if not _HAS_LANGFUSE: |
| 71 | + logger.error( |
| 72 | + "langfuse is not installed. " |
| 73 | + "Add: pip install 'lightspeed-evaluation[langfuse]'" |
| 74 | + ) |
| 75 | + return |
| 76 | + |
| 77 | + langfuse_mod = importlib.import_module("langfuse") |
| 78 | + |
| 79 | + kwargs = self._build_client_kwargs() |
| 80 | + try: |
| 81 | + self._client = langfuse_mod.Langfuse(**kwargs) |
| 82 | + except (RuntimeError, ValueError, OSError, ConnectionError): |
| 83 | + logger.exception("langfuse: failed to initialize client") |
| 84 | + self._client = None |
| 85 | + |
| 86 | + def save_result(self, result: EvaluationResult) -> None: |
| 87 | + """Accumulate a single result for batch export at finalize.""" |
| 88 | + self._results.append(result) |
| 89 | + |
| 90 | + def save_run(self, results: list[EvaluationResult]) -> None: |
| 91 | + """Accumulate conversation results for batch export at finalize.""" |
| 92 | + self._results.extend(results) |
| 93 | + |
| 94 | + def set_evaluation_context( |
| 95 | + self, evaluation_data: Optional[list[EvaluationData]] = None |
| 96 | + ) -> None: |
| 97 | + """No-op — Langfuse export does not need the full evaluation dataset.""" |
| 98 | + _ = evaluation_data |
| 99 | + |
| 100 | + def finalize(self) -> None: |
| 101 | + """Create the trace, write all scores, and flush to Langfuse.""" |
| 102 | + if self._client is None: |
| 103 | + return |
| 104 | + |
| 105 | + if not self._results: |
| 106 | + logger.info("langfuse: no results to report; skipping") |
| 107 | + return |
| 108 | + |
| 109 | + try: |
| 110 | + self._write_trace_and_scores() |
| 111 | + except (RuntimeError, ValueError, OSError, ConnectionError): |
| 112 | + logger.exception("langfuse: failed to write trace and scores") |
| 113 | + |
| 114 | + def close(self) -> None: |
| 115 | + """Shut down the Langfuse client.""" |
| 116 | + if self._client is not None: |
| 117 | + try: |
| 118 | + self._client.shutdown() |
| 119 | + except (RuntimeError, OSError, ConnectionError): |
| 120 | + logger.debug("langfuse: shutdown raised; ignoring") |
| 121 | + self._client = None |
| 122 | + |
| 123 | + def _build_client_kwargs(self) -> dict[str, Any]: |
| 124 | + """Build keyword arguments for the Langfuse constructor.""" |
| 125 | + kwargs: dict[str, Any] = {} |
| 126 | + if self._config.public_key: |
| 127 | + kwargs["public_key"] = self._config.public_key |
| 128 | + if self._config.secret_key: |
| 129 | + kwargs["secret_key"] = self._config.secret_key |
| 130 | + if self._config.host: |
| 131 | + kwargs["host"] = self._config.host.strip() |
| 132 | + return kwargs |
| 133 | + |
| 134 | + def _write_trace_and_scores(self) -> None: |
| 135 | + """Create one trace and emit one score per result row.""" |
| 136 | + run_name = self._run_info.name if self._run_info else "evaluation" |
| 137 | + |
| 138 | + trace_meta: dict[str, Any] = { |
| 139 | + "run_name": run_name, |
| 140 | + "result_count": len(self._results), |
| 141 | + "rows_preview": self._build_rows_preview(), |
| 142 | + } |
| 143 | + |
| 144 | + self._trace = self._client.trace( |
| 145 | + name=_truncate(f"lightspeed_eval__{run_name}", 256), |
| 146 | + metadata=trace_meta, |
| 147 | + ) |
| 148 | + |
| 149 | + for r in self._results: |
| 150 | + if r.score is None: |
| 151 | + logger.debug( |
| 152 | + "langfuse: skipping score for %s (status=%s, no numeric score)", |
| 153 | + r.metric_identifier, |
| 154 | + r.result, |
| 155 | + ) |
| 156 | + continue |
| 157 | + |
| 158 | + self._trace.score( |
| 159 | + name=_truncate(r.metric_identifier, 200), |
| 160 | + value=float(r.score), |
| 161 | + comment=_format_comment(r), |
| 162 | + metadata=_build_score_metadata(r), |
| 163 | + ) |
| 164 | + |
| 165 | + self._client.flush() |
| 166 | + |
| 167 | + def _build_rows_preview(self) -> list[dict[str, Any]]: |
| 168 | + """Build a compact preview of the first 50 rows for trace metadata.""" |
| 169 | + preview: list[dict[str, Any]] = [] |
| 170 | + for i, r in enumerate(self._results[:50]): |
| 171 | + preview.append( |
| 172 | + { |
| 173 | + "idx": i, |
| 174 | + "conversation_group_id": r.conversation_group_id, |
| 175 | + "turn_id": r.turn_id or "", |
| 176 | + "metric": r.metric_identifier, |
| 177 | + "result": r.result, |
| 178 | + "score": r.score, |
| 179 | + } |
| 180 | + ) |
| 181 | + return preview |
| 182 | + |
| 183 | + |
| 184 | +def _format_comment(r: EvaluationResult) -> str: |
| 185 | + """Build a human-readable comment for a Langfuse score entry.""" |
| 186 | + parts: list[str] = [ |
| 187 | + f"result={r.result}", |
| 188 | + f"conversation_group_id={r.conversation_group_id}", |
| 189 | + f"turn_id={r.turn_id or ''}", |
| 190 | + ] |
| 191 | + if r.reason: |
| 192 | + max_reason = 1200 |
| 193 | + reason = ( |
| 194 | + r.reason |
| 195 | + if len(r.reason) <= max_reason |
| 196 | + else r.reason[: max_reason - 3] + "..." |
| 197 | + ) |
| 198 | + parts.append(f"reason={reason}") |
| 199 | + return " | ".join(parts) |
| 200 | + |
| 201 | + |
| 202 | +def _build_score_metadata(r: EvaluationResult) -> dict[str, Any]: |
| 203 | + """Build per-score metadata mirroring evaluation CSV fields.""" |
| 204 | + max_text = 8000 |
| 205 | + return { |
| 206 | + "query": _truncate(r.query, max_text) if r.query else "", |
| 207 | + "response": _truncate(r.response, max_text) if r.response else "", |
| 208 | + "conversation_group_id": r.conversation_group_id, |
| 209 | + "turn_id": r.turn_id or "", |
| 210 | + "tool_calls": _safe_truncate(r.tool_calls, max_text), |
| 211 | + "contexts": _safe_truncate(r.contexts, max_text), |
| 212 | + "expected_response": _format_expected_response(r.expected_response, max_text), |
| 213 | + "expected_intent": _safe_truncate(r.expected_intent, max_text), |
| 214 | + "expected_tool_calls": _safe_truncate(r.expected_tool_calls, max_text), |
| 215 | + "expected_keywords": _safe_truncate(r.expected_keywords, max_text), |
| 216 | + } |
| 217 | + |
| 218 | + |
| 219 | +def _safe_truncate(value: Optional[str], max_len: int) -> str: |
| 220 | + """Truncate a nullable string, returning empty string for None.""" |
| 221 | + if value is None or not str(value).strip(): |
| 222 | + return "" |
| 223 | + return _truncate(str(value), max_len) |
| 224 | + |
| 225 | + |
| 226 | +def _format_expected_response( |
| 227 | + value: str | list[str] | None, max_len: int |
| 228 | +) -> str: |
| 229 | + """Format expected_response which can be a string or list of strings.""" |
| 230 | + if value is None: |
| 231 | + return "" |
| 232 | + if isinstance(value, list): |
| 233 | + text = "\n---\n".join(str(x) for x in value) |
| 234 | + else: |
| 235 | + text = str(value) |
| 236 | + return _truncate(text, max_len) |
| 237 | + |
| 238 | + |
| 239 | +def _truncate(s: str, max_len: int) -> str: |
| 240 | + """Truncate a string with ellipsis if it exceeds max_len.""" |
| 241 | + if len(s) <= max_len: |
| 242 | + return s |
| 243 | + return s[: max_len - 3] + "..." |
0 commit comments