From cbc9611472545d764d2b8a6b3fec6e5b5c6c742c Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Tue, 28 Apr 2026 17:58:03 +0200 Subject: [PATCH 1/2] LEADS-349-calculate-aggregated-score-from-key-metrics --- src/lightspeed_evaluation/core/system/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightspeed_evaluation/core/system/loader.py b/src/lightspeed_evaluation/core/system/loader.py index 7dbb4c4f..29fcfe5f 100644 --- a/src/lightspeed_evaluation/core/system/loader.py +++ b/src/lightspeed_evaluation/core/system/loader.py @@ -149,7 +149,7 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: judge_panel_data = config_data.get("judge_panel") judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None - # Parse storage backends with backward compatibility for legacy 'output' section + # Parse storage backends with backward compatibility storage_data = self._get_storage_config_with_backward_compat(config_data) storage_backends = self._parse_storage_config(storage_data) From 6e035483ade2e4ebdf784aa9ad6f83e4c4dc93b9 Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Tue, 5 May 2026 13:51:36 +0200 Subject: [PATCH 2/2] API latency calculation --- README.md | 1 + config/system.yaml | 1 + docs/EVALUATION_GUIDE.md | 12 +- src/lightspeed_evaluation/core/constants.py | 1 + .../core/models/__init__.py | 4 +- src/lightspeed_evaluation/core/models/data.py | 10 + .../core/models/quality.py | 28 +- .../core/models/statistics.py | 22 +- .../core/models/summary.py | 25 +- .../core/output/generator.py | 148 ++++- .../core/output/statistics.py | 85 ++- .../core/storage/sql_storage.py | 2 + .../core/system/loader.py | 2 +- .../pipeline/evaluation/amender.py | 17 +- .../pipeline/evaluation/evaluator.py | 27 +- .../runner/evaluation.py | 8 +- tests/unit/core/models/conftest.py | 49 +- tests/unit/core/models/test_quality.py | 123 +++- tests/unit/core/models/test_summary.py | 80 ++- tests/unit/core/output/test_generator.py | 22 +- tests/unit/core/output/test_statistics.py | 580 +----------------- tests/unit/core/output/test_statistics_api.py | 268 ++++++++ .../core/output/test_statistics_detailed.py | 392 ++++++++++++ tests/unit/core/storage/test_sql_storage.py | 1 + .../unit/pipeline/evaluation/test_amender.py | 87 +++ 25 files changed, 1333 insertions(+), 662 deletions(-) create mode 100644 tests/unit/core/output/test_statistics_api.py create mode 100644 tests/unit/core/output/test_statistics_detailed.py diff --git a/README.md b/README.md index 6d9c4177..d659e722 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ A comprehensive framework for evaluating GenAI applications. - **API Integration**: Direct integration with external API for real-time data generation (if enabled) - **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after each conversation evaluation (applicable when API is enabled) - **Token Usage Tracking**: Track input/output tokens for both API calls and Judge LLM evaluations (per-judge tracking for panel mode) +- **API Latency Tracking**: Measure and analyze API response times with percentile statistics (p50, p95, p99) for performance monitoring - **Streaming Performance Metrics**: Capture time-to-first-token (TTFT), streaming duration, and tokens/second when using streaming endpoint - **Statistical Analysis**: Statistics for every metric with score distribution analysis - **Rich Output**: CSV, JSON, TXT reports + visualization graphs (pass rates, distributions, heatmaps) diff --git a/config/system.yaml b/config/system.yaml index 7187a018..4ff16404 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -274,6 +274,7 @@ storage: - "response" - "api_input_tokens" - "api_output_tokens" + - "agent_latency" # Streaming performance metrics (only populated when using streaming endpoint) - "time_to_first_token" # Time to first token in seconds - "streaming_duration" # Total streaming duration in seconds diff --git a/docs/EVALUATION_GUIDE.md b/docs/EVALUATION_GUIDE.md index 30cf5ea0..683dffce 100644 --- a/docs/EVALUATION_GUIDE.md +++ b/docs/EVALUATION_GUIDE.md @@ -1127,6 +1127,7 @@ Contains every metric evaluation with: - Detailed reasoning - Query and response text - Execution time +- API latency **Use for:** Drilling into specific failures, detailed analysis @@ -1180,6 +1181,16 @@ ragas:faithfulness: - **ERROR** ⚠️: Evaluation couldn't complete (missing data, API failure, etc.) - **SKIPPED** ⏭️: Evaluation skipped due to prior failure (when `skip_on_failure` is enabled) +### Performance Metrics (API Enabled Only) + +**API Latency**: Response time per API call with percentile stats (p50, p95, p99). Cached responses (zero tokens) are excluded to avoid skewing statistics. + +**Streaming Metrics**: Time-to-first-token, streaming duration, and tokens/second when using streaming endpoints. + +**Token Usage**: Track consumption across Judge LLM, embeddings, and API calls. + +**Note:** Cached responses are detected by zero `api_input_tokens` and `api_output_tokens` — latency is set to 0 for these. + ### Score Quality Levels | Score | Quality | Recommendation | @@ -1912,4 +1923,3 @@ This comprehensive guide has covered everything you need to know to effectively *This guide is designed to make AI evaluation accessible to everyone. Whether you're a product manager making decisions, a QA engineer testing systems, or a developer integrating evaluation into workflows, you now have everything you need to ensure your AI applications meet quality standards.* **Happy Evaluating! 🚀** - diff --git a/src/lightspeed_evaluation/core/constants.py b/src/lightspeed_evaluation/core/constants.py index 22b47dca..bb4cf114 100644 --- a/src/lightspeed_evaluation/core/constants.py +++ b/src/lightspeed_evaluation/core/constants.py @@ -110,6 +110,7 @@ # Streaming performance metrics "time_to_first_token", "streaming_duration", + "agent_latency", "tokens_per_second", "tool_calls", "contexts", diff --git a/src/lightspeed_evaluation/core/models/__init__.py b/src/lightspeed_evaluation/core/models/__init__.py index c1790194..de9b85b6 100644 --- a/src/lightspeed_evaluation/core/models/__init__.py +++ b/src/lightspeed_evaluation/core/models/__init__.py @@ -43,7 +43,7 @@ ConversationStats, TagStats, StreamingStats, - ApiTokenUsage, + AgentTokenUsage, ConfidenceInterval, DetailedStats, ) @@ -84,7 +84,7 @@ "ConversationStats", "TagStats", "StreamingStats", - "ApiTokenUsage", + "AgentTokenUsage", "ConfidenceInterval", "DetailedStats", # API models diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index 0f22191e..5dfbbca4 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin): default=0, ge=0, description="Output tokens used by API call" ) + # API execution time tracking (per turn) + agent_latency: float = Field( + default=0, ge=0, description="API call latency for this turn in seconds" + ) + # Per-turn metrics support turn_metrics: Optional[list[str]] = Field( default=None, @@ -526,6 +531,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin): execution_time: float = Field( default=0, ge=0, description="Execution time in seconds" ) + agent_latency: float = Field( + default=0, + ge=0, + description="API latency in seconds (per turn or average for conversation)", + ) api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used") api_output_tokens: int = Field( default=0, ge=0, description="API output tokens used" diff --git a/src/lightspeed_evaluation/core/models/quality.py b/src/lightspeed_evaluation/core/models/quality.py index 9f309a8d..5ce1cdfc 100644 --- a/src/lightspeed_evaluation/core/models/quality.py +++ b/src/lightspeed_evaluation/core/models/quality.py @@ -9,7 +9,13 @@ from pydantic import BaseModel, Field -from lightspeed_evaluation.core.models import MetricStats, ScoreStatistics +from lightspeed_evaluation.core.models.statistics import ( + MetricStats, + NumericStats, + ScoreStatistics, + AgentTokenStats, +) + logger = logging.getLogger(__name__) @@ -44,17 +50,18 @@ class QualityReport(BaseModel): default_factory=list, description="Warnings about quality metrics configuration or usage", ) - api_latency: float = Field( - default=0.0, description="[Placeholder] Average API response time in seconds" + agent_latency_stats: Optional[NumericStats] = Field( + default=None, description="Agent latency statistics" ) - api_tokens: int = Field( - default=0, - description="[Placeholder] Total number of tokens consumed across all API calls", + agent_token_stats: Optional[AgentTokenStats] = Field( + default=None, description="Agent token usage statistics" ) @staticmethod def create_report( by_metric: dict[str, MetricStats], + agent_latency_stats: Optional[NumericStats], + agent_token_stats: Optional[AgentTokenStats], quality_score_metrics: list[str], ) -> Optional["QualityReport"]: """Creates a quality report with aggregated quality score from selected metrics. @@ -64,6 +71,8 @@ def create_report( Args: by_metric: Dictionary mapping metric identifiers to their computed statistics. + agent_latency_stats: Agent API latency statistics (p50, p95, p99). + agent_token_stats: Agent token usage statistics with percentiles. quality_score_metrics: Metric identifiers to include in quality score calculation. All specified metrics must exist in by_metric. @@ -148,14 +157,13 @@ def create_report( if stats is not None: extra_metrics[metric_id] = stats - # Calculate aggregated quality score - aggregated_score = QualityReport._calculate_quality_score(quality_metrics) - return QualityReport( - quality_score=aggregated_score, + quality_score=QualityReport._calculate_quality_score(quality_metrics), quality_metrics=quality_metrics, extra_metrics=extra_metrics, warnings=warnings, + agent_latency_stats=agent_latency_stats, + agent_token_stats=agent_token_stats, ) @staticmethod diff --git a/src/lightspeed_evaluation/core/models/statistics.py b/src/lightspeed_evaluation/core/models/statistics.py index da114fbd..7f144e9c 100644 --- a/src/lightspeed_evaluation/core/models/statistics.py +++ b/src/lightspeed_evaluation/core/models/statistics.py @@ -5,7 +5,7 @@ class NumericStats(BaseModel): - """Numeric statistics for a set of values (e.g., TTFT, duration).""" + """Numeric statistics for a set of values (e.g., TTFT, duration, latency).""" count: int = Field(default=0, description="Number of values") mean: Optional[float] = Field(default=None, description="Mean value") @@ -13,6 +13,8 @@ class NumericStats(BaseModel): std: Optional[float] = Field(default=None, description="Standard deviation") min_value: Optional[float] = Field(default=None, description="Minimum value") max_value: Optional[float] = Field(default=None, description="Maximum value") + p95: Optional[float] = Field(default=None, description="95th percentile") + p99: Optional[float] = Field(default=None, description="99th percentile") class ConfidenceInterval(BaseModel): @@ -116,11 +118,25 @@ class StreamingStats(BaseModel): ) -class ApiTokenUsage(BaseModel): - """API token usage totals.""" +class AgentTokenStats(BaseModel): + """Agent token usage statistics with percentiles.""" + + input: Optional[NumericStats] = Field( + default=None, description="Input token statistics" + ) + output: Optional[NumericStats] = Field( + default=None, description="Output token statistics" + ) + + +class AgentTokenUsage(BaseModel): + """Agent token usage totals and statistics.""" total_api_input_tokens: int = Field(default=0, description="Total API input tokens") total_api_output_tokens: int = Field( default=0, description="Total API output tokens" ) total_api_tokens: int = Field(default=0, description="Total API tokens") + statistics: Optional[AgentTokenStats] = Field( + default=None, description="Agent token usage statistics with percentiles" + ) diff --git a/src/lightspeed_evaluation/core/models/summary.py b/src/lightspeed_evaluation/core/models/summary.py index f81cd6da..30a70f64 100644 --- a/src/lightspeed_evaluation/core/models/summary.py +++ b/src/lightspeed_evaluation/core/models/summary.py @@ -10,7 +10,8 @@ EvaluationResult, ) from lightspeed_evaluation.core.models.statistics import ( - ApiTokenUsage, + AgentTokenUsage, + NumericStats, ConversationStats, MetricStats, OverallStats, @@ -18,7 +19,8 @@ TagStats, ) from lightspeed_evaluation.core.output.statistics import ( - compute_api_token_usage, + compute_agent_token_usage, + compute_agent_latency_stats, compute_overall_stats, compute_streaming_stats, compute_tag_stats, @@ -50,8 +52,11 @@ class EvaluationSummary(BaseModel): by_tag: dict[str, TagStats] = Field( default_factory=dict, description="Statistics per tag" ) - api_tokens: Optional[ApiTokenUsage] = Field( - default=None, description="API token usage (when evaluation data provided)" + agent_token_usage: Optional[AgentTokenUsage] = Field( + default=None, description="Agent token usage with totals and statistics" + ) + agent_latency_stats: Optional[NumericStats] = Field( + default=None, description="Agent latency statistics (when API enabled)" ) streaming: Optional[StreamingStats] = Field( default=None, description="Streaming performance stats (when available)" @@ -70,7 +75,8 @@ def from_results( Args: results: List of evaluation results to summarize. - evaluation_data: Optional evaluation data for API token and streaming stats. + evaluation_data: Optional evaluation data for API token, agent latency, + and streaming stats. compute_confidence_intervals: Whether to compute bootstrap confidence intervals. Default False. @@ -88,11 +94,13 @@ def from_results( by_tag = compute_tag_stats(results, compute_confidence_intervals) # Compute API token usage and streaming stats if evaluation data provided - api_tokens = None streaming = None + agent_token_usage = None + agent_latency_stats = None if evaluation_data: - api_tokens = compute_api_token_usage(evaluation_data) streaming = compute_streaming_stats(evaluation_data) + agent_token_usage = compute_agent_token_usage(evaluation_data) + agent_latency_stats = compute_agent_latency_stats(evaluation_data) return cls( timestamp=timestamp, @@ -101,6 +109,7 @@ def from_results( by_metric=by_metric, by_conversation=by_conversation, by_tag=by_tag, - api_tokens=api_tokens, + agent_token_usage=agent_token_usage, + agent_latency_stats=agent_latency_stats, streaming=streaming, ) diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py index 3aed5f54..3ddfeb1e 100644 --- a/src/lightspeed_evaluation/core/output/generator.py +++ b/src/lightspeed_evaluation/core/output/generator.py @@ -18,9 +18,13 @@ ) from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult from lightspeed_evaluation.core.models.summary import ( - ConversationStats, EvaluationSummary, +) +from lightspeed_evaluation.core.models.statistics import ( + AgentTokenStats, + ConversationStats, MetricStats, + NumericStats, OverallStats, StreamingStats, TagStats, @@ -91,6 +95,12 @@ def generate_reports( if quality_score_metrics: quality_report = QualityReport.create_report( summary.by_metric, + summary.agent_latency_stats, + ( + summary.agent_token_usage.statistics + if summary.agent_token_usage + else None + ), quality_score_metrics, ) @@ -313,6 +323,39 @@ def _generate_json_summary_from_model( return json_file + @staticmethod + def _build_agent_token_stats_dict( + agent_token_stats: Optional[AgentTokenStats], + ) -> Optional[dict[str, Any]]: + """Build agent token stats dict, returning None if all sub-fields are None.""" + if not agent_token_stats: + return None + + input_stats = ( + { + "50%": agent_token_stats.input.median, + "95%": agent_token_stats.input.p95, + "99%": agent_token_stats.input.p99, + } + if agent_token_stats.input + else None + ) + output_stats = ( + { + "50%": agent_token_stats.output.median, + "95%": agent_token_stats.output.p95, + "99%": agent_token_stats.output.p99, + } + if agent_token_stats.output + else None + ) + + # Return None if both input and output are None + if input_stats is None and output_stats is None: + return None + + return {"input": input_stats, "output": output_stats} + def _generate_quality_score_report( self, quality_report: QualityReport, @@ -350,8 +393,14 @@ def _generate_quality_score_report( } for metric_id, stats in quality_report.extra_metrics.items() }, - "api_latency": quality_report.api_latency, - "api_tokens": quality_report.api_tokens, + "agent_latency_stats": ( + _numeric_stats_to_dict(quality_report.agent_latency_stats) + if quality_report.agent_latency_stats is not None + else None + ), + "agent_token_stats": OutputHandler._build_agent_token_stats_dict( + quality_report.agent_token_stats + ), "warnings": quality_report.warnings, } @@ -382,8 +431,12 @@ def _generate_text_summary_from_model( # Build compatible dicts from summary model basic_stats = _overall_to_basic_stats_dict(summary.overall) api_tokens = ( - summary.api_tokens.model_dump() - if summary.api_tokens + { + "total_api_input_tokens": summary.agent_token_usage.total_api_input_tokens, + "total_api_output_tokens": summary.agent_token_usage.total_api_output_tokens, + "total_api_tokens": summary.agent_token_usage.total_api_tokens, + } + if summary.agent_token_usage else { "total_api_input_tokens": 0, "total_api_output_tokens": 0, @@ -408,6 +461,9 @@ def _generate_text_summary_from_model( # Token usage statistics self._write_token_stats(f, basic_stats, api_tokens) + # API latency statistics + self._write_agent_latency_stats(f, summary.agent_latency_stats) + # Streaming performance statistics self._write_streaming_stats(f, streaming_stats) @@ -456,6 +512,24 @@ def _write_token_stats( f.write(f"Output Tokens: {api_tokens.get('total_api_output_tokens', 0):,}\n") f.write(f"Total Tokens: {api_tokens.get('total_api_tokens', 0):,}\n\n") + def _write_agent_latency_stats( + self, f: Any, agent_latency: Optional[NumericStats] + ) -> None: + """Write API latency statistics section.""" + if agent_latency is None: + return # No API latency data available + + stats_dict = _numeric_stats_to_dict(agent_latency) + self._write_numeric_stats( + f, + "API Latency (seconds):\n" + "-" * 20, + stats_dict, + precision=3, + include_std=True, + include_percentiles=True, + ) + f.write("\n") + def _write_streaming_stats(self, f: Any, streaming_stats: dict[str, Any]) -> None: """Write streaming performance statistics section.""" # Check if there are any streaming metrics @@ -469,35 +543,46 @@ def _write_streaming_stats(self, f: Any, streaming_stats: dict[str, Any]) -> Non f.write("Streaming Performance:\n") f.write("-" * 20 + "\n") - self._write_numeric_stats(f, "Time to First Token (seconds)", ttft) - self._write_numeric_stats(f, "Streaming Duration (seconds)", duration) - self._write_numeric_stats(f, "Tokens per Second", throughput, precision=1) + self._write_numeric_stats(f, "Time to First Token (seconds):", ttft) + self._write_numeric_stats(f, "Streaming Duration (seconds):", duration) + self._write_numeric_stats(f, "Tokens per Second:", throughput, precision=1) f.write("\n") - def _write_numeric_stats( + def _write_numeric_stats( # pylint: disable=too-many-arguments self, f: Any, title: str, stats: dict[str, Any], *, precision: int = 3, + include_std: bool = False, + include_percentiles: bool = False, ) -> None: - """Write numeric statistics with mean, median, min, max. + """Write numeric statistics with mean, median, min, max, and optional percentiles. Args: f: File handle to write to. title: Section title. - stats: Statistics dictionary with mean, median, min, max, count. + stats: Statistics dictionary with mean, median, min, max, count, and optional p95/p99. precision: Decimal precision for formatting numbers. + include_std: Whether to include standard deviation if available. + include_percentiles: Whether to include p95 and p99 percentiles if available. """ if stats.get("count", 0) == 0: return fmt = f".{precision}f" - f.write(f"{title}:\n") + f.write(f"{title}\n") f.write(f" Mean: {stats['mean']:{fmt}}\n") f.write(f" Median: {stats['median']:{fmt}}\n") + if include_percentiles: + if stats.get("p95") is not None: + f.write(f" P95: {stats['p95']:{fmt}}\n") + if stats.get("p99") is not None: + f.write(f" P99: {stats['p99']:{fmt}}\n") + if include_std and stats["count"] > 1: + f.write(f" Std Dev: {stats['std']:{fmt}}\n") f.write(f" Min: {stats['min']:{fmt}}, Max: {stats['max']:{fmt}}\n") def _write_breakdown_section( @@ -681,17 +766,17 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]: Dictionary matching the JSON summary format. """ overall = summary.overall - api_tokens = summary.api_tokens + agent_token_usage = summary.agent_token_usage judge_tokens = overall.total_judge_llm_tokens - api_total = api_tokens.total_api_tokens if api_tokens else 0 + api_total = agent_token_usage.total_api_tokens if agent_token_usage else 0 overall_stats = { **_overall_to_basic_stats_dict(overall), "total_api_input_tokens": ( - api_tokens.total_api_input_tokens if api_tokens else 0 + agent_token_usage.total_api_input_tokens if agent_token_usage else 0 ), "total_api_output_tokens": ( - api_tokens.total_api_output_tokens if api_tokens else 0 + agent_token_usage.total_api_output_tokens if agent_token_usage else 0 ), "total_api_tokens": api_total, "total_tokens": judge_tokens + api_total, @@ -704,6 +789,11 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]: "by_tag": _tag_stats_to_dict(summary.by_tag), } + if summary.agent_latency_stats is not None: + result["agent_latency_stats"] = _numeric_stats_to_dict( + summary.agent_latency_stats + ) + if summary.streaming is not None: result["streaming_performance"] = _streaming_stats_to_dict(summary.streaming) @@ -735,12 +825,13 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]: ), "time_to_first_token": r.time_to_first_token, "streaming_duration": r.streaming_duration, + "agent_latency": r.agent_latency, "tokens_per_second": r.tokens_per_second, } def _overall_to_basic_stats_dict( - overall: "OverallStats", + overall: OverallStats, ) -> dict[str, Any]: """Convert OverallStats to the dict format expected by text output. @@ -889,14 +980,21 @@ def _streaming_stats_to_dict(streaming: StreamingStats) -> dict[str, Any]: ): numeric = getattr(streaming, field_name, None) if numeric is not None: - result[field_name] = { - "count": numeric.count, - "mean": numeric.mean, - "median": numeric.median, - "std": numeric.std, - "min": numeric.min_value, - "max": numeric.max_value, - } + result[field_name] = _numeric_stats_to_dict(numeric) else: result[field_name] = {"count": 0} return result + + +def _numeric_stats_to_dict(numeric: NumericStats) -> dict[str, Any]: + """Convert NumericStats model to dict format for text output.""" + return { + "count": numeric.count, + "mean": numeric.mean, + "median": numeric.median, + "std": numeric.std, + "min": numeric.min_value, + "max": numeric.max_value, + "p95": numeric.p95, + "p99": numeric.p99, + } diff --git a/src/lightspeed_evaluation/core/output/statistics.py b/src/lightspeed_evaluation/core/output/statistics.py index 91c1c8d2..bf467491 100644 --- a/src/lightspeed_evaluation/core/output/statistics.py +++ b/src/lightspeed_evaluation/core/output/statistics.py @@ -6,13 +6,17 @@ import numpy as np import pandas as pd -from lightspeed_evaluation.core.models import ( +from lightspeed_evaluation.core.models.data import ( EvaluationData, EvaluationResult, +) + +from lightspeed_evaluation.core.models.statistics import ( NumericStats, ScoreStatistics, StreamingStats, - ApiTokenUsage, + AgentTokenUsage, + AgentTokenStats, OverallStats, MetricStats, ConversationStats, @@ -77,6 +81,8 @@ def compute_numeric_stats(values: list[float]) -> Optional[NumericStats]: std=statistics.stdev(values) if len(values) > 1 else 0.0, min_value=min(values), max_value=max(values), + p95=float(np.percentile(values, 95)), + p99=float(np.percentile(values, 99)), ) @@ -164,8 +170,8 @@ def compute_score_statistics( ) -def compute_api_token_usage(evaluation_data: list[EvaluationData]) -> ApiTokenUsage: - """Compute total API token usage from evaluation data.""" +def compute_agent_token_usage(evaluation_data: list[EvaluationData]) -> AgentTokenUsage: + """Compute agent token usage with totals and statistics from evaluation data.""" total_input_tokens = 0 total_output_tokens = 0 @@ -174,10 +180,11 @@ def compute_api_token_usage(evaluation_data: list[EvaluationData]) -> ApiTokenUs total_input_tokens += turn.api_input_tokens total_output_tokens += turn.api_output_tokens - return ApiTokenUsage( + return AgentTokenUsage( total_api_input_tokens=total_input_tokens, total_api_output_tokens=total_output_tokens, total_api_tokens=total_input_tokens + total_output_tokens, + statistics=compute_agent_token_stats(evaluation_data), ) @@ -261,3 +268,71 @@ def compute_detailed_stats(results: list[EvaluationResult]) -> DetailedStats: by_conversation=compute_conversation_stats(results), by_tag=compute_tag_stats(results, compute_ci=True), ) + + +def compute_field_numeric_stats_from_evaluation_data( + evaluation_data: list[EvaluationData], field_name: str +) -> Optional[NumericStats]: + """Calculate statistics for a numeric field, filtering out zeros (unmeasured values). + + Args: + evaluation_data: List of evaluation records to inspect. + field_name: Name of the numeric field to compute stats for. + + Returns: + Optional[NumericStats]: Dictionary of computed statistics including count, mean, median, + min, max, and standard deviation. Note that zero values are filtered out before + computing statistics as they represent unmeasured values. + """ + values = [] + for conv_data in evaluation_data: + for turn in conv_data.turns: + value = getattr(turn, field_name, 0) + if value > 0: + values.append(value) + + return compute_numeric_stats(values) + + +def compute_agent_latency_stats( + evaluation_data: list[EvaluationData], +) -> Optional[NumericStats]: + """Compute agent latency statistics from evaluation data. + + Args: + evaluation_data: List of evaluation data containing turn-level latency values. + + Returns: + NumericStats instance with computed statistics, or None if no evaluation data + or no valid (non-zero) latency values exist. + """ + if not evaluation_data: + return None + num_stats = compute_field_numeric_stats_from_evaluation_data( + evaluation_data, "agent_latency" + ) + return num_stats + + +def compute_agent_token_stats( + evaluation_data: list[EvaluationData], +) -> Optional[AgentTokenStats]: + """Calculate agent token usage statistics with percentiles from evaluation data. + + Args: + evaluation_data: List of evaluation data containing turn-level token counts. + + Returns: + AgentTokenStats instance with input/output token statistics, or None if no data. + """ + if not evaluation_data: + return None + + input_tokens_stats = compute_field_numeric_stats_from_evaluation_data( + evaluation_data, "api_input_tokens" + ) + output_tokens_stats = compute_field_numeric_stats_from_evaluation_data( + evaluation_data, "api_output_tokens" + ) + + return AgentTokenStats(input=input_tokens_stats, output=output_tokens_stats) diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py index 892d06db..ae0b07b3 100644 --- a/src/lightspeed_evaluation/core/storage/sql_storage.py +++ b/src/lightspeed_evaluation/core/storage/sql_storage.py @@ -66,6 +66,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods judge_scores = Column(Text, nullable=True) time_to_first_token = Column(Float, nullable=True) streaming_duration = Column(Float, nullable=True) + agent_latency = Column(Float, nullable=True) tokens_per_second = Column(Float, nullable=True) tool_calls = Column(Text, nullable=True) contexts = Column(Text, nullable=True) @@ -326,6 +327,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB: judge_scores=self._serialize_judge_scores(result.judge_scores), time_to_first_token=result.time_to_first_token, streaming_duration=result.streaming_duration, + agent_latency=result.agent_latency, tokens_per_second=result.tokens_per_second, tool_calls=result.tool_calls, contexts=result.contexts, diff --git a/src/lightspeed_evaluation/core/system/loader.py b/src/lightspeed_evaluation/core/system/loader.py index 29fcfe5f..7dbb4c4f 100644 --- a/src/lightspeed_evaluation/core/system/loader.py +++ b/src/lightspeed_evaluation/core/system/loader.py @@ -149,7 +149,7 @@ def _create_system_config(self, config_data: dict[str, Any]) -> SystemConfig: judge_panel_data = config_data.get("judge_panel") judge_panel = JudgePanelConfig(**judge_panel_data) if judge_panel_data else None - # Parse storage backends with backward compatibility + # Parse storage backends with backward compatibility for legacy 'output' section storage_data = self._get_storage_config_with_backward_compat(config_data) storage_backends = self._parse_storage_config(storage_data) diff --git a/src/lightspeed_evaluation/pipeline/evaluation/amender.py b/src/lightspeed_evaluation/pipeline/evaluation/amender.py index 17d90506..21d2b22e 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/amender.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/amender.py @@ -1,6 +1,7 @@ """API Data Amendment module - handles API data enrichment.""" import logging +import time from typing import Any, Optional from lightspeed_evaluation.core.api import APIClient @@ -36,6 +37,8 @@ def amend_single_turn( logger.debug("Amending turn %s with API data", turn_data.turn_id) + # Track API call execution time + api_start_time = time.perf_counter() try: api_response = self.api_client.query( query=turn_data.query, @@ -43,6 +46,7 @@ def amend_single_turn( attachments=turn_data.attachments, extra_request_params=turn_data.extra_request_params, ) + api_latency = time.perf_counter() - api_start_time # AMEND EVALUATION DATA: This modifies the loaded TurnData object in-place # Update response from API @@ -63,11 +67,19 @@ def amend_single_turn( # Update token usage from API output (with fallback to 0 if not present) turn_data.api_input_tokens = getattr(api_response, "input_tokens", 0) turn_data.api_output_tokens = getattr(api_response, "output_tokens", 0) + + # Update API latency only for actual API calls (cached responses have 0 tokens) + turn_data.agent_latency = ( + api_latency + if (turn_data.api_input_tokens > 0 or turn_data.api_output_tokens > 0) + else 0.0 + ) logger.debug( - "Token usage for turn %s: input=%d, output=%d", + "Token usage for turn %s: input=%d, output=%d, API latency=%.3fs", turn_data.turn_id, turn_data.api_input_tokens, turn_data.api_output_tokens, + turn_data.agent_latency, ) # Update streaming performance metrics (only available for streaming endpoint) @@ -105,6 +117,9 @@ def amend_single_turn( return None, api_response.conversation_id except APIError as e: + # Record elapsed time even on error + api_latency = time.perf_counter() - api_start_time + turn_data.agent_latency = api_latency error_msg = f"API Error for turn {turn_data.turn_id}: {e}" logger.error(error_msg) return error_msg, conversation_id diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py index 80328197..d447a7bf 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py @@ -49,7 +49,7 @@ def _to_json_str(value: Any) -> Optional[str]: return str(value) -def _calculate_api_token_counts_per_request( +def _compute_api_token_counts_per_request( request: EvaluationRequest, ) -> tuple[int, int]: """Resolve API token counts for a request.""" @@ -71,6 +71,21 @@ def _calculate_api_token_counts_per_request( return (api_input_tokens_sum, api_output_tokens_sum) +def _compute_agent_latency_per_request(request: EvaluationRequest) -> float: + """Resolve API latency for a request. + + For turn-level: return the turn's API latency. + For conversation-level: return SUM of API latency per turn. + """ + # For turn-level metric + if request.turn_data is not None: + return request.turn_data.agent_latency + + # For conversation-level: sum of per turn latencies + latencies = [turn.agent_latency for turn in request.conv_data.turns] + return sum(latencies) + + class MetricsEvaluator: """Handles individual metric evaluation with proper scoring and status determination.""" @@ -212,9 +227,10 @@ def evaluate_metric( # pylint: disable=too-many-locals execution_time = time.time() - start_time turn_data = request.turn_data - api_input_tokens, api_output_tokens = ( - _calculate_api_token_counts_per_request(request) + api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request( + request ) + agent_latency = _compute_agent_latency_per_request(request) return EvaluationResult( **metric_result.model_dump(), conversation_group_id=request.conv_data.conversation_group_id, @@ -225,6 +241,7 @@ def evaluate_metric( # pylint: disable=too-many-locals query=turn_data.query if turn_data else "", response=turn_data.response or "" if turn_data else "", execution_time=execution_time, + agent_latency=agent_latency, api_input_tokens=api_input_tokens, api_output_tokens=api_output_tokens, # Streaming performance metrics @@ -707,9 +724,10 @@ def _create_error_result( ) -> EvaluationResult: """Create an ERROR result for failed evaluation.""" turn_data = request.turn_data - api_input_tokens, api_output_tokens = _calculate_api_token_counts_per_request( + api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request( request ) + agent_latency = _compute_agent_latency_per_request(request) return EvaluationResult( conversation_group_id=request.conv_data.conversation_group_id, tag=request.conv_data.tag, @@ -723,6 +741,7 @@ def _create_error_result( query=turn_data.query if turn_data else "", response=turn_data.response or "" if turn_data else "", execution_time=execution_time, + agent_latency=agent_latency, api_input_tokens=api_input_tokens, api_output_tokens=api_output_tokens, # Streaming performance metrics diff --git a/src/lightspeed_evaluation/runner/evaluation.py b/src/lightspeed_evaluation/runner/evaluation.py index f81304e1..c54563e0 100644 --- a/src/lightspeed_evaluation/runner/evaluation.py +++ b/src/lightspeed_evaluation/runner/evaluation.py @@ -11,7 +11,7 @@ LLMPoolConfig, SystemConfig, OverallStats, - ApiTokenUsage, + AgentTokenUsage, ) # Import only lightweight modules at top level @@ -65,7 +65,7 @@ def _clear_caches(system_config: SystemConfig) -> None: def _print_summary( summary: OverallStats, - api_tokens: Optional[ApiTokenUsage] = None, + api_tokens: Optional[AgentTokenUsage] = None, ) -> None: """Print evaluation summary and token usage.""" print( @@ -132,7 +132,7 @@ def run_evaluation( # pylint: disable=too-many-locals from lightspeed_evaluation.api import evaluate from lightspeed_evaluation.core.output import OutputHandler from lightspeed_evaluation.core.output.statistics import ( - compute_api_token_usage, + compute_agent_token_usage, compute_overall_stats, ) from lightspeed_evaluation.core.system import DataValidator @@ -201,7 +201,7 @@ def run_evaluation( # pylint: disable=too-many-locals # Final Summary summary = compute_overall_stats(results) api_tokens = ( - compute_api_token_usage(evaluation_data) + compute_agent_token_usage(evaluation_data) if system_config.agents is not None and system_config.agents.enabled else None ) diff --git a/tests/unit/core/models/conftest.py b/tests/unit/core/models/conftest.py index c9a4d699..7d76436f 100644 --- a/tests/unit/core/models/conftest.py +++ b/tests/unit/core/models/conftest.py @@ -2,7 +2,12 @@ import pytest -from lightspeed_evaluation.core.models import MetricStats, ScoreStatistics +from lightspeed_evaluation.core.models.statistics import ( + AgentTokenStats, + MetricStats, + NumericStats, + ScoreStatistics, +) @pytest.fixture @@ -75,3 +80,45 @@ def quality_by_metric_with_none() -> dict[str, MetricStats]: ), ), } + + +@pytest.fixture +def api_latency_summary() -> NumericStats: + """Sample API latency statistics for quality report tests.""" + return NumericStats( + count=10, + mean=1.5, + median=1.3, + std=0.5, + min_value=0.8, + max_value=2.5, + p95=2.2, + p99=2.4, + ) + + +@pytest.fixture +def agent_token_stats() -> AgentTokenStats: + """Sample agent token statistics for quality report tests.""" + return AgentTokenStats( + input=NumericStats( + count=10, + mean=450.5, + median=425.0, + std=50.2, + min_value=350.0, + max_value=550.0, + p95=520.0, + p99=545.0, + ), + output=NumericStats( + count=10, + mean=180.3, + median=175.0, + std=25.1, + min_value=140.0, + max_value=220.0, + p95=210.0, + p99=218.0, + ), + ) diff --git a/tests/unit/core/models/test_quality.py b/tests/unit/core/models/test_quality.py index 500154f1..d87ee295 100644 --- a/tests/unit/core/models/test_quality.py +++ b/tests/unit/core/models/test_quality.py @@ -3,7 +3,11 @@ from pytest import LogCaptureFixture from lightspeed_evaluation.core.models.quality import QualityReport -from lightspeed_evaluation.core.models.statistics import MetricStats +from lightspeed_evaluation.core.models.statistics import ( + AgentTokenStats, + MetricStats, + NumericStats, +) class TestQualityReport: @@ -12,6 +16,7 @@ class TestQualityReport: def test_quality_report_creation_happy_path( self, quality_by_metric: dict[str, MetricStats], + api_latency_summary: NumericStats, ) -> None: """Test QualityReport creation with valid metrics.""" @@ -19,7 +24,9 @@ def test_quality_report_creation_happy_path( quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] # Create the QualityReport - report = QualityReport.create_report(quality_by_metric, quality_score_metrics) + report = QualityReport.create_report( + quality_by_metric, api_latency_summary, None, quality_score_metrics + ) # Assertions assert report is not None @@ -52,17 +59,26 @@ def test_quality_report_creation_happy_path( # Verify extra metrics contain correct mean scores assert report.extra_metrics["custom:context_recall"].mean == 0.75 + # Verify agent latency is set correctly + assert report.agent_latency_stats is not None + assert report.agent_latency_stats.count == 10 + assert report.agent_latency_stats.mean == 1.5 + # Verify no warnings for valid configuration assert len(report.warnings) == 0 def test_quality_report_creation_missing_metric( - self, quality_by_metric: dict[str, MetricStats] + self, + quality_by_metric: dict[str, MetricStats], + api_latency_summary: NumericStats, ) -> None: """Test QualityReport excludes missing metrics and generates warning.""" quality_score_metrics = ["ragas:faithfulness", "ragas:answer_correctness"] # Create the QualityReport - report = QualityReport.create_report(quality_by_metric, quality_score_metrics) + report = QualityReport.create_report( + quality_by_metric, api_latency_summary, None, quality_score_metrics + ) # Assertions assert report is not None @@ -89,7 +105,10 @@ def test_quality_report_creation_missing_metric( assert "ragas:answer_relevancy" in report.extra_metrics def test_quality_report_total_samples_zero( - self, quality_by_metric_zero: dict[str, MetricStats], caplog: LogCaptureFixture + self, + quality_by_metric_zero: dict[str, MetricStats], + api_latency_summary: NumericStats, + caplog: LogCaptureFixture, ) -> None: """Test QualityReport returns None when all quality metrics have zero samples.""" # Define quality score metrics (subset of all metrics) @@ -97,7 +116,7 @@ def test_quality_report_total_samples_zero( # Create the QualityReport report = QualityReport.create_report( - quality_by_metric_zero, quality_score_metrics + quality_by_metric_zero, api_latency_summary, None, quality_score_metrics ) # Assertions @@ -105,7 +124,9 @@ def test_quality_report_total_samples_zero( assert "Quality score computation failed" in caplog.text def test_quality_report_sample_size_zero( - self, quality_by_metric_zero: dict[str, MetricStats] + self, + quality_by_metric_zero: dict[str, MetricStats], + api_latency_summary: NumericStats, ) -> None: """Test QualityReport excludes metrics with zero samples and generates warning.""" # Define quality score metrics (subset of all metrics) @@ -113,7 +134,7 @@ def test_quality_report_sample_size_zero( # Create the QualityReport report = QualityReport.create_report( - quality_by_metric_zero, quality_score_metrics + quality_by_metric_zero, api_latency_summary, None, quality_score_metrics ) # Assertions @@ -126,6 +147,7 @@ def test_quality_report_sample_size_zero( def test_quality_report_none_score_statistics( self, quality_by_metric_with_none: dict[str, MetricStats], + api_latency_summary: NumericStats, caplog: LogCaptureFixture, ) -> None: """Test QualityReport excludes metrics with None score_statistics and logs warning.""" @@ -134,7 +156,10 @@ def test_quality_report_none_score_statistics( # Create the QualityReport report = QualityReport.create_report( - quality_by_metric_with_none, quality_score_metrics + quality_by_metric_with_none, + api_latency_summary, + None, + quality_score_metrics, ) # Assertions @@ -163,3 +188,83 @@ def test_quality_report_none_score_statistics( # Verify warning was logged assert "ragas:faithfulness" in caplog.text assert "Missing score statistics data" in caplog.text + + def test_quality_report_creation_no_api_latency( + self, + quality_by_metric: dict[str, MetricStats], + ) -> None: + """Test QualityReport handles None API latency (api_enabled=False).""" + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + api_latency_summary = None + + # Create the QualityReport with None agent_latency_stats + report = QualityReport.create_report( + quality_by_metric, + api_latency_summary, # API disabled scenario + None, # No agent token stats + quality_score_metrics, + ) + + # Assertions + assert report is not None + assert report.agent_latency_stats is None # Should gracefully handle None + assert report.quality_score > 0 # Quality score still computed + assert len(report.quality_metrics) == 2 + + def test_quality_report_with_agent_token_stats( + self, + quality_by_metric: dict[str, MetricStats], + api_latency_summary: NumericStats, + agent_token_stats: AgentTokenStats, + ) -> None: + """Test QualityReport includes agent token statistics with percentiles.""" + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + # Create the QualityReport with agent token stats + report = QualityReport.create_report( + quality_by_metric, + api_latency_summary, + agent_token_stats, + quality_score_metrics, + ) + + # Assertions + assert report is not None + assert report.agent_token_stats is not None + + # Verify input token statistics + assert report.agent_token_stats.input is not None + assert report.agent_token_stats.input.count == 10 + assert report.agent_token_stats.input.mean == 450.5 + assert report.agent_token_stats.input.median == 425.0 + assert report.agent_token_stats.input.p95 == 520.0 + assert report.agent_token_stats.input.p99 == 545.0 + + # Verify output token statistics + assert report.agent_token_stats.output is not None + assert report.agent_token_stats.output.count == 10 + assert report.agent_token_stats.output.mean == 180.3 + assert report.agent_token_stats.output.median == 175.0 + assert report.agent_token_stats.output.p95 == 210.0 + assert report.agent_token_stats.output.p99 == 218.0 + + def test_quality_report_with_no_agent_token_stats( + self, + quality_by_metric: dict[str, MetricStats], + api_latency_summary: NumericStats, + ) -> None: + """Test QualityReport handles None agent token stats gracefully.""" + quality_score_metrics = ["ragas:faithfulness", "ragas:answer_relevancy"] + + # Create the QualityReport without agent token stats + report = QualityReport.create_report( + quality_by_metric, + api_latency_summary, + None, # No agent token stats + quality_score_metrics, + ) + + # Assertions + assert report is not None + assert report.agent_token_stats is None + assert report.quality_score > 0 # Quality score still computed diff --git a/tests/unit/core/models/test_summary.py b/tests/unit/core/models/test_summary.py index 15c8b1de..3e53d697 100644 --- a/tests/unit/core/models/test_summary.py +++ b/tests/unit/core/models/test_summary.py @@ -188,15 +188,82 @@ def test_with_evaluation_data(self) -> None: summary = EvaluationSummary.from_results(results, evaluation_data=eval_data) - assert summary.api_tokens is not None - assert summary.api_tokens.total_api_input_tokens == 500 - assert summary.api_tokens.total_api_output_tokens == 200 - assert summary.api_tokens.total_api_tokens == 700 + assert summary.agent_token_usage is not None + assert summary.agent_token_usage.total_api_input_tokens == 500 + assert summary.agent_token_usage.total_api_output_tokens == 200 + assert summary.agent_token_usage.total_api_tokens == 700 assert summary.streaming is not None assert summary.streaming.time_to_first_token is not None assert summary.streaming.time_to_first_token.mean == 0.5 + def test_with_api_latency(self) -> None: + """Test from_results computes agent latency statistics from evaluation data.""" + results = [_make_result(turn_id="t1")] + + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="t1", query="Query 1", agent_latency=1.5), + TurnData(turn_id="t2", query="Query 2", agent_latency=2.0), + TurnData(turn_id="t3", query="Query 3", agent_latency=1.8), + ], + ) + ] + + summary = EvaluationSummary.from_results(results, evaluation_data=eval_data) + + assert summary.agent_latency_stats is not None + assert summary.agent_latency_stats.count == 3 + expected_mean = (1.5 + 2.0 + 1.8) / 3 # ≈ 1.7667 + assert summary.agent_latency_stats.mean is not None + assert abs(summary.agent_latency_stats.mean - expected_mean) < 0.0001 + assert summary.agent_latency_stats.min_value == 1.5 + assert summary.agent_latency_stats.max_value == 2.0 + + def test_without_api_latency(self) -> None: + """Test from_results with no agent latency data (default 0).""" + results = [_make_result(turn_id="t1")] + + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="t1", query="Query 1", agent_latency=0), + TurnData(turn_id="t2", query="Query 2", agent_latency=0), + ], + ) + ] + + summary = EvaluationSummary.from_results(results, evaluation_data=eval_data) + + # agent_latency_stats should exist with count=0 when all latencies are 0 + assert summary.agent_latency_stats is None + + def test_with_mixed_api_latency(self) -> None: + """Test from_results with mixed zero and non-zero latencies.""" + results = [_make_result(turn_id="t1")] + + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="t1", query="Query 1", agent_latency=1.5), + TurnData(turn_id="t2", query="Query 2", agent_latency=0), # cached + TurnData(turn_id="t3", query="Query 3", agent_latency=2.0), + ], + ) + ] + + summary = EvaluationSummary.from_results(results, evaluation_data=eval_data) + + # Should compute stats only from non-zero values (1.5, 2.0) + assert summary.agent_latency_stats is not None + assert summary.agent_latency_stats.count == 2 + assert summary.agent_latency_stats.min_value == 1.5 + assert summary.agent_latency_stats.max_value == 2.0 + def test_without_confidence_intervals_by_default(self) -> None: """Test that confidence intervals are not computed by default.""" results = [ @@ -348,8 +415,9 @@ def test_model_dump_excludes_none_optional(self) -> None: summary = EvaluationSummary.from_results(results) dumped = summary.model_dump() - # api_tokens and streaming should be None when not provided - assert dumped["api_tokens"] is None + # agent_token_usage, agent_latency_stats, and streaming should be None when not provided + assert dumped["agent_token_usage"] is None + assert dumped["agent_latency_stats"] is None assert dumped["streaming"] is None def test_pydantic_validation(self) -> None: diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index 4e9a8cfb..21ffd1ec 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -181,6 +181,8 @@ def test_generate_individual_reports_with_quality_report( # Create a quality report quality_report = QualityReport.create_report( summary.by_metric, + summary.agent_latency_stats, + summary.agent_token_usage.statistics if summary.agent_token_usage else None, ["ragas:faithfulness", "ragas:answer_relevancy"], ) assert quality_report is not None @@ -441,6 +443,8 @@ def test_generate_quality_score_report_all_fields( # Create a quality report quality_report = QualityReport.create_report( summary.by_metric, + summary.agent_latency_stats, + summary.agent_token_usage.statistics if summary.agent_token_usage else None, ["ragas:faithfulness", "ragas:answer_relevancy"], ) @@ -462,8 +466,8 @@ def test_generate_quality_score_report_all_fields( assert "quality_score" in data assert "quality_metrics" in data assert "extra_metrics" in data - assert "api_latency" in data - assert "api_tokens" in data + assert "agent_latency_stats" in data + assert "agent_token_stats" in data assert "warnings" in data # Check quality_score is a number @@ -485,9 +489,15 @@ def test_generate_quality_score_report_all_fields( # Check extra_metrics structure assert isinstance(data["extra_metrics"], dict) - # Check API fields are numeric - assert isinstance(data["api_latency"], (int, float)) - assert isinstance(data["api_tokens"], int) + # Check agent latency and token stats fields + # agent_latency_stats can be None or a dict with numeric stats + assert data["agent_latency_stats"] is None or isinstance( + data["agent_latency_stats"], dict + ) + # agent_token_stats can be None or a dict + assert data["agent_token_stats"] is None or isinstance( + data["agent_token_stats"], dict + ) # Check warnings is a list assert isinstance(data["warnings"], list) @@ -516,6 +526,8 @@ def test_quality_report_with_partial_metrics( # Try to create quality report with metrics that don't exist quality_report = QualityReport.create_report( summary.by_metric, + summary.agent_latency_stats, + summary.agent_token_usage.statistics if summary.agent_token_usage else None, ["ragas:faithfulness", "ragas:answer_relevancy", "nonexistent:metric"], ) diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py index a88111a4..0d32eef7 100644 --- a/tests/unit/core/output/test_statistics.py +++ b/tests/unit/core/output/test_statistics.py @@ -1,20 +1,16 @@ -"""Unit tests for output statistics module.""" +"""Unit tests for core statistics module.""" import pytest import pandas as pd -from lightspeed_evaluation.core.models import ( - EvaluationData, +from lightspeed_evaluation.core.models.data import ( EvaluationResult, - TurnData, - OverallStats, ) +from lightspeed_evaluation.core.models.statistics import OverallStats from lightspeed_evaluation.core.output.statistics import ( compute_score_statistics, bootstrap_intervals, - compute_api_token_usage, compute_overall_stats, - compute_detailed_stats, ) @@ -346,573 +342,3 @@ def test_compute_overall_stats_single_result(self) -> None: total_embedding_tokens=0, ) assert stats == expected - - -class TestCalculateDetailedStats: - """Tests for compute_detailed_stats function.""" - - def test_detailed_stats_with_results( - self, sample_results_statistics: list[EvaluationResult] - ) -> None: - """Test detailed stats calculation.""" - stats = compute_detailed_stats(sample_results_statistics).model_dump() - - assert stats["by_metric"] - assert stats["by_conversation"] - - assert "metric1" in stats["by_metric"] - assert "metric2" in stats["by_metric"] - assert "conv1" in stats["by_conversation"] - assert "conv2" in stats["by_conversation"] - - def test_detailed_stats_empty_results(self) -> None: - """Test detailed stats with empty results.""" - stats = compute_detailed_stats([]).model_dump() - - assert not stats["by_metric"] - assert not stats["by_conversation"] - - def test_detailed_stats_metric_breakdown( - self, sample_results_statistics: list[EvaluationResult] - ) -> None: - """Test metric breakdown in detailed stats.""" - stats = compute_detailed_stats(sample_results_statistics).model_dump() - - metric1_stats = stats["by_metric"]["metric1"] - assert metric1_stats["passed"] == 1 - assert metric1_stats["failed"] == 1 - - metric2_stats = stats["by_metric"]["metric2"] - assert metric2_stats["passed"] == 1 - assert metric2_stats["error"] == 1 - - def test_detailed_stats_conversation_breakdown( - self, sample_results_statistics: list[EvaluationResult] - ) -> None: - """Test conversation breakdown in detailed stats.""" - stats = compute_detailed_stats(sample_results_statistics).model_dump() - - conv1_stats = stats["by_conversation"]["conv1"] - assert conv1_stats["passed"] == 1 - assert conv1_stats["failed"] == 1 - - conv2_stats = stats["by_conversation"]["conv2"] - assert conv2_stats["passed"] == 1 - assert conv2_stats["error"] == 1 - - def test_detailed_stats_includes_rates( - self, sample_results_statistics: list[EvaluationResult] - ) -> None: - """Test that detailed stats include percentage rates.""" - stats = compute_detailed_stats(sample_results_statistics).model_dump() - - metric1_stats = stats["by_metric"]["metric1"] - assert "pass_rate" in metric1_stats - assert "fail_rate" in metric1_stats - assert metric1_stats["pass_rate"] == 50.0 - assert metric1_stats["fail_rate"] == 50.0 - - def test_detailed_stats_single_metric(self) -> None: - """Test detailed stats with single metric.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="single_metric", - score=0.9, - result="PASS", - threshold=0.7, - ) - ] - - stats = compute_detailed_stats(results).model_dump() - - assert len(stats["by_metric"]) == 1 - assert "single_metric" in stats["by_metric"] - - def test_compute_detailed_stats_single_metric_single_conversation(self) -> None: - """Test compute_detailed_stats with single metric and conversation.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="ragas:faithfulness", - result="PASS", - score=0.8, - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn2", - metric_identifier="ragas:faithfulness", - result="FAIL", - score=0.3, - threshold=0.7, - reason="Poor", - ), - ] - - stats = compute_detailed_stats(results).model_dump() - - # Check by_metric breakdown - assert "ragas:faithfulness" in stats["by_metric"] - metric_stats = stats["by_metric"]["ragas:faithfulness"] - assert metric_stats["passed"] == 1 - assert metric_stats["failed"] == 1 - assert metric_stats["error"] == 0 - assert metric_stats["pass_rate"] == 50.0 - - # Check by_conversation breakdown - assert "conv1" in stats["by_conversation"] - conv_stats = stats["by_conversation"]["conv1"] - assert conv_stats["passed"] == 1 - assert conv_stats["failed"] == 1 - assert conv_stats["error"] == 0 - assert conv_stats["pass_rate"] == 50.0 - - def test_compute_detailed_stats_multiple_metrics_conversations(self) -> None: - """Test compute_detailed_stats with multiple metrics and conversations.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="ragas:faithfulness", - result="PASS", - score=0.8, - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="ragas:relevancy", - result="FAIL", - score=0.3, - threshold=0.7, - reason="Poor", - ), - EvaluationResult( - conversation_group_id="conv2", - turn_id="turn1", - metric_identifier="ragas:faithfulness", - result="PASS", - score=0.9, - threshold=0.7, - reason="Excellent", - ), - EvaluationResult( - conversation_group_id="conv2", - turn_id="turn1", - metric_identifier="ragas:relevancy", - result="ERROR", - score=0.0, - threshold=0.7, - reason="API error", - ), - ] - - stats = compute_detailed_stats(results).model_dump() - - # Check metrics - assert len(stats["by_metric"]) == 2 - assert "ragas:faithfulness" in stats["by_metric"] - assert "ragas:relevancy" in stats["by_metric"] - - faithfulness_stats = stats["by_metric"]["ragas:faithfulness"] - assert faithfulness_stats["passed"] == 2 - assert faithfulness_stats["failed"] == 0 - assert faithfulness_stats["error"] == 0 - assert faithfulness_stats["pass_rate"] == 100.0 - - relevancy_stats = stats["by_metric"]["ragas:relevancy"] - assert relevancy_stats["passed"] == 0 - assert relevancy_stats["failed"] == 1 - assert relevancy_stats["error"] == 1 - assert relevancy_stats["pass_rate"] == 0.0 - assert relevancy_stats["fail_rate"] == 50.0 - assert relevancy_stats["error_rate"] == 50.0 - - # Check conversations - assert len(stats["by_conversation"]) == 2 - assert "conv1" in stats["by_conversation"] - assert "conv2" in stats["by_conversation"] - - conv1_stats = stats["by_conversation"]["conv1"] - assert conv1_stats["passed"] == 1 - assert conv1_stats["failed"] == 1 - assert conv1_stats["error"] == 0 - assert conv1_stats["pass_rate"] == 50.0 - - conv2_stats = stats["by_conversation"]["conv2"] - assert conv2_stats["passed"] == 1 - assert conv2_stats["failed"] == 0 - assert conv2_stats["error"] == 1 - assert conv2_stats["pass_rate"] == 50.0 - assert conv2_stats["error_rate"] == 50.0 - - def test_compute_detailed_stats_score_statistics(self) -> None: - """Test compute_detailed_stats includes score statistics.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="ragas:faithfulness", - result="PASS", - score=0.8, - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn2", - metric_identifier="ragas:faithfulness", - result="PASS", - score=0.9, - threshold=0.7, - reason="Excellent", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn3", - metric_identifier="ragas:faithfulness", - result="FAIL", - score=0.3, - threshold=0.7, - reason="Poor", - ), - ] - - stats = compute_detailed_stats(results).model_dump() - - metric_stats = stats["by_metric"]["ragas:faithfulness"] - assert "score_statistics" in metric_stats - - score_stats = metric_stats["score_statistics"] - assert score_stats["count"] == 3 - assert score_stats["mean"] == pytest.approx(0.6667, rel=1e-3) - assert score_stats["min_score"] == 0.3 - assert score_stats["max_score"] == 0.9 - assert score_stats["median"] == 0.8 - assert score_stats["std"] > 0 # Should have some standard deviation - # Confidence interval should be calculated for 3+ scores - assert "confidence_interval" in score_stats - ci = score_stats["confidence_interval"] - assert ci is not None - assert "low" in ci - assert "mean" in ci - assert "high" in ci - assert ci["confidence_level"] == 95 - - def test_compute_detailed_stats_no_scores(self) -> None: - """Test compute_detailed_stats with results that have no scores.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="test:metric", - result="ERROR", - threshold=0.7, - reason="API error", - ) - ] - - stats = compute_detailed_stats(results).model_dump() - - metric_stats = stats["by_metric"]["test:metric"] - assert "score_statistics" in metric_stats - - score_stats = metric_stats["score_statistics"] - assert score_stats["count"] == 0 - assert score_stats["mean"] == 0.0 - assert score_stats["median"] == 0.0 - assert score_stats["std"] == 0.0 - # Confidence interval should be None when no scores - assert score_stats["confidence_interval"] is None - - def test_compute_detailed_stats_single_score_no_confidence_interval(self) -> None: - """Test compute_detailed_stats with single score has no CI (needs 2+).""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="test:metric", - result="PASS", - score=0.8, - threshold=0.7, - reason="Good", - ) - ] - - stats = compute_detailed_stats(results).model_dump() - - metric_stats = stats["by_metric"]["test:metric"] - score_stats = metric_stats["score_statistics"] - assert score_stats["count"] == 1 - # Confidence interval should be None for single score - assert score_stats["confidence_interval"] is None - - def test_compute_detailed_stats_by_tag(self) -> None: - """Test compute_detailed_stats includes by_tag breakdown.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - tag="production", - turn_id="turn1", - metric_identifier="metric1", - result="PASS", - score=0.9, - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv2", - tag="production", - turn_id="turn1", - metric_identifier="metric1", - result="PASS", - score=0.8, - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv3", - tag="staging", - turn_id="turn1", - metric_identifier="metric1", - result="FAIL", - score=0.5, - threshold=0.7, - reason="Below threshold", - ), - ] - - stats = compute_detailed_stats(results).model_dump() - - # Verify by_tag is present - assert "by_tag" in stats - assert "production" in stats["by_tag"] - assert "staging" in stats["by_tag"] - - # Check production tag stats - prod_stats = stats["by_tag"]["production"] - assert prod_stats["passed"] == 2 - assert prod_stats["failed"] == 0 - assert prod_stats["pass_rate"] == 100.0 - assert "score_statistics" in prod_stats - assert prod_stats["score_statistics"]["count"] == 2 - assert prod_stats["score_statistics"]["mean"] == pytest.approx(0.85) - - # Check staging tag stats - staging_stats = stats["by_tag"]["staging"] - assert staging_stats["passed"] == 0 - assert staging_stats["failed"] == 1 - assert staging_stats["fail_rate"] == 100.0 - assert "score_statistics" in staging_stats - - def test_compute_detailed_stats_default_tag(self) -> None: - """Test compute_detailed_stats with default 'eval' tag.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="metric1", - result="PASS", - threshold=0.7, - ), - ] - - stats = compute_detailed_stats(results).model_dump() - - # Default tag should be "eval" - assert "by_tag" in stats - assert "eval" in stats["by_tag"] - assert stats["by_tag"]["eval"]["passed"] == 1 - - -class TestCalculateApiTokenUsage: - """Tests for compute_api_token_usage function.""" - - def test_compute_api_token_usage_empty_data(self) -> None: - """Test compute_api_token_usage with empty data.""" - result = compute_api_token_usage([]) - assert result.total_api_input_tokens == 0 - assert result.total_api_output_tokens == 0 - assert result.total_api_tokens == 0 - - def test_compute_api_token_usage_single_turn(self) -> None: - """Test compute_api_token_usage with single turn.""" - turn = TurnData( - turn_id="turn1", - query="Test query", - response="Test response", - api_input_tokens=100, - api_output_tokens=50, - ) - eval_data = EvaluationData( - conversation_group_id="conv1", - turns=[turn], - ) - result = compute_api_token_usage([eval_data]) - assert result.total_api_input_tokens == 100 - assert result.total_api_output_tokens == 50 - assert result.total_api_tokens == 150 - - def test_compute_api_token_usage_multiple_turns(self) -> None: - """Test compute_api_token_usage with multiple turns.""" - turns = [ - TurnData( - turn_id="turn1", - query="Query 1", - response="Response 1", - api_input_tokens=100, - api_output_tokens=50, - ), - TurnData( - turn_id="turn2", - query="Query 2", - response="Response 2", - api_input_tokens=150, - api_output_tokens=75, - ), - ] - eval_data = EvaluationData( - conversation_group_id="conv1", - turns=turns, - ) - result = compute_api_token_usage([eval_data]) - assert result.total_api_input_tokens == 250 - assert result.total_api_output_tokens == 125 - assert result.total_api_tokens == 375 - - def test_compute_api_token_usage_multiple_conversations(self) -> None: - """Test compute_api_token_usage with multiple conversations.""" - eval_data1 = EvaluationData( - conversation_group_id="conv1", - turns=[ - TurnData( - turn_id="turn1", - query="Q1", - response="R1", - api_input_tokens=100, - api_output_tokens=50, - ), - ], - ) - eval_data2 = EvaluationData( - conversation_group_id="conv2", - turns=[ - TurnData( - turn_id="turn1", - query="Q2", - response="R2", - api_input_tokens=200, - api_output_tokens=100, - ), - ], - ) - result = compute_api_token_usage([eval_data1, eval_data2]) - assert result.total_api_input_tokens == 300 - assert result.total_api_output_tokens == 150 - assert result.total_api_tokens == 450 - - def test_compute_api_token_usage_zero_tokens(self) -> None: - """Test compute_api_token_usage with zero token values.""" - turn = TurnData( - turn_id="turn1", - query="Test", - response="Response", - api_input_tokens=0, - api_output_tokens=0, - ) - eval_data = EvaluationData( - conversation_group_id="conv1", - turns=[turn], - ) - result = compute_api_token_usage([eval_data]) - assert result.total_api_input_tokens == 0 - assert result.total_api_output_tokens == 0 - assert result.total_api_tokens == 0 - - -class TestCalculateBasicStatsWithTokens: - """Tests for compute_overall_stats token tracking fields.""" - - def test_basic_stats_includes_token_fields(self) -> None: - """Test that basic stats includes token fields.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="test:metric", - result="PASS", - score=0.8, - threshold=0.7, - judge_llm_input_tokens=100, - judge_llm_output_tokens=50, - embedding_tokens=150, - ) - ] - stats = compute_overall_stats(results) - assert stats.total_judge_llm_input_tokens == 100 - assert stats.total_judge_llm_output_tokens == 50 - assert stats.total_judge_llm_tokens == 150 - assert stats.total_embedding_tokens == 150 - - def test_basic_stats_sums_token_values(self) -> None: - """Test that basic stats correctly sums token values.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="test:metric", - result="PASS", - score=0.8, - threshold=0.7, - judge_llm_input_tokens=100, - judge_llm_output_tokens=50, - embedding_tokens=100, - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn2", - metric_identifier="test:metric", - result="PASS", - score=0.9, - threshold=0.7, - judge_llm_input_tokens=200, - judge_llm_output_tokens=100, - embedding_tokens=250, - ), - ] - stats = compute_overall_stats(results) - assert stats.total_judge_llm_input_tokens == 300 - assert stats.total_judge_llm_output_tokens == 150 - assert stats.total_judge_llm_tokens == 450 - assert stats.total_embedding_tokens == 350 - - def test_basic_stats_zero_tokens_by_default(self) -> None: - """Test that results without tokens default to zero.""" - results = [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="test:metric", - result="PASS", - score=0.8, - threshold=0.7, - ) - ] - stats = compute_overall_stats(results) - assert stats.total_judge_llm_input_tokens == 0 - assert stats.total_judge_llm_output_tokens == 0 - assert stats.total_judge_llm_tokens == 0 - assert stats.total_embedding_tokens == 0 - - def test_basic_stats_empty_results_zero_tokens(self) -> None: - """Test that empty results have zero tokens.""" - stats = compute_overall_stats([]) - assert stats.total_judge_llm_input_tokens == 0 - assert stats.total_judge_llm_output_tokens == 0 - assert stats.total_judge_llm_tokens == 0 - assert stats.total_embedding_tokens == 0 diff --git a/tests/unit/core/output/test_statistics_api.py b/tests/unit/core/output/test_statistics_api.py new file mode 100644 index 00000000..86a82dfe --- /dev/null +++ b/tests/unit/core/output/test_statistics_api.py @@ -0,0 +1,268 @@ +"""Unit tests for api statistics module.""" + +import pytest + +from lightspeed_evaluation.core.models import EvaluationData, EvaluationResult, TurnData +from lightspeed_evaluation.core.output.statistics import ( + compute_field_numeric_stats_from_evaluation_data, + compute_agent_token_usage, + compute_overall_stats, +) + + +class TestCalculateApiTokenUsage: + """Tests for compute_agent_token_usage function.""" + + def test_compute_agent_token_usage_empty_data(self) -> None: + """Test compute_agent_token_usage with empty data.""" + result = compute_agent_token_usage([]) + assert result.total_api_input_tokens == 0 + assert result.total_api_output_tokens == 0 + assert result.total_api_tokens == 0 + + def test_compute_agent_token_usage_single_turn(self) -> None: + """Test compute_agent_token_usage with single turn.""" + turn = TurnData( + turn_id="turn1", + query="Test query", + response="Test response", + api_input_tokens=100, + api_output_tokens=50, + ) + eval_data = EvaluationData( + conversation_group_id="conv1", + turns=[turn], + ) + result = compute_agent_token_usage([eval_data]) + assert result.total_api_input_tokens == 100 + assert result.total_api_output_tokens == 50 + assert result.total_api_tokens == 150 + + def test_compute_agent_token_usage_multiple_turns(self) -> None: + """Test compute_agent_token_usage with multiple turns.""" + turns = [ + TurnData( + turn_id="turn1", + query="Query 1", + response="Response 1", + api_input_tokens=100, + api_output_tokens=50, + ), + TurnData( + turn_id="turn2", + query="Query 2", + response="Response 2", + api_input_tokens=150, + api_output_tokens=75, + ), + ] + eval_data = EvaluationData( + conversation_group_id="conv1", + turns=turns, + ) + result = compute_agent_token_usage([eval_data]) + assert result.total_api_input_tokens == 250 + assert result.total_api_output_tokens == 125 + assert result.total_api_tokens == 375 + + def test_compute_agent_token_usage_multiple_conversations(self) -> None: + """Test compute_agent_token_usage with multiple conversations.""" + eval_data1 = EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData( + turn_id="turn1", + query="Q1", + response="R1", + api_input_tokens=100, + api_output_tokens=50, + ), + ], + ) + eval_data2 = EvaluationData( + conversation_group_id="conv2", + turns=[ + TurnData( + turn_id="turn1", + query="Q2", + response="R2", + api_input_tokens=200, + api_output_tokens=100, + ), + ], + ) + result = compute_agent_token_usage([eval_data1, eval_data2]) + assert result.total_api_input_tokens == 300 + assert result.total_api_output_tokens == 150 + assert result.total_api_tokens == 450 + + def test_compute_agent_token_usage_zero_tokens(self) -> None: + """Test compute_agent_token_usage with zero token values.""" + turn = TurnData( + turn_id="turn1", + query="Test", + response="Response", + api_input_tokens=0, + api_output_tokens=0, + ) + eval_data = EvaluationData( + conversation_group_id="conv1", + turns=[turn], + ) + result = compute_agent_token_usage([eval_data]) + assert result.total_api_input_tokens == 0 + assert result.total_api_output_tokens == 0 + assert result.total_api_tokens == 0 + + +class TestCalculateApiLatencyStats: + """Tests for compute_field_numeric_stats_from_evaluation_data function.""" + + def test_calculate_api_latency_with_values(self) -> None: + """Test API latency calculation with valid non-zero values.""" + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="turn1", query="Q1", agent_latency=1.0), + TurnData(turn_id="turn2", query="Q2", agent_latency=2.0), + ], + ), + ] + result = compute_field_numeric_stats_from_evaluation_data( + eval_data, "agent_latency" + ) + + assert result is not None + assert result.count == 2 + assert result.mean == pytest.approx(1.5) + assert result.median == 1.5 + assert result.min_value == 1.0 + assert result.max_value == 2.0 + assert result.p95 is not None + assert result.p99 is not None + + def test_calculate_api_latency_excludes_zeros(self) -> None: + """Test that zero latency values are excluded (no actual API calls).""" + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="turn1", query="Q1", agent_latency=0), + TurnData(turn_id="turn2", query="Q2", agent_latency=1.5), + ], + ), + ] + result = compute_field_numeric_stats_from_evaluation_data( + eval_data, "agent_latency" + ) + + assert result is not None + assert result.count == 1 + assert result.mean == 1.5 + + def test_calculate_api_latency_all_zeros_returns_empty(self) -> None: + """Test that all-zero latencies return empty count (api_enabled=False scenario).""" + eval_data = [ + EvaluationData( + conversation_group_id="conv1", + turns=[ + TurnData(turn_id="turn1", query="Q1", agent_latency=0), + TurnData(turn_id="turn2", query="Q2", agent_latency=0), + ], + ), + ] + result = compute_field_numeric_stats_from_evaluation_data( + eval_data, "agent_latency" + ) + + assert result is None + + +class TestCalculateBasicStatsWithTokens: + """Tests for compute_overall_stats token tracking fields.""" + + def test_basic_stats_includes_token_fields(self) -> None: + """Test that basic stats includes token fields.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="test:metric", + result="PASS", + score=0.8, + threshold=0.7, + judge_llm_input_tokens=100, + judge_llm_output_tokens=50, + embedding_tokens=150, + ) + ] + stats = compute_overall_stats(results) + assert hasattr(stats, "total_judge_llm_input_tokens") + assert hasattr(stats, "total_judge_llm_output_tokens") + assert hasattr(stats, "total_judge_llm_tokens") + + assert hasattr(stats, "total_embedding_tokens") + + def test_basic_stats_sums_token_values(self) -> None: + """Test that basic stats correctly sums token values.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="test:metric", + result="PASS", + score=0.8, + threshold=0.7, + judge_llm_input_tokens=100, + judge_llm_output_tokens=50, + embedding_tokens=100, + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn2", + metric_identifier="test:metric", + result="PASS", + score=0.9, + threshold=0.7, + judge_llm_input_tokens=200, + judge_llm_output_tokens=100, + embedding_tokens=250, + ), + ] + stats = compute_overall_stats(results) + assert stats.total_judge_llm_input_tokens == 300 + assert stats.total_judge_llm_output_tokens == 150 + assert stats.total_judge_llm_tokens == 450 + + assert stats.total_embedding_tokens == 350 + assert stats.total_embedding_tokens == 350 + + def test_basic_stats_zero_tokens_by_default(self) -> None: + """Test that results without tokens default to zero.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="test:metric", + result="PASS", + score=0.8, + threshold=0.7, + ) + ] + stats = compute_overall_stats(results) + assert stats.total_judge_llm_input_tokens == 0 + assert stats.total_judge_llm_output_tokens == 0 + assert stats.total_judge_llm_tokens == 0 + + assert stats.total_embedding_tokens == 0 + assert stats.total_embedding_tokens == 0 + + def test_basic_stats_empty_results_zero_tokens(self) -> None: + """Test that empty results have zero tokens.""" + stats = compute_overall_stats([]) + assert stats.total_judge_llm_input_tokens == 0 + assert stats.total_judge_llm_output_tokens == 0 + assert stats.total_judge_llm_tokens == 0 + assert stats.total_embedding_tokens == 0 + assert stats.total_embedding_tokens == 0 diff --git a/tests/unit/core/output/test_statistics_detailed.py b/tests/unit/core/output/test_statistics_detailed.py new file mode 100644 index 00000000..63b128f1 --- /dev/null +++ b/tests/unit/core/output/test_statistics_detailed.py @@ -0,0 +1,392 @@ +"""Unit tests for detailed statistics module.""" + +import pytest + +from lightspeed_evaluation.core.models import ( + EvaluationResult, +) +from lightspeed_evaluation.core.output.statistics import ( + compute_detailed_stats, +) + + +class TestCalculateDetailedStats: + """Tests for compute_detailed_stats function.""" + + def test_detailed_stats_with_results( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: + """Test detailed stats calculation.""" + stats = compute_detailed_stats(sample_results_statistics).model_dump() + + assert stats["by_metric"] + assert stats["by_conversation"] + + assert "metric1" in stats["by_metric"] + assert "metric2" in stats["by_metric"] + assert "conv1" in stats["by_conversation"] + assert "conv2" in stats["by_conversation"] + + def test_detailed_stats_empty_results(self) -> None: + """Test detailed stats with empty results.""" + stats = compute_detailed_stats([]).model_dump() + + assert not stats["by_metric"] + assert not stats["by_conversation"] + + def test_detailed_stats_metric_breakdown( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: + """Test metric breakdown in detailed stats.""" + stats = compute_detailed_stats(sample_results_statistics).model_dump() + + metric1_stats = stats["by_metric"]["metric1"] + assert metric1_stats["passed"] == 1 + assert metric1_stats["failed"] == 1 + + metric2_stats = stats["by_metric"]["metric2"] + assert metric2_stats["passed"] == 1 + assert metric2_stats["error"] == 1 + + def test_detailed_stats_conversation_breakdown( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: + """Test conversation breakdown in detailed stats.""" + stats = compute_detailed_stats(sample_results_statistics).model_dump() + + conv1_stats = stats["by_conversation"]["conv1"] + assert conv1_stats["passed"] == 1 + assert conv1_stats["failed"] == 1 + + conv2_stats = stats["by_conversation"]["conv2"] + assert conv2_stats["passed"] == 1 + assert conv2_stats["error"] == 1 + + def test_detailed_stats_includes_rates( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: + """Test that detailed stats include percentage rates.""" + stats = compute_detailed_stats(sample_results_statistics).model_dump() + + metric1_stats = stats["by_metric"]["metric1"] + assert "pass_rate" in metric1_stats + assert "fail_rate" in metric1_stats + assert metric1_stats["pass_rate"] == 50.0 + assert metric1_stats["fail_rate"] == 50.0 + + def test_detailed_stats_single_metric(self) -> None: + """Test detailed stats with single metric.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="single_metric", + score=0.9, + result="PASS", + threshold=0.7, + ) + ] + + stats = compute_detailed_stats(results).model_dump() + + assert len(stats["by_metric"]) == 1 + assert "single_metric" in stats["by_metric"] + + def test_compute_detailed_stats_single_metric_single_conversation(self) -> None: + """Test compute_detailed_stats with single metric and conversation.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.8, + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn2", + metric_identifier="ragas:faithfulness", + result="FAIL", + score=0.3, + threshold=0.7, + reason="Poor", + ), + ] + + stats = compute_detailed_stats(results).model_dump() + + # Check by_metric breakdown + assert "ragas:faithfulness" in stats["by_metric"] + metric_stats = stats["by_metric"]["ragas:faithfulness"] + assert metric_stats["passed"] == 1 + assert metric_stats["failed"] == 1 + assert metric_stats["error"] == 0 + assert metric_stats["pass_rate"] == 50.0 + + # Check by_conversation breakdown + assert "conv1" in stats["by_conversation"] + conv_stats = stats["by_conversation"]["conv1"] + assert conv_stats["passed"] == 1 + assert conv_stats["failed"] == 1 + assert conv_stats["error"] == 0 + assert conv_stats["pass_rate"] == 50.0 + + def test_compute_detailed_stats_multiple_metrics_conversations(self) -> None: + """Test compute_detailed_stats with multiple metrics and conversations.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.8, + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:relevancy", + result="FAIL", + score=0.3, + threshold=0.7, + reason="Poor", + ), + EvaluationResult( + conversation_group_id="conv2", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.9, + threshold=0.7, + reason="Excellent", + ), + EvaluationResult( + conversation_group_id="conv2", + turn_id="turn1", + metric_identifier="ragas:relevancy", + result="ERROR", + score=0.0, + threshold=0.7, + reason="API error", + ), + ] + + stats = compute_detailed_stats(results).model_dump() + + # Check metrics + assert len(stats["by_metric"]) == 2 + assert "ragas:faithfulness" in stats["by_metric"] + assert "ragas:relevancy" in stats["by_metric"] + + faithfulness_stats = stats["by_metric"]["ragas:faithfulness"] + assert faithfulness_stats["passed"] == 2 + assert faithfulness_stats["failed"] == 0 + assert faithfulness_stats["error"] == 0 + assert faithfulness_stats["pass_rate"] == 100.0 + + relevancy_stats = stats["by_metric"]["ragas:relevancy"] + assert relevancy_stats["passed"] == 0 + assert relevancy_stats["failed"] == 1 + assert relevancy_stats["error"] == 1 + assert relevancy_stats["pass_rate"] == 0.0 + assert relevancy_stats["fail_rate"] == 50.0 + assert relevancy_stats["error_rate"] == 50.0 + + # Check conversations + assert len(stats["by_conversation"]) == 2 + assert "conv1" in stats["by_conversation"] + assert "conv2" in stats["by_conversation"] + + conv1_stats = stats["by_conversation"]["conv1"] + assert conv1_stats["passed"] == 1 + assert conv1_stats["failed"] == 1 + assert conv1_stats["error"] == 0 + assert conv1_stats["pass_rate"] == 50.0 + + conv2_stats = stats["by_conversation"]["conv2"] + assert conv2_stats["passed"] == 1 + assert conv2_stats["failed"] == 0 + assert conv2_stats["error"] == 1 + assert conv2_stats["pass_rate"] == 50.0 + assert conv2_stats["error_rate"] == 50.0 + + def test_compute_detailed_stats_score_statistics(self) -> None: + """Test compute_detailed_stats includes score statistics.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.8, + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn2", + metric_identifier="ragas:faithfulness", + result="PASS", + score=0.9, + threshold=0.7, + reason="Excellent", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn3", + metric_identifier="ragas:faithfulness", + result="FAIL", + score=0.3, + threshold=0.7, + reason="Poor", + ), + ] + + stats = compute_detailed_stats(results).model_dump() + + metric_stats = stats["by_metric"]["ragas:faithfulness"] + assert "score_statistics" in metric_stats + + score_stats = metric_stats["score_statistics"] + assert score_stats["count"] == 3 + assert score_stats["mean"] == pytest.approx(0.6667, rel=1e-3) + assert score_stats["min_score"] == 0.3 + assert score_stats["max_score"] == 0.9 + assert score_stats["median"] == 0.8 + assert score_stats["std"] > 0 # Should have some standard deviation + # Confidence interval should be calculated for 3+ scores + assert "confidence_interval" in score_stats + ci = score_stats["confidence_interval"] + assert ci is not None + assert "low" in ci + assert "mean" in ci + assert "high" in ci + assert ci["confidence_level"] == 95 + + def test_compute_detailed_stats_no_scores(self) -> None: + """Test compute_detailed_stats with results that have no scores.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="test:metric", + result="ERROR", + threshold=0.7, + reason="API error", + ) + ] + + stats = compute_detailed_stats(results).model_dump() + + metric_stats = stats["by_metric"]["test:metric"] + assert "score_statistics" in metric_stats + + score_stats = metric_stats["score_statistics"] + assert score_stats["count"] == 0 + assert score_stats["mean"] == 0.0 + assert score_stats["median"] == 0.0 + assert score_stats["std"] == 0.0 + # Confidence interval should be None when no scores + assert score_stats["confidence_interval"] is None + + def test_compute_detailed_stats_single_score_no_confidence_interval(self) -> None: + """Test compute_detailed_stats with single score has no CI (needs 2+).""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="test:metric", + result="PASS", + score=0.8, + threshold=0.7, + reason="Good", + ) + ] + + stats = compute_detailed_stats(results).model_dump() + + metric_stats = stats["by_metric"]["test:metric"] + score_stats = metric_stats["score_statistics"] + assert score_stats["count"] == 1 + # Confidence interval should be None for single score + assert score_stats["confidence_interval"] is None + + def test_compute_detailed_stats_by_tag(self) -> None: + """Test compute_detailed_stats includes by_tag breakdown.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + tag="production", + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + score=0.9, + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv2", + tag="production", + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + score=0.8, + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv3", + tag="staging", + turn_id="turn1", + metric_identifier="metric1", + result="FAIL", + score=0.5, + threshold=0.7, + reason="Below threshold", + ), + ] + + stats = compute_detailed_stats(results).model_dump() + + # Verify by_tag is present + assert "by_tag" in stats + assert "production" in stats["by_tag"] + assert "staging" in stats["by_tag"] + + # Check production tag stats + prod_stats = stats["by_tag"]["production"] + assert prod_stats["passed"] == 2 + assert prod_stats["failed"] == 0 + assert prod_stats["pass_rate"] == 100.0 + assert "score_statistics" in prod_stats + assert prod_stats["score_statistics"]["count"] == 2 + assert prod_stats["score_statistics"]["mean"] == pytest.approx(0.85) + + # Check staging tag stats + staging_stats = stats["by_tag"]["staging"] + assert staging_stats["passed"] == 0 + assert staging_stats["failed"] == 1 + assert staging_stats["fail_rate"] == 100.0 + assert "score_statistics" in staging_stats + + def test_compute_detailed_stats_default_tag(self) -> None: + """Test compute_detailed_stats with default 'eval' tag.""" + results = [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + threshold=0.7, + ), + ] + + stats = compute_detailed_stats(results).model_dump() + + # Default tag should be "eval" + assert "by_tag" in stats + assert "eval" in stats["by_tag"] + assert stats["by_tag"]["eval"]["passed"] == 1 diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py index 60f6a182..29493135 100644 --- a/tests/unit/core/storage/test_sql_storage.py +++ b/tests/unit/core/storage/test_sql_storage.py @@ -423,6 +423,7 @@ def test_all_csv_columns_present(self) -> None: "judge_scores", "time_to_first_token", "streaming_duration", + "agent_latency", "tokens_per_second", "tool_calls", "contexts", diff --git a/tests/unit/pipeline/evaluation/test_amender.py b/tests/unit/pipeline/evaluation/test_amender.py index efdeae3e..d9f03d31 100644 --- a/tests/unit/pipeline/evaluation/test_amender.py +++ b/tests/unit/pipeline/evaluation/test_amender.py @@ -263,3 +263,90 @@ def test_amend_single_turn_with_extra_request_params( attachments=None, extra_request_params={"mode": "troubleshooting"}, ) + + def test_amend_single_turn_measures_agent_latency( + self, mocker: MockerFixture + ) -> None: + """Test that agent_latency is measured for actual API calls (with tokens).""" + mock_client = mocker.Mock() + api_response = APIResponse( + response="Test response", + conversation_id="conv_latency", + contexts=[], + tool_calls=[], + input_tokens=100, + output_tokens=50, + ) + mock_client.query.return_value = api_response + + # Mock time.perf_counter to return deterministic timing values + mocker.patch( + "time.perf_counter", + side_effect=[1.0, 1.5], # Start: 1.0, End: 1.5 → latency = 0.5 + ) + + amender = APIDataAmender(mock_client) + + turn = TurnData(turn_id="9", query="Latency test query", response=None) + + # Initial agent_latency should be 0 (default) + assert turn.agent_latency == 0 + + error_msg, conversation_id = amender.amend_single_turn(turn) + + # No error should be returned + assert error_msg is None + assert conversation_id == "conv_latency" + + # agent_latency should be measured (exactly 0.5s) for actual API call + assert turn.agent_latency == 0.5 + assert turn.api_input_tokens == 100 + assert turn.api_output_tokens == 50 + + def test_amend_single_turn_no_agent_latency_when_no_client(self) -> None: + """Test that agent_latency is NOT measured when API client is None (api_enabled=False).""" + amender = APIDataAmender(None) + + turn = TurnData(turn_id="10", query="No API query", response=None) + + # Initial agent_latency should be 0 (default) + assert turn.agent_latency == 0 + + error_msg, conversation_id = amender.amend_single_turn(turn) + + # No error should be returned + assert error_msg is None + assert conversation_id is None + + # agent_latency should remain 0 since no API call was made + assert turn.agent_latency == 0 + + def test_amend_single_turn_no_latency_for_cached_responses( + self, mocker: MockerFixture + ) -> None: + """Test that agent_latency is 0 for cached responses (zero tokens).""" + mock_client = mocker.Mock() + # Cached responses have zero tokens (set by cache retrieval logic) + cached_response = APIResponse( + response="Cached response", + conversation_id="conv_cached", + contexts=[], + tool_calls=[], + input_tokens=0, + output_tokens=0, + ) + mock_client.query.return_value = cached_response + + amender = APIDataAmender(mock_client) + + turn = TurnData(turn_id="11", query="Cached query", response=None) + + error_msg, conversation_id = amender.amend_single_turn(turn) + + assert error_msg is None + assert conversation_id == "conv_cached" + + # Cached response should have zero latency (no actual API call) + assert turn.agent_latency == 0 + assert turn.api_input_tokens == 0 + assert turn.api_output_tokens == 0