Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ A comprehensive framework for evaluating GenAI applications.
- **API Integration**: Direct integration with external API for real-time data generation (if enabled)
- **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after each conversation evaluation (applicable when API is enabled)
- **Token Usage Tracking**: Track input/output tokens for both API calls and Judge LLM evaluations (per-judge tracking for panel mode)
- **API Latency Tracking**: Measure and analyze API response times with percentile statistics (p50, p95, p99) for performance monitoring
- **Streaming Performance Metrics**: Capture time-to-first-token (TTFT), streaming duration, and tokens/second when using streaming endpoint
- **Statistical Analysis**: Statistics for every metric with score distribution analysis
- **Rich Output**: CSV, JSON, TXT reports + visualization graphs (pass rates, distributions, heatmaps)
Expand Down
1 change: 1 addition & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ storage:
- "response"
- "api_input_tokens"
- "api_output_tokens"
- "agent_latency"
# Streaming performance metrics (only populated when using streaming endpoint)
- "time_to_first_token" # Time to first token in seconds
- "streaming_duration" # Total streaming duration in seconds
Expand Down
12 changes: 11 additions & 1 deletion docs/EVALUATION_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,7 @@ Contains every metric evaluation with:
- Detailed reasoning
- Query and response text
- Execution time
- API latency

**Use for:** Drilling into specific failures, detailed analysis

Expand Down Expand Up @@ -1180,6 +1181,16 @@ ragas:faithfulness:
- **ERROR** ⚠️: Evaluation couldn't complete (missing data, API failure, etc.)
- **SKIPPED** ⏭️: Evaluation skipped due to prior failure (when `skip_on_failure` is enabled)

### Performance Metrics (API Enabled Only)

**API Latency**: Response time per API call with percentile stats (p50, p95, p99). Cached responses (zero tokens) are excluded to avoid skewing statistics.

**Streaming Metrics**: Time-to-first-token, streaming duration, and tokens/second when using streaming endpoints.

**Token Usage**: Track consumption across Judge LLM, embeddings, and API calls.

**Note:** Cached responses are detected by zero `api_input_tokens` and `api_output_tokens` — latency is set to 0 for these.

### Score Quality Levels

| Score | Quality | Recommendation |
Expand Down Expand Up @@ -1912,4 +1923,3 @@ This comprehensive guide has covered everything you need to know to effectively
*This guide is designed to make AI evaluation accessible to everyone. Whether you're a product manager making decisions, a QA engineer testing systems, or a developer integrating evaluation into workflows, you now have everything you need to ensure your AI applications meet quality standards.*

**Happy Evaluating! 🚀**

1 change: 1 addition & 0 deletions src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
# Streaming performance metrics
"time_to_first_token",
"streaming_duration",
"agent_latency",
"tokens_per_second",
"tool_calls",
"contexts",
Expand Down
4 changes: 2 additions & 2 deletions src/lightspeed_evaluation/core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
ConversationStats,
TagStats,
StreamingStats,
ApiTokenUsage,
AgentTokenUsage,
ConfidenceInterval,
DetailedStats,
)
Expand Down Expand Up @@ -84,7 +84,7 @@
"ConversationStats",
"TagStats",
"StreamingStats",
"ApiTokenUsage",
"AgentTokenUsage",
"ConfidenceInterval",
"DetailedStats",
# API models
Expand Down
10 changes: 10 additions & 0 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin):
default=0, ge=0, description="Output tokens used by API call"
)

# API execution time tracking (per turn)
agent_latency: float = Field(
default=0, ge=0, description="API call latency for this turn in seconds"
)

# Per-turn metrics support
turn_metrics: Optional[list[str]] = Field(
default=None,
Expand Down Expand Up @@ -526,6 +531,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
execution_time: float = Field(
default=0, ge=0, description="Execution time in seconds"
)
agent_latency: float = Field(
default=0,
ge=0,
description="API latency in seconds (per turn or average for conversation)",
)
api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used")
api_output_tokens: int = Field(
default=0, ge=0, description="API output tokens used"
Expand Down
28 changes: 18 additions & 10 deletions src/lightspeed_evaluation/core/models/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@

from pydantic import BaseModel, Field

from lightspeed_evaluation.core.models import MetricStats, ScoreStatistics
from lightspeed_evaluation.core.models.statistics import (
MetricStats,
NumericStats,
ScoreStatistics,
AgentTokenStats,
)


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,17 +50,18 @@ class QualityReport(BaseModel):
default_factory=list,
description="Warnings about quality metrics configuration or usage",
)
api_latency: float = Field(
default=0.0, description="[Placeholder] Average API response time in seconds"
agent_latency_stats: Optional[NumericStats] = Field(
default=None, description="Agent latency statistics"
)
api_tokens: int = Field(
default=0,
description="[Placeholder] Total number of tokens consumed across all API calls",
agent_token_stats: Optional[AgentTokenStats] = Field(
default=None, description="Agent token usage statistics"
)

@staticmethod
def create_report(
by_metric: dict[str, MetricStats],
agent_latency_stats: Optional[NumericStats],
agent_token_stats: Optional[AgentTokenStats],
quality_score_metrics: list[str],
) -> Optional["QualityReport"]:
"""Creates a quality report with aggregated quality score from selected metrics.
Expand All @@ -64,6 +71,8 @@ def create_report(

Args:
by_metric: Dictionary mapping metric identifiers to their computed statistics.
agent_latency_stats: Agent API latency statistics (p50, p95, p99).
agent_token_stats: Agent token usage statistics with percentiles.
quality_score_metrics: Metric identifiers to include in quality score calculation.
All specified metrics must exist in by_metric.

Expand Down Expand Up @@ -148,14 +157,13 @@ def create_report(
if stats is not None:
extra_metrics[metric_id] = stats

# Calculate aggregated quality score
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)

return QualityReport(
quality_score=aggregated_score,
quality_score=QualityReport._calculate_quality_score(quality_metrics),
quality_metrics=quality_metrics,
extra_metrics=extra_metrics,
warnings=warnings,
agent_latency_stats=agent_latency_stats,
agent_token_stats=agent_token_stats,
)

@staticmethod
Expand Down
22 changes: 19 additions & 3 deletions src/lightspeed_evaluation/core/models/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@


class NumericStats(BaseModel):
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
"""Numeric statistics for a set of values (e.g., TTFT, duration, latency)."""

count: int = Field(default=0, description="Number of values")
mean: Optional[float] = Field(default=None, description="Mean value")
median: Optional[float] = Field(default=None, description="Median value")
std: Optional[float] = Field(default=None, description="Standard deviation")
min_value: Optional[float] = Field(default=None, description="Minimum value")
max_value: Optional[float] = Field(default=None, description="Maximum value")
p95: Optional[float] = Field(default=None, description="95th percentile")
p99: Optional[float] = Field(default=None, description="99th percentile")


class ConfidenceInterval(BaseModel):
Expand Down Expand Up @@ -116,11 +118,25 @@ class StreamingStats(BaseModel):
)


class ApiTokenUsage(BaseModel):
"""API token usage totals."""
class AgentTokenStats(BaseModel):
"""Agent token usage statistics with percentiles."""

input: Optional[NumericStats] = Field(
default=None, description="Input token statistics"
)
output: Optional[NumericStats] = Field(
default=None, description="Output token statistics"
)


class AgentTokenUsage(BaseModel):
"""Agent token usage totals and statistics."""

total_api_input_tokens: int = Field(default=0, description="Total API input tokens")
total_api_output_tokens: int = Field(
default=0, description="Total API output tokens"
)
total_api_tokens: int = Field(default=0, description="Total API tokens")
statistics: Optional[AgentTokenStats] = Field(
default=None, description="Agent token usage statistics with percentiles"
)
25 changes: 17 additions & 8 deletions src/lightspeed_evaluation/core/models/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@
EvaluationResult,
)
from lightspeed_evaluation.core.models.statistics import (
ApiTokenUsage,
AgentTokenUsage,
NumericStats,
ConversationStats,
MetricStats,
OverallStats,
StreamingStats,
TagStats,
)
from lightspeed_evaluation.core.output.statistics import (
compute_api_token_usage,
compute_agent_token_usage,
compute_agent_latency_stats,
compute_overall_stats,
compute_streaming_stats,
compute_tag_stats,
Expand Down Expand Up @@ -50,8 +52,11 @@ class EvaluationSummary(BaseModel):
by_tag: dict[str, TagStats] = Field(
default_factory=dict, description="Statistics per tag"
)
api_tokens: Optional[ApiTokenUsage] = Field(
default=None, description="API token usage (when evaluation data provided)"
agent_token_usage: Optional[AgentTokenUsage] = Field(
default=None, description="Agent token usage with totals and statistics"
)
agent_latency_stats: Optional[NumericStats] = Field(
default=None, description="Agent latency statistics (when API enabled)"
)
streaming: Optional[StreamingStats] = Field(
default=None, description="Streaming performance stats (when available)"
Expand All @@ -70,7 +75,8 @@ def from_results(

Args:
results: List of evaluation results to summarize.
evaluation_data: Optional evaluation data for API token and streaming stats.
evaluation_data: Optional evaluation data for API token, agent latency,
and streaming stats.
compute_confidence_intervals: Whether to compute bootstrap confidence
intervals. Default False.

Expand All @@ -88,11 +94,13 @@ def from_results(
by_tag = compute_tag_stats(results, compute_confidence_intervals)

# Compute API token usage and streaming stats if evaluation data provided
api_tokens = None
streaming = None
agent_token_usage = None
agent_latency_stats = None
if evaluation_data:
api_tokens = compute_api_token_usage(evaluation_data)
streaming = compute_streaming_stats(evaluation_data)
agent_token_usage = compute_agent_token_usage(evaluation_data)
agent_latency_stats = compute_agent_latency_stats(evaluation_data)

return cls(
timestamp=timestamp,
Expand All @@ -101,6 +109,7 @@ def from_results(
by_metric=by_metric,
by_conversation=by_conversation,
by_tag=by_tag,
api_tokens=api_tokens,
agent_token_usage=agent_token_usage,
agent_latency_stats=agent_latency_stats,
streaming=streaming,
)
Loading
Loading