Skip to content

Commit 861d59e

Browse files
committed
API latency calculation
1 parent 09b699a commit 861d59e

22 files changed

Lines changed: 1235 additions & 637 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ A comprehensive framework for evaluating GenAI applications.
1414
- **API Integration**: Direct integration with external API for real-time data generation (if enabled)
1515
- **Setup/Cleanup Scripts**: Support for running setup and cleanup scripts before/after each conversation evaluation (applicable when API is enabled)
1616
- **Token Usage Tracking**: Track input/output tokens for both API calls and Judge LLM evaluations (per-judge tracking for panel mode)
17+
- **API Latency Tracking**: Measure and analyze API response times with percentile statistics (p50, p95, p99) for performance monitoring
1718
- **Streaming Performance Metrics**: Capture time-to-first-token (TTFT), streaming duration, and tokens/second when using streaming endpoint
1819
- **Statistical Analysis**: Statistics for every metric with score distribution analysis
1920
- **Rich Output**: CSV, JSON, TXT reports + visualization graphs (pass rates, distributions, heatmaps)

config/system.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ storage:
274274
- "response"
275275
- "api_input_tokens"
276276
- "api_output_tokens"
277+
- "agent_latency"
277278
# Streaming performance metrics (only populated when using streaming endpoint)
278279
- "time_to_first_token" # Time to first token in seconds
279280
- "streaming_duration" # Total streaming duration in seconds

docs/EVALUATION_GUIDE.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,7 @@ Contains every metric evaluation with:
11271127
- Detailed reasoning
11281128
- Query and response text
11291129
- Execution time
1130+
- API latency
11301131
11311132
**Use for:** Drilling into specific failures, detailed analysis
11321133
@@ -1180,6 +1181,16 @@ ragas:faithfulness:
11801181
- **ERROR** ⚠️: Evaluation couldn't complete (missing data, API failure, etc.)
11811182
- **SKIPPED** ⏭️: Evaluation skipped due to prior failure (when `skip_on_failure` is enabled)
11821183
1184+
### Performance Metrics (API Enabled Only)
1185+
1186+
**API Latency**: Response time per API call with percentile stats (p50, p95, p99). Cached responses (zero tokens) are excluded to avoid skewing statistics.
1187+
1188+
**Streaming Metrics**: Time-to-first-token, streaming duration, and tokens/second when using streaming endpoints.
1189+
1190+
**Token Usage**: Track consumption across Judge LLM, embeddings, and API calls.
1191+
1192+
**Note:** Cached responses are detected by zero `api_input_tokens` and `api_output_tokens` — latency is set to 0 for these.
1193+
11831194
### Score Quality Levels
11841195
11851196
| Score | Quality | Recommendation |
@@ -1912,4 +1923,3 @@ This comprehensive guide has covered everything you need to know to effectively
19121923
*This guide is designed to make AI evaluation accessible to everyone. Whether you're a product manager making decisions, a QA engineer testing systems, or a developer integrating evaluation into workflows, you now have everything you need to ensure your AI applications meet quality standards.*
19131924

19141925
**Happy Evaluating! 🚀**
1915-

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
# Streaming performance metrics
107107
"time_to_first_token",
108108
"streaming_duration",
109+
"agent_latency",
109110
"tokens_per_second",
110111
"tool_calls",
111112
"contexts",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin):
8484
default=0, ge=0, description="Output tokens used by API call"
8585
)
8686

87+
# API execution time tracking (per turn)
88+
agent_latency: float = Field(
89+
default=0, ge=0, description="API call latency for this turn in seconds"
90+
)
91+
8792
# Per-turn metrics support
8893
turn_metrics: Optional[list[str]] = Field(
8994
default=None,
@@ -515,6 +520,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
515520
execution_time: float = Field(
516521
default=0, ge=0, description="Execution time in seconds"
517522
)
523+
agent_latency: float = Field(
524+
default=0,
525+
ge=0,
526+
description="API latency in seconds (per turn or average for conversation)",
527+
)
518528
api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used")
519529
api_output_tokens: int = Field(
520530
default=0, ge=0, description="API output tokens used"

src/lightspeed_evaluation/core/models/quality.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99

1010
from pydantic import BaseModel, Field
1111

12-
from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
12+
from lightspeed_evaluation.core.models.summary import (
13+
MetricStats,
14+
NumericStats,
15+
ScoreStatistics,
16+
AgentTokenStats,
17+
)
18+
1319

1420
logger = logging.getLogger(__name__)
1521

@@ -44,17 +50,18 @@ class QualityReport(BaseModel):
4450
default_factory=list,
4551
description="Warnings about quality metrics configuration or usage",
4652
)
47-
api_latency: float = Field(
48-
default=0.0, description="[Placeholder] Average API response time in seconds"
53+
agent_latency_stats: Optional[NumericStats] = Field(
54+
default=None, description="Agent latency statistics"
4955
)
50-
api_tokens: int = Field(
51-
default=0,
52-
description="[Placeholder] Total number of tokens consumed across all API calls",
56+
agent_token_stats: Optional[AgentTokenStats] = Field(
57+
default=None, description="Agent token usage statistics"
5358
)
5459

5560
@staticmethod
5661
def create_report(
5762
by_metric: dict[str, MetricStats],
63+
agent_latency_stats: Optional[NumericStats],
64+
agent_token_stats: Optional[AgentTokenStats],
5865
quality_score_metrics: list[str],
5966
) -> Optional["QualityReport"]:
6067
"""Creates a quality report with aggregated quality score from selected metrics.
@@ -64,6 +71,8 @@ def create_report(
6471
6572
Args:
6673
by_metric: Dictionary mapping metric identifiers to their computed statistics.
74+
agent_latency_stats: Agent API latency statistics (p50, p95, p99).
75+
agent_token_stats: Agent token usage statistics with percentiles.
6776
quality_score_metrics: Metric identifiers to include in quality score calculation.
6877
All specified metrics must exist in by_metric.
6978
@@ -148,14 +157,13 @@ def create_report(
148157
if stats is not None:
149158
extra_metrics[metric_id] = stats
150159

151-
# Calculate aggregated quality score
152-
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)
153-
154160
return QualityReport(
155-
quality_score=aggregated_score,
161+
quality_score=QualityReport._calculate_quality_score(quality_metrics),
156162
quality_metrics=quality_metrics,
157163
extra_metrics=extra_metrics,
158164
warnings=warnings,
165+
agent_latency_stats=agent_latency_stats,
166+
agent_token_stats=agent_token_stats,
159167
)
160168

161169
@staticmethod

src/lightspeed_evaluation/core/models/summary.py

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,23 @@
1414
from lightspeed_evaluation.core.models.data import EvaluationData, EvaluationResult
1515
from lightspeed_evaluation.core.output.statistics import (
1616
bootstrap_intervals,
17+
calculate_field_numeric_stats_from_evaluation_data,
1718
calculate_api_token_usage,
1819
calculate_streaming_stats,
1920
)
2021

2122

2223
class NumericStats(BaseModel):
23-
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
24+
"""Numeric statistics for a set of values (e.g., TTFT, duration, latency)."""
2425

2526
count: int = Field(default=0, description="Number of values")
2627
mean: Optional[float] = Field(default=None, description="Mean value")
2728
median: Optional[float] = Field(default=None, description="Median value")
2829
std: Optional[float] = Field(default=None, description="Standard deviation")
2930
min_value: Optional[float] = Field(default=None, description="Minimum value")
3031
max_value: Optional[float] = Field(default=None, description="Maximum value")
32+
p95: Optional[float] = Field(default=None, description="95th percentile")
33+
p99: Optional[float] = Field(default=None, description="99th percentile")
3134

3235

3336
class ScoreStatistics(BaseModel):
@@ -101,14 +104,28 @@ class StreamingStats(BaseModel):
101104
)
102105

103106

104-
class ApiTokenUsage(BaseModel):
105-
"""API token usage totals."""
107+
class AgentTokenStats(BaseModel):
108+
"""Agent token usage statistics with percentiles."""
109+
110+
input: Optional[NumericStats] = Field(
111+
default=None, description="Input token statistics"
112+
)
113+
output: Optional[NumericStats] = Field(
114+
default=None, description="Output token statistics"
115+
)
116+
117+
118+
class AgentTokenUsage(BaseModel):
119+
"""Agent token usage totals and statistics."""
106120

107121
total_api_input_tokens: int = Field(default=0, description="Total API input tokens")
108122
total_api_output_tokens: int = Field(
109123
default=0, description="Total API output tokens"
110124
)
111125
total_api_tokens: int = Field(default=0, description="Total API tokens")
126+
statistics: Optional[AgentTokenStats] = Field(
127+
default=None, description="Agent token usage statistics with percentiles"
128+
)
112129

113130

114131
class EvaluationSummary(BaseModel):
@@ -134,8 +151,11 @@ class EvaluationSummary(BaseModel):
134151
by_tag: dict[str, TagStats] = Field(
135152
default_factory=dict, description="Statistics per tag"
136153
)
137-
api_tokens: Optional[ApiTokenUsage] = Field(
138-
default=None, description="API token usage (when evaluation data provided)"
154+
agent_token_usage: Optional[AgentTokenUsage] = Field(
155+
default=None, description="Agent token usage with totals and statistics"
156+
)
157+
agent_latency_stats: Optional[NumericStats] = Field(
158+
default=None, description="API latency statistics (when API enabled)"
139159
)
140160
streaming: Optional[StreamingStats] = Field(
141161
default=None, description="Streaming performance stats (when available)"
@@ -172,11 +192,14 @@ def from_results(
172192
by_tag = _compute_tag_stats(results, compute_confidence_intervals)
173193

174194
# Compute API token usage and streaming stats if evaluation data provided
175-
api_tokens = None
195+
agent_token_usage = None
176196
streaming = None
197+
agent_latency_stats = None
177198
if evaluation_data:
178-
api_tokens = _compute_api_token_usage(evaluation_data)
199+
agent_token_usage = _compute_agent_token_usage(evaluation_data)
179200
streaming = _compute_streaming_stats(evaluation_data)
201+
# Compute Agent latency statistics from evaluation data
202+
agent_latency_stats = _compute_agent_latency_stats(evaluation_data)
180203

181204
return cls(
182205
timestamp=timestamp,
@@ -185,7 +208,8 @@ def from_results(
185208
by_metric=by_metric,
186209
by_conversation=by_conversation,
187210
by_tag=by_tag,
188-
api_tokens=api_tokens,
211+
agent_token_usage=agent_token_usage,
212+
agent_latency_stats=agent_latency_stats,
189213
streaming=streaming,
190214
)
191215

@@ -417,7 +441,7 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
417441
"""Convert a raw numeric stats dictionary to a NumericStats model.
418442
419443
Args:
420-
raw: Dictionary with count, mean, median, std, min, max keys.
444+
raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.
421445
422446
Returns:
423447
NumericStats instance, or None if count is 0.
@@ -432,28 +456,78 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
432456
std=raw.get("std"),
433457
min_value=raw.get("min"),
434458
max_value=raw.get("max"),
459+
p95=raw.get("p95"),
460+
p99=raw.get("p99"),
435461
)
436462

437463

438-
def _compute_api_token_usage(
464+
def _compute_agent_token_usage(
439465
evaluation_data: list[EvaluationData],
440-
) -> ApiTokenUsage:
441-
"""Compute API token usage from evaluation data.
466+
) -> AgentTokenUsage:
467+
"""Compute agent token usage with totals and statistics from evaluation data.
442468
443469
Args:
444470
evaluation_data: List of evaluation data with turn-level token counts.
445471
446472
Returns:
447-
ApiTokenUsage instance.
473+
AgentTokenUsage instance with totals and statistics.
448474
"""
475+
stats = _compute_agent_token_stats(evaluation_data)
449476
raw = calculate_api_token_usage(evaluation_data)
450-
return ApiTokenUsage(
477+
return AgentTokenUsage(
451478
total_api_input_tokens=raw["total_api_input_tokens"],
452479
total_api_output_tokens=raw["total_api_output_tokens"],
453480
total_api_tokens=raw["total_api_tokens"],
481+
statistics=stats,
454482
)
455483

456484

485+
def _compute_agent_latency_stats(
486+
evaluation_data: list[EvaluationData],
487+
) -> Optional[NumericStats]:
488+
"""Compute agent latency statistics from evaluation data.
489+
490+
Args:
491+
evaluation_data: List of evaluation data containing turn-level latency values.
492+
493+
Returns:
494+
NumericStats instance, or None if no agent latency data available.
495+
"""
496+
if not evaluation_data:
497+
return None
498+
raw = calculate_field_numeric_stats_from_evaluation_data(
499+
evaluation_data, "agent_latency"
500+
)
501+
return _numeric_stats_from_dict(raw)
502+
503+
504+
def _compute_agent_token_stats(
505+
evaluation_data: list[EvaluationData],
506+
) -> Optional[AgentTokenStats]:
507+
"""Calculate agent token usage statistics with percentiles from evaluation data.
508+
509+
Args:
510+
evaluation_data: List of evaluation data containing turn-level token counts.
511+
512+
Returns:
513+
AgentTokenStats instance with input/output token statistics, or None if no data.
514+
"""
515+
if not evaluation_data:
516+
return None
517+
518+
input_tokens_stats = calculate_field_numeric_stats_from_evaluation_data(
519+
evaluation_data, "api_input_tokens"
520+
)
521+
output_tokens_stats = calculate_field_numeric_stats_from_evaluation_data(
522+
evaluation_data, "api_output_tokens"
523+
)
524+
525+
input_stats = _numeric_stats_from_dict(input_tokens_stats)
526+
output_stats = _numeric_stats_from_dict(output_tokens_stats)
527+
528+
return AgentTokenStats(input=input_stats, output=output_stats)
529+
530+
457531
def _compute_streaming_stats(
458532
evaluation_data: list[EvaluationData],
459533
) -> Optional[StreamingStats]:

0 commit comments

Comments
 (0)