Skip to content

Commit a00a8f1

Browse files
committed
API latency calculation
1 parent 09b699a commit a00a8f1

20 files changed

Lines changed: 1133 additions & 636 deletions

File tree

config/system.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ storage:
274274
- "response"
275275
- "api_input_tokens"
276276
- "api_output_tokens"
277+
- "agent_latency"
277278
# Streaming performance metrics (only populated when using streaming endpoint)
278279
- "time_to_first_token" # Time to first token in seconds
279280
- "streaming_duration" # Total streaming duration in seconds

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
# Streaming performance metrics
107107
"time_to_first_token",
108108
"streaming_duration",
109+
"agent_latency",
109110
"tokens_per_second",
110111
"tool_calls",
111112
"contexts",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ class TurnData(StreamingMetricsMixin):
8484
default=0, ge=0, description="Output tokens used by API call"
8585
)
8686

87+
# API execution time tracking (per turn)
88+
agent_latency: float = Field(
89+
default=0, ge=0, description="API call latency for this turn in seconds"
90+
)
91+
8792
# Per-turn metrics support
8893
turn_metrics: Optional[list[str]] = Field(
8994
default=None,
@@ -515,6 +520,11 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
515520
execution_time: float = Field(
516521
default=0, ge=0, description="Execution time in seconds"
517522
)
523+
agent_latency: float = Field(
524+
default=0,
525+
ge=0,
526+
description="API latency in seconds (per turn or average for conversation)",
527+
)
518528
api_input_tokens: int = Field(default=0, ge=0, description="API input tokens used")
519529
api_output_tokens: int = Field(
520530
default=0, ge=0, description="API output tokens used"

src/lightspeed_evaluation/core/models/quality.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,13 @@
99

1010
from pydantic import BaseModel, Field
1111

12-
from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
12+
from lightspeed_evaluation.core.models.summary import (
13+
MetricStats,
14+
NumericStats,
15+
ScoreStatistics,
16+
AgentTokenStats,
17+
)
18+
1319

1420
logger = logging.getLogger(__name__)
1521

@@ -44,17 +50,18 @@ class QualityReport(BaseModel):
4450
default_factory=list,
4551
description="Warnings about quality metrics configuration or usage",
4652
)
47-
api_latency: float = Field(
48-
default=0.0, description="[Placeholder] Average API response time in seconds"
53+
agent_latency_stats: Optional[NumericStats] = Field(
54+
default=None, description="Agent latency statistics"
4955
)
50-
api_tokens: int = Field(
51-
default=0,
52-
description="[Placeholder] Total number of tokens consumed across all API calls",
56+
agent_token_stats: Optional[AgentTokenStats] = Field(
57+
default=None, description="Agent token usage statistics"
5358
)
5459

5560
@staticmethod
5661
def create_report(
5762
by_metric: dict[str, MetricStats],
63+
agent_latency_stats: Optional[NumericStats],
64+
agent_token_stats: Optional[AgentTokenStats],
5865
quality_score_metrics: list[str],
5966
) -> Optional["QualityReport"]:
6067
"""Creates a quality report with aggregated quality score from selected metrics.
@@ -64,6 +71,8 @@ def create_report(
6471
6572
Args:
6673
by_metric: Dictionary mapping metric identifiers to their computed statistics.
74+
agent_latency_stats: Agent API latency statistics (p50, p95, p99).
75+
agent_token_stats: Agent token usage statistics with percentiles.
6776
quality_score_metrics: Metric identifiers to include in quality score calculation.
6877
All specified metrics must exist in by_metric.
6978
@@ -148,14 +157,13 @@ def create_report(
148157
if stats is not None:
149158
extra_metrics[metric_id] = stats
150159

151-
# Calculate aggregated quality score
152-
aggregated_score = QualityReport._calculate_quality_score(quality_metrics)
153-
154160
return QualityReport(
155-
quality_score=aggregated_score,
161+
quality_score=QualityReport._calculate_quality_score(quality_metrics),
156162
quality_metrics=quality_metrics,
157163
extra_metrics=extra_metrics,
158164
warnings=warnings,
165+
agent_latency_stats=agent_latency_stats,
166+
agent_token_stats=agent_token_stats,
159167
)
160168

161169
@staticmethod

src/lightspeed_evaluation/core/models/summary.py

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,23 @@
1414
from lightspeed_evaluation.core.models.data import EvaluationData, EvaluationResult
1515
from lightspeed_evaluation.core.output.statistics import (
1616
bootstrap_intervals,
17+
calculate_field_numeric_stats_from_evaluation_data,
1718
calculate_api_token_usage,
1819
calculate_streaming_stats,
1920
)
2021

2122

2223
class NumericStats(BaseModel):
23-
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
24+
"""Numeric statistics for a set of values (e.g., TTFT, duration, latency)."""
2425

2526
count: int = Field(default=0, description="Number of values")
2627
mean: Optional[float] = Field(default=None, description="Mean value")
2728
median: Optional[float] = Field(default=None, description="Median value")
2829
std: Optional[float] = Field(default=None, description="Standard deviation")
2930
min_value: Optional[float] = Field(default=None, description="Minimum value")
3031
max_value: Optional[float] = Field(default=None, description="Maximum value")
32+
p95: Optional[float] = Field(default=None, description="95th percentile")
33+
p99: Optional[float] = Field(default=None, description="99th percentile")
3134

3235

3336
class ScoreStatistics(BaseModel):
@@ -101,14 +104,28 @@ class StreamingStats(BaseModel):
101104
)
102105

103106

104-
class ApiTokenUsage(BaseModel):
105-
"""API token usage totals."""
107+
class AgentTokenStats(BaseModel):
108+
"""Agent token usage statistics with percentiles."""
109+
110+
input: Optional[NumericStats] = Field(
111+
default=None, description="Input token statistics"
112+
)
113+
output: Optional[NumericStats] = Field(
114+
default=None, description="Output token statistics"
115+
)
116+
117+
118+
class AgentTokenUsage(BaseModel):
119+
"""Agent token usage totals and statistics."""
106120

107121
total_api_input_tokens: int = Field(default=0, description="Total API input tokens")
108122
total_api_output_tokens: int = Field(
109123
default=0, description="Total API output tokens"
110124
)
111125
total_api_tokens: int = Field(default=0, description="Total API tokens")
126+
statistics: Optional[AgentTokenStats] = Field(
127+
default=None, description="Agent token usage statistics with percentiles"
128+
)
112129

113130

114131
class EvaluationSummary(BaseModel):
@@ -134,8 +151,11 @@ class EvaluationSummary(BaseModel):
134151
by_tag: dict[str, TagStats] = Field(
135152
default_factory=dict, description="Statistics per tag"
136153
)
137-
api_tokens: Optional[ApiTokenUsage] = Field(
138-
default=None, description="API token usage (when evaluation data provided)"
154+
agent_token_usage: Optional[AgentTokenUsage] = Field(
155+
default=None, description="Agent token usage with totals and statistics"
156+
)
157+
agent_latency_stats: Optional[NumericStats] = Field(
158+
default=None, description="API latency statistics (when API enabled)"
139159
)
140160
streaming: Optional[StreamingStats] = Field(
141161
default=None, description="Streaming performance stats (when available)"
@@ -172,11 +192,14 @@ def from_results(
172192
by_tag = _compute_tag_stats(results, compute_confidence_intervals)
173193

174194
# Compute API token usage and streaming stats if evaluation data provided
175-
api_tokens = None
195+
agent_token_usage = None
176196
streaming = None
197+
agent_latency_stats = None
177198
if evaluation_data:
178-
api_tokens = _compute_api_token_usage(evaluation_data)
199+
agent_token_usage = _compute_agent_token_usage(evaluation_data)
179200
streaming = _compute_streaming_stats(evaluation_data)
201+
# Compute Agent latency statistics from evaluation data
202+
agent_latency_stats = _compute_agent_latency_stats(evaluation_data)
180203

181204
return cls(
182205
timestamp=timestamp,
@@ -185,7 +208,8 @@ def from_results(
185208
by_metric=by_metric,
186209
by_conversation=by_conversation,
187210
by_tag=by_tag,
188-
api_tokens=api_tokens,
211+
agent_token_usage=agent_token_usage,
212+
agent_latency_stats=agent_latency_stats,
189213
streaming=streaming,
190214
)
191215

@@ -417,7 +441,7 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
417441
"""Convert a raw numeric stats dictionary to a NumericStats model.
418442
419443
Args:
420-
raw: Dictionary with count, mean, median, std, min, max keys.
444+
raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.
421445
422446
Returns:
423447
NumericStats instance, or None if count is 0.
@@ -432,28 +456,78 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
432456
std=raw.get("std"),
433457
min_value=raw.get("min"),
434458
max_value=raw.get("max"),
459+
p95=raw.get("p95"),
460+
p99=raw.get("p99"),
435461
)
436462

437463

438-
def _compute_api_token_usage(
464+
def _compute_agent_token_usage(
439465
evaluation_data: list[EvaluationData],
440-
) -> ApiTokenUsage:
441-
"""Compute API token usage from evaluation data.
466+
) -> AgentTokenUsage:
467+
"""Compute agent token usage with totals and statistics from evaluation data.
442468
443469
Args:
444470
evaluation_data: List of evaluation data with turn-level token counts.
445471
446472
Returns:
447-
ApiTokenUsage instance.
473+
AgentTokenUsage instance with totals and statistics.
448474
"""
475+
stats = _compute_agent_token_stats(evaluation_data)
449476
raw = calculate_api_token_usage(evaluation_data)
450-
return ApiTokenUsage(
477+
return AgentTokenUsage(
451478
total_api_input_tokens=raw["total_api_input_tokens"],
452479
total_api_output_tokens=raw["total_api_output_tokens"],
453480
total_api_tokens=raw["total_api_tokens"],
481+
statistics=stats,
454482
)
455483

456484

485+
def _compute_agent_latency_stats(
486+
evaluation_data: list[EvaluationData],
487+
) -> Optional[NumericStats]:
488+
"""Compute agent latency statistics from evaluation data.
489+
490+
Args:
491+
evaluation_data: List of evaluation data containing turn-level latency values.
492+
493+
Returns:
494+
NumericStats instance, or None if no agent latency data available.
495+
"""
496+
if not evaluation_data:
497+
return None
498+
raw = calculate_field_numeric_stats_from_evaluation_data(
499+
evaluation_data, "agent_latency"
500+
)
501+
return _numeric_stats_from_dict(raw)
502+
503+
504+
def _compute_agent_token_stats(
505+
evaluation_data: list[EvaluationData],
506+
) -> Optional[AgentTokenStats]:
507+
"""Calculate agent token usage statistics with percentiles from evaluation data.
508+
509+
Args:
510+
evaluation_data: List of evaluation data containing turn-level token counts.
511+
512+
Returns:
513+
AgentTokenStats instance with input/output token statistics, or None if no data.
514+
"""
515+
if not evaluation_data:
516+
return None
517+
518+
input_tokens_stats = calculate_field_numeric_stats_from_evaluation_data(
519+
evaluation_data, "api_input_tokens"
520+
)
521+
output_tokens_stats = calculate_field_numeric_stats_from_evaluation_data(
522+
evaluation_data, "api_output_tokens"
523+
)
524+
525+
input_stats = _numeric_stats_from_dict(input_tokens_stats)
526+
output_stats = _numeric_stats_from_dict(output_tokens_stats)
527+
528+
return AgentTokenStats(input=input_stats, output=output_stats)
529+
530+
457531
def _compute_streaming_stats(
458532
evaluation_data: list[EvaluationData],
459533
) -> Optional[StreamingStats]:

0 commit comments

Comments
 (0)