1414from lightspeed_evaluation .core .models .data import EvaluationData , EvaluationResult
1515from lightspeed_evaluation .core .output .statistics import (
1616 bootstrap_intervals ,
17+ calculate_field_numeric_stats_from_evaluation_data ,
1718 calculate_api_token_usage ,
1819 calculate_streaming_stats ,
1920)
2021
2122
2223class NumericStats (BaseModel ):
23- """Numeric statistics for a set of values (e.g., TTFT, duration)."""
24+ """Numeric statistics for a set of values (e.g., TTFT, duration, latency )."""
2425
2526 count : int = Field (default = 0 , description = "Number of values" )
2627 mean : Optional [float ] = Field (default = None , description = "Mean value" )
2728 median : Optional [float ] = Field (default = None , description = "Median value" )
2829 std : Optional [float ] = Field (default = None , description = "Standard deviation" )
2930 min_value : Optional [float ] = Field (default = None , description = "Minimum value" )
3031 max_value : Optional [float ] = Field (default = None , description = "Maximum value" )
32+ p95 : Optional [float ] = Field (default = None , description = "95th percentile" )
33+ p99 : Optional [float ] = Field (default = None , description = "99th percentile" )
3134
3235
3336class ScoreStatistics (BaseModel ):
@@ -101,14 +104,28 @@ class StreamingStats(BaseModel):
101104 )
102105
103106
104- class ApiTokenUsage (BaseModel ):
105- """API token usage totals."""
107+ class AgentTokenStats (BaseModel ):
108+ """Agent token usage statistics with percentiles."""
109+
110+ input : Optional [NumericStats ] = Field (
111+ default = None , description = "Input token statistics"
112+ )
113+ output : Optional [NumericStats ] = Field (
114+ default = None , description = "Output token statistics"
115+ )
116+
117+
118+ class AgentTokenUsage (BaseModel ):
119+ """Agent token usage totals and statistics."""
106120
107121 total_api_input_tokens : int = Field (default = 0 , description = "Total API input tokens" )
108122 total_api_output_tokens : int = Field (
109123 default = 0 , description = "Total API output tokens"
110124 )
111125 total_api_tokens : int = Field (default = 0 , description = "Total API tokens" )
126+ statistics : Optional [AgentTokenStats ] = Field (
127+ default = None , description = "Agent token usage statistics with percentiles"
128+ )
112129
113130
114131class EvaluationSummary (BaseModel ):
@@ -134,8 +151,11 @@ class EvaluationSummary(BaseModel):
134151 by_tag : dict [str , TagStats ] = Field (
135152 default_factory = dict , description = "Statistics per tag"
136153 )
137- api_tokens : Optional [ApiTokenUsage ] = Field (
138- default = None , description = "API token usage (when evaluation data provided)"
154+ agent_token_usage : Optional [AgentTokenUsage ] = Field (
155+ default = None , description = "Agent token usage with totals and statistics"
156+ )
157+ agent_latency_stats : Optional [NumericStats ] = Field (
158+ default = None , description = "API latency statistics (when API enabled)"
139159 )
140160 streaming : Optional [StreamingStats ] = Field (
141161 default = None , description = "Streaming performance stats (when available)"
@@ -172,11 +192,14 @@ def from_results(
172192 by_tag = _compute_tag_stats (results , compute_confidence_intervals )
173193
174194 # Compute API token usage and streaming stats if evaluation data provided
175- api_tokens = None
195+ agent_token_usage = None
176196 streaming = None
197+ agent_latency_stats = None
177198 if evaluation_data :
178- api_tokens = _compute_api_token_usage (evaluation_data )
199+ agent_token_usage = _compute_agent_token_usage (evaluation_data )
179200 streaming = _compute_streaming_stats (evaluation_data )
201+ # Compute Agent latency statistics from evaluation data
202+ agent_latency_stats = _compute_agent_latency_stats (evaluation_data )
180203
181204 return cls (
182205 timestamp = timestamp ,
@@ -185,7 +208,8 @@ def from_results(
185208 by_metric = by_metric ,
186209 by_conversation = by_conversation ,
187210 by_tag = by_tag ,
188- api_tokens = api_tokens ,
211+ agent_token_usage = agent_token_usage ,
212+ agent_latency_stats = agent_latency_stats ,
189213 streaming = streaming ,
190214 )
191215
@@ -417,7 +441,7 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
417441 """Convert a raw numeric stats dictionary to a NumericStats model.
418442
419443 Args:
420- raw: Dictionary with count, mean, median, std, min, max keys.
444+ raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.
421445
422446 Returns:
423447 NumericStats instance, or None if count is 0.
@@ -432,28 +456,78 @@ def _numeric_stats_from_dict(raw: dict[str, Any]) -> Optional[NumericStats]:
432456 std = raw .get ("std" ),
433457 min_value = raw .get ("min" ),
434458 max_value = raw .get ("max" ),
459+ p95 = raw .get ("p95" ),
460+ p99 = raw .get ("p99" ),
435461 )
436462
437463
438- def _compute_api_token_usage (
464+ def _compute_agent_token_usage (
439465 evaluation_data : list [EvaluationData ],
440- ) -> ApiTokenUsage :
441- """Compute API token usage from evaluation data.
466+ ) -> AgentTokenUsage :
467+ """Compute agent token usage with totals and statistics from evaluation data.
442468
443469 Args:
444470 evaluation_data: List of evaluation data with turn-level token counts.
445471
446472 Returns:
447- ApiTokenUsage instance.
473+ AgentTokenUsage instance with totals and statistics .
448474 """
475+ stats = _compute_agent_token_stats (evaluation_data )
449476 raw = calculate_api_token_usage (evaluation_data )
450- return ApiTokenUsage (
477+ return AgentTokenUsage (
451478 total_api_input_tokens = raw ["total_api_input_tokens" ],
452479 total_api_output_tokens = raw ["total_api_output_tokens" ],
453480 total_api_tokens = raw ["total_api_tokens" ],
481+ statistics = stats ,
454482 )
455483
456484
485+ def _compute_agent_latency_stats (
486+ evaluation_data : list [EvaluationData ],
487+ ) -> Optional [NumericStats ]:
488+ """Compute agent latency statistics from evaluation data.
489+
490+ Args:
491+ evaluation_data: List of evaluation data containing turn-level latency values.
492+
493+ Returns:
494+ NumericStats instance, or None if no agent latency data available.
495+ """
496+ if not evaluation_data :
497+ return None
498+ raw = calculate_field_numeric_stats_from_evaluation_data (
499+ evaluation_data , "agent_latency"
500+ )
501+ return _numeric_stats_from_dict (raw )
502+
503+
504+ def _compute_agent_token_stats (
505+ evaluation_data : list [EvaluationData ],
506+ ) -> Optional [AgentTokenStats ]:
507+ """Calculate agent token usage statistics with percentiles from evaluation data.
508+
509+ Args:
510+ evaluation_data: List of evaluation data containing turn-level token counts.
511+
512+ Returns:
513+ AgentTokenStats instance with input/output token statistics, or None if no data.
514+ """
515+ if not evaluation_data :
516+ return None
517+
518+ input_tokens_stats = calculate_field_numeric_stats_from_evaluation_data (
519+ evaluation_data , "api_input_tokens"
520+ )
521+ output_tokens_stats = calculate_field_numeric_stats_from_evaluation_data (
522+ evaluation_data , "api_output_tokens"
523+ )
524+
525+ input_stats = _numeric_stats_from_dict (input_tokens_stats )
526+ output_stats = _numeric_stats_from_dict (output_tokens_stats )
527+
528+ return AgentTokenStats (input = input_stats , output = output_stats )
529+
530+
457531def _compute_streaming_stats (
458532 evaluation_data : list [EvaluationData ],
459533) -> Optional [StreamingStats ]:
0 commit comments