1414from lightspeed_evaluation .core .models .data import EvaluationData , EvaluationResult
1515from lightspeed_evaluation .core .output .statistics import (
1616 bootstrap_intervals ,
17+ calculate_field_numeric_stats_from_evaluation_data ,
1718 calculate_api_token_usage ,
1819 calculate_streaming_stats ,
1920)
2021
2122
2223class NumericStats (BaseModel ):
23- """Numeric statistics for a set of values (e.g., TTFT, duration)."""
24+ """Numeric statistics for a set of values (e.g., TTFT, duration, latency )."""
2425
2526 count : int = Field (default = 0 , description = "Number of values" )
2627 mean : Optional [float ] = Field (default = None , description = "Mean value" )
2728 median : Optional [float ] = Field (default = None , description = "Median value" )
2829 std : Optional [float ] = Field (default = None , description = "Standard deviation" )
2930 min_value : Optional [float ] = Field (default = None , description = "Minimum value" )
3031 max_value : Optional [float ] = Field (default = None , description = "Maximum value" )
32+ p95 : Optional [float ] = Field (default = None , description = "95th percentile" )
33+ p99 : Optional [float ] = Field (default = None , description = "99th percentile" )
3134
3235
3336class ScoreStatistics (BaseModel ):
@@ -101,14 +104,28 @@ class StreamingStats(BaseModel):
101104 )
102105
103106
104- class ApiTokenUsage (BaseModel ):
105- """API token usage totals."""
107+ class AgentTokenStats (BaseModel ):
108+ """Agent token usage statistics with percentiles."""
109+
110+ input : Optional [NumericStats ] = Field (
111+ default = None , description = "Input token statistics"
112+ )
113+ output : Optional [NumericStats ] = Field (
114+ default = None , description = "Output token statistics"
115+ )
116+
117+
118+ class AgentTokenUsage (BaseModel ):
119+ """Agent token usage totals and statistics."""
106120
107121 total_api_input_tokens : int = Field (default = 0 , description = "Total API input tokens" )
108122 total_api_output_tokens : int = Field (
109123 default = 0 , description = "Total API output tokens"
110124 )
111125 total_api_tokens : int = Field (default = 0 , description = "Total API tokens" )
126+ statistics : Optional [AgentTokenStats ] = Field (
127+ default = None , description = "Agent token usage statistics with percentiles"
128+ )
112129
113130
114131class EvaluationSummary (BaseModel ):
@@ -134,8 +151,11 @@ class EvaluationSummary(BaseModel):
134151 by_tag : dict [str , TagStats ] = Field (
135152 default_factory = dict , description = "Statistics per tag"
136153 )
137- api_tokens : Optional [ApiTokenUsage ] = Field (
138- default = None , description = "API token usage (when evaluation data provided)"
154+ agent_token_usage : Optional [AgentTokenUsage ] = Field (
155+ default = None , description = "Agent token usage with totals and statistics"
156+ )
157+ agent_latency_stats : Optional [NumericStats ] = Field (
158+ default = None , description = "API latency statistics (when API enabled)"
139159 )
140160 streaming : Optional [StreamingStats ] = Field (
141161 default = None , description = "Streaming performance stats (when available)"
@@ -172,11 +192,14 @@ def from_results(
172192 by_tag = _compute_tag_stats (results , compute_confidence_intervals )
173193
174194 # Compute API token usage and streaming stats if evaluation data provided
175- api_tokens = None
195+ agent_token_usage = None
176196 streaming = None
197+ agent_latency_stats = None
177198 if evaluation_data :
178- api_tokens = _compute_api_token_usage (evaluation_data )
199+ agent_token_usage = _compute_agent_token_usage (evaluation_data )
179200 streaming = _compute_streaming_stats (evaluation_data )
201+ # Compute Agent latency statistics from evaluation data
202+ agent_latency_stats = _compute_agent_latency_stats (evaluation_data )
180203
181204 return cls (
182205 timestamp = timestamp ,
@@ -185,7 +208,8 @@ def from_results(
185208 by_metric = by_metric ,
186209 by_conversation = by_conversation ,
187210 by_tag = by_tag ,
188- api_tokens = api_tokens ,
211+ agent_token_usage = agent_token_usage ,
212+ agent_latency_stats = agent_latency_stats ,
189213 streaming = streaming ,
190214 )
191215
@@ -413,47 +437,94 @@ def _try_bootstrap(scores: list[float]) -> Optional[dict[str, float]]:
413437 return None
414438
415439
416- def _numeric_stats_from_dict (raw : dict [str , Any ]) -> Optional [ NumericStats ] :
440+ def _numeric_stats_from_dict (raw : dict [str , Any ]) -> NumericStats :
417441 """Convert a raw numeric stats dictionary to a NumericStats model.
418442
419443 Args:
420- raw: Dictionary with count, mean, median, std, min, max keys.
444+ raw: Dictionary with count, mean, median, std, min, max, p95, p99 keys.
421445
422446 Returns:
423- NumericStats instance, or None if count is 0 .
447+ NumericStats instance with stats ( count may be 0 if no values collected) .
424448 """
425- if raw .get ("count" , 0 ) == 0 :
426- return None
427-
428449 return NumericStats (
429- count = raw [ "count" ] ,
450+ count = raw . get ( "count" , 0 ) ,
430451 mean = raw .get ("mean" ),
431452 median = raw .get ("median" ),
432453 std = raw .get ("std" ),
433454 min_value = raw .get ("min" ),
434455 max_value = raw .get ("max" ),
456+ p95 = raw .get ("p95" ),
457+ p99 = raw .get ("p99" ),
435458 )
436459
437460
438- def _compute_api_token_usage (
461+ def _compute_agent_token_usage (
439462 evaluation_data : list [EvaluationData ],
440- ) -> ApiTokenUsage :
441- """Compute API token usage from evaluation data.
463+ ) -> AgentTokenUsage :
464+ """Compute agent token usage with totals and statistics from evaluation data.
442465
443466 Args:
444467 evaluation_data: List of evaluation data with turn-level token counts.
445468
446469 Returns:
447- ApiTokenUsage instance.
470+ AgentTokenUsage instance with totals and statistics .
448471 """
472+ stats = _compute_agent_token_stats (evaluation_data )
449473 raw = calculate_api_token_usage (evaluation_data )
450- return ApiTokenUsage (
474+ return AgentTokenUsage (
451475 total_api_input_tokens = raw ["total_api_input_tokens" ],
452476 total_api_output_tokens = raw ["total_api_output_tokens" ],
453477 total_api_tokens = raw ["total_api_tokens" ],
478+ statistics = stats ,
454479 )
455480
456481
482+ def _compute_agent_latency_stats (
483+ evaluation_data : list [EvaluationData ],
484+ ) -> Optional [NumericStats ]:
485+ """Compute agent latency statistics from evaluation data.
486+
487+ Args:
488+ evaluation_data: List of evaluation data containing turn-level latency values.
489+
490+ Returns:
491+ NumericStats instance (may have count=0 if no valid data), or None if no evaluation data.
492+ """
493+ if not evaluation_data :
494+ return None
495+ raw = calculate_field_numeric_stats_from_evaluation_data (
496+ evaluation_data , "agent_latency"
497+ )
498+ return _numeric_stats_from_dict (raw )
499+
500+
501+ def _compute_agent_token_stats (
502+ evaluation_data : list [EvaluationData ],
503+ ) -> Optional [AgentTokenStats ]:
504+ """Calculate agent token usage statistics with percentiles from evaluation data.
505+
506+ Args:
507+ evaluation_data: List of evaluation data containing turn-level token counts.
508+
509+ Returns:
510+ AgentTokenStats instance with input/output token statistics, or None if no data.
511+ """
512+ if not evaluation_data :
513+ return None
514+
515+ input_tokens_stats = calculate_field_numeric_stats_from_evaluation_data (
516+ evaluation_data , "api_input_tokens"
517+ )
518+ output_tokens_stats = calculate_field_numeric_stats_from_evaluation_data (
519+ evaluation_data , "api_output_tokens"
520+ )
521+
522+ input_stats = _numeric_stats_from_dict (input_tokens_stats )
523+ output_stats = _numeric_stats_from_dict (output_tokens_stats )
524+
525+ return AgentTokenStats (input = input_stats , output = output_stats )
526+
527+
457528def _compute_streaming_stats (
458529 evaluation_data : list [EvaluationData ],
459530) -> Optional [StreamingStats ]:
0 commit comments