Skip to content

Commit 8687ca1

Browse files
authored
Merge pull request #233 from xmican10/summary-statistics-refactor
chore: Summary and statistics refactor
2 parents f25b262 + 51b1cd9 commit 8687ca1

14 files changed

Lines changed: 725 additions & 1006 deletions

File tree

src/lightspeed_evaluation/core/models/__init__.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@
3535
SystemConfig,
3636
VisualizationConfig,
3737
)
38+
from lightspeed_evaluation.core.models.statistics import (
39+
NumericStats,
40+
ScoreStatistics,
41+
OverallStats,
42+
MetricStats,
43+
ConversationStats,
44+
TagStats,
45+
StreamingStats,
46+
ApiTokenUsage,
47+
ConfidenceInterval,
48+
DetailedStats,
49+
)
3850

3951
__all__ = [
4052
# Agent config models
@@ -64,6 +76,17 @@
6476
"LoggingConfig",
6577
"SystemConfig",
6678
"VisualizationConfig",
79+
# Stats models
80+
"NumericStats",
81+
"ScoreStatistics",
82+
"OverallStats",
83+
"MetricStats",
84+
"ConversationStats",
85+
"TagStats",
86+
"StreamingStats",
87+
"ApiTokenUsage",
88+
"ConfidenceInterval",
89+
"DetailedStats",
6790
# API models
6891
"APIRequest",
6992
"APIResponse",

src/lightspeed_evaluation/core/models/quality.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pydantic import BaseModel, Field
1111

12-
from lightspeed_evaluation.core.models.summary import MetricStats, ScoreStatistics
12+
from lightspeed_evaluation.core.models import MetricStats, ScoreStatistics
1313

1414
logger = logging.getLogger(__name__)
1515

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
"""Pydantic models for evaluation statistics."""
2+
3+
from typing import Optional
4+
from pydantic import BaseModel, Field
5+
6+
7+
class NumericStats(BaseModel):
8+
"""Numeric statistics for a set of values (e.g., TTFT, duration)."""
9+
10+
count: int = Field(default=0, description="Number of values")
11+
mean: Optional[float] = Field(default=None, description="Mean value")
12+
median: Optional[float] = Field(default=None, description="Median value")
13+
std: Optional[float] = Field(default=None, description="Standard deviation")
14+
min_value: Optional[float] = Field(default=None, description="Minimum value")
15+
max_value: Optional[float] = Field(default=None, description="Maximum value")
16+
17+
18+
class ConfidenceInterval(BaseModel):
19+
"""Bootstrap confidence interval for score statistics."""
20+
21+
low: float = Field(
22+
default=0.0, description="Lower bound of the confidence interval"
23+
)
24+
mean: float = Field(default=0.0, description="Mean value from bootstrap samples")
25+
high: float = Field(
26+
default=0.0, description="Upper bound of the confidence interval"
27+
)
28+
confidence_level: float = Field(
29+
default=95.0,
30+
description="Confidence level as a percentage (e.g., 95.0 for 95%)",
31+
)
32+
33+
34+
class ScoreStatistics(BaseModel):
35+
"""Score statistics for a metric or group."""
36+
37+
count: int = Field(default=0, description="Number of scored results")
38+
mean: float = Field(default=0.0, description="Mean score")
39+
median: float = Field(default=0.0, description="Median score")
40+
std: float = Field(default=0.0, description="Standard deviation")
41+
min_score: float = Field(default=0.0, description="Minimum score")
42+
max_score: float = Field(default=0.0, description="Maximum score")
43+
confidence_interval: Optional[ConfidenceInterval] = Field(
44+
default=None,
45+
description="Bootstrap confidence interval with low, mean, high, confidence_level",
46+
)
47+
48+
49+
class OverallStats(BaseModel):
50+
"""Overall pass/fail/error/skipped statistics."""
51+
52+
total: int = Field(default=0, description="Total number of evaluations")
53+
passed: int = Field(default=0, description="Number of passed evaluations")
54+
failed: int = Field(default=0, description="Number of failed evaluations")
55+
error: int = Field(default=0, description="Number of error evaluations")
56+
skipped: int = Field(default=0, description="Number of skipped evaluations")
57+
pass_rate: float = Field(default=0.0, description="Pass rate percentage")
58+
fail_rate: float = Field(default=0.0, description="Fail rate percentage")
59+
error_rate: float = Field(default=0.0, description="Error rate percentage")
60+
skipped_rate: float = Field(default=0.0, description="Skipped rate percentage")
61+
total_judge_llm_input_tokens: int = Field(
62+
default=0, description="Total judge LLM input tokens"
63+
)
64+
total_judge_llm_output_tokens: int = Field(
65+
default=0, description="Total judge LLM output tokens"
66+
)
67+
total_judge_llm_tokens: int = Field(default=0, description="Total judge LLM tokens")
68+
total_embedding_tokens: int = Field(default=0, description="Total embedding tokens")
69+
70+
71+
class MetricStats(OverallStats):
72+
"""Statistics for a specific metric, extending OverallStats with score statistics."""
73+
74+
score_statistics: Optional[ScoreStatistics] = Field(
75+
default=None, description="Score statistics for this metric"
76+
)
77+
78+
79+
class ConversationStats(OverallStats):
80+
"""Statistics for a specific conversation group."""
81+
82+
83+
class TagStats(OverallStats):
84+
"""Statistics for a specific tag, extending OverallStats with score statistics."""
85+
86+
score_statistics: Optional[ScoreStatistics] = Field(
87+
default=None, description="Score statistics for this tag"
88+
)
89+
90+
91+
class DetailedStats(BaseModel):
92+
"""Detailed statistics broken down by metric, conversation, and tag."""
93+
94+
by_metric: dict[str, MetricStats] = Field(
95+
default_factory=dict, description="Statistics grouped by metric name"
96+
)
97+
by_conversation: dict[str, ConversationStats] = Field(
98+
default_factory=dict, description="Statistics grouped by conversation ID"
99+
)
100+
by_tag: dict[str, TagStats] = Field(
101+
default_factory=dict, description="Statistics grouped by tag"
102+
)
103+
104+
105+
class StreamingStats(BaseModel):
106+
"""Streaming performance statistics."""
107+
108+
time_to_first_token: Optional[NumericStats] = Field(
109+
default=None, description="Time to first token statistics"
110+
)
111+
streaming_duration: Optional[NumericStats] = Field(
112+
default=None, description="Streaming duration statistics"
113+
)
114+
tokens_per_second: Optional[NumericStats] = Field(
115+
default=None, description="Tokens per second statistics"
116+
)
117+
118+
119+
class ApiTokenUsage(BaseModel):
120+
"""API token usage totals."""
121+
122+
total_api_input_tokens: int = Field(default=0, description="Total API input tokens")
123+
total_api_output_tokens: int = Field(
124+
default=0, description="Total API output tokens"
125+
)
126+
total_api_tokens: int = Field(default=0, description="Total API tokens")

0 commit comments

Comments
 (0)