Skip to content

Commit fc5f537

Browse files
authored
Merge pull request #247 from xmican10/chore-refactor-generator
chore: refactoring of the generator module
2 parents 9ada2a8 + 6abfbb6 commit fc5f537

2 files changed

Lines changed: 239 additions & 219 deletions

File tree

src/lightspeed_evaluation/core/output/generator.py

Lines changed: 23 additions & 219 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,21 @@
2222
)
2323
from lightspeed_evaluation.core.models.statistics import (
2424
AgentTokenStats,
25-
ConversationStats,
26-
MetricStats,
2725
NumericStats,
28-
OverallStats,
29-
StreamingStats,
30-
TagStats,
3126
)
3227
from lightspeed_evaluation.core.models.quality import QualityReport
3328
from lightspeed_evaluation.core.storage import FileBackendConfig, get_file_config
3429
from lightspeed_evaluation.core.output.visualization import GraphGenerator
30+
from lightspeed_evaluation.core.output.serializers import (
31+
numeric_stats_to_dict,
32+
streaming_stats_to_dict,
33+
summary_to_detailed_stats_dict,
34+
overall_to_basic_stats_dict,
35+
result_to_json_dict,
36+
metric_stats_to_dict,
37+
conversation_stats_to_dict,
38+
tag_stats_to_dict,
39+
)
3540

3641
logger = logging.getLogger(__name__)
3742

@@ -229,7 +234,7 @@ def _create_graphs(
229234

230235
# Convert summary by_metric/by_conversation/by_tag to dict format
231236
# that the GraphGenerator expects
232-
detailed_stats = _summary_to_detailed_stats_dict(summary)
237+
detailed_stats = summary_to_detailed_stats_dict(summary)
233238

234239
graph_generator = GraphGenerator(
235240
output_dir=str(self.output_dir), figsize=figsize, dpi=dpi
@@ -312,7 +317,7 @@ def _generate_json_summary_from_model(
312317
"total_evaluations": len(summary.results),
313318
"summary_stats": summary_stats,
314319
"configuration": self._build_config_dict(),
315-
"results": [_result_to_json_dict(r) for r in summary.results],
320+
"results": [result_to_json_dict(r) for r in summary.results],
316321
}
317322

318323
with open(json_file, "w", encoding="utf-8") as f:
@@ -391,7 +396,7 @@ def _generate_quality_score_report(
391396
for metric_id, stats in quality_report.extra_metrics.items()
392397
},
393398
"agent_latency_stats": (
394-
_numeric_stats_to_dict(quality_report.agent_latency_stats)
399+
numeric_stats_to_dict(quality_report.agent_latency_stats)
395400
if quality_report.agent_latency_stats is not None
396401
else None
397402
),
@@ -426,7 +431,7 @@ def _generate_text_summary_from_model(
426431
txt_file = out / f"{base_filename}_summary.txt"
427432

428433
# Build compatible dicts from summary model
429-
basic_stats = _overall_to_basic_stats_dict(summary.overall)
434+
basic_stats = overall_to_basic_stats_dict(summary.overall)
430435
api_tokens = (
431436
{
432437
"total_api_input_tokens": summary.agent_token_usage.total_api_input_tokens,
@@ -441,9 +446,9 @@ def _generate_text_summary_from_model(
441446
}
442447
)
443448
streaming_stats = (
444-
_streaming_stats_to_dict(summary.streaming) if summary.streaming else {}
449+
streaming_stats_to_dict(summary.streaming) if summary.streaming else {}
445450
)
446-
detailed_stats = _summary_to_detailed_stats_dict(summary)
451+
detailed_stats = summary_to_detailed_stats_dict(summary)
447452

448453
with open(txt_file, "w", encoding="utf-8") as f:
449454
f.write("LSC Evaluation Framework - Summary Report\n")
@@ -516,7 +521,7 @@ def _write_agent_latency_stats(
516521
if agent_latency is None:
517522
return # No API latency data available
518523

519-
stats_dict = _numeric_stats_to_dict(agent_latency)
524+
stats_dict = numeric_stats_to_dict(agent_latency)
520525
self._write_numeric_stats(
521526
f,
522527
"API Latency (seconds):\n" + "-" * 20,
@@ -768,7 +773,7 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
768773
api_total = agent_token_usage.total_api_tokens if agent_token_usage else 0
769774

770775
overall_stats = {
771-
**_overall_to_basic_stats_dict(overall),
776+
**overall_to_basic_stats_dict(overall),
772777
"total_api_input_tokens": (
773778
agent_token_usage.total_api_input_tokens if agent_token_usage else 0
774779
),
@@ -781,218 +786,17 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
781786

782787
result: dict[str, Any] = {
783788
"overall": overall_stats,
784-
"by_metric": _metric_stats_to_dict(summary.by_metric),
785-
"by_conversation": _conversation_stats_to_dict(summary.by_conversation),
786-
"by_tag": _tag_stats_to_dict(summary.by_tag),
789+
"by_metric": metric_stats_to_dict(summary.by_metric),
790+
"by_conversation": conversation_stats_to_dict(summary.by_conversation),
791+
"by_tag": tag_stats_to_dict(summary.by_tag),
787792
}
788793

789794
if summary.agent_latency_stats is not None:
790-
result["agent_latency_stats"] = _numeric_stats_to_dict(
795+
result["agent_latency_stats"] = numeric_stats_to_dict(
791796
summary.agent_latency_stats
792797
)
793798

794799
if summary.streaming is not None:
795-
result["streaming_performance"] = _streaming_stats_to_dict(summary.streaming)
800+
result["streaming_performance"] = streaming_stats_to_dict(summary.streaming)
796801

797802
return result
798-
799-
800-
def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
801-
"""Convert a single EvaluationResult to JSON-serializable dict.
802-
803-
Args:
804-
r: The evaluation result to convert.
805-
806-
Returns:
807-
Dictionary matching the existing JSON summary result format.
808-
"""
809-
return {
810-
"conversation_group_id": r.conversation_group_id,
811-
"tag": r.tag,
812-
"turn_id": r.turn_id,
813-
"metric_identifier": r.metric_identifier,
814-
"result": r.result,
815-
"score": r.score,
816-
"threshold": r.threshold,
817-
"execution_time": r.execution_time,
818-
"evaluation_latency": r.evaluation_latency,
819-
"judge_llm_input_tokens": r.judge_llm_input_tokens,
820-
"judge_llm_output_tokens": r.judge_llm_output_tokens,
821-
"judge_scores": (
822-
[js.model_dump() for js in r.judge_scores] if r.judge_scores else None
823-
),
824-
"time_to_first_token": r.time_to_first_token,
825-
"streaming_duration": r.streaming_duration,
826-
"agent_latency": r.agent_latency,
827-
"tokens_per_second": r.tokens_per_second,
828-
}
829-
830-
831-
def _overall_to_basic_stats_dict(
832-
overall: OverallStats,
833-
) -> dict[str, Any]:
834-
"""Convert OverallStats to the dict format expected by text output.
835-
836-
Args:
837-
overall: OverallStats model instance.
838-
839-
Returns:
840-
Dictionary with keys matching the original calculate_basic_stats format.
841-
"""
842-
return {
843-
"TOTAL": overall.total,
844-
"PASS": overall.passed,
845-
"FAIL": overall.failed,
846-
"ERROR": overall.error,
847-
"SKIPPED": overall.skipped,
848-
"pass_rate": overall.pass_rate,
849-
"fail_rate": overall.fail_rate,
850-
"error_rate": overall.error_rate,
851-
"skipped_rate": overall.skipped_rate,
852-
"total_judge_llm_input_tokens": overall.total_judge_llm_input_tokens,
853-
"total_judge_llm_output_tokens": overall.total_judge_llm_output_tokens,
854-
"total_judge_llm_tokens": overall.total_judge_llm_tokens,
855-
"total_embedding_tokens": overall.total_embedding_tokens,
856-
}
857-
858-
859-
def _group_stats_to_dict(
860-
stats: MetricStats | ConversationStats | TagStats,
861-
) -> dict[str, Any]:
862-
"""Convert a group stats model to the dict format for text output.
863-
864-
Args:
865-
stats: MetricStats, ConversationStats, or TagStats instance.
866-
867-
Returns:
868-
Dictionary with lowercase keys matching original detailed stats format.
869-
"""
870-
result: dict[str, Any] = {
871-
"pass": stats.passed,
872-
"fail": stats.failed,
873-
"error": stats.error,
874-
"skipped": stats.skipped,
875-
"pass_rate": stats.pass_rate,
876-
"fail_rate": stats.fail_rate,
877-
"error_rate": stats.error_rate,
878-
"skipped_rate": stats.skipped_rate,
879-
}
880-
if (
881-
isinstance(stats, (MetricStats, TagStats))
882-
and stats.score_statistics is not None
883-
):
884-
score_stats = stats.score_statistics
885-
result["score_statistics"] = {
886-
"count": score_stats.count,
887-
"mean": score_stats.mean,
888-
"median": score_stats.median,
889-
"std": score_stats.std,
890-
"min": score_stats.min_score,
891-
"max": score_stats.max_score,
892-
"confidence_interval": (
893-
score_stats.confidence_interval.model_dump()
894-
if score_stats.confidence_interval is not None
895-
else None
896-
),
897-
}
898-
return result
899-
900-
901-
def _metric_stats_to_dict(
902-
by_metric: dict[str, MetricStats],
903-
) -> dict[str, dict[str, Any]]:
904-
"""Convert by_metric model dict to legacy dict format.
905-
906-
Args:
907-
by_metric: Dictionary mapping metric IDs to MetricStats models.
908-
909-
Returns:
910-
Dictionary in the original detailed stats format.
911-
"""
912-
return {k: _group_stats_to_dict(v) for k, v in by_metric.items()}
913-
914-
915-
def _conversation_stats_to_dict(
916-
by_conversation: dict[str, ConversationStats],
917-
) -> dict[str, dict[str, Any]]:
918-
"""Convert by_conversation model dict to legacy dict format.
919-
920-
Args:
921-
by_conversation: Dictionary mapping conversation IDs to ConversationStats.
922-
923-
Returns:
924-
Dictionary in the original detailed stats format.
925-
"""
926-
return {k: _group_stats_to_dict(v) for k, v in by_conversation.items()}
927-
928-
929-
def _tag_stats_to_dict(
930-
by_tag: dict[str, TagStats],
931-
) -> dict[str, dict[str, Any]]:
932-
"""Convert by_tag model dict to legacy dict format.
933-
934-
Args:
935-
by_tag: Dictionary mapping tags to TagStats models.
936-
937-
Returns:
938-
Dictionary in the original detailed stats format.
939-
"""
940-
return {k: _group_stats_to_dict(v) for k, v in by_tag.items()}
941-
942-
943-
def _summary_to_detailed_stats_dict(
944-
summary: EvaluationSummary,
945-
) -> dict[str, Any]:
946-
"""Convert EvaluationSummary to the detailed stats dict format.
947-
948-
This produces a dictionary with by_metric, by_conversation, by_tag keys
949-
matching the format from compute_detailed_stats().
950-
951-
Args:
952-
summary: The EvaluationSummary instance.
953-
954-
Returns:
955-
Dictionary matching the original detailed stats format.
956-
"""
957-
return {
958-
"by_metric": _metric_stats_to_dict(summary.by_metric),
959-
"by_conversation": _conversation_stats_to_dict(summary.by_conversation),
960-
"by_tag": _tag_stats_to_dict(summary.by_tag),
961-
}
962-
963-
964-
def _streaming_stats_to_dict(streaming: StreamingStats) -> dict[str, Any]:
965-
"""Convert StreamingStats model to the dict format for text output.
966-
967-
Args:
968-
streaming: StreamingStats model instance.
969-
970-
Returns:
971-
Dictionary matching the original streaming stats format.
972-
"""
973-
result: dict[str, Any] = {}
974-
for field_name in (
975-
"time_to_first_token",
976-
"streaming_duration",
977-
"tokens_per_second",
978-
):
979-
numeric = getattr(streaming, field_name, None)
980-
if numeric is not None:
981-
result[field_name] = _numeric_stats_to_dict(numeric)
982-
else:
983-
result[field_name] = {"count": 0}
984-
return result
985-
986-
987-
def _numeric_stats_to_dict(numeric: NumericStats) -> dict[str, Any]:
988-
"""Convert NumericStats model to dict format for text output."""
989-
return {
990-
"count": numeric.count,
991-
"mean": numeric.mean,
992-
"median": numeric.median,
993-
"std": numeric.std,
994-
"min": numeric.min_value,
995-
"max": numeric.max_value,
996-
"p95": numeric.p95,
997-
"p99": numeric.p99,
998-
}

0 commit comments

Comments
 (0)