2222)
2323from lightspeed_evaluation .core .models .statistics import (
2424 AgentTokenStats ,
25- ConversationStats ,
26- MetricStats ,
2725 NumericStats ,
28- OverallStats ,
29- StreamingStats ,
30- TagStats ,
3126)
3227from lightspeed_evaluation .core .models .quality import QualityReport
3328from lightspeed_evaluation .core .storage import FileBackendConfig , get_file_config
3429from lightspeed_evaluation .core .output .visualization import GraphGenerator
30+ from lightspeed_evaluation .core .output .serializers import (
31+ numeric_stats_to_dict ,
32+ streaming_stats_to_dict ,
33+ summary_to_detailed_stats_dict ,
34+ overall_to_basic_stats_dict ,
35+ result_to_json_dict ,
36+ metric_stats_to_dict ,
37+ conversation_stats_to_dict ,
38+ tag_stats_to_dict ,
39+ )
3540
3641logger = logging .getLogger (__name__ )
3742
@@ -229,7 +234,7 @@ def _create_graphs(
229234
230235 # Convert summary by_metric/by_conversation/by_tag to dict format
231236 # that the GraphGenerator expects
232- detailed_stats = _summary_to_detailed_stats_dict (summary )
237+ detailed_stats = summary_to_detailed_stats_dict (summary )
233238
234239 graph_generator = GraphGenerator (
235240 output_dir = str (self .output_dir ), figsize = figsize , dpi = dpi
@@ -312,7 +317,7 @@ def _generate_json_summary_from_model(
312317 "total_evaluations" : len (summary .results ),
313318 "summary_stats" : summary_stats ,
314319 "configuration" : self ._build_config_dict (),
315- "results" : [_result_to_json_dict (r ) for r in summary .results ],
320+ "results" : [result_to_json_dict (r ) for r in summary .results ],
316321 }
317322
318323 with open (json_file , "w" , encoding = "utf-8" ) as f :
@@ -391,7 +396,7 @@ def _generate_quality_score_report(
391396 for metric_id , stats in quality_report .extra_metrics .items ()
392397 },
393398 "agent_latency_stats" : (
394- _numeric_stats_to_dict (quality_report .agent_latency_stats )
399+ numeric_stats_to_dict (quality_report .agent_latency_stats )
395400 if quality_report .agent_latency_stats is not None
396401 else None
397402 ),
@@ -426,7 +431,7 @@ def _generate_text_summary_from_model(
426431 txt_file = out / f"{ base_filename } _summary.txt"
427432
428433 # Build compatible dicts from summary model
429- basic_stats = _overall_to_basic_stats_dict (summary .overall )
434+ basic_stats = overall_to_basic_stats_dict (summary .overall )
430435 api_tokens = (
431436 {
432437 "total_api_input_tokens" : summary .agent_token_usage .total_api_input_tokens ,
@@ -441,9 +446,9 @@ def _generate_text_summary_from_model(
441446 }
442447 )
443448 streaming_stats = (
444- _streaming_stats_to_dict (summary .streaming ) if summary .streaming else {}
449+ streaming_stats_to_dict (summary .streaming ) if summary .streaming else {}
445450 )
446- detailed_stats = _summary_to_detailed_stats_dict (summary )
451+ detailed_stats = summary_to_detailed_stats_dict (summary )
447452
448453 with open (txt_file , "w" , encoding = "utf-8" ) as f :
449454 f .write ("LSC Evaluation Framework - Summary Report\n " )
@@ -516,7 +521,7 @@ def _write_agent_latency_stats(
516521 if agent_latency is None :
517522 return # No API latency data available
518523
519- stats_dict = _numeric_stats_to_dict (agent_latency )
524+ stats_dict = numeric_stats_to_dict (agent_latency )
520525 self ._write_numeric_stats (
521526 f ,
522527 "API Latency (seconds):\n " + "-" * 20 ,
@@ -768,7 +773,7 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
768773 api_total = agent_token_usage .total_api_tokens if agent_token_usage else 0
769774
770775 overall_stats = {
771- ** _overall_to_basic_stats_dict (overall ),
776+ ** overall_to_basic_stats_dict (overall ),
772777 "total_api_input_tokens" : (
773778 agent_token_usage .total_api_input_tokens if agent_token_usage else 0
774779 ),
@@ -781,218 +786,17 @@ def _build_json_summary_stats(summary: EvaluationSummary) -> dict[str, Any]:
781786
782787 result : dict [str , Any ] = {
783788 "overall" : overall_stats ,
784- "by_metric" : _metric_stats_to_dict (summary .by_metric ),
785- "by_conversation" : _conversation_stats_to_dict (summary .by_conversation ),
786- "by_tag" : _tag_stats_to_dict (summary .by_tag ),
789+ "by_metric" : metric_stats_to_dict (summary .by_metric ),
790+ "by_conversation" : conversation_stats_to_dict (summary .by_conversation ),
791+ "by_tag" : tag_stats_to_dict (summary .by_tag ),
787792 }
788793
789794 if summary .agent_latency_stats is not None :
790- result ["agent_latency_stats" ] = _numeric_stats_to_dict (
795+ result ["agent_latency_stats" ] = numeric_stats_to_dict (
791796 summary .agent_latency_stats
792797 )
793798
794799 if summary .streaming is not None :
795- result ["streaming_performance" ] = _streaming_stats_to_dict (summary .streaming )
800+ result ["streaming_performance" ] = streaming_stats_to_dict (summary .streaming )
796801
797802 return result
798-
799-
800- def _result_to_json_dict (r : EvaluationResult ) -> dict [str , Any ]:
801- """Convert a single EvaluationResult to JSON-serializable dict.
802-
803- Args:
804- r: The evaluation result to convert.
805-
806- Returns:
807- Dictionary matching the existing JSON summary result format.
808- """
809- return {
810- "conversation_group_id" : r .conversation_group_id ,
811- "tag" : r .tag ,
812- "turn_id" : r .turn_id ,
813- "metric_identifier" : r .metric_identifier ,
814- "result" : r .result ,
815- "score" : r .score ,
816- "threshold" : r .threshold ,
817- "execution_time" : r .execution_time ,
818- "evaluation_latency" : r .evaluation_latency ,
819- "judge_llm_input_tokens" : r .judge_llm_input_tokens ,
820- "judge_llm_output_tokens" : r .judge_llm_output_tokens ,
821- "judge_scores" : (
822- [js .model_dump () for js in r .judge_scores ] if r .judge_scores else None
823- ),
824- "time_to_first_token" : r .time_to_first_token ,
825- "streaming_duration" : r .streaming_duration ,
826- "agent_latency" : r .agent_latency ,
827- "tokens_per_second" : r .tokens_per_second ,
828- }
829-
830-
831- def _overall_to_basic_stats_dict (
832- overall : OverallStats ,
833- ) -> dict [str , Any ]:
834- """Convert OverallStats to the dict format expected by text output.
835-
836- Args:
837- overall: OverallStats model instance.
838-
839- Returns:
840- Dictionary with keys matching the original calculate_basic_stats format.
841- """
842- return {
843- "TOTAL" : overall .total ,
844- "PASS" : overall .passed ,
845- "FAIL" : overall .failed ,
846- "ERROR" : overall .error ,
847- "SKIPPED" : overall .skipped ,
848- "pass_rate" : overall .pass_rate ,
849- "fail_rate" : overall .fail_rate ,
850- "error_rate" : overall .error_rate ,
851- "skipped_rate" : overall .skipped_rate ,
852- "total_judge_llm_input_tokens" : overall .total_judge_llm_input_tokens ,
853- "total_judge_llm_output_tokens" : overall .total_judge_llm_output_tokens ,
854- "total_judge_llm_tokens" : overall .total_judge_llm_tokens ,
855- "total_embedding_tokens" : overall .total_embedding_tokens ,
856- }
857-
858-
859- def _group_stats_to_dict (
860- stats : MetricStats | ConversationStats | TagStats ,
861- ) -> dict [str , Any ]:
862- """Convert a group stats model to the dict format for text output.
863-
864- Args:
865- stats: MetricStats, ConversationStats, or TagStats instance.
866-
867- Returns:
868- Dictionary with lowercase keys matching original detailed stats format.
869- """
870- result : dict [str , Any ] = {
871- "pass" : stats .passed ,
872- "fail" : stats .failed ,
873- "error" : stats .error ,
874- "skipped" : stats .skipped ,
875- "pass_rate" : stats .pass_rate ,
876- "fail_rate" : stats .fail_rate ,
877- "error_rate" : stats .error_rate ,
878- "skipped_rate" : stats .skipped_rate ,
879- }
880- if (
881- isinstance (stats , (MetricStats , TagStats ))
882- and stats .score_statistics is not None
883- ):
884- score_stats = stats .score_statistics
885- result ["score_statistics" ] = {
886- "count" : score_stats .count ,
887- "mean" : score_stats .mean ,
888- "median" : score_stats .median ,
889- "std" : score_stats .std ,
890- "min" : score_stats .min_score ,
891- "max" : score_stats .max_score ,
892- "confidence_interval" : (
893- score_stats .confidence_interval .model_dump ()
894- if score_stats .confidence_interval is not None
895- else None
896- ),
897- }
898- return result
899-
900-
901- def _metric_stats_to_dict (
902- by_metric : dict [str , MetricStats ],
903- ) -> dict [str , dict [str , Any ]]:
904- """Convert by_metric model dict to legacy dict format.
905-
906- Args:
907- by_metric: Dictionary mapping metric IDs to MetricStats models.
908-
909- Returns:
910- Dictionary in the original detailed stats format.
911- """
912- return {k : _group_stats_to_dict (v ) for k , v in by_metric .items ()}
913-
914-
915- def _conversation_stats_to_dict (
916- by_conversation : dict [str , ConversationStats ],
917- ) -> dict [str , dict [str , Any ]]:
918- """Convert by_conversation model dict to legacy dict format.
919-
920- Args:
921- by_conversation: Dictionary mapping conversation IDs to ConversationStats.
922-
923- Returns:
924- Dictionary in the original detailed stats format.
925- """
926- return {k : _group_stats_to_dict (v ) for k , v in by_conversation .items ()}
927-
928-
929- def _tag_stats_to_dict (
930- by_tag : dict [str , TagStats ],
931- ) -> dict [str , dict [str , Any ]]:
932- """Convert by_tag model dict to legacy dict format.
933-
934- Args:
935- by_tag: Dictionary mapping tags to TagStats models.
936-
937- Returns:
938- Dictionary in the original detailed stats format.
939- """
940- return {k : _group_stats_to_dict (v ) for k , v in by_tag .items ()}
941-
942-
943- def _summary_to_detailed_stats_dict (
944- summary : EvaluationSummary ,
945- ) -> dict [str , Any ]:
946- """Convert EvaluationSummary to the detailed stats dict format.
947-
948- This produces a dictionary with by_metric, by_conversation, by_tag keys
949- matching the format from compute_detailed_stats().
950-
951- Args:
952- summary: The EvaluationSummary instance.
953-
954- Returns:
955- Dictionary matching the original detailed stats format.
956- """
957- return {
958- "by_metric" : _metric_stats_to_dict (summary .by_metric ),
959- "by_conversation" : _conversation_stats_to_dict (summary .by_conversation ),
960- "by_tag" : _tag_stats_to_dict (summary .by_tag ),
961- }
962-
963-
964- def _streaming_stats_to_dict (streaming : StreamingStats ) -> dict [str , Any ]:
965- """Convert StreamingStats model to the dict format for text output.
966-
967- Args:
968- streaming: StreamingStats model instance.
969-
970- Returns:
971- Dictionary matching the original streaming stats format.
972- """
973- result : dict [str , Any ] = {}
974- for field_name in (
975- "time_to_first_token" ,
976- "streaming_duration" ,
977- "tokens_per_second" ,
978- ):
979- numeric = getattr (streaming , field_name , None )
980- if numeric is not None :
981- result [field_name ] = _numeric_stats_to_dict (numeric )
982- else :
983- result [field_name ] = {"count" : 0 }
984- return result
985-
986-
987- def _numeric_stats_to_dict (numeric : NumericStats ) -> dict [str , Any ]:
988- """Convert NumericStats model to dict format for text output."""
989- return {
990- "count" : numeric .count ,
991- "mean" : numeric .mean ,
992- "median" : numeric .median ,
993- "std" : numeric .std ,
994- "min" : numeric .min_value ,
995- "max" : numeric .max_value ,
996- "p95" : numeric .p95 ,
997- "p99" : numeric .p99 ,
998- }
0 commit comments