2323_DEFAULT_BINS = 20
2424# How many distinct error messages to list in the failure breakdown.
2525_TOP_ERRORS = 10
26+ # How many of the most-repeated sentences to list in the duplicates breakdown.
27+ _TOP_DUPLICATES = 10
28+ # Width to truncate sentences to when listing duplicates.
29+ _SENTENCE_WIDTH = 80
2630
2731
2832def run_stats (args : argparse .Namespace ) -> None :
@@ -40,6 +44,7 @@ def run_stats(args: argparse.Namespace) -> None:
4044 token_counts : list [int ] = []
4145 type_counter : Counter [str ] = Counter ()
4246 error_counter : Counter [str ] = Counter ()
47+ text_counter : Counter [str ] = Counter ()
4348
4449 with open (path , encoding = "utf-8" ) as f :
4550 for line in f :
@@ -52,6 +57,7 @@ def run_stats(args: argparse.Namespace) -> None:
5257 except Exception :
5358 malformed += 1
5459 continue
60+ text_counter .update ([pr .text ])
5561 if pr .failed :
5662 failed += 1
5763 error_counter .update (pr .errors or ["(no error message)" ])
@@ -75,6 +81,14 @@ def run_stats(args: argparse.Namespace) -> None:
7581
7682 console .print (_overview_table (path , total , successful , failed , malformed ))
7783
84+ total_texts = sum (text_counter .values ())
85+ if total_texts :
86+ console .print ()
87+ console .print (_duplicates_table (text_counter , total_texts ))
88+ if any (c > 1 for c in text_counter .values ()):
89+ console .print ()
90+ console .print (_top_duplicates_table (text_counter ))
91+
7892 if successful :
7993 console .print ()
8094 console .print (_summary_table (sizes , depths , token_counts ))
@@ -209,3 +223,39 @@ def _failure_table(error_counter: Counter[str], failed: int) -> Table:
209223 if remaining > 0 :
210224 table .add_row (f"… and { remaining } more distinct messages" , "" )
211225 return table
226+
227+
228+ def _truncate (text : str , width : int = _SENTENCE_WIDTH ) -> str :
229+ text = text .replace ("\n " , " " ).strip ()
230+ return text if len (text ) <= width else text [: width - 1 ] + "…"
231+
232+
233+ def _duplicates_table (text_counter : Counter [str ], total_texts : int ) -> Table :
234+ unique = len (text_counter )
235+ duplicated = sum (1 for c in text_counter .values () if c > 1 )
236+ redundant = sum (c - 1 for c in text_counter .values () if c > 1 )
237+
238+ table = Table (title = "Duplicate sentences" , show_header = False )
239+ table .add_column ("Metric" , style = "bold" )
240+ table .add_column ("Value" )
241+ table .add_row ("Total sentences" , str (total_texts ))
242+ table .add_row ("Unique sentences" , f"{ unique } ({ _pct (unique , total_texts )} )" )
243+ table .add_row (
244+ "Duplicated sentences" ,
245+ f"{ duplicated } ({ _pct (duplicated , unique )} of unique)" ,
246+ )
247+ table .add_row ("Redundant copies" , f"{ redundant } ({ _pct (redundant , total_texts )} )" )
248+ return table
249+
250+
251+ def _top_duplicates_table (text_counter : Counter [str ]) -> Table :
252+ duplicated = [(t , c ) for t , c in text_counter .most_common () if c > 1 ]
253+ table = Table (title = "Most duplicated sentences" )
254+ table .add_column ("Count" , justify = "right" )
255+ table .add_column ("Sentence" )
256+ for text , count in duplicated [:_TOP_DUPLICATES ]:
257+ table .add_row (str (count ), _truncate (text ))
258+ remaining = len (duplicated ) - _TOP_DUPLICATES
259+ if remaining > 0 :
260+ table .add_row ("" , f"… and { remaining } more duplicated sentences" )
261+ return table
0 commit comments