Skip to content

Commit 2e77b31

Browse files
committed
stats reports duplicates
1 parent 8d40ba5 commit 2e77b31

1 file changed

Lines changed: 50 additions & 0 deletions

File tree

src/hyperbase/cli/stats.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
_DEFAULT_BINS = 20
2424
# How many distinct error messages to list in the failure breakdown.
2525
_TOP_ERRORS = 10
26+
# How many of the most-repeated sentences to list in the duplicates breakdown.
27+
_TOP_DUPLICATES = 10
28+
# Width to truncate sentences to when listing duplicates.
29+
_SENTENCE_WIDTH = 80
2630

2731

2832
def run_stats(args: argparse.Namespace) -> None:
@@ -40,6 +44,7 @@ def run_stats(args: argparse.Namespace) -> None:
4044
token_counts: list[int] = []
4145
type_counter: Counter[str] = Counter()
4246
error_counter: Counter[str] = Counter()
47+
text_counter: Counter[str] = Counter()
4348

4449
with open(path, encoding="utf-8") as f:
4550
for line in f:
@@ -52,6 +57,7 @@ def run_stats(args: argparse.Namespace) -> None:
5257
except Exception:
5358
malformed += 1
5459
continue
60+
text_counter.update([pr.text])
5561
if pr.failed:
5662
failed += 1
5763
error_counter.update(pr.errors or ["(no error message)"])
@@ -75,6 +81,14 @@ def run_stats(args: argparse.Namespace) -> None:
7581

7682
console.print(_overview_table(path, total, successful, failed, malformed))
7783

84+
total_texts = sum(text_counter.values())
85+
if total_texts:
86+
console.print()
87+
console.print(_duplicates_table(text_counter, total_texts))
88+
if any(c > 1 for c in text_counter.values()):
89+
console.print()
90+
console.print(_top_duplicates_table(text_counter))
91+
7892
if successful:
7993
console.print()
8094
console.print(_summary_table(sizes, depths, token_counts))
@@ -209,3 +223,39 @@ def _failure_table(error_counter: Counter[str], failed: int) -> Table:
209223
if remaining > 0:
210224
table.add_row(f"… and {remaining} more distinct messages", "")
211225
return table
226+
227+
228+
def _truncate(text: str, width: int = _SENTENCE_WIDTH) -> str:
229+
text = text.replace("\n", " ").strip()
230+
return text if len(text) <= width else text[: width - 1] + "…"
231+
232+
233+
def _duplicates_table(text_counter: Counter[str], total_texts: int) -> Table:
234+
unique = len(text_counter)
235+
duplicated = sum(1 for c in text_counter.values() if c > 1)
236+
redundant = sum(c - 1 for c in text_counter.values() if c > 1)
237+
238+
table = Table(title="Duplicate sentences", show_header=False)
239+
table.add_column("Metric", style="bold")
240+
table.add_column("Value")
241+
table.add_row("Total sentences", str(total_texts))
242+
table.add_row("Unique sentences", f"{unique} ({_pct(unique, total_texts)})")
243+
table.add_row(
244+
"Duplicated sentences",
245+
f"{duplicated} ({_pct(duplicated, unique)} of unique)",
246+
)
247+
table.add_row("Redundant copies", f"{redundant} ({_pct(redundant, total_texts)})")
248+
return table
249+
250+
251+
def _top_duplicates_table(text_counter: Counter[str]) -> Table:
252+
duplicated = [(t, c) for t, c in text_counter.most_common() if c > 1]
253+
table = Table(title="Most duplicated sentences")
254+
table.add_column("Count", justify="right")
255+
table.add_column("Sentence")
256+
for text, count in duplicated[:_TOP_DUPLICATES]:
257+
table.add_row(str(count), _truncate(text))
258+
remaining = len(duplicated) - _TOP_DUPLICATES
259+
if remaining > 0:
260+
table.add_row("", f"… and {remaining} more duplicated sentences")
261+
return table

0 commit comments

Comments
 (0)