Skip to content

Commit d3e9114

Browse files
committed
dedup command
1 parent 701f8f8 commit d3e9114

1 file changed

Lines changed: 77 additions & 0 deletions

File tree

src/hyperbase/cli/dedup.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import argparse
2+
import contextlib
3+
import json
4+
import os
5+
import sys
6+
import tempfile
7+
from pathlib import Path
8+
9+
10+
def run_dedup(args: argparse.Namespace) -> None:
11+
"""Remove duplicate sentences from a JSONL parse-results file.
12+
13+
Keeps the first occurrence of each distinct ``text`` (exact match, the same
14+
key used by ``stats`` duplicate detection) and drops the rest. Writes either
15+
to ``-o/--output`` or back over the input with ``--in-place``.
16+
"""
17+
path = Path(args.file).expanduser()
18+
if not path.is_file():
19+
print(f"Error: file not found: {path}", file=sys.stderr)
20+
sys.exit(1)
21+
22+
in_place = bool(getattr(args, "in_place", False))
23+
output = getattr(args, "output", None)
24+
if in_place and output:
25+
print("Error: use either -o/--output or --in-place, not both", file=sys.stderr)
26+
sys.exit(1)
27+
if not in_place and not output:
28+
print("Error: specify -o/--output <path> or --in-place", file=sys.stderr)
29+
sys.exit(1)
30+
31+
dest = path if in_place else Path(output).expanduser()
32+
dest.parent.mkdir(parents=True, exist_ok=True)
33+
34+
total = kept = duplicates = malformed = 0
35+
seen: set[str] = set()
36+
37+
# Stream to a temp file in the destination directory, then atomically
38+
# replace — so --in-place never reads and overwrites the same file at once,
39+
# and an interrupted run never leaves a half-written output.
40+
fd, tmp_name = tempfile.mkstemp(dir=dest.parent, suffix=".tmp")
41+
try:
42+
with (
43+
open(path, encoding="utf-8") as fin,
44+
os.fdopen(fd, "w", encoding="utf-8") as fout,
45+
):
46+
for line in fin:
47+
stripped = line.strip()
48+
if not stripped:
49+
continue
50+
total += 1
51+
# Key on the raw ``text`` field only. We deliberately do NOT
52+
# reconstruct a ParseResult: that would run hedge() on the edge
53+
# string and drop an otherwise-valid line whose edge fails to
54+
# parse. Kept lines are written verbatim (byte-lossless).
55+
try:
56+
text = json.loads(stripped)["text"]
57+
except Exception:
58+
malformed += 1
59+
continue
60+
if text in seen:
61+
duplicates += 1
62+
continue
63+
seen.add(text)
64+
kept += 1
65+
fout.write(stripped + "\n")
66+
os.replace(tmp_name, dest)
67+
except BaseException:
68+
with contextlib.suppress(OSError):
69+
os.unlink(tmp_name)
70+
raise
71+
72+
print(f"Total lines: {total}", file=sys.stderr)
73+
print(f"Kept (unique): {kept}", file=sys.stderr)
74+
print(f"Duplicates: {duplicates}", file=sys.stderr)
75+
if malformed:
76+
print(f"Malformed: {malformed} (dropped)", file=sys.stderr)
77+
print(f"Output: {dest}", file=sys.stderr)

0 commit comments

Comments
 (0)