|
| 1 | +import argparse |
| 2 | +import contextlib |
| 3 | +import json |
| 4 | +import os |
| 5 | +import sys |
| 6 | +import tempfile |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | + |
| 10 | +def run_dedup(args: argparse.Namespace) -> None: |
| 11 | + """Remove duplicate sentences from a JSONL parse-results file. |
| 12 | +
|
| 13 | + Keeps the first occurrence of each distinct ``text`` (exact match, the same |
| 14 | + key used by ``stats`` duplicate detection) and drops the rest. Writes either |
| 15 | + to ``-o/--output`` or back over the input with ``--in-place``. |
| 16 | + """ |
| 17 | + path = Path(args.file).expanduser() |
| 18 | + if not path.is_file(): |
| 19 | + print(f"Error: file not found: {path}", file=sys.stderr) |
| 20 | + sys.exit(1) |
| 21 | + |
| 22 | + in_place = bool(getattr(args, "in_place", False)) |
| 23 | + output = getattr(args, "output", None) |
| 24 | + if in_place and output: |
| 25 | + print("Error: use either -o/--output or --in-place, not both", file=sys.stderr) |
| 26 | + sys.exit(1) |
| 27 | + if not in_place and not output: |
| 28 | + print("Error: specify -o/--output <path> or --in-place", file=sys.stderr) |
| 29 | + sys.exit(1) |
| 30 | + |
| 31 | + dest = path if in_place else Path(output).expanduser() |
| 32 | + dest.parent.mkdir(parents=True, exist_ok=True) |
| 33 | + |
| 34 | + total = kept = duplicates = malformed = 0 |
| 35 | + seen: set[str] = set() |
| 36 | + |
| 37 | + # Stream to a temp file in the destination directory, then atomically |
| 38 | + # replace — so --in-place never reads and overwrites the same file at once, |
| 39 | + # and an interrupted run never leaves a half-written output. |
| 40 | + fd, tmp_name = tempfile.mkstemp(dir=dest.parent, suffix=".tmp") |
| 41 | + try: |
| 42 | + with ( |
| 43 | + open(path, encoding="utf-8") as fin, |
| 44 | + os.fdopen(fd, "w", encoding="utf-8") as fout, |
| 45 | + ): |
| 46 | + for line in fin: |
| 47 | + stripped = line.strip() |
| 48 | + if not stripped: |
| 49 | + continue |
| 50 | + total += 1 |
| 51 | + # Key on the raw ``text`` field only. We deliberately do NOT |
| 52 | + # reconstruct a ParseResult: that would run hedge() on the edge |
| 53 | + # string and drop an otherwise-valid line whose edge fails to |
| 54 | + # parse. Kept lines are written verbatim (byte-lossless). |
| 55 | + try: |
| 56 | + text = json.loads(stripped)["text"] |
| 57 | + except Exception: |
| 58 | + malformed += 1 |
| 59 | + continue |
| 60 | + if text in seen: |
| 61 | + duplicates += 1 |
| 62 | + continue |
| 63 | + seen.add(text) |
| 64 | + kept += 1 |
| 65 | + fout.write(stripped + "\n") |
| 66 | + os.replace(tmp_name, dest) |
| 67 | + except BaseException: |
| 68 | + with contextlib.suppress(OSError): |
| 69 | + os.unlink(tmp_name) |
| 70 | + raise |
| 71 | + |
| 72 | + print(f"Total lines: {total}", file=sys.stderr) |
| 73 | + print(f"Kept (unique): {kept}", file=sys.stderr) |
| 74 | + print(f"Duplicates: {duplicates}", file=sys.stderr) |
| 75 | + if malformed: |
| 76 | + print(f"Malformed: {malformed} (dropped)", file=sys.stderr) |
| 77 | + print(f"Output: {dest}", file=sys.stderr) |
0 commit comments