Skip to content

Commit ba412b2

Browse files
committed
adding dedup test
1 parent d3e9114 commit ba412b2

1 file changed

Lines changed: 129 additions & 0 deletions

File tree

tests/test_cli_dedup.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import argparse
2+
import json
3+
4+
import pytest
5+
6+
from hyperbase.builders import hedge
7+
from hyperbase.cli.dedup import run_dedup
8+
from hyperbase.parsers.result import ParseResult
9+
10+
11+
def _line(text: str, failed: bool = False) -> str:
12+
tokens = text.split()
13+
pr = ParseResult(
14+
edge=hedge("(is/P a/C b/C)"),
15+
text=text,
16+
tokens=tokens,
17+
tok_pos=hedge("(is/P a/C b/C)"),
18+
failed=failed,
19+
errors=["no valid parse"] if failed else [],
20+
)
21+
return pr.to_json()
22+
23+
24+
def _write(path, lines: list[str]) -> str:
25+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
26+
return str(path)
27+
28+
29+
def _ns(file, output=None, in_place=False) -> argparse.Namespace:
30+
return argparse.Namespace(file=file, output=output, in_place=in_place)
31+
32+
33+
def test_removes_duplicates_preserving_first_occurrence_and_order(tmp_path, capsys):
34+
src = _write(
35+
tmp_path / "in.jsonl",
36+
[
37+
_line("the cat sat"),
38+
_line("hello world"),
39+
_line("the cat sat"),
40+
_line("a third one"),
41+
_line("hello world"),
42+
],
43+
)
44+
out = tmp_path / "out.jsonl"
45+
run_dedup(_ns(src, output=str(out)))
46+
47+
texts = [
48+
json.loads(ln)["text"] for ln in out.read_text(encoding="utf-8").splitlines()
49+
]
50+
assert texts == ["the cat sat", "hello world", "a third one"]
51+
52+
err = capsys.readouterr().err
53+
assert "Total lines: 5" in err
54+
assert "Kept (unique): 3" in err
55+
assert "Duplicates: 2" in err
56+
57+
58+
def test_in_place_rewrites_input(tmp_path, capsys):
59+
src = _write(
60+
tmp_path / "data.jsonl",
61+
[_line("one"), _line("one"), _line("two")],
62+
)
63+
run_dedup(_ns(src, in_place=True))
64+
65+
lines = (tmp_path / "data.jsonl").read_text(encoding="utf-8").splitlines()
66+
assert len(lines) == 2
67+
err = capsys.readouterr().err
68+
assert "Kept (unique): 2" in err
69+
assert "Duplicates: 1" in err
70+
71+
72+
def test_already_unique_file_is_byte_identical(tmp_path):
73+
src_path = tmp_path / "uniq.jsonl"
74+
_write(src_path, [_line("alpha"), _line("beta"), _line("gamma")])
75+
original = src_path.read_text(encoding="utf-8")
76+
77+
out = tmp_path / "out.jsonl"
78+
run_dedup(_ns(str(src_path), output=str(out)))
79+
assert out.read_text(encoding="utf-8") == original
80+
81+
82+
def test_malformed_line_dropped_and_reported(tmp_path, capsys):
83+
src_path = tmp_path / "in.jsonl"
84+
src_path.write_text(
85+
_line("good one") + "\n" + "{bad json\n" + _line("good two") + "\n",
86+
encoding="utf-8",
87+
)
88+
out = tmp_path / "out.jsonl"
89+
run_dedup(_ns(str(src_path), output=str(out)))
90+
91+
lines = out.read_text(encoding="utf-8").splitlines()
92+
assert len(lines) == 2
93+
err = capsys.readouterr().err
94+
assert "Malformed: 1 (dropped)" in err
95+
96+
97+
def test_failed_parse_duplicates_collapse(tmp_path, capsys):
98+
src = _write(
99+
tmp_path / "in.jsonl",
100+
[_line("garble", failed=True), _line("garble", failed=True)],
101+
)
102+
out = tmp_path / "out.jsonl"
103+
run_dedup(_ns(src, output=str(out)))
104+
assert len(out.read_text(encoding="utf-8").splitlines()) == 1
105+
assert "Duplicates: 1" in capsys.readouterr().err
106+
107+
108+
def test_neither_output_nor_in_place_exits(tmp_path, capsys):
109+
src = _write(tmp_path / "in.jsonl", [_line("x")])
110+
with pytest.raises(SystemExit) as exc:
111+
run_dedup(_ns(src))
112+
assert exc.value.code == 1
113+
assert "specify -o/--output" in capsys.readouterr().err
114+
115+
116+
def test_both_output_and_in_place_exits(tmp_path, capsys):
117+
src = _write(tmp_path / "in.jsonl", [_line("x")])
118+
with pytest.raises(SystemExit) as exc:
119+
run_dedup(_ns(src, output=str(tmp_path / "out.jsonl"), in_place=True))
120+
assert exc.value.code == 1
121+
assert "not both" in capsys.readouterr().err
122+
123+
124+
def test_missing_file_exits(tmp_path, capsys):
125+
missing = tmp_path / "nope.jsonl"
126+
with pytest.raises(SystemExit) as exc:
127+
run_dedup(_ns(str(missing), output=str(tmp_path / "out.jsonl")))
128+
assert exc.value.code == 1
129+
assert "file not found" in capsys.readouterr().err

0 commit comments

Comments
 (0)