Skip to content

Commit ece0141

Browse files
committed
shuffle cli command
1 parent ba412b2 commit ece0141

6 files changed

Lines changed: 257 additions & 55 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
- built-in REPL setting `check_badness` to render a badness panel after each parse, available regardless of the active parser plugin.
1616
- parser plugin settings have an optional flag to indicate that no parser reload is nodes, REPL adjusted to accommodate this mechanism.
1717
- functional pattern `deep`.
18-
- CLI `stats` and `dedup` commands.
18+
- CLI `stats`, `dedup`, and `shuffle` commands.
1919

2020
### Changed
2121

src/hyperbase/cli/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,35 @@ def main() -> None:
211211
help="Overwrite the input file in place",
212212
)
213213

214+
# --- shuffle subcommand ------------------------------------------------
215+
shuffle_parser = subparsers.add_parser(
216+
"shuffle",
217+
help="Randomly shuffle the lines of a JSONL parse-results file",
218+
)
219+
shuffle_parser.add_argument(
220+
"file",
221+
type=str,
222+
help="Path to a .jsonl parse-results file",
223+
)
224+
shuffle_parser.add_argument(
225+
"-o",
226+
"--output",
227+
type=str,
228+
default=None,
229+
help="Output .jsonl path (required unless --in-place)",
230+
)
231+
shuffle_parser.add_argument(
232+
"--in-place",
233+
action="store_true",
234+
help="Overwrite the input file in place",
235+
)
236+
shuffle_parser.add_argument(
237+
"--seed",
238+
type=int,
239+
default=None,
240+
help="Random seed for a reproducible shuffle (default: random)",
241+
)
242+
214243
# Dynamically inject parser-specific args, derived from the active
215244
# parser's ``accepted_params()``. We do this in two passes so that
216245
# plugin packages stay the source of truth for their CLI surface.
@@ -254,6 +283,12 @@ def main() -> None:
254283
run_dedup(args)
255284
sys.exit(0)
256285

286+
if args.command == "shuffle":
287+
from hyperbase.cli.shuffle import run_shuffle
288+
289+
run_shuffle(args)
290+
sys.exit(0)
291+
257292
if args.command == "repl":
258293
from hyperbase.cli.repl import run_repl
259294

src/hyperbase/cli/_io.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import argparse
2+
import contextlib
3+
import os
4+
import sys
5+
import tempfile
6+
from collections.abc import Iterator
7+
from contextlib import contextmanager
8+
from pathlib import Path
9+
from typing import IO
10+
11+
12+
def resolve_io(args: argparse.Namespace) -> tuple[Path, Path]:
13+
"""Validate a transform command's args and return ``(src, dest)``.
14+
15+
Shared by parses-file -> parses-file commands (``dedup``, ``shuffle``): the
16+
input ``file`` must exist, and exactly one of ``-o/--output`` or
17+
``--in-place`` must be given. Exits with a stderr message on any violation.
18+
"""
19+
src = Path(args.file).expanduser()
20+
if not src.is_file():
21+
print(f"Error: file not found: {src}", file=sys.stderr)
22+
sys.exit(1)
23+
24+
in_place = bool(getattr(args, "in_place", False))
25+
output = getattr(args, "output", None)
26+
if in_place and output:
27+
print("Error: use either -o/--output or --in-place, not both", file=sys.stderr)
28+
sys.exit(1)
29+
if not in_place and not output:
30+
print("Error: specify -o/--output <path> or --in-place", file=sys.stderr)
31+
sys.exit(1)
32+
33+
dest = src if in_place else Path(output).expanduser()
34+
dest.parent.mkdir(parents=True, exist_ok=True)
35+
return src, dest
36+
37+
38+
@contextmanager
39+
def atomic_write(dest: Path) -> Iterator[IO[str]]:
40+
"""Yield a text handle that is atomically moved onto *dest* on success.
41+
42+
Writes go to a temp file in *dest*'s directory; on clean exit it is
43+
``os.replace``-d onto *dest* -- atomic, and safe even when *dest* is also the
44+
input being read (``--in-place``). On any exception the temp file is removed.
45+
"""
46+
fd, tmp_name = tempfile.mkstemp(dir=dest.parent, suffix=".tmp")
47+
try:
48+
with os.fdopen(fd, "w", encoding="utf-8") as fout:
49+
yield fout
50+
os.replace(tmp_name, dest)
51+
except BaseException:
52+
with contextlib.suppress(OSError):
53+
os.unlink(tmp_name)
54+
raise

src/hyperbase/cli/dedup.py

Lines changed: 26 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import argparse
2-
import contextlib
32
import json
4-
import os
53
import sys
6-
import tempfile
7-
from pathlib import Path
4+
5+
from hyperbase.cli._io import atomic_write, resolve_io
86

97

108
def run_dedup(args: argparse.Namespace) -> None:
@@ -14,60 +12,34 @@ def run_dedup(args: argparse.Namespace) -> None:
1412
key used by ``stats`` duplicate detection) and drops the rest. Writes either
1513
to ``-o/--output`` or back over the input with ``--in-place``.
1614
"""
17-
path = Path(args.file).expanduser()
18-
if not path.is_file():
19-
print(f"Error: file not found: {path}", file=sys.stderr)
20-
sys.exit(1)
21-
22-
in_place = bool(getattr(args, "in_place", False))
23-
output = getattr(args, "output", None)
24-
if in_place and output:
25-
print("Error: use either -o/--output or --in-place, not both", file=sys.stderr)
26-
sys.exit(1)
27-
if not in_place and not output:
28-
print("Error: specify -o/--output <path> or --in-place", file=sys.stderr)
29-
sys.exit(1)
30-
31-
dest = path if in_place else Path(output).expanduser()
32-
dest.parent.mkdir(parents=True, exist_ok=True)
15+
src, dest = resolve_io(args)
3316

3417
total = kept = duplicates = malformed = 0
3518
seen: set[str] = set()
3619

37-
# Stream to a temp file in the destination directory, then atomically
38-
# replace — so --in-place never reads and overwrites the same file at once,
39-
# and an interrupted run never leaves a half-written output.
40-
fd, tmp_name = tempfile.mkstemp(dir=dest.parent, suffix=".tmp")
41-
try:
42-
with (
43-
open(path, encoding="utf-8") as fin,
44-
os.fdopen(fd, "w", encoding="utf-8") as fout,
45-
):
46-
for line in fin:
47-
stripped = line.strip()
48-
if not stripped:
49-
continue
50-
total += 1
51-
# Key on the raw ``text`` field only. We deliberately do NOT
52-
# reconstruct a ParseResult: that would run hedge() on the edge
53-
# string and drop an otherwise-valid line whose edge fails to
54-
# parse. Kept lines are written verbatim (byte-lossless).
55-
try:
56-
text = json.loads(stripped)["text"]
57-
except Exception:
58-
malformed += 1
59-
continue
60-
if text in seen:
61-
duplicates += 1
62-
continue
63-
seen.add(text)
64-
kept += 1
65-
fout.write(stripped + "\n")
66-
os.replace(tmp_name, dest)
67-
except BaseException:
68-
with contextlib.suppress(OSError):
69-
os.unlink(tmp_name)
70-
raise
20+
# ``atomic_write`` is the outer context so the input is closed before the
21+
# final os.replace (matters for --in-place, where dest is the input).
22+
with atomic_write(dest) as fout, open(src, encoding="utf-8") as fin:
23+
for line in fin:
24+
stripped = line.strip()
25+
if not stripped:
26+
continue
27+
total += 1
28+
# Key on the raw ``text`` field only. We deliberately do NOT
29+
# reconstruct a ParseResult: that would run hedge() on the edge
30+
# string and drop an otherwise-valid line whose edge fails to
31+
# parse. Kept lines are written verbatim (byte-lossless).
32+
try:
33+
text = json.loads(stripped)["text"]
34+
except Exception:
35+
malformed += 1
36+
continue
37+
if text in seen:
38+
duplicates += 1
39+
continue
40+
seen.add(text)
41+
kept += 1
42+
fout.write(stripped + "\n")
7143

7244
print(f"Total lines: {total}", file=sys.stderr)
7345
print(f"Kept (unique): {kept}", file=sys.stderr)

src/hyperbase/cli/shuffle.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import argparse
2+
import random
3+
import sys
4+
5+
from hyperbase.cli._io import atomic_write, resolve_io
6+
7+
8+
def run_shuffle(args: argparse.Namespace) -> None:
9+
"""Randomly shuffle the lines of a JSONL parse-results file.
10+
11+
Lines are treated as opaque text -- nothing is parsed or dropped except
12+
blank lines -- so the original ordering is the only thing changed. Writes
13+
either to ``-o/--output`` or back over the input with ``--in-place``. Pass
14+
``--seed`` for a reproducible ordering.
15+
"""
16+
src, dest = resolve_io(args)
17+
18+
lines: list[str] = []
19+
with open(src, encoding="utf-8") as fin:
20+
for line in fin:
21+
stripped = line.strip()
22+
if stripped:
23+
lines.append(stripped)
24+
25+
# Random(None) seeds from OS entropy (random each run); Random(int) is
26+
# reproducible -- so no conditional is needed here.
27+
random.Random(getattr(args, "seed", None)).shuffle(lines)
28+
29+
with atomic_write(dest) as fout:
30+
for line in lines:
31+
fout.write(line + "\n")
32+
33+
print(f"Lines shuffled: {len(lines)}", file=sys.stderr)
34+
if getattr(args, "seed", None) is not None:
35+
print(f"Seed: {args.seed}", file=sys.stderr)
36+
print(f"Output: {dest}", file=sys.stderr)

tests/test_cli_shuffle.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import argparse
2+
3+
import pytest
4+
5+
from hyperbase.builders import hedge
6+
from hyperbase.cli.shuffle import run_shuffle
7+
from hyperbase.parsers.result import ParseResult
8+
9+
10+
def _line(text: str) -> str:
11+
tokens = text.split()
12+
pr = ParseResult(
13+
edge=hedge("(is/P a/C b/C)"),
14+
text=text,
15+
tokens=tokens,
16+
tok_pos=hedge("(is/P a/C b/C)"),
17+
)
18+
return pr.to_json()
19+
20+
21+
def _write(path, lines: list[str]) -> str:
22+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
23+
return str(path)
24+
25+
26+
def _ns(file, output=None, in_place=False, seed=None) -> argparse.Namespace:
27+
return argparse.Namespace(file=file, output=output, in_place=in_place, seed=seed)
28+
29+
30+
def _read(path) -> list[str]:
31+
return path.read_text(encoding="utf-8").splitlines()
32+
33+
34+
def test_shuffle_is_a_permutation(tmp_path, capsys):
35+
lines = [_line(f"sentence number {i}") for i in range(20)]
36+
src = _write(tmp_path / "in.jsonl", lines)
37+
out = tmp_path / "out.jsonl"
38+
run_shuffle(_ns(src, output=str(out), seed=42))
39+
40+
result = _read(out)
41+
# Same lines, same count — only the order changed.
42+
assert sorted(result) == sorted(lines)
43+
assert len(result) == len(lines)
44+
# A fixed seed over 20 distinct lines reorders them.
45+
assert result != lines
46+
assert "Lines shuffled: 20" in capsys.readouterr().err
47+
48+
49+
def test_seed_is_reproducible(tmp_path):
50+
lines = [_line(f"s {i}") for i in range(15)]
51+
src = _write(tmp_path / "in.jsonl", lines)
52+
out_a = tmp_path / "a.jsonl"
53+
out_b = tmp_path / "b.jsonl"
54+
run_shuffle(_ns(src, output=str(out_a), seed=7))
55+
run_shuffle(_ns(src, output=str(out_b), seed=7))
56+
assert _read(out_a) == _read(out_b)
57+
58+
59+
def test_in_place_shuffles_input(tmp_path, capsys):
60+
lines = [_line(f"x {i}") for i in range(12)]
61+
src_path = tmp_path / "data.jsonl"
62+
_write(src_path, lines)
63+
run_shuffle(_ns(str(src_path), in_place=True, seed=3))
64+
65+
result = _read(src_path)
66+
assert sorted(result) == sorted(lines)
67+
err = capsys.readouterr().err
68+
assert "Lines shuffled: 12" in err
69+
assert "Seed: 3" in err
70+
71+
72+
def test_blank_lines_dropped(tmp_path, capsys):
73+
src_path = tmp_path / "in.jsonl"
74+
src_path.write_text(
75+
_line("one") + "\n\n" + _line("two") + "\n \n" + _line("three") + "\n",
76+
encoding="utf-8",
77+
)
78+
out = tmp_path / "out.jsonl"
79+
run_shuffle(_ns(str(src_path), output=str(out), seed=1))
80+
81+
result = _read(out)
82+
assert len(result) == 3
83+
assert "Lines shuffled: 3" in capsys.readouterr().err
84+
85+
86+
def test_no_seed_omits_seed_line(tmp_path, capsys):
87+
src = _write(tmp_path / "in.jsonl", [_line("a"), _line("b")])
88+
run_shuffle(_ns(src, output=str(tmp_path / "out.jsonl")))
89+
assert "Seed:" not in capsys.readouterr().err
90+
91+
92+
def test_neither_output_nor_in_place_exits(tmp_path, capsys):
93+
src = _write(tmp_path / "in.jsonl", [_line("x")])
94+
with pytest.raises(SystemExit) as exc:
95+
run_shuffle(_ns(src))
96+
assert exc.value.code == 1
97+
assert "specify -o/--output" in capsys.readouterr().err
98+
99+
100+
def test_missing_file_exits(tmp_path, capsys):
101+
missing = tmp_path / "nope.jsonl"
102+
with pytest.raises(SystemExit) as exc:
103+
run_shuffle(_ns(str(missing), output=str(tmp_path / "out.jsonl")))
104+
assert exc.value.code == 1
105+
assert "file not found" in capsys.readouterr().err

0 commit comments

Comments
 (0)