Skip to content

Commit 7fd0fe3

Browse files
authored
Add --target-uncompressed-size to the rechunk CLI command (#100)
1 parent 338dcf4 commit 7fd0fe3

5 files changed

Lines changed: 196 additions & 13 deletions

File tree

tests/test_cli.py

Lines changed: 66 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ def test_invalid_zarr_backend_storage_is_rejected(command, command_args):
170170
[
171171
["append", "only-one-store"],
172172
["normalise", "left", "right"],
173-
["rechunk", "store", "variant_contig"],
174173
["remove", "store"],
175174
["copy-store-to-icechunk", "only-one-store"],
176175
],
@@ -228,15 +227,22 @@ def test_remove_cli_updates_vcz_store(tmp_path):
228227
check_removed_sample(vcz, "NA00002")
229228

230229

231-
def test_rechunk_cli_passes_arguments(monkeypatch):
232-
seen = {}
230+
def fake_rechunk(
231+
vcz,
232+
variants_array_name,
233+
variants_chunk_size=None,
234+
*,
235+
target_uncompressed_size_bytes=None,
236+
backend_storage=None,
237+
):
238+
fake_rechunk.seen = {
239+
"args": (vcz, variants_array_name, variants_chunk_size),
240+
"target_uncompressed_size_bytes": target_uncompressed_size_bytes,
241+
"backend_storage": backend_storage,
242+
}
233243

234-
def fake_rechunk(
235-
vcz, variants_array_name, variants_chunk_size, *, backend_storage=None
236-
):
237-
seen["args"] = (vcz, variants_array_name, variants_chunk_size)
238-
seen["backend_storage"] = backend_storage
239244

245+
def test_rechunk_cli_passes_arguments(monkeypatch):
240246
monkeypatch.setattr(cli, "rechunk_function", fake_rechunk)
241247

242248
runner = ct.CliRunner()
@@ -247,12 +253,63 @@ def fake_rechunk(
247253
)
248254

249255
assert result.exit_code == 0
250-
assert seen == {
256+
assert fake_rechunk.seen == {
251257
"args": ("store", "variant_contig", 4),
258+
"target_uncompressed_size_bytes": None,
252259
"backend_storage": "icechunk",
253260
}
254261

255262

263+
def test_rechunk_cli_passes_target_uncompressed_size(monkeypatch):
264+
monkeypatch.setattr(cli, "rechunk_function", fake_rechunk)
265+
266+
runner = ct.CliRunner()
267+
result = runner.invoke(
268+
cli.vczstore_main,
269+
["rechunk", "--target-uncompressed-size", "100MB", "store", "variant_contig"],
270+
catch_exceptions=False,
271+
)
272+
273+
assert result.exit_code == 0
274+
assert fake_rechunk.seen == {
275+
"args": ("store", "variant_contig", None),
276+
"target_uncompressed_size_bytes": 100_000_000,
277+
"backend_storage": None,
278+
}
279+
280+
281+
def test_rechunk_cli_rejects_neither_size_option():
282+
runner = ct.CliRunner()
283+
result = runner.invoke(cli.vczstore_main, ["rechunk", "store", "variant_contig"])
284+
285+
assert result.exit_code == 2
286+
assert (
287+
"Must specify either VARIANTS_CHUNK_SIZE or --target-uncompressed-size"
288+
in result.output
289+
)
290+
291+
292+
def test_rechunk_cli_rejects_both_size_options():
293+
runner = ct.CliRunner()
294+
result = runner.invoke(
295+
cli.vczstore_main,
296+
[
297+
"rechunk",
298+
"--target-uncompressed-size",
299+
"100MB",
300+
"store",
301+
"variant_contig",
302+
"4",
303+
],
304+
)
305+
306+
assert result.exit_code == 2
307+
assert (
308+
"Cannot specify both VARIANTS_CHUNK_SIZE and --target-uncompressed-size"
309+
in result.output
310+
)
311+
312+
256313
@pytest.mark.parametrize(
257314
("backend_storage", "zarr_format"),
258315
[(None, None), ("icechunk", 3)],

tests/test_rechunk.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import numpy as np
12
import pytest
3+
import zarr
24
from vcztools.utils import open_zarr
35

46
from tests.utils import make_vcz
5-
from vczstore.rechunk import rechunk
7+
from vczstore.rechunk import compute_variants_chunk_size_from_target, rechunk
68

79

810
def make_simple_vcz(variants_chunk_size=2):
@@ -48,3 +50,44 @@ def test_rechunk_not_multiple_of_min_chunk_size():
4850
vcz = make_simple_vcz()
4951
with pytest.raises(ValueError, match="not an exact multiple"):
5052
rechunk(vcz, "variant_contig", 3)
53+
54+
55+
def test_rechunk_with_target_uncompressed_size():
56+
vcz = make_simple_vcz()
57+
58+
rechunk(vcz, "variant_contig", target_uncompressed_size_bytes=16)
59+
60+
root = open_zarr(vcz)
61+
# variant_contig is int32 (4 bytes), 1D; bytes_per_variant_chunk=4, n=4,
62+
# rounded to multiple of 2 → 4
63+
assert root["variant_contig"].chunks[0] == 4
64+
assert root["call_genotype"].chunks[0] == 2
65+
66+
67+
@pytest.mark.parametrize(
68+
("dtype", "extra_chunks", "target_bytes", "min_chunk_size", "expected"),
69+
[
70+
# 1D int32: bytes_per_variant_chunk=4, target=16 → n=4, multiple of 2 → 4
71+
(np.int32, (), 16, 2, 4),
72+
# 1D int32: target=20 → n=5, rounded down to multiple of 2 → 4
73+
(np.int32, (), 20, 2, 4),
74+
# 1D int32: target too small → clamp to min_chunk_size
75+
(np.int32, (), 1, 2, 2),
76+
# 2D int8 with extra chunk dim 3: bytes_per_variant_chunk=3,
77+
# target=12 → n=4, multiple of 2 → 4
78+
(np.int8, (3,), 12, 2, 4),
79+
],
80+
)
81+
def test_compute_variants_chunk_size_from_target(
82+
dtype, extra_chunks, target_bytes, min_chunk_size, expected
83+
):
84+
store = zarr.storage.MemoryStore()
85+
root = zarr.open_group(store=store)
86+
shape = (10,) + tuple(c * 2 for c in extra_chunks)
87+
chunks = (2,) + extra_chunks
88+
arr = root.create_array(name="x", data=np.zeros(shape, dtype=dtype), chunks=chunks)
89+
90+
result = compute_variants_chunk_size_from_target(arr, target_bytes, min_chunk_size)
91+
92+
assert result == expected
93+
assert result % min_chunk_size == 0

vczstore/cli.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from vczstore.normalise import normalise as normalise_function
77
from vczstore.rechunk import rechunk as rechunk_function
88
from vczstore.remove import remove as remove_function
9+
from vczstore.utils import parse_size
910

1011

1112
class NaturalOrderGroup(click.Group):
@@ -155,18 +156,54 @@ def normalise(
155156
@click.command()
156157
@click.argument("vcz", type=click.Path())
157158
@click.argument("variants_array_name", type=str)
158-
@click.argument("variants_chunk_size", type=click.IntRange(min=1))
159+
@click.argument(
160+
"variants_chunk_size", type=click.IntRange(min=1), required=False, default=None
161+
)
162+
@click.option(
163+
"--target-uncompressed-size",
164+
type=str,
165+
default=None,
166+
help=(
167+
"Target uncompressed chunk size (e.g. '100MB'). "
168+
"Alternative to VARIANTS_CHUNK_SIZE."
169+
),
170+
)
159171
@verbose
160172
@backend_storage
161-
def rechunk(vcz, variants_array_name, variants_chunk_size, verbose, backend_storage):
173+
def rechunk(
174+
vcz,
175+
variants_array_name,
176+
variants_chunk_size,
177+
target_uncompressed_size,
178+
verbose,
179+
backend_storage,
180+
):
162181
"""Rechunk a variants array with a larger variants chunk size that is
163182
an exact multiple of the min variants chunk size"""
164183
setup_logging(verbose)
184+
if variants_chunk_size is not None and target_uncompressed_size is not None:
185+
raise click.UsageError(
186+
"Cannot specify both VARIANTS_CHUNK_SIZE and --target-uncompressed-size"
187+
)
188+
if variants_chunk_size is None and target_uncompressed_size is None:
189+
raise click.UsageError(
190+
"Must specify either VARIANTS_CHUNK_SIZE or --target-uncompressed-size"
191+
)
192+
target_bytes = None
193+
if target_uncompressed_size is not None:
194+
try:
195+
target_bytes = parse_size(target_uncompressed_size)
196+
except ValueError as e:
197+
raise click.BadParameter(
198+
f"Cannot parse size: {target_uncompressed_size!r}",
199+
param_hint="'--target-uncompressed-size'",
200+
) from e
165201
call_or_error(
166202
rechunk_function,
167203
vcz,
168204
variants_array_name,
169205
variants_chunk_size,
206+
target_uncompressed_size_bytes=target_bytes,
170207
backend_storage=backend_storage,
171208
)
172209

vczstore/rechunk.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import math
2+
13
from bio2zarr.zarr_utils import create_group_array, get_compressor_config
24
from vcztools.utils import array_dims, open_zarr
35

@@ -11,8 +13,9 @@
1113
def rechunk(
1214
vcz,
1315
variants_array_name,
14-
variants_chunk_size,
16+
variants_chunk_size=None,
1517
*,
18+
target_uncompressed_size_bytes=None,
1619
backend_storage=None,
1720
):
1821
"""Rechunk a variants array with a larger variants chunk size that is
@@ -30,6 +33,12 @@ def rechunk(
3033
)
3134

3235
min_chunk_size = compute_min_variants_chunk_size(root)
36+
37+
if target_uncompressed_size_bytes is not None:
38+
variants_chunk_size = compute_variants_chunk_size_from_target(
39+
arr, target_uncompressed_size_bytes, min_chunk_size
40+
)
41+
3342
if variants_chunk_size % min_chunk_size != 0:
3443
raise ValueError(
3544
f"variants_chunk_size={variants_chunk_size} is not an exact multiple "
@@ -55,3 +64,13 @@ def rechunk(
5564
compressor=get_compressor_config(arr),
5665
dimension_names=array_dims(arr),
5766
)
67+
68+
69+
def compute_variants_chunk_size_from_target(
70+
arr, target_bytes: int, min_chunk_size: int
71+
) -> int:
72+
"""Compute the variants chunk size giving chunks of approximately
73+
target_bytes uncompressed."""
74+
bytes_per_variant_chunk = math.prod(arr.chunks[1:]) * arr.dtype.itemsize
75+
n = target_bytes // bytes_per_variant_chunk
76+
return max((n // min_chunk_size) * min_chunk_size, min_chunk_size)

vczstore/utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,33 @@
1717
)
1818

1919

20+
def parse_size(s: str) -> int:
21+
"""Parse a human-readable size string (e.g. '100MB', '2GiB') to bytes."""
22+
suffixes = [
23+
("TiB", 1024**4),
24+
("GiB", 1024**3),
25+
("MiB", 1024**2),
26+
("KiB", 1024),
27+
("TB", 1000**4),
28+
("GB", 1000**3),
29+
("MB", 1000**2),
30+
("KB", 1000),
31+
("B", 1),
32+
]
33+
upper = s.strip().upper()
34+
for suffix, factor in suffixes:
35+
if upper.endswith(suffix):
36+
num = s[: len(s) - len(suffix)].strip()
37+
try:
38+
return int(float(num) * factor)
39+
except ValueError as e:
40+
raise ValueError(f"Cannot parse size: {s!r}") from e
41+
try:
42+
return int(s.strip())
43+
except ValueError as e:
44+
raise ValueError(f"Cannot parse size: {s!r}") from e
45+
46+
2047
def missing_val(arr):
2148
if arr.dtype.kind == "i":
2249
return INT_MISSING

0 commit comments

Comments
 (0)