Skip to content

Commit e447ed9

Browse files
committed
Add a --zarr-format CLI option
1 parent 4b50f88 commit e447ed9

6 files changed

Lines changed: 69 additions & 42 deletions

File tree

bio2zarr/cli.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ def list_commands(self, ctx):
4343
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
4444
)
4545

46+
zarr_format = click.option(
47+
"--zarr-format",
48+
type=click.Choice([2, 3]),
49+
default=None,
50+
help="Zarr format version of output",
51+
)
52+
4653
num_partitions = click.option(
4754
"-n",
4855
"--num-partitions",
@@ -500,6 +507,7 @@ def dencode_finalise(zarr_path, verbose, progress):
500507
@click.command(name="convert")
501508
@vcfs
502509
@new_zarr_path
510+
@zarr_format
503511
@force
504512
@variants_chunk_size
505513
@samples_chunk_size
@@ -510,6 +518,7 @@ def dencode_finalise(zarr_path, verbose, progress):
510518
def convert_vcf(
511519
vcfs,
512520
zarr_path,
521+
zarr_format,
513522
force,
514523
variants_chunk_size,
515524
samples_chunk_size,
@@ -526,6 +535,7 @@ def convert_vcf(
526535
vcf_mod.convert(
527536
vcfs,
528537
zarr_path,
538+
zarr_format=zarr_format,
529539
variants_chunk_size=variants_chunk_size,
530540
samples_chunk_size=samples_chunk_size,
531541
show_progress=progress,

bio2zarr/vcf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1654,6 +1654,7 @@ def convert(
16541654
vcz_path=None,
16551655
*,
16561656
mode="r",
1657+
zarr_format=None,
16571658
variants_chunk_size=None,
16581659
samples_chunk_size=None,
16591660
worker_processes=core.DEFAULT_WORKER_PROCESSES,
@@ -1719,6 +1720,7 @@ def convert(
17191720
icf_path,
17201721
vcz_path,
17211722
mode=mode,
1723+
zarr_format=zarr_format,
17221724
variants_chunk_size=variants_chunk_size,
17231725
samples_chunk_size=samples_chunk_size,
17241726
worker_processes=worker_processes,
@@ -1747,6 +1749,7 @@ def encode(
17471749
worker_processes=core.DEFAULT_WORKER_PROCESSES,
17481750
show_progress=False,
17491751
mode="r",
1752+
zarr_format=None,
17501753
):
17511754
with vcz.open_zarr(zarr_path, mode=mode) as zr:
17521755
# Rough heuristic to split work up enough to keep utilisation high
@@ -1761,6 +1764,7 @@ def encode(
17611764
local_alleles=local_alleles,
17621765
ploidy=ploidy,
17631766
max_variant_chunks=max_variant_chunks,
1767+
zarr_format=zarr_format,
17641768
)
17651769
vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zr.dir)
17661770
vzw.encode_all_partitions(
@@ -1787,6 +1791,7 @@ def encode_init(
17871791
max_memory=None,
17881792
worker_processes=core.DEFAULT_WORKER_PROCESSES,
17891793
show_progress=False,
1794+
zarr_format=None,
17901795
):
17911796
icf_store = IntermediateColumnarFormat(icf_path)
17921797
if schema_path is None:
@@ -1805,7 +1810,7 @@ def encode_init(
18051810
with open(schema_path) as f:
18061811
schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
18071812
zarr_path = pathlib.Path(zarr_path)
1808-
vzw = vcz.VcfZarrWriter("icf", zarr_path)
1813+
vzw = vcz.VcfZarrWriter("icf", zarr_path, zarr_format=zarr_format)
18091814
return vzw.init(
18101815
icf_store,
18111816
target_num_partitions=target_num_partitions,

bio2zarr/vcz.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -618,9 +618,10 @@ def encode(
618618

619619

620620
class VcfZarrWriter:
621-
def __init__(self, source_type, path):
621+
def __init__(self, source_type, path, zarr_format=None):
622622
self.source_type = source_type
623623
self.path = pathlib.Path(path)
624+
self.zarr_format = zarr_format or zarr_utils.ZARR_FORMAT
624625
self.wip_path = self.path / "wip"
625626
self.arrays_path = self.wip_path / "arrays"
626627
self.partitions_path = self.wip_path / "partitions"
@@ -679,7 +680,10 @@ def init(
679680
)
680681

681682
self.path.mkdir()
682-
root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
683+
zarr_format_kwargs = dict(
684+
zarr_format=self.zarr_format or zarr_utils.ZARR_FORMAT
685+
)
686+
root = zarr.open(store=self.path, mode="a", **zarr_format_kwargs)
683687
root.attrs.update(
684688
{
685689
"vcf_zarr_version": "0.4",
@@ -698,9 +702,7 @@ def init(
698702
self.wip_path.mkdir()
699703
self.arrays_path.mkdir()
700704
self.partitions_path.mkdir()
701-
root = zarr.open(
702-
store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
703-
)
705+
root = zarr.open(store=self.arrays_path, mode="a", **zarr_format_kwargs)
704706

705707
total_chunks = 0
706708
for field in self.schema.fields:
@@ -788,7 +790,7 @@ def init_array(self, root, schema, array_spec, variants_dim_size):
788790
else schema.defaults["compressor"]
789791
)
790792

791-
kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
793+
kwargs = {}
792794
# see https://github.com/zarr-developers/zarr-python/issues/3197
793795
kwargs["fill_value"] = None
794796

@@ -1050,7 +1052,9 @@ def finalise_array(self, name):
10501052
if not src.exists():
10511053
# Needs test
10521054
raise ValueError(f"Partition {partition} of {name} does not exist")
1053-
zarr_utils.move_chunks(src, self.arrays_path, partition, name)
1055+
zarr_utils.move_chunks(
1056+
src, self.arrays_path, partition, name, self.zarr_format
1057+
)
10541058
# Finally, once all the chunks have moved into the arrays dir,
10551059
# we move it out of wip
10561060
os.rename(self.arrays_path / name, self.path / name)

bio2zarr/zarr_utils.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
ZARR_FORMAT = 2
2626

2727

28-
ZARR_FORMAT_KWARGS = dict(zarr_format=ZARR_FORMAT)
2928
# In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
3029
STRING_DTYPE_NAME = "T"
3130
STRING_ITEMSIZE = 16
@@ -63,7 +62,7 @@
6362
}
6463

6564

66-
def make_compressor(config):
65+
def make_compressor(config, zarr_format):
6766
"""Build a format-correct compressor from a numcodecs-style config dict.
6867
6968
Returns a numcodecs codec under ZARR_FORMAT==2 and a zarr v3 codec under
@@ -74,7 +73,7 @@ def make_compressor(config):
7473
raise NotImplementedError(
7574
f"Only blosc compressors are supported, got {config!r}"
7675
)
77-
if ZARR_FORMAT == 2:
76+
if zarr_format == 2:
7877
return numcodecs.get_codec(config)
7978
return BloscCodec(
8079
cname=config["cname"],
@@ -123,11 +122,13 @@ def create_group_array(
123122
"""Create an array within a group."""
124123
new_kwargs = {**kwargs}
125124
if compressor is not None:
126-
new_kwargs["compressors"] = [make_compressor(compressor)]
125+
new_kwargs["compressors"] = [
126+
make_compressor(compressor, group.metadata.zarr_format)
127+
]
127128

128129
# Zarr format v2 rejects dimension_names on create_array; we instead
129130
# write the xarray _ARRAY_DIMENSIONS attribute after the fact.
130-
if ZARR_FORMAT == 3:
131+
if group.metadata.zarr_format == 3:
131132
new_kwargs.pop("zarr_format", None)
132133
new_kwargs["dimension_names"] = dimension_names
133134

@@ -139,7 +140,7 @@ def create_group_array(
139140
)
140141
else:
141142
array = group.create_array(name, shape=shape, dtype=dtype, **new_kwargs)
142-
if ZARR_FORMAT == 2 and dimension_names is not None:
143+
if group.metadata.zarr_format == 2 and dimension_names is not None:
143144
array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
144145
return array
145146

@@ -160,8 +161,10 @@ def create_empty_group_array(
160161
new_kwargs = {**kwargs}
161162
new_kwargs.pop("zarr_format", None)
162163
if compressor is not None:
163-
new_kwargs["compressors"] = [make_compressor(compressor)]
164-
if ZARR_FORMAT == 2:
164+
new_kwargs["compressors"] = [
165+
make_compressor(compressor, group.metadata.zarr_format)
166+
]
167+
if group.metadata.zarr_format == 2:
165168
# Filter list is interpreted as numcodecs-style config dicts and
166169
# converted here so callers don't need to import numcodecs.
167170
# Zarr v2 also requires a VLenUTF8 filter to accompany "T"-dtype
@@ -184,7 +187,7 @@ def create_empty_group_array(
184187
array = group.create_array(
185188
name=name, shape=shape, dtype=dtype, chunks=chunks, **new_kwargs
186189
)
187-
if ZARR_FORMAT == 2 and dimension_names is not None:
190+
if group.metadata.zarr_format == 2 and dimension_names is not None:
188191
array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
189192
return array
190193

@@ -210,10 +213,10 @@ def get_compressor_config(array):
210213
raise TypeError(f"Unsupported compressor type: {type(compressor).__name__}")
211214

212215

213-
def move_chunks(src_path, dest_path, partition, name):
216+
def move_chunks(src_path, dest_path, partition, name, zarr_format):
214217
# Zarr v2 stores chunk files directly in the array directory; v3 places
215218
# them under a c/ subdirectory.
216-
if ZARR_FORMAT == 2:
219+
if zarr_format == 2:
217220
dest = dest_path / name
218221
chunk_files = [
219222
path for path in src_path.iterdir() if not path.name.startswith(".")

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
show_progress=True,
6767
worker_processes=core.DEFAULT_WORKER_PROCESSES,
6868
local_alleles=False,
69+
zarr_format=None,
6970
)
7071

7172
DEFAULT_TSKIT_CONVERT_ARGS = dict(

tests/test_zarr_utils.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,17 @@ def test_default_mode_read_only(self, tmp_path):
133133

134134

135135
class TestMakeCompressor:
136-
def test_v2_returns_numcodecs(self, monkeypatch):
137-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2)
138-
c = zarr_utils.make_compressor(zarr_utils.DEFAULT_COMPRESSOR_CONFIG)
136+
def test_v2_returns_numcodecs(self):
137+
c = zarr_utils.make_compressor(
138+
zarr_utils.DEFAULT_COMPRESSOR_CONFIG, zarr_format=2
139+
)
139140
assert isinstance(c, numcodecs.Blosc)
140141
assert c.get_config() == zarr_utils.DEFAULT_COMPRESSOR_CONFIG
141142

142-
def test_v3_returns_blosc_codec(self, monkeypatch):
143-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3)
144-
c = zarr_utils.make_compressor(zarr_utils.DEFAULT_COMPRESSOR_CONFIG)
143+
def test_v3_returns_blosc_codec(self):
144+
c = zarr_utils.make_compressor(
145+
zarr_utils.DEFAULT_COMPRESSOR_CONFIG, zarr_format=3
146+
)
145147
assert isinstance(c, BloscCodec)
146148
assert c.cname.value == "zstd"
147149
assert c.clevel == 7
@@ -156,31 +158,36 @@ def test_v3_returns_blosc_codec(self, monkeypatch):
156158
(numcodecs.Blosc.BITSHUFFLE, BloscShuffle.bitshuffle),
157159
],
158160
)
159-
def test_v3_shuffle_mapping(self, monkeypatch, numcodecs_shuffle, v3_shuffle):
160-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3)
161+
def test_v3_shuffle_mapping(self, numcodecs_shuffle, v3_shuffle):
161162
c = zarr_utils.make_compressor(
162163
{
163164
"id": "blosc",
164165
"cname": "zstd",
165166
"clevel": 5,
166167
"shuffle": numcodecs_shuffle,
167-
}
168+
},
169+
zarr_format=3,
168170
)
169171
assert c.shuffle == v3_shuffle
170172
assert c.blocksize == 0
171173

172-
def test_v3_default_shuffle_when_omitted(self, monkeypatch):
173-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3)
174-
c = zarr_utils.make_compressor({"id": "blosc", "cname": "lz4", "clevel": 1})
174+
def test_v3_default_shuffle_when_omitted(self):
175+
c = zarr_utils.make_compressor(
176+
{"id": "blosc", "cname": "lz4", "clevel": 1}, zarr_format=3
177+
)
175178
assert c.shuffle == BloscShuffle.shuffle
176179

180+
@pytest.mark.parametrize("zarr_format", [2, 3])
177181
def test_non_blosc_raises(self, zarr_format):
178182
with pytest.raises(NotImplementedError, match="Only blosc"):
179-
zarr_utils.make_compressor({"id": "zlib", "level": 1})
183+
zarr_utils.make_compressor(
184+
{"id": "zlib", "level": 1}, zarr_format=zarr_format
185+
)
180186

187+
@pytest.mark.parametrize("zarr_format", [2, 3])
181188
def test_missing_id_raises(self, zarr_format):
182189
with pytest.raises(NotImplementedError, match="Only blosc"):
183-
zarr_utils.make_compressor({"cname": "zstd"})
190+
zarr_utils.make_compressor({"cname": "zstd"}, zarr_format=zarr_format)
184191

185192

186193
class TestDefaultCompressorConstants:
@@ -576,8 +583,7 @@ class Stub:
576583

577584

578585
class TestMoveChunks:
579-
def test_v2_layout(self, tmp_path, monkeypatch):
580-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2)
586+
def test_v2_layout(self, tmp_path):
581587
src = tmp_path / "src"
582588
src.mkdir()
583589
(src / ".zarray").write_text("{}")
@@ -587,15 +593,14 @@ def test_v2_layout(self, tmp_path, monkeypatch):
587593
dest_root = tmp_path / "dest"
588594
(dest_root / "arr").mkdir(parents=True)
589595

590-
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr")
596+
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=2)
591597

592598
assert (dest_root / "arr" / "0").read_text() == "chunk0"
593599
assert (dest_root / "arr" / "1").read_text() == "chunk1"
594600
# Hidden file is left behind.
595601
assert (src / ".zarray").exists()
596602

597-
def test_v3_layout(self, tmp_path, monkeypatch):
598-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3)
603+
def test_v3_layout(self, tmp_path):
599604
src = tmp_path / "src"
600605
(src / "c").mkdir(parents=True)
601606
(src / "c" / "0").write_text("chunk0")
@@ -605,20 +610,19 @@ def test_v3_layout(self, tmp_path, monkeypatch):
605610
dest_root = tmp_path / "dest"
606611
(dest_root / "arr").mkdir(parents=True)
607612

608-
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr")
613+
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=3)
609614

610615
assert (dest_root / "arr" / "c" / "0").read_text() == "chunk0"
611616
assert (dest_root / "arr" / "c" / "1").read_text() == "chunk1"
612617
assert (src / "c" / ".hidden").exists()
613618

614-
def test_v3_missing_c_directory(self, tmp_path, monkeypatch):
615-
monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3)
619+
def test_v3_missing_c_directory(self, tmp_path):
616620
src = tmp_path / "src"
617621
src.mkdir()
618622

619623
dest_root = tmp_path / "dest"
620624
(dest_root / "arr").mkdir(parents=True)
621625

622626
# Should not raise even though src/c/ does not exist.
623-
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr")
627+
zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=3)
624628
assert list((dest_root / "arr" / "c").iterdir()) == []

0 commit comments

Comments
 (0)