From 43205f7d35748d87a0d6db56e9e99f96f65e6aeb Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 23 Apr 2026 11:49:23 +0100 Subject: [PATCH 1/4] Add a --zarr-format CLI option Remove unnecessary monkeypatching Rename ZARR_FORMAT to DEFAULT_ZARR_FORMAT and similarly for the env var Add zarr_format to more signatures Add open_zarr_append convenience function --- .github/workflows/ci.yml | 2 +- bio2zarr/cli.py | 23 +++++++++++++++ bio2zarr/plink.py | 2 ++ bio2zarr/tskit.py | 2 ++ bio2zarr/vcf.py | 11 ++++++-- bio2zarr/vcz.py | 18 +++++++----- bio2zarr/zarr_utils.py | 51 +++++++++++++++++++-------------- tests/test_cli.py | 5 ++++ tests/test_zarr_utils.py | 61 ++++++++++++++++++++-------------------- 9 files changed, 113 insertions(+), 62 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c3e16ab..ad4e01f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -227,4 +227,4 @@ jobs: run: | uv run python -m pytest env: - BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }} + BIO2ZARR_DEFAULT_ZARR_FORMAT: ${{ matrix.zarr-format }} diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 4fefc3f..493315e 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -43,6 +43,14 @@ def list_commands(self, ctx): "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True) ) +zarr_format = click.option( + "--zarr-format", + type=click.Choice([2, 3]), + default=None, + help="Zarr format version of output (default: 2, or value of " + "BIO2ZARR_DEFAULT_ZARR_FORMAT env var)", +) + num_partitions = click.option( "-n", "--num-partitions", @@ -373,6 +381,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles, p @click.command @icf_path @new_zarr_path +@zarr_format @force @verbose @schema @@ -385,6 +394,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles, p def encode( icf_path, zarr_path, + zarr_format, force, verbose, schema, @@ -403,6 +413,7 @@ def encode( vcf_mod.encode( icf_path, zarr_path, + zarr_format=zarr_format, schema_path=schema, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, @@ -416,6 +427,7 @@ def encode( @click.command @icf_path @new_zarr_path +@zarr_format @num_partitions @force @schema @@ -428,6 +440,7 @@ def encode( def dencode_init( icf_path, zarr_path, + zarr_format, num_partitions, force, schema, @@ -457,6 +470,7 @@ def dencode_init( work_summary = vcf_mod.encode_init( icf_path, zarr_path, + zarr_format=zarr_format, target_num_partitions=num_partitions, schema_path=schema, variants_chunk_size=variants_chunk_size, @@ -500,6 +514,7 @@ def dencode_finalise(zarr_path, verbose, progress): @click.command(name="convert") @vcfs @new_zarr_path +@zarr_format @force @variants_chunk_size @samples_chunk_size @@ -510,6 +525,7 @@ def dencode_finalise(zarr_path, verbose, progress): def convert_vcf( vcfs, zarr_path, + zarr_format, force, variants_chunk_size, samples_chunk_size, @@ -526,6 +542,7 @@ def convert_vcf( vcf_mod.convert( vcfs, zarr_path, + zarr_format=zarr_format, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, show_progress=progress, @@ -563,6 +580,7 @@ def vcf2zarr_main(): @click.argument("in_path", type=click.Path()) @click.argument("zarr_path", type=click.Path()) @force +@zarr_format @worker_processes @progress @verbose @@ -572,6 +590,7 @@ def convert_plink( in_path, zarr_path, force, + zarr_format, verbose, worker_processes, progress, @@ -588,6 +607,7 @@ def convert_plink( plink.convert( in_path, zarr_path, + zarr_format=zarr_format, show_progress=progress, worker_processes=worker_processes, samples_chunk_size=samples_chunk_size, @@ -740,6 +760,7 @@ def zipzarr(src, dest, unzip, keep, force, verbose, progress): @progress @worker_processes @force +@zarr_format @core.requires_optional_dependency("tskit", "tskit") def convert_tskit( ts_path, @@ -752,6 +773,7 @@ def convert_tskit( progress, worker_processes, force, + zarr_format, ): setup_logging(verbose) check_overwrite_dir(zarr_path, force) @@ -772,6 +794,7 @@ def convert_tskit( samples_chunk_size=samples_chunk_size, worker_processes=worker_processes, show_progress=progress, + zarr_format=zarr_format, ) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 650b1ec..f7dc8c5 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -310,6 +310,7 @@ def convert( samples_chunk_size=None, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False, + zarr_format=None, ): """ Convert a PLINK fileset to VCF Zarr format. @@ -359,4 +360,5 @@ def convert( mode=mode, worker_processes=worker_processes, show_progress=show_progress, + zarr_format=zarr_format, ) diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py index 4a01c45..d9154c5 100644 --- a/bio2zarr/tskit.py +++ b/bio2zarr/tskit.py @@ -265,6 +265,7 @@ def convert( samples_chunk_size=None, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False, + zarr_format=None, ): """ Convert a :class:`tskit.TreeSequence` (or path to a tree sequence @@ -330,4 +331,5 @@ def convert( mode=mode, worker_processes=worker_processes, show_progress=show_progress, + zarr_format=zarr_format, ) diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 46fe990..7bb284c 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1654,6 +1654,7 @@ def convert( vcz_path=None, *, mode="r", + zarr_format=None, variants_chunk_size=None, samples_chunk_size=None, worker_processes=core.DEFAULT_WORKER_PROCESSES, @@ -1719,6 +1720,7 @@ def convert( icf_path, vcz_path, mode=mode, + zarr_format=zarr_format, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, worker_processes=worker_processes, @@ -1747,6 +1749,7 @@ def encode( worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False, mode="r", + zarr_format=None, ): with vcz.open_zarr(zarr_path, mode=mode) as zr: # Rough heuristic to split work up enough to keep utilisation high @@ -1761,8 +1764,11 @@ def encode( local_alleles=local_alleles, ploidy=ploidy, max_variant_chunks=max_variant_chunks, + zarr_format=zarr_format, + ) + vzw = vcz.VcfZarrWriter( + IntermediateColumnarFormat, zr.dir, zarr_format=zarr_format ) - vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zr.dir) vzw.encode_all_partitions( worker_processes=worker_processes, show_progress=show_progress, @@ -1787,6 +1793,7 @@ def encode_init( max_memory=None, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False, + zarr_format=None, ): icf_store = IntermediateColumnarFormat(icf_path) if schema_path is None: @@ -1805,7 +1812,7 @@ def encode_init( with open(schema_path) as f: schema_instance = vcz.VcfZarrSchema.fromjson(f.read()) zarr_path = pathlib.Path(zarr_path) - vzw = vcz.VcfZarrWriter("icf", zarr_path) + vzw = vcz.VcfZarrWriter("icf", zarr_path, zarr_format=zarr_format) return vzw.init( icf_store, target_num_partitions=target_num_partitions, diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 3e18cb0..8e98648 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -569,6 +569,7 @@ def encode( mode="r", worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False, + zarr_format=None, ): """Encode a source format object into a Zarr store. @@ -600,7 +601,7 @@ def encode( """ source_type = type(source_format) with open_zarr(zarr_path, mode=mode) as zr: - vzw = VcfZarrWriter(source_type, zr.dir) + vzw = VcfZarrWriter(source_type, zr.dir, zarr_format=zarr_format) # Rough heuristic to split work up enough to keep utilisation high target_num_partitions = max(1, worker_processes * 4) vzw.init( @@ -618,9 +619,10 @@ def encode( class VcfZarrWriter: - def __init__(self, source_type, path): + def __init__(self, source_type, path, zarr_format=None): self.source_type = source_type self.path = pathlib.Path(path) + self.zarr_format = zarr_format or zarr_utils.DEFAULT_ZARR_FORMAT self.wip_path = self.path / "wip" self.arrays_path = self.wip_path / "arrays" self.partitions_path = self.wip_path / "partitions" @@ -679,7 +681,7 @@ def init( ) self.path.mkdir() - root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS) + root = zarr_utils.open_zarr_append(self.path, zarr_format=self.zarr_format) root.attrs.update( { "vcf_zarr_version": "0.4", @@ -698,8 +700,8 @@ def init( self.wip_path.mkdir() self.arrays_path.mkdir() self.partitions_path.mkdir() - root = zarr.open( - store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS + root = zarr_utils.open_zarr_append( + self.arrays_path, zarr_format=self.zarr_format ) total_chunks = 0 @@ -788,7 +790,7 @@ def init_array(self, root, schema, array_spec, variants_dim_size): else schema.defaults["compressor"] ) - kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS) + kwargs = {} # see https://github.com/zarr-developers/zarr-python/issues/3197 kwargs["fill_value"] = None @@ -1050,7 +1052,9 @@ def finalise_array(self, name): if not src.exists(): # Needs test raise ValueError(f"Partition {partition} of {name} does not exist") - zarr_utils.move_chunks(src, self.arrays_path, partition, name) + zarr_utils.move_chunks( + src, self.arrays_path, partition, name, self.zarr_format + ) # Finally, once all the chunks have moved into the arrays dir, # we move it out of wip os.rename(self.arrays_path / name, self.path / name) diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py index c3b24ee..0cca469 100644 --- a/bio2zarr/zarr_utils.py +++ b/bio2zarr/zarr_utils.py @@ -14,25 +14,24 @@ numcodecs.blosc.use_threads = False -# Storage format (zarr v2 vs v3) is chosen at runtime via the -# BIO2ZARR_ZARR_FORMAT env var. The underlying zarr-python library is always -# v3 (>=3.1); this flag only controls which on-disk format we write. -# NOTE: this inferface for v3 storage was introduced for experimentation and -# is not envisaged as a long-term interface. +# Storage format (zarr v2 vs v3) is chosen at runtime in the CLI via +# --zarr-format, in the API by zarr_format, or by the BIO2ZARR_DEFAULT_ZARR_FORMAT +# env var. If specified in the CLI or API these override any env var setting. +# The default is Zarr format version 2. The underlying zarr-python library is +# always v3 (>=3.1); this flag only controls which on-disk format we write. try: - ZARR_FORMAT = int(os.environ.get("BIO2ZARR_ZARR_FORMAT", "2")) + DEFAULT_ZARR_FORMAT = int(os.environ.get("BIO2ZARR_DEFAULT_ZARR_FORMAT", "2")) except ValueError: - ZARR_FORMAT = 2 + DEFAULT_ZARR_FORMAT = 2 -ZARR_FORMAT_KWARGS = dict(zarr_format=ZARR_FORMAT) # In zarr-python v3 strings are stored as string arrays (T) with itemsize 16 STRING_DTYPE_NAME = "T" STRING_ITEMSIZE = 16 # Canonical, format-independent compressor configs (numcodecs-style dicts). -# Stored verbatim in schema JSON so a schema written under one ZARR_FORMAT +# Stored verbatim in schema JSON so a schema written under one Zarr format # stays readable under the other. DEFAULT_COMPRESSOR_CONFIG = { "id": "blosc", @@ -63,18 +62,18 @@ } -def make_compressor(config): +def make_compressor(config, zarr_format): """Build a format-correct compressor from a numcodecs-style config dict. - Returns a numcodecs codec under ZARR_FORMAT==2 and a zarr v3 codec under - ZARR_FORMAT==3. Only Blosc is supported, since that is the only + Returns a numcodecs codec under zarr_format==2 and a zarr v3 codec under + zarr_format==3. Only Blosc is supported, since that is the only compressor bio2zarr produces. """ if config.get("id") != "blosc": raise NotImplementedError( f"Only blosc compressors are supported, got {config!r}" ) - if ZARR_FORMAT == 2: + if zarr_format == 2: return numcodecs.get_codec(config) return BloscCodec( cname=config["cname"], @@ -90,6 +89,12 @@ def first_dim_iter(z): yield from z.blocks[chunk] +def open_zarr_append(store, zarr_format): + """Open a Zarr store for append using the given Zarr format""" + zarr_format_kwargs = dict(zarr_format=zarr_format or DEFAULT_ZARR_FORMAT) + return zarr.open(store=store, mode="a", **zarr_format_kwargs) + + def open_vcf_zarr(path): """Open a VCF Zarr store at ``path``, returning the root group. @@ -123,11 +128,13 @@ def create_group_array( """Create an array within a group.""" new_kwargs = {**kwargs} if compressor is not None: - new_kwargs["compressors"] = [make_compressor(compressor)] + new_kwargs["compressors"] = [ + make_compressor(compressor, group.metadata.zarr_format) + ] # Zarr format v2 rejects dimension_names on create_array; we instead # write the xarray _ARRAY_DIMENSIONS attribute after the fact. - if ZARR_FORMAT == 3: + if group.metadata.zarr_format == 3: new_kwargs.pop("zarr_format", None) new_kwargs["dimension_names"] = dimension_names @@ -139,7 +146,7 @@ def create_group_array( ) else: array = group.create_array(name, shape=shape, dtype=dtype, **new_kwargs) - if ZARR_FORMAT == 2 and dimension_names is not None: + if group.metadata.zarr_format == 2 and dimension_names is not None: array.attrs["_ARRAY_DIMENSIONS"] = dimension_names return array @@ -160,8 +167,10 @@ def create_empty_group_array( new_kwargs = {**kwargs} new_kwargs.pop("zarr_format", None) if compressor is not None: - new_kwargs["compressors"] = [make_compressor(compressor)] - if ZARR_FORMAT == 2: + new_kwargs["compressors"] = [ + make_compressor(compressor, group.metadata.zarr_format) + ] + if group.metadata.zarr_format == 2: # Filter list is interpreted as numcodecs-style config dicts and # converted here so callers don't need to import numcodecs. # Zarr v2 also requires a VLenUTF8 filter to accompany "T"-dtype @@ -184,7 +193,7 @@ def create_empty_group_array( array = group.create_array( name=name, shape=shape, dtype=dtype, chunks=chunks, **new_kwargs ) - if ZARR_FORMAT == 2 and dimension_names is not None: + if group.metadata.zarr_format == 2 and dimension_names is not None: array.attrs["_ARRAY_DIMENSIONS"] = dimension_names return array @@ -210,10 +219,10 @@ def get_compressor_config(array): raise TypeError(f"Unsupported compressor type: {type(compressor).__name__}") -def move_chunks(src_path, dest_path, partition, name): +def move_chunks(src_path, dest_path, partition, name, zarr_format): # Zarr v2 stores chunk files directly in the array directory; v3 places # them under a c/ subdirectory. - if ZARR_FORMAT == 2: + if zarr_format == 2: dest = dest_path / name chunk_files = [ path for path in src_path.iterdir() if not path.name.startswith(".") diff --git a/tests/test_cli.py b/tests/test_cli.py index 5f06385..3e9a290 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -40,6 +40,7 @@ worker_processes=core.DEFAULT_WORKER_PROCESSES, max_memory=None, show_progress=True, + zarr_format=None, ) DEFAULT_DENCODE_INIT_ARGS = dict( @@ -48,6 +49,7 @@ samples_chunk_size=None, max_variant_chunks=None, show_progress=True, + zarr_format=None, ) DEFAULT_DENCODE_PARTITION_ARGS = dict() @@ -66,6 +68,7 @@ show_progress=True, worker_processes=core.DEFAULT_WORKER_PROCESSES, local_alleles=False, + zarr_format=None, ) DEFAULT_TSKIT_CONVERT_ARGS = dict( @@ -74,6 +77,7 @@ samples_chunk_size=None, show_progress=True, worker_processes=core.DEFAULT_WORKER_PROCESSES, + zarr_format=None, ) DEFAULT_PLINK_CONVERT_ARGS = dict( @@ -81,6 +85,7 @@ samples_chunk_size=None, show_progress=True, worker_processes=core.DEFAULT_WORKER_PROCESSES, + zarr_format=None, ) diff --git a/tests/test_zarr_utils.py b/tests/test_zarr_utils.py index 6daf9d1..5adf368 100644 --- a/tests/test_zarr_utils.py +++ b/tests/test_zarr_utils.py @@ -13,8 +13,7 @@ @pytest.fixture(params=[2, 3]) -def zarr_format(request, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", request.param) +def zarr_format(request): return request.param @@ -133,15 +132,17 @@ def test_default_mode_read_only(self, tmp_path): class TestMakeCompressor: - def test_v2_returns_numcodecs(self, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2) - c = zarr_utils.make_compressor(zarr_utils.DEFAULT_COMPRESSOR_CONFIG) + def test_v2_returns_numcodecs(self): + c = zarr_utils.make_compressor( + zarr_utils.DEFAULT_COMPRESSOR_CONFIG, zarr_format=2 + ) assert isinstance(c, numcodecs.Blosc) assert c.get_config() == zarr_utils.DEFAULT_COMPRESSOR_CONFIG - def test_v3_returns_blosc_codec(self, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3) - c = zarr_utils.make_compressor(zarr_utils.DEFAULT_COMPRESSOR_CONFIG) + def test_v3_returns_blosc_codec(self): + c = zarr_utils.make_compressor( + zarr_utils.DEFAULT_COMPRESSOR_CONFIG, zarr_format=3 + ) assert isinstance(c, BloscCodec) assert c.cname.value == "zstd" assert c.clevel == 7 @@ -156,31 +157,36 @@ def test_v3_returns_blosc_codec(self, monkeypatch): (numcodecs.Blosc.BITSHUFFLE, BloscShuffle.bitshuffle), ], ) - def test_v3_shuffle_mapping(self, monkeypatch, numcodecs_shuffle, v3_shuffle): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3) + def test_v3_shuffle_mapping(self, numcodecs_shuffle, v3_shuffle): c = zarr_utils.make_compressor( { "id": "blosc", "cname": "zstd", "clevel": 5, "shuffle": numcodecs_shuffle, - } + }, + zarr_format=3, ) assert c.shuffle == v3_shuffle assert c.blocksize == 0 - def test_v3_default_shuffle_when_omitted(self, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3) - c = zarr_utils.make_compressor({"id": "blosc", "cname": "lz4", "clevel": 1}) + def test_v3_default_shuffle_when_omitted(self): + c = zarr_utils.make_compressor( + {"id": "blosc", "cname": "lz4", "clevel": 1}, zarr_format=3 + ) assert c.shuffle == BloscShuffle.shuffle + @pytest.mark.parametrize("zarr_format", [2, 3]) def test_non_blosc_raises(self, zarr_format): with pytest.raises(NotImplementedError, match="Only blosc"): - zarr_utils.make_compressor({"id": "zlib", "level": 1}) + zarr_utils.make_compressor( + {"id": "zlib", "level": 1}, zarr_format=zarr_format + ) + @pytest.mark.parametrize("zarr_format", [2, 3]) def test_missing_id_raises(self, zarr_format): with pytest.raises(NotImplementedError, match="Only blosc"): - zarr_utils.make_compressor({"cname": "zstd"}) + zarr_utils.make_compressor({"cname": "zstd"}, zarr_format=zarr_format) class TestDefaultCompressorConstants: @@ -428,8 +434,7 @@ def test_no_filters_non_string(self, group, zarr_format): if zarr_format == 2: assert a.metadata.filters is None - def test_user_filters_non_string_v2(self, tmp_path, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2) + def test_user_filters_non_string_v2(self, tmp_path): root = zarr.open_group(tmp_path / "s", mode="w", zarr_format=2) a = zarr_utils.create_empty_group_array( root, @@ -464,10 +469,7 @@ def test_string_dtype_no_user_filters(self, group, zarr_format, string_dtype): @pytest.mark.parametrize( "string_dtype", [zarr_utils.STRING_DTYPE_NAME, StringDType()] ) - def test_string_dtype_with_user_filters_v2( - self, tmp_path, monkeypatch, string_dtype - ): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2) + def test_string_dtype_with_user_filters_v2(self, tmp_path, string_dtype): root = zarr.open_group(tmp_path / "s", mode="w", zarr_format=2) # Place a VLenUTF8 in the user filter list; the helper should # still append its own VLenUTF8 (the conversion path does not @@ -576,8 +578,7 @@ class Stub: class TestMoveChunks: - def test_v2_layout(self, tmp_path, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 2) + def test_v2_layout(self, tmp_path): src = tmp_path / "src" src.mkdir() (src / ".zarray").write_text("{}") @@ -587,15 +588,14 @@ def test_v2_layout(self, tmp_path, monkeypatch): dest_root = tmp_path / "dest" (dest_root / "arr").mkdir(parents=True) - zarr_utils.move_chunks(src, dest_root, partition=0, name="arr") + zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=2) assert (dest_root / "arr" / "0").read_text() == "chunk0" assert (dest_root / "arr" / "1").read_text() == "chunk1" # Hidden file is left behind. assert (src / ".zarray").exists() - def test_v3_layout(self, tmp_path, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3) + def test_v3_layout(self, tmp_path): src = tmp_path / "src" (src / "c").mkdir(parents=True) (src / "c" / "0").write_text("chunk0") @@ -605,14 +605,13 @@ def test_v3_layout(self, tmp_path, monkeypatch): dest_root = tmp_path / "dest" (dest_root / "arr").mkdir(parents=True) - zarr_utils.move_chunks(src, dest_root, partition=0, name="arr") + zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=3) assert (dest_root / "arr" / "c" / "0").read_text() == "chunk0" assert (dest_root / "arr" / "c" / "1").read_text() == "chunk1" assert (src / "c" / ".hidden").exists() - def test_v3_missing_c_directory(self, tmp_path, monkeypatch): - monkeypatch.setattr(zarr_utils, "ZARR_FORMAT", 3) + def test_v3_missing_c_directory(self, tmp_path): src = tmp_path / "src" src.mkdir() @@ -620,5 +619,5 @@ def test_v3_missing_c_directory(self, tmp_path, monkeypatch): (dest_root / "arr").mkdir(parents=True) # Should not raise even though src/c/ does not exist. - zarr_utils.move_chunks(src, dest_root, partition=0, name="arr") + zarr_utils.move_chunks(src, dest_root, partition=0, name="arr", zarr_format=3) assert list((dest_root / "arr" / "c").iterdir()) == [] From 2810977fc8508331793dbf20694448d650d83c5c Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 24 Apr 2026 11:34:10 +0100 Subject: [PATCH 2/4] Refactor --- bio2zarr/cli.py | 24 ++++++++++++------------ bio2zarr/plink.py | 3 +++ bio2zarr/tskit.py | 3 +++ bio2zarr/vcf.py | 5 ++++- bio2zarr/vcz.py | 3 +++ 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 493315e..438a236 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -381,7 +381,6 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles, p @click.command @icf_path @new_zarr_path -@zarr_format @force @verbose @schema @@ -391,10 +390,10 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles, p @max_memory @progress @worker_processes +@zarr_format def encode( icf_path, zarr_path, - zarr_format, force, verbose, schema, @@ -404,6 +403,7 @@ def encode( max_memory, progress, worker_processes, + zarr_format, ): """ Convert intermediate columnar format to VCF Zarr. @@ -413,7 +413,6 @@ def encode( vcf_mod.encode( icf_path, zarr_path, - zarr_format=zarr_format, schema_path=schema, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, @@ -421,13 +420,13 @@ def encode( worker_processes=worker_processes, max_memory=max_memory, show_progress=progress, + zarr_format=zarr_format, ) @click.command @icf_path @new_zarr_path -@zarr_format @num_partitions @force @schema @@ -437,10 +436,10 @@ def encode( @json @progress @verbose +@zarr_format def dencode_init( icf_path, zarr_path, - zarr_format, num_partitions, force, schema, @@ -450,6 +449,7 @@ def dencode_init( json, progress, verbose, + zarr_format, ): """ Initialise conversion of intermediate format to VCF Zarr. This will @@ -470,13 +470,13 @@ def dencode_init( work_summary = vcf_mod.encode_init( icf_path, zarr_path, - zarr_format=zarr_format, target_num_partitions=num_partitions, schema_path=schema, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, max_variant_chunks=max_variant_chunks, show_progress=progress, + zarr_format=zarr_format, ) show_work_summary(work_summary, json) @@ -514,7 +514,6 @@ def dencode_finalise(zarr_path, verbose, progress): @click.command(name="convert") @vcfs @new_zarr_path -@zarr_format @force @variants_chunk_size @samples_chunk_size @@ -522,10 +521,10 @@ def dencode_finalise(zarr_path, verbose, progress): @progress @worker_processes @local_alleles +@zarr_format def convert_vcf( vcfs, zarr_path, - zarr_format, force, variants_chunk_size, samples_chunk_size, @@ -533,6 +532,7 @@ def convert_vcf( progress, worker_processes, local_alleles, + zarr_format, ): """ Convert input VCF(s) directly to VCF Zarr (not recommended for large files). @@ -542,12 +542,12 @@ def convert_vcf( vcf_mod.convert( vcfs, zarr_path, - zarr_format=zarr_format, variants_chunk_size=variants_chunk_size, samples_chunk_size=samples_chunk_size, show_progress=progress, worker_processes=worker_processes, local_alleles=local_alleles, + zarr_format=zarr_format, ) @@ -580,22 +580,22 @@ def vcf2zarr_main(): @click.argument("in_path", type=click.Path()) @click.argument("zarr_path", type=click.Path()) @force -@zarr_format @worker_processes @progress @verbose @variants_chunk_size @samples_chunk_size +@zarr_format def convert_plink( in_path, zarr_path, force, - zarr_format, verbose, worker_processes, progress, variants_chunk_size, samples_chunk_size, + zarr_format, ): """ Convert plink fileset to VCF Zarr. Results are equivalent to @@ -607,11 +607,11 @@ def convert_plink( plink.convert( in_path, zarr_path, - zarr_format=zarr_format, show_progress=progress, worker_processes=worker_processes, samples_chunk_size=samples_chunk_size, variants_chunk_size=variants_chunk_size, + zarr_format=zarr_format, ) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index f7dc8c5..0538067 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -342,6 +342,9 @@ def convert( means use the main process only. show_progress : bool If True, display a progress bar during conversion. + zarr_format : int, optional + Zarr format version of output (default: 2, or value of + BIO2ZARR_DEFAULT_ZARR_FORMAT env var) Returns ------- diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py index d9154c5..be34263 100644 --- a/bio2zarr/tskit.py +++ b/bio2zarr/tskit.py @@ -301,6 +301,9 @@ def convert( means use the main process only. show_progress : bool If True, display a progress bar during conversion. + zarr_format : int, optional + Zarr format version of output (default: 2, or value of + BIO2ZARR_DEFAULT_ZARR_FORMAT env var) Returns ------- diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 7bb284c..8d0a907 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1654,7 +1654,6 @@ def convert( vcz_path=None, *, mode="r", - zarr_format=None, variants_chunk_size=None, samples_chunk_size=None, worker_processes=core.DEFAULT_WORKER_PROCESSES, @@ -1662,6 +1661,7 @@ def convert( ploidy=None, show_progress=False, icf_path=None, + zarr_format=None, ): """ Convert VCF file(s) to VCF Zarr format. @@ -1698,6 +1698,9 @@ def convert( icf_path : str, Path, or None Path for the intermediate columnar format (ICF) data. If None, a temporary directory is used and cleaned up automatically. + zarr_format : int, optional + Zarr format version of output (default: 2, or value of + BIO2ZARR_DEFAULT_ZARR_FORMAT env var) Returns ------- diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 8e98648..71a7c83 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -593,6 +593,9 @@ def encode( Number of worker processes for parallel encoding. show_progress : bool If True, display a progress bar. + zarr_format : int, optional + Zarr format version of output (default: 2, or value of + BIO2ZARR_DEFAULT_ZARR_FORMAT env var) Returns ------- From c903d26c58e367d51fe92631479eb72f0fa52014 Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 24 Apr 2026 10:51:44 +0100 Subject: [PATCH 3/4] Add tskit zarr_format test Add plink zarr_format test Add vcf zarr_format test --- tests/test_plink.py | 9 +++++++++ tests/test_tskit.py | 10 ++++++++++ tests/test_vcf_examples.py | 10 ++++++++++ 3 files changed, 29 insertions(+) diff --git a/tests/test_plink.py b/tests/test_plink.py index 1f19101..c0a186e 100644 --- a/tests/test_plink.py +++ b/tests/test_plink.py @@ -443,6 +443,15 @@ def test_by_validating( validate(path, out) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_zarr_format(tmp_path, zarr_format): + path = "tests/data/plink/plink_sim_10s_100v_10pmiss" + out = tmp_path / "example.zarr" + root = plink.convert(path, out, zarr_format=zarr_format) + assert root.metadata.zarr_format == zarr_format + validate(path, out) + + class TestMultipleContigs: """Test handling of multiple contigs in PLINK files.""" diff --git a/tests/test_tskit.py b/tests/test_tskit.py index 34df5d7..a15347e 100644 --- a/tests/test_tskit.py +++ b/tests/test_tskit.py @@ -575,3 +575,13 @@ def test_workers(tmp_path, worker_processes): root = tsk.convert(ts, worker_processes=worker_processes) ds = load_dataset(root) assert_ts_ds_equal(ts, ds) + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_zarr_format(zarr_format): + ts = msprime.sim_ancestry(10, sequence_length=1000, random_seed=42) + ts = add_mutations(ts) + root = tsk.convert(ts, zarr_format=zarr_format) + assert root.metadata.zarr_format == zarr_format + ds = load_dataset(root) + assert_ts_ds_equal(ts, ds) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index d8b2467..dd3c235 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -394,6 +394,16 @@ def test_worker_processes(self, ds, worker_processes): ds2 = load_dataset(root) assert_dataset_equal(ds, ds2, drop_vars=["region_index"]) + @pytest.mark.parametrize("zarr_format", [2, 3]) + def test_zarr_format(self, ds, zarr_format): + root = vcf_mod.convert( + [self.data_path], + zarr_format=zarr_format, + ) + assert root.metadata.zarr_format == zarr_format + ds2 = load_dataset(root) + assert_dataset_equal(ds, ds2, drop_vars=["region_index"]) + def test_inspect(self, tmp_path): # TODO pretty weak test, we should be doing this better somewhere else out = tmp_path / "example.vcf.zarr" From 5c9a8edd03b8c7fba48cb9acb6f2f1cac1ae61cf Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 24 Apr 2026 12:03:35 +0100 Subject: [PATCH 4/4] Review feedback --- bio2zarr/vcz.py | 2 +- bio2zarr/zarr_utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 71a7c83..c8eb807 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -625,7 +625,7 @@ class VcfZarrWriter: def __init__(self, source_type, path, zarr_format=None): self.source_type = source_type self.path = pathlib.Path(path) - self.zarr_format = zarr_format or zarr_utils.DEFAULT_ZARR_FORMAT + self.zarr_format = zarr_format self.wip_path = self.path / "wip" self.arrays_path = self.wip_path / "arrays" self.partitions_path = self.wip_path / "partitions" diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py index 0cca469..a67a804 100644 --- a/bio2zarr/zarr_utils.py +++ b/bio2zarr/zarr_utils.py @@ -91,8 +91,9 @@ def first_dim_iter(z): def open_zarr_append(store, zarr_format): """Open a Zarr store for append using the given Zarr format""" - zarr_format_kwargs = dict(zarr_format=zarr_format or DEFAULT_ZARR_FORMAT) - return zarr.open(store=store, mode="a", **zarr_format_kwargs) + if zarr_format is None: + zarr_format = DEFAULT_ZARR_FORMAT + return zarr.open(store=store, mode="a", zarr_format=zarr_format) def open_vcf_zarr(path): @@ -222,6 +223,8 @@ def get_compressor_config(array): def move_chunks(src_path, dest_path, partition, name, zarr_format): # Zarr v2 stores chunk files directly in the array directory; v3 places # them under a c/ subdirectory. + if zarr_format is None: + zarr_format = DEFAULT_ZARR_FORMAT if zarr_format == 2: dest = dest_path / name chunk_files = [