Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -227,4 +227,4 @@ jobs:
run: |
uv run python -m pytest
env:
BIO2ZARR_ZARR_FORMAT: ${{ matrix.zarr-format }}
BIO2ZARR_DEFAULT_ZARR_FORMAT: ${{ matrix.zarr-format }}
23 changes: 23 additions & 0 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def list_commands(self, ctx):
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
)

zarr_format = click.option(
"--zarr-format",
type=click.Choice([2, 3]),
default=None,
help="Zarr format version of output (default: 2, or value of "
"BIO2ZARR_DEFAULT_ZARR_FORMAT env var)",
)

num_partitions = click.option(
"-n",
"--num-partitions",
Expand Down Expand Up @@ -382,6 +390,7 @@ def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles, p
@max_memory
@progress
@worker_processes
@zarr_format
def encode(
icf_path,
zarr_path,
Expand All @@ -394,6 +403,7 @@ def encode(
max_memory,
progress,
worker_processes,
zarr_format,
):
"""
Convert intermediate columnar format to VCF Zarr.
Expand All @@ -410,6 +420,7 @@ def encode(
worker_processes=worker_processes,
max_memory=max_memory,
show_progress=progress,
zarr_format=zarr_format,
)


Expand All @@ -425,6 +436,7 @@ def encode(
@json
@progress
@verbose
@zarr_format
def dencode_init(
icf_path,
zarr_path,
Expand All @@ -437,6 +449,7 @@ def dencode_init(
json,
progress,
verbose,
zarr_format,
):
"""
Initialise conversion of intermediate format to VCF Zarr. This will
Expand All @@ -463,6 +476,7 @@ def dencode_init(
samples_chunk_size=samples_chunk_size,
max_variant_chunks=max_variant_chunks,
show_progress=progress,
zarr_format=zarr_format,
)
show_work_summary(work_summary, json)

Expand Down Expand Up @@ -507,6 +521,7 @@ def dencode_finalise(zarr_path, verbose, progress):
@progress
@worker_processes
@local_alleles
@zarr_format
def convert_vcf(
vcfs,
zarr_path,
Expand All @@ -517,6 +532,7 @@ def convert_vcf(
progress,
worker_processes,
local_alleles,
zarr_format,
):
"""
Convert input VCF(s) directly to VCF Zarr (not recommended for large files).
Expand All @@ -531,6 +547,7 @@ def convert_vcf(
show_progress=progress,
worker_processes=worker_processes,
local_alleles=local_alleles,
zarr_format=zarr_format,
)


Expand Down Expand Up @@ -568,6 +585,7 @@ def vcf2zarr_main():
@verbose
@variants_chunk_size
@samples_chunk_size
@zarr_format
def convert_plink(
in_path,
zarr_path,
Expand All @@ -577,6 +595,7 @@ def convert_plink(
progress,
variants_chunk_size,
samples_chunk_size,
zarr_format,
):
"""
Convert plink fileset to VCF Zarr. Results are equivalent to
Expand All @@ -592,6 +611,7 @@ def convert_plink(
worker_processes=worker_processes,
samples_chunk_size=samples_chunk_size,
variants_chunk_size=variants_chunk_size,
zarr_format=zarr_format,
)


Expand Down Expand Up @@ -740,6 +760,7 @@ def zipzarr(src, dest, unzip, keep, force, verbose, progress):
@progress
@worker_processes
@force
@zarr_format
@core.requires_optional_dependency("tskit", "tskit")
def convert_tskit(
ts_path,
Expand All @@ -752,6 +773,7 @@ def convert_tskit(
progress,
worker_processes,
force,
zarr_format,
):
setup_logging(verbose)
check_overwrite_dir(zarr_path, force)
Expand All @@ -772,6 +794,7 @@ def convert_tskit(
samples_chunk_size=samples_chunk_size,
worker_processes=worker_processes,
show_progress=progress,
zarr_format=zarr_format,
)


Expand Down
5 changes: 5 additions & 0 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ def convert(
samples_chunk_size=None,
worker_processes=core.DEFAULT_WORKER_PROCESSES,
show_progress=False,
zarr_format=None,
):
"""
Convert a PLINK fileset to VCF Zarr format.
Expand Down Expand Up @@ -341,6 +342,9 @@ def convert(
means use the main process only.
show_progress : bool
If True, display a progress bar during conversion.
zarr_format : int, optional
Zarr format version of output (default: 2, or value of
BIO2ZARR_DEFAULT_ZARR_FORMAT env var)

Returns
-------
Expand All @@ -359,4 +363,5 @@ def convert(
mode=mode,
worker_processes=worker_processes,
show_progress=show_progress,
zarr_format=zarr_format,
)
5 changes: 5 additions & 0 deletions bio2zarr/tskit.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def convert(
samples_chunk_size=None,
worker_processes=core.DEFAULT_WORKER_PROCESSES,
show_progress=False,
zarr_format=None,
):
"""
Convert a :class:`tskit.TreeSequence` (or path to a tree sequence
Expand Down Expand Up @@ -300,6 +301,9 @@ def convert(
means use the main process only.
show_progress : bool
If True, display a progress bar during conversion.
zarr_format : int, optional
Zarr format version of output (default: 2, or value of
BIO2ZARR_DEFAULT_ZARR_FORMAT env var)

Returns
-------
Expand Down Expand Up @@ -330,4 +334,5 @@ def convert(
mode=mode,
worker_processes=worker_processes,
show_progress=show_progress,
zarr_format=zarr_format,
)
14 changes: 12 additions & 2 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,7 @@ def convert(
ploidy=None,
show_progress=False,
icf_path=None,
zarr_format=None,
):
"""
Convert VCF file(s) to VCF Zarr format.
Expand Down Expand Up @@ -1697,6 +1698,9 @@ def convert(
icf_path : str, Path, or None
Path for the intermediate columnar format (ICF) data. If None,
a temporary directory is used and cleaned up automatically.
zarr_format : int, optional
Zarr format version of output (default: 2, or value of
BIO2ZARR_DEFAULT_ZARR_FORMAT env var)

Returns
-------
Expand All @@ -1719,6 +1723,7 @@ def convert(
icf_path,
vcz_path,
mode=mode,
zarr_format=zarr_format,
variants_chunk_size=variants_chunk_size,
samples_chunk_size=samples_chunk_size,
worker_processes=worker_processes,
Expand Down Expand Up @@ -1747,6 +1752,7 @@ def encode(
worker_processes=core.DEFAULT_WORKER_PROCESSES,
show_progress=False,
mode="r",
zarr_format=None,
):
with vcz.open_zarr(zarr_path, mode=mode) as zr:
# Rough heuristic to split work up enough to keep utilisation high
Expand All @@ -1761,8 +1767,11 @@ def encode(
local_alleles=local_alleles,
ploidy=ploidy,
max_variant_chunks=max_variant_chunks,
zarr_format=zarr_format,
)
vzw = vcz.VcfZarrWriter(
IntermediateColumnarFormat, zr.dir, zarr_format=zarr_format
)
vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zr.dir)
vzw.encode_all_partitions(
worker_processes=worker_processes,
show_progress=show_progress,
Expand All @@ -1787,6 +1796,7 @@ def encode_init(
max_memory=None,
worker_processes=core.DEFAULT_WORKER_PROCESSES,
show_progress=False,
zarr_format=None,
):
icf_store = IntermediateColumnarFormat(icf_path)
if schema_path is None:
Expand All @@ -1805,7 +1815,7 @@ def encode_init(
with open(schema_path) as f:
schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
zarr_path = pathlib.Path(zarr_path)
vzw = vcz.VcfZarrWriter("icf", zarr_path)
vzw = vcz.VcfZarrWriter("icf", zarr_path, zarr_format=zarr_format)
return vzw.init(
icf_store,
target_num_partitions=target_num_partitions,
Expand Down
21 changes: 14 additions & 7 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ def encode(
mode="r",
worker_processes=core.DEFAULT_WORKER_PROCESSES,
show_progress=False,
zarr_format=None,
):
"""Encode a source format object into a Zarr store.

Expand All @@ -592,6 +593,9 @@ def encode(
Number of worker processes for parallel encoding.
show_progress : bool
If True, display a progress bar.
zarr_format : int, optional
Zarr format version of output (default: 2, or value of
BIO2ZARR_DEFAULT_ZARR_FORMAT env var)

Returns
-------
Expand All @@ -600,7 +604,7 @@ def encode(
"""
source_type = type(source_format)
with open_zarr(zarr_path, mode=mode) as zr:
vzw = VcfZarrWriter(source_type, zr.dir)
vzw = VcfZarrWriter(source_type, zr.dir, zarr_format=zarr_format)
# Rough heuristic to split work up enough to keep utilisation high
target_num_partitions = max(1, worker_processes * 4)
vzw.init(
Expand All @@ -618,9 +622,10 @@ def encode(


class VcfZarrWriter:
def __init__(self, source_type, path):
def __init__(self, source_type, path, zarr_format=None):
self.source_type = source_type
self.path = pathlib.Path(path)
self.zarr_format = zarr_format
self.wip_path = self.path / "wip"
self.arrays_path = self.wip_path / "arrays"
self.partitions_path = self.wip_path / "partitions"
Expand Down Expand Up @@ -679,7 +684,7 @@ def init(
)

self.path.mkdir()
root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS)
root = zarr_utils.open_zarr_append(self.path, zarr_format=self.zarr_format)
root.attrs.update(
{
"vcf_zarr_version": "0.4",
Expand All @@ -698,8 +703,8 @@ def init(
self.wip_path.mkdir()
self.arrays_path.mkdir()
self.partitions_path.mkdir()
root = zarr.open(
store=self.arrays_path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS
root = zarr_utils.open_zarr_append(
self.arrays_path, zarr_format=self.zarr_format
)

total_chunks = 0
Expand Down Expand Up @@ -788,7 +793,7 @@ def init_array(self, root, schema, array_spec, variants_dim_size):
else schema.defaults["compressor"]
)

kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
kwargs = {}
# see https://github.com/zarr-developers/zarr-python/issues/3197
kwargs["fill_value"] = None

Expand Down Expand Up @@ -1050,7 +1055,9 @@ def finalise_array(self, name):
if not src.exists():
# Needs test
raise ValueError(f"Partition {partition} of {name} does not exist")
zarr_utils.move_chunks(src, self.arrays_path, partition, name)
zarr_utils.move_chunks(
src, self.arrays_path, partition, name, self.zarr_format
)
# Finally, once all the chunks have moved into the arrays dir,
# we move it out of wip
os.rename(self.arrays_path / name, self.path / name)
Expand Down
Loading
Loading