Skip to content

Commit 0603228

Browse files
authored
Support a single vcz in create (#105)
1 parent c25223f commit 0603228

3 files changed

Lines changed: 157 additions & 122 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Multiallelic sites, and split alleles (mutiple records for a site) are both acce
3434

3535
A VCZ store is just a VCZ file - typically in cloud object store - so it's possible to create one using bio2zarr (e.g. vcf2zarr).
3636

37-
However, when the VCFs being appended contain different variants the store must be created with the full set of variants. This is achieved by calling `vczstore create` with the VCZ files that collectively define the set of variants (e.g. one for each genotype array). (Currently `vczstore create` can only be called with two arguments - but you can call it repeatedly to built up the store from multiple files.)
37+
However, when the VCFs being appended contain different variants the store must be created with the full set of variants. This is achieved by calling `vczstore create` with the VCZ files that collectively define the set of variants (e.g. one for each genotype array). (Currently `vczstore create` can only be called with one or two input VCZ arguments - but you can call it repeatedly to built up the store from multiple files.)
3838

3939
After creation the store contains no samples.
4040

tests/test_create.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,16 @@ def test_merge_alts(alts1, alts2, expected):
5151
assert _merge_alts(alts1, alts2) == expected
5252

5353

54+
def test_create__single():
55+
vcz1 = make_vcz([0], [100], [["A", "T"]])
56+
vcz_out = zarr.storage.MemoryStore()
57+
create(vcz_out, vcz1)
58+
root = zarr.open(vcz_out)
59+
assert_array_equal(root["variant_position"][:], [100])
60+
assert root["variant_allele"][0, 0] == "A"
61+
assert root["variant_allele"][0, 1] == "T"
62+
63+
5464
def test_create__no_match():
5565
# Disjoint alts at same position → 2 output variants
5666
vcz1 = make_vcz([0], [100], [["A", "T"]])

vczstore/create.py

Lines changed: 146 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -525,14 +525,16 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
525525
"""Create a new, empty store vcz_out using merged variants from vczs
526526
using -m none semantics with stable variant ordering.
527527
528-
Currently vczs must contain exactly two stores. This requirement may be lifted
529-
in the future.
528+
Currently vczs must contain exactly one or two stores. This requirement may be
529+
lifted in the future.
530530
531-
Both stores must have identical contig_id arrays. Output contains all variants
532-
from both stores; variants at the same position whose alt sets overlap are merged
533-
into a single record with combined alleles. Only fixed variant fields are
534-
written; other variant fields are not included, and sample and call
535-
fields are included, but with no samples.
531+
In the case of one store, the new store is the same except it contains no samples.
532+
533+
In the case of two stores, both must have identical contig_id arrays. Output
534+
contains all variants from both stores; variants at the same position whose alt
535+
sets overlap are merged into a single record with combined alleles. Only fixed
536+
variant fields are written; other variant fields are not included, and sample and
537+
call fields are included, but with no samples.
536538
537539
When two records are merged: variant_id are joined with ";", variant_quality takes
538540
the max (ignoring missing), variant_filter is the union. variant_filter requires
@@ -541,150 +543,177 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
541543
them.
542544
"""
543545

544-
if len(vczs) != 2:
545-
raise NotImplementedError("A store can only be created from two VCZ files.")
546-
547-
vcz1, vcz2 = vczs
546+
if len(vczs) not in (1, 2):
547+
raise NotImplementedError(
548+
"A store can only be created from one or two VCZ files."
549+
)
548550

549551
with transaction(
550552
vcz_out, backend_storage=backend_storage, message="create", create_repo=True
551553
) as vcz_out:
554+
vcz1 = vczs[0]
552555
root1 = zarr.open(vcz1, mode="r")
553-
root2 = zarr.open(vcz2, mode="r")
554-
555-
if not np.all(root1["contig_id"][:] == root2["contig_id"][:]):
556-
raise ValueError("contig_id fields must be identical")
557-
558-
(
559-
out_contig,
560-
out_position,
561-
out_length,
562-
out_id,
563-
out_quality,
564-
out_filter,
565-
out_allele,
566-
) = _compute_merged_variants(root1, root2, show_progress=show_progress)
567-
568-
n_variants = out_contig.shape[0]
569-
570-
out_root = open_zarr(
571-
vcz_out,
572-
mode="w",
573-
backend_storage=backend_storage,
574-
zarr_format=root1.metadata.zarr_format,
575-
)
576-
out_root.attrs.update(root1.attrs)
577556

578-
# copy direct from vcz1
579-
vcz1_copy_vars = [
580-
var
581-
for var in root1.keys()
582-
if var.startswith("contig_") or var.startswith("filter_")
583-
]
584-
logger.debug(f"Copying arrays for {', '.join(vcz1_copy_vars)}")
585-
copy_store(vcz1, vcz_out, array_keys=vcz1_copy_vars)
586-
587-
logger.debug("Creating variant arrays")
588-
arr = root1["variant_contig"]
589-
create_group_array(
590-
out_root,
591-
"variant_contig",
592-
data=out_contig,
593-
shape=out_contig.shape,
594-
dtype=arr.dtype,
595-
chunks=arr.chunks,
596-
compressor=get_compressor_config(arr),
597-
dimension_names=["variants"],
598-
)
599-
arr = root1["variant_position"]
600-
create_group_array(
601-
out_root,
602-
"variant_position",
603-
data=out_position,
604-
shape=out_position.shape,
605-
dtype=arr.dtype,
606-
chunks=arr.chunks,
607-
compressor=get_compressor_config(arr),
608-
dimension_names=["variants"],
609-
)
610-
if out_length is not None:
611-
arr = root1["variant_length"]
612-
create_group_array(
613-
out_root,
614-
"variant_length",
615-
data=out_length,
616-
shape=out_length.shape,
617-
dtype=arr.dtype,
618-
chunks=arr.chunks,
619-
compressor=get_compressor_config(arr),
620-
dimension_names=["variants"],
557+
if len(vczs) == 1:
558+
out_root = open_zarr(
559+
vcz_out,
560+
mode="w",
561+
backend_storage=backend_storage,
562+
zarr_format=root1.metadata.zarr_format,
621563
)
622-
623-
if out_id is not None:
624-
arr = root1["variant_id"]
625-
create_group_array(
626-
out_root,
627-
"variant_id",
628-
data=out_id,
629-
shape=out_id.shape,
630-
dtype=STRING_DTYPE_NAME,
631-
chunks=arr.chunks,
632-
compressor=get_compressor_config(arr),
633-
dimension_names=["variants"],
564+
out_root.attrs.update(root1.attrs)
565+
566+
# copy direct from vcz1
567+
vcz1_copy_vars = [
568+
var
569+
for var in root1.keys()
570+
if not var.startswith("call_") and not var == "sample_id"
571+
]
572+
logger.debug(f"Copying arrays for {', '.join(vcz1_copy_vars)}")
573+
copy_store(vcz1, vcz_out, array_keys=vcz1_copy_vars)
574+
575+
else: # len(vczs) == 2
576+
vcz2 = vczs[1]
577+
root2 = zarr.open(vcz2, mode="r")
578+
579+
if not np.all(root1["contig_id"][:] == root2["contig_id"][:]):
580+
raise ValueError("contig_id fields must be identical")
581+
582+
(
583+
out_contig,
584+
out_position,
585+
out_length,
586+
out_id,
587+
out_quality,
588+
out_filter,
589+
out_allele,
590+
) = _compute_merged_variants(root1, root2, show_progress=show_progress)
591+
592+
n_variants = out_contig.shape[0]
593+
594+
out_root = open_zarr(
595+
vcz_out,
596+
mode="w",
597+
backend_storage=backend_storage,
598+
zarr_format=root1.metadata.zarr_format,
634599
)
635-
arr = root1["variant_id_mask"]
600+
out_root.attrs.update(root1.attrs)
601+
602+
# copy direct from vcz1
603+
vcz1_copy_vars = [
604+
var
605+
for var in root1.keys()
606+
if var.startswith("contig_") or var.startswith("filter_")
607+
]
608+
logger.debug(f"Copying arrays for {', '.join(vcz1_copy_vars)}")
609+
copy_store(vcz1, vcz_out, array_keys=vcz1_copy_vars)
610+
611+
logger.debug("Creating variant arrays")
612+
arr = root1["variant_contig"]
636613
create_group_array(
637614
out_root,
638-
"variant_id_mask",
639-
data=out_id == ".",
640-
shape=out_id.shape,
615+
"variant_contig",
616+
data=out_contig,
617+
shape=out_contig.shape,
641618
dtype=arr.dtype,
642619
chunks=arr.chunks,
643620
compressor=get_compressor_config(arr),
644621
dimension_names=["variants"],
645622
)
646-
if out_quality is not None:
647-
arr = root1["variant_quality"]
623+
arr = root1["variant_position"]
648624
create_group_array(
649625
out_root,
650-
"variant_quality",
651-
data=out_quality,
652-
shape=out_quality.shape,
626+
"variant_position",
627+
data=out_position,
628+
shape=out_position.shape,
653629
dtype=arr.dtype,
654630
chunks=arr.chunks,
655631
compressor=get_compressor_config(arr),
656632
dimension_names=["variants"],
657633
)
658-
if out_filter is not None:
659-
arr = root1["variant_filter"]
634+
if out_length is not None:
635+
arr = root1["variant_length"]
636+
create_group_array(
637+
out_root,
638+
"variant_length",
639+
data=out_length,
640+
shape=out_length.shape,
641+
dtype=arr.dtype,
642+
chunks=arr.chunks,
643+
compressor=get_compressor_config(arr),
644+
dimension_names=["variants"],
645+
)
646+
647+
if out_id is not None:
648+
arr = root1["variant_id"]
649+
create_group_array(
650+
out_root,
651+
"variant_id",
652+
data=out_id,
653+
shape=out_id.shape,
654+
dtype=STRING_DTYPE_NAME,
655+
chunks=arr.chunks,
656+
compressor=get_compressor_config(arr),
657+
dimension_names=["variants"],
658+
)
659+
arr = root1["variant_id_mask"]
660+
create_group_array(
661+
out_root,
662+
"variant_id_mask",
663+
data=out_id == ".",
664+
shape=out_id.shape,
665+
dtype=arr.dtype,
666+
chunks=arr.chunks,
667+
compressor=get_compressor_config(arr),
668+
dimension_names=["variants"],
669+
)
670+
if out_quality is not None:
671+
arr = root1["variant_quality"]
672+
create_group_array(
673+
out_root,
674+
"variant_quality",
675+
data=out_quality,
676+
shape=out_quality.shape,
677+
dtype=arr.dtype,
678+
chunks=arr.chunks,
679+
compressor=get_compressor_config(arr),
680+
dimension_names=["variants"],
681+
)
682+
if out_filter is not None:
683+
arr = root1["variant_filter"]
684+
create_group_array(
685+
out_root,
686+
"variant_filter",
687+
data=out_filter,
688+
shape=out_filter.shape,
689+
dtype=arr.dtype,
690+
chunks=arr.chunks,
691+
compressor=get_compressor_config(arr),
692+
dimension_names=["variants", "filters"],
693+
)
694+
695+
arr = root1["variant_allele"]
660696
create_group_array(
661697
out_root,
662-
"variant_filter",
663-
data=out_filter,
664-
shape=out_filter.shape,
665-
dtype=arr.dtype,
698+
"variant_allele",
699+
data=out_allele,
700+
shape=out_allele.shape,
701+
dtype=STRING_DTYPE_NAME,
666702
chunks=arr.chunks,
667703
compressor=get_compressor_config(arr),
668-
dimension_names=["variants", "filters"],
704+
dimension_names=["variants", "alleles"],
669705
)
670706

671-
arr = root1["variant_allele"]
672-
create_group_array(
673-
out_root,
674-
"variant_allele",
675-
data=out_allele,
676-
shape=out_allele.shape,
677-
dtype=STRING_DTYPE_NAME,
678-
chunks=arr.chunks,
679-
compressor=get_compressor_config(arr),
680-
dimension_names=["variants", "alleles"],
681-
)
707+
if out_length is not None:
708+
indexer = VcfZarrIndexer(vcz_out)
709+
indexer.create_index()
682710

683711
logger.debug("Creating empty sample and call arrays")
684712
for var in root1.keys():
685713
if var.startswith("call_"):
686714
arr = root1[var]
687715
shape = (n_variants, 0) + arr.shape[2:]
716+
# TODO: should allow sample chunk size to be overridden/enforced here
688717
create_empty_group_array(
689718
out_root,
690719
var,
@@ -706,7 +735,3 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
706735
compressor=get_compressor_config(arr),
707736
dimension_names=array_dims(arr),
708737
)
709-
710-
if out_length is not None:
711-
indexer = VcfZarrIndexer(vcz_out)
712-
indexer.create_index()

0 commit comments

Comments
 (0)