@@ -525,14 +525,16 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
525525 """Create a new, empty store vcz_out using merged variants from vczs
526526 using -m none semantics with stable variant ordering.
527527
528- Currently vczs must contain exactly two stores. This requirement may be lifted
529- in the future.
528+ Currently vczs must contain exactly one or two stores. This requirement may be
529+ lifted in the future.
530530
531- Both stores must have identical contig_id arrays. Output contains all variants
532- from both stores; variants at the same position whose alt sets overlap are merged
533- into a single record with combined alleles. Only fixed variant fields are
534- written; other variant fields are not included, and sample and call
535- fields are included, but with no samples.
531+ In the case of one store, the new store is the same except it contains no samples.
532+
533+ In the case of two stores, both must have identical contig_id arrays. Output
534+ contains all variants from both stores; variants at the same position whose alt
535+ sets overlap are merged into a single record with combined alleles. Only fixed
536+ variant fields are written; other variant fields are not included, and sample and
537+ call fields are included, but with no samples.
536538
537539 When two records are merged: variant_id are joined with ";", variant_quality takes
538540 the max (ignoring missing), variant_filter is the union. variant_filter requires
@@ -541,150 +543,177 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
541543 them.
542544 """
543545
544- if len (vczs ) != 2 :
545- raise NotImplementedError ("A store can only be created from two VCZ files." )
546-
547- vcz1 , vcz2 = vczs
546+ if len (vczs ) not in ( 1 , 2 ) :
547+ raise NotImplementedError (
548+ "A store can only be created from one or two VCZ files."
549+ )
548550
549551 with transaction (
550552 vcz_out , backend_storage = backend_storage , message = "create" , create_repo = True
551553 ) as vcz_out :
554+ vcz1 = vczs [0 ]
552555 root1 = zarr .open (vcz1 , mode = "r" )
553- root2 = zarr .open (vcz2 , mode = "r" )
554-
555- if not np .all (root1 ["contig_id" ][:] == root2 ["contig_id" ][:]):
556- raise ValueError ("contig_id fields must be identical" )
557-
558- (
559- out_contig ,
560- out_position ,
561- out_length ,
562- out_id ,
563- out_quality ,
564- out_filter ,
565- out_allele ,
566- ) = _compute_merged_variants (root1 , root2 , show_progress = show_progress )
567-
568- n_variants = out_contig .shape [0 ]
569-
570- out_root = open_zarr (
571- vcz_out ,
572- mode = "w" ,
573- backend_storage = backend_storage ,
574- zarr_format = root1 .metadata .zarr_format ,
575- )
576- out_root .attrs .update (root1 .attrs )
577556
578- # copy direct from vcz1
579- vcz1_copy_vars = [
580- var
581- for var in root1 .keys ()
582- if var .startswith ("contig_" ) or var .startswith ("filter_" )
583- ]
584- logger .debug (f"Copying arrays for { ', ' .join (vcz1_copy_vars )} " )
585- copy_store (vcz1 , vcz_out , array_keys = vcz1_copy_vars )
586-
587- logger .debug ("Creating variant arrays" )
588- arr = root1 ["variant_contig" ]
589- create_group_array (
590- out_root ,
591- "variant_contig" ,
592- data = out_contig ,
593- shape = out_contig .shape ,
594- dtype = arr .dtype ,
595- chunks = arr .chunks ,
596- compressor = get_compressor_config (arr ),
597- dimension_names = ["variants" ],
598- )
599- arr = root1 ["variant_position" ]
600- create_group_array (
601- out_root ,
602- "variant_position" ,
603- data = out_position ,
604- shape = out_position .shape ,
605- dtype = arr .dtype ,
606- chunks = arr .chunks ,
607- compressor = get_compressor_config (arr ),
608- dimension_names = ["variants" ],
609- )
610- if out_length is not None :
611- arr = root1 ["variant_length" ]
612- create_group_array (
613- out_root ,
614- "variant_length" ,
615- data = out_length ,
616- shape = out_length .shape ,
617- dtype = arr .dtype ,
618- chunks = arr .chunks ,
619- compressor = get_compressor_config (arr ),
620- dimension_names = ["variants" ],
557+ if len (vczs ) == 1 :
558+ out_root = open_zarr (
559+ vcz_out ,
560+ mode = "w" ,
561+ backend_storage = backend_storage ,
562+ zarr_format = root1 .metadata .zarr_format ,
621563 )
622-
623- if out_id is not None :
624- arr = root1 ["variant_id" ]
625- create_group_array (
626- out_root ,
627- "variant_id" ,
628- data = out_id ,
629- shape = out_id .shape ,
630- dtype = STRING_DTYPE_NAME ,
631- chunks = arr .chunks ,
632- compressor = get_compressor_config (arr ),
633- dimension_names = ["variants" ],
564+ out_root .attrs .update (root1 .attrs )
565+
566+ # copy direct from vcz1
567+ vcz1_copy_vars = [
568+ var
569+ for var in root1 .keys ()
570+ if not var .startswith ("call_" ) and not var == "sample_id"
571+ ]
572+ logger .debug (f"Copying arrays for { ', ' .join (vcz1_copy_vars )} " )
573+ copy_store (vcz1 , vcz_out , array_keys = vcz1_copy_vars )
574+
575+ else : # len(vczs) == 2
576+ vcz2 = vczs [1 ]
577+ root2 = zarr .open (vcz2 , mode = "r" )
578+
579+ if not np .all (root1 ["contig_id" ][:] == root2 ["contig_id" ][:]):
580+ raise ValueError ("contig_id fields must be identical" )
581+
582+ (
583+ out_contig ,
584+ out_position ,
585+ out_length ,
586+ out_id ,
587+ out_quality ,
588+ out_filter ,
589+ out_allele ,
590+ ) = _compute_merged_variants (root1 , root2 , show_progress = show_progress )
591+
592+ n_variants = out_contig .shape [0 ]
593+
594+ out_root = open_zarr (
595+ vcz_out ,
596+ mode = "w" ,
597+ backend_storage = backend_storage ,
598+ zarr_format = root1 .metadata .zarr_format ,
634599 )
635- arr = root1 ["variant_id_mask" ]
600+ out_root .attrs .update (root1 .attrs )
601+
602+ # copy direct from vcz1
603+ vcz1_copy_vars = [
604+ var
605+ for var in root1 .keys ()
606+ if var .startswith ("contig_" ) or var .startswith ("filter_" )
607+ ]
608+ logger .debug (f"Copying arrays for { ', ' .join (vcz1_copy_vars )} " )
609+ copy_store (vcz1 , vcz_out , array_keys = vcz1_copy_vars )
610+
611+ logger .debug ("Creating variant arrays" )
612+ arr = root1 ["variant_contig" ]
636613 create_group_array (
637614 out_root ,
638- "variant_id_mask " ,
639- data = out_id == "." ,
640- shape = out_id .shape ,
615+ "variant_contig " ,
616+ data = out_contig ,
617+ shape = out_contig .shape ,
641618 dtype = arr .dtype ,
642619 chunks = arr .chunks ,
643620 compressor = get_compressor_config (arr ),
644621 dimension_names = ["variants" ],
645622 )
646- if out_quality is not None :
647- arr = root1 ["variant_quality" ]
623+ arr = root1 ["variant_position" ]
648624 create_group_array (
649625 out_root ,
650- "variant_quality " ,
651- data = out_quality ,
652- shape = out_quality .shape ,
626+ "variant_position " ,
627+ data = out_position ,
628+ shape = out_position .shape ,
653629 dtype = arr .dtype ,
654630 chunks = arr .chunks ,
655631 compressor = get_compressor_config (arr ),
656632 dimension_names = ["variants" ],
657633 )
658- if out_filter is not None :
659- arr = root1 ["variant_filter" ]
634+ if out_length is not None :
635+ arr = root1 ["variant_length" ]
636+ create_group_array (
637+ out_root ,
638+ "variant_length" ,
639+ data = out_length ,
640+ shape = out_length .shape ,
641+ dtype = arr .dtype ,
642+ chunks = arr .chunks ,
643+ compressor = get_compressor_config (arr ),
644+ dimension_names = ["variants" ],
645+ )
646+
647+ if out_id is not None :
648+ arr = root1 ["variant_id" ]
649+ create_group_array (
650+ out_root ,
651+ "variant_id" ,
652+ data = out_id ,
653+ shape = out_id .shape ,
654+ dtype = STRING_DTYPE_NAME ,
655+ chunks = arr .chunks ,
656+ compressor = get_compressor_config (arr ),
657+ dimension_names = ["variants" ],
658+ )
659+ arr = root1 ["variant_id_mask" ]
660+ create_group_array (
661+ out_root ,
662+ "variant_id_mask" ,
663+ data = out_id == "." ,
664+ shape = out_id .shape ,
665+ dtype = arr .dtype ,
666+ chunks = arr .chunks ,
667+ compressor = get_compressor_config (arr ),
668+ dimension_names = ["variants" ],
669+ )
670+ if out_quality is not None :
671+ arr = root1 ["variant_quality" ]
672+ create_group_array (
673+ out_root ,
674+ "variant_quality" ,
675+ data = out_quality ,
676+ shape = out_quality .shape ,
677+ dtype = arr .dtype ,
678+ chunks = arr .chunks ,
679+ compressor = get_compressor_config (arr ),
680+ dimension_names = ["variants" ],
681+ )
682+ if out_filter is not None :
683+ arr = root1 ["variant_filter" ]
684+ create_group_array (
685+ out_root ,
686+ "variant_filter" ,
687+ data = out_filter ,
688+ shape = out_filter .shape ,
689+ dtype = arr .dtype ,
690+ chunks = arr .chunks ,
691+ compressor = get_compressor_config (arr ),
692+ dimension_names = ["variants" , "filters" ],
693+ )
694+
695+ arr = root1 ["variant_allele" ]
660696 create_group_array (
661697 out_root ,
662- "variant_filter " ,
663- data = out_filter ,
664- shape = out_filter .shape ,
665- dtype = arr . dtype ,
698+ "variant_allele " ,
699+ data = out_allele ,
700+ shape = out_allele .shape ,
701+ dtype = STRING_DTYPE_NAME ,
666702 chunks = arr .chunks ,
667703 compressor = get_compressor_config (arr ),
668- dimension_names = ["variants" , "filters " ],
704+ dimension_names = ["variants" , "alleles " ],
669705 )
670706
671- arr = root1 ["variant_allele" ]
672- create_group_array (
673- out_root ,
674- "variant_allele" ,
675- data = out_allele ,
676- shape = out_allele .shape ,
677- dtype = STRING_DTYPE_NAME ,
678- chunks = arr .chunks ,
679- compressor = get_compressor_config (arr ),
680- dimension_names = ["variants" , "alleles" ],
681- )
707+ if out_length is not None :
708+ indexer = VcfZarrIndexer (vcz_out )
709+ indexer .create_index ()
682710
683711 logger .debug ("Creating empty sample and call arrays" )
684712 for var in root1 .keys ():
685713 if var .startswith ("call_" ):
686714 arr = root1 [var ]
687715 shape = (n_variants , 0 ) + arr .shape [2 :]
716+ # TODO: should allow sample chunk size to be overridden/enforced here
688717 create_empty_group_array (
689718 out_root ,
690719 var ,
@@ -706,7 +735,3 @@ def create(vcz_out, *vczs, show_progress=False, backend_storage=None) -> None:
706735 compressor = get_compressor_config (arr ),
707736 dimension_names = array_dims (arr ),
708737 )
709-
710- if out_length is not None :
711- indexer = VcfZarrIndexer (vcz_out )
712- indexer .create_index ()
0 commit comments