1616import numcodecs
1717import numpy as np
1818
19+ from bio2zarr .zarr_utils import STRING_DTYPE_NAME
20+
1921from . import constants , core , provenance , vcf_utils , vcz
2022
2123logger = logging .getLogger (__name__ )
@@ -110,7 +112,7 @@ def smallest_dtype(self):
110112 ret = "U1"
111113 else :
112114 assert self .vcf_type == "String"
113- ret = "O"
115+ ret = STRING_DTYPE_NAME
114116 return ret
115117
116118
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
397399
398400def sanitise_value_string_1d (shape , value ):
399401 if value is None :
400- return np .full (shape , "." , dtype = "O" )
402+ return np .full (shape , "." , dtype = STRING_DTYPE_NAME )
401403 else :
402404 value = drop_empty_second_dim (value )
403405 result = np .full (shape , "" , dtype = value .dtype )
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
407409
408410def sanitise_value_string_2d (shape , value ):
409411 if value is None :
410- return np .full (shape , "." , dtype = "O" )
412+ return np .full (shape , "." , dtype = STRING_DTYPE_NAME )
411413 else :
412- result = np .full (shape , "" , dtype = "O" )
414+ result = np .full (shape , "" , dtype = STRING_DTYPE_NAME )
413415 if value .ndim == 2 :
414416 result [: value .shape [0 ], : value .shape [1 ]] = value
415417 else :
@@ -569,7 +571,12 @@ def transform(self, vcf_value):
569571 value = np .array (list (vcf_value .split ("," )))
570572 else :
571573 # TODO can we make this faster??
572- value = np .array ([v .split ("," ) for v in vcf_value ], dtype = "O" )
574+ var_len_values = [v .split ("," ) for v in vcf_value ]
575+ number = max (len (v ) for v in var_len_values )
576+ value = np .array (
577+ [v + ["" ] * (number - len (v )) for v in var_len_values ],
578+ dtype = STRING_DTYPE_NAME ,
579+ )
573580 # print("HERE", vcf_value, value)
574581 # for v in vcf_value:
575582 # print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ def iter_alleles(self, start, stop, num_alleles):
10441051 ref_field .iter_values (start , stop ),
10451052 alt_field .iter_values (start , stop ),
10461053 ):
1047- alleles = np .full (num_alleles , constants .STR_FILL , dtype = "O" )
1054+ alleles = np .full (num_alleles , constants .STR_FILL , dtype = STRING_DTYPE_NAME )
10481055 alleles [0 ] = ref [0 ]
10491056 alleles [1 : 1 + len (alt )] = alt
10501057 yield alleles
@@ -1163,7 +1170,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11631170 ),
11641171 fixed_field_spec (
11651172 name = "variant_allele" ,
1166- dtype = "O" ,
1173+ dtype = STRING_DTYPE_NAME ,
11671174 dimensions = ["variants" , "alleles" ],
11681175 ),
11691176 fixed_field_spec (
@@ -1173,7 +1180,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11731180 ),
11741181 fixed_field_spec (
11751182 name = "variant_id" ,
1176- dtype = "O" ,
1183+ dtype = STRING_DTYPE_NAME ,
11771184 ),
11781185 fixed_field_spec (
11791186 name = "variant_id_mask" ,
0 commit comments