diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 6f59adf..019a3c3 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -6,6 +6,7 @@ import pandas as pd from bio2zarr import constants, core, vcz +from bio2zarr.zarr_utils import STRING_DTYPE_NAME logger = logging.getLogger(__name__) @@ -198,7 +199,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): ref_iter = self.bim.allele_2.values[start:stop] gt_iter = self.bed_reader.iter_decode(start, stop) for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter): - alleles = np.full(num_alleles, constants.STR_FILL, dtype="O") + alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME) alleles[0] = ref alleles[1 : 1 + len(alt)] = alt phased = np.zeros(gt.shape[0], dtype=bool) @@ -246,13 +247,13 @@ def generate_schema( ), vcz.ZarrArraySpec( name="variant_allele", - dtype="O", + dtype=STRING_DTYPE_NAME, dimensions=["variants", "alleles"], description=None, ), vcz.ZarrArraySpec( name="variant_id", - dtype="O", + dtype=STRING_DTYPE_NAME, dimensions=["variants"], description=None, ), diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py index 7a63277..02da5ff 100644 --- a/bio2zarr/tskit.py +++ b/bio2zarr/tskit.py @@ -4,6 +4,7 @@ import numpy as np from bio2zarr import constants, core, vcz +from bio2zarr.zarr_utils import STRING_DTYPE_NAME logger = logging.getLogger(__name__) @@ -116,7 +117,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles): copy=False, ): gt = np.full(shape, constants.INT_FILL, dtype=np.int8) - alleles = np.full(num_alleles, constants.STR_FILL, dtype="O") + alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME) # length is the length of the REF allele unless other fields # are included. variant_length = len(variant.alleles[0]) @@ -200,7 +201,7 @@ def generate_schema( vcz.ZarrArraySpec( source=None, name="variant_allele", - dtype="O", + dtype=STRING_DTYPE_NAME, dimensions=["variants", "alleles"], description="Alleles for each variant", ), diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 9665325..8d4d0e1 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -16,6 +16,8 @@ import numcodecs import numpy as np +from bio2zarr.zarr_utils import STRING_DTYPE_NAME + from . import constants, core, provenance, vcf_utils, vcz logger = logging.getLogger(__name__) @@ -110,7 +112,7 @@ def smallest_dtype(self): ret = "U1" else: assert self.vcf_type == "String" - ret = "O" + ret = STRING_DTYPE_NAME return ret @@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value): def sanitise_value_string_1d(shape, value): if value is None: - return np.full(shape, ".", dtype="O") + return np.full(shape, ".", dtype=STRING_DTYPE_NAME) else: value = drop_empty_second_dim(value) result = np.full(shape, "", dtype=value.dtype) @@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value): def sanitise_value_string_2d(shape, value): if value is None: - return np.full(shape, ".", dtype="O") + return np.full(shape, ".", dtype=STRING_DTYPE_NAME) else: - result = np.full(shape, "", dtype="O") + result = np.full(shape, "", dtype=STRING_DTYPE_NAME) if value.ndim == 2: result[: value.shape[0], : value.shape[1]] = value else: @@ -569,7 +571,12 @@ def transform(self, vcf_value): value = np.array(list(vcf_value.split(","))) else: # TODO can we make this faster?? - value = np.array([v.split(",") for v in vcf_value], dtype="O") + var_len_values = [v.split(",") for v in vcf_value] + number = max(len(v) for v in var_len_values) + value = np.array( + [v + [""] * (number - len(v)) for v in var_len_values], + dtype=STRING_DTYPE_NAME, + ) # print("HERE", vcf_value, value) # for v in vcf_value: # print("\t", type(v), len(v), v.split(",")) @@ -1044,7 +1051,7 @@ def iter_alleles(self, start, stop, num_alleles): ref_field.iter_values(start, stop), alt_field.iter_values(start, stop), ): - alleles = np.full(num_alleles, constants.STR_FILL, dtype="O") + alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME) alleles[0] = ref[0] alleles[1 : 1 + len(alt)] = alt yield alleles @@ -1165,7 +1172,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)): ), fixed_field_spec( name="variant_allele", - dtype="O", + dtype=STRING_DTYPE_NAME, dimensions=["variants", "alleles"], ), fixed_field_spec( @@ -1175,7 +1182,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)): ), fixed_field_spec( name="variant_id", - dtype="O", + dtype=STRING_DTYPE_NAME, ), fixed_field_spec( name="variant_id_mask", diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 45864ac..3351ecf 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -284,7 +284,7 @@ def variant_chunk_nbytes(self, schema): for size in self.get_shape(schema)[1:]: chunk_items *= size dt = np.dtype(self.dtype) - if dt.kind == "O" and "samples" in self.dimensions: + if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions: logger.warning( f"Field {self.name} is a string; max memory usage may " "be a significant underestimate" @@ -707,13 +707,16 @@ def init_array(self, root, schema, array_spec, variants_dim_size): else schema.defaults["compressor"] ) compressor = numcodecs.get_codec(compressor) - if array_spec.dtype == "O": + if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME: if zarr_utils.zarr_v3(): filters = [*list(filters), numcodecs.VLenUTF8()] else: kwargs["object_codec"] = numcodecs.VLenUTF8() - if not zarr_utils.zarr_v3(): + if zarr_utils.zarr_v3(): + # see https://github.com/zarr-developers/zarr-python/issues/3197 + kwargs["fill_value"] = None + else: kwargs["dimension_separator"] = self.metadata.dimension_separator shape = schema.get_shape(array_spec.dimensions) diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py index d99b71a..ad87071 100644 --- a/bio2zarr/zarr_utils.py +++ b/bio2zarr/zarr_utils.py @@ -8,8 +8,10 @@ def zarr_v3() -> bool: if zarr_v3(): # Use zarr format v2 even when running with zarr-python v3 ZARR_FORMAT_KWARGS = dict(zarr_format=2) + STRING_DTYPE_NAME = "T" else: ZARR_FORMAT_KWARGS = dict() + STRING_DTYPE_NAME = "O" # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529 diff --git a/tests/test_icf.py b/tests/test_icf.py index a08064a..666d9ab 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -8,6 +8,7 @@ from bio2zarr import provenance, vcf_utils, vcz from bio2zarr import vcf as vcf_mod +from bio2zarr.zarr_utils import STRING_DTYPE_NAME class TestSmallExample: @@ -227,9 +228,14 @@ def schema(self, icf): ("variant_IFD", "f4", (208, 9), ("variants", "INFO_IFD_dim")), ("variant_IC1", "U1", (208,), ("variants",)), ("variant_IC2", "U1", (208, 2), ("variants", "INFO_IC2_dim")), - ("variant_IS1", "O", (208,), ("variants",)), - ("variant_IS2", "O", (208, 2), ("variants", "INFO_IS2_dim")), - ("call_FS2", "O", (208, 2, 2), ("variants", "samples", "FORMAT_FS2_dim")), + ("variant_IS1", STRING_DTYPE_NAME, (208,), ("variants",)), + ("variant_IS2", STRING_DTYPE_NAME, (208, 2), ("variants", "INFO_IS2_dim")), + ( + "call_FS2", + STRING_DTYPE_NAME, + (208, 2, 2), + ("variants", "samples", "FORMAT_FS2_dim"), + ), ("call_FC2", "U1", (208, 2, 2), ("variants", "samples", "FORMAT_FC2_dim")), ("call_FIG", "i2", (208, 2, 6), ("variants", "samples", "genotypes")), ("call_FIA", "i2", (208, 2, 2), ("variants", "samples", "alt_alleles")), diff --git a/tests/test_tskit.py b/tests/test_tskit.py index 81e28f6..864ceb9 100644 --- a/tests/test_tskit.py +++ b/tests/test_tskit.py @@ -11,6 +11,7 @@ from bio2zarr import tskit as tsk from bio2zarr import vcf +from bio2zarr.zarr_utils import STRING_DTYPE_NAME def test_missing_dependency(): @@ -115,7 +116,7 @@ def test_alleles(self, conversion): ts, zroot = conversion alleles = zroot["variant_allele"][:] assert alleles.shape == (3, 2) - assert alleles.dtype == "O" + assert alleles.dtype.kind == STRING_DTYPE_NAME nt.assert_array_equal(alleles, [["A", "TTTT"], ["CCC", "G"], ["G", "AA"]]) def test_variant_length(self, conversion): @@ -146,7 +147,7 @@ def test_contig_id(self, conversion): ts, zroot = conversion contigs = zroot["contig_id"][:] assert contigs.shape == (1,) - assert contigs.dtype == "O" + assert contigs.dtype.kind == STRING_DTYPE_NAME nt.assert_array_equal(contigs, ["1"]) def test_variant_contig(self, conversion): @@ -160,7 +161,7 @@ def test_sample_id(self, conversion): ts, zroot = conversion samples = zroot["sample_id"][:] assert samples.shape == (4,) - assert samples.dtype == "O" + assert samples.dtype.kind == STRING_DTYPE_NAME nt.assert_array_equal(samples, ["tsk_0", "tsk_1", "tsk_2", "tsk_3"]) def test_region_index(self, conversion): diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 3698f9e..9cdcef4 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -11,6 +11,7 @@ from bio2zarr import constants, provenance, vcz_verification from bio2zarr import vcf as vcf_mod +from bio2zarr.zarr_utils import zarr_v3 def assert_dataset_equal(ds1, ds2, drop_vars=None): @@ -275,6 +276,10 @@ def test_no_genotypes(self, ds, tmp_path): if field_name != "sample_id" and not field_name.startswith("call_"): xt.assert_equal(ds[field_name], ds2[field_name]) + @pytest.mark.skipif( + zarr_v3(), + reason="Zarr chunks ignored when reading arrays with StringDType: https://github.com/pydata/xarray/issues/11054", + ) @pytest.mark.parametrize( ("variants_chunk_size", "samples_chunk_size", "y_chunks", "x_chunks"), [ diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 4f60e6e..5483a40 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -13,6 +13,10 @@ from bio2zarr import vcf as vcf_mod from bio2zarr.zarr_utils import zarr_v3 +# In zarr-python v2 strings are stored as object arrays (O) with itemsize 8 +# In zarr-python v3 strings are stored as string arrays (T) with itemsize 16 +STRING_ITEMSIZE = 16 if zarr_v3() else 8 + @pytest.fixture(scope="module") def vcf_file(): @@ -82,7 +86,10 @@ def test_not_enough_memory(self, tmp_path, icf_path, max_memory): with pytest.raises(ValueError, match="Insufficient memory"): vcf_mod.encode(icf_path, zarr_path, max_memory=max_memory) - @pytest.mark.parametrize("max_memory", ["315KiB", "500KiB"]) + # zarr-python v3 string use more memory + @pytest.mark.parametrize( + "max_memory", ["630KiB", "1000KiB"] if zarr_v3() else ["315KiB", "500KiB"] + ) def test_not_enough_memory_for_two( self, tmp_path, icf_path, zarr_path, caplog, max_memory ): @@ -292,9 +299,8 @@ class TestChunkNbytes: ("variant_position", 36), # 9 * 4 ("variant_H2", 9), ("variant_AC", 18), # 9 * 2 - # Object fields have an itemsize of 8 - ("variant_AA", 72), # 9 * 8 - ("variant_allele", 9 * 4 * 8), + ("variant_AA", 9 * STRING_ITEMSIZE), + ("variant_allele", 9 * 4 * STRING_ITEMSIZE), ], ) def test_example_schema(self, schema, field, value):