Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd

from bio2zarr import constants, core, vcz
from bio2zarr.zarr_utils import STRING_DTYPE_NAME

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -198,7 +199,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
ref_iter = self.bim.allele_2.values[start:stop]
gt_iter = self.bed_reader.iter_decode(start, stop)
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
alleles[0] = ref
alleles[1 : 1 + len(alt)] = alt
phased = np.zeros(gt.shape[0], dtype=bool)
Expand Down Expand Up @@ -246,13 +247,13 @@ def generate_schema(
),
vcz.ZarrArraySpec(
name="variant_allele",
dtype="O",
dtype=STRING_DTYPE_NAME,
dimensions=["variants", "alleles"],
description=None,
),
vcz.ZarrArraySpec(
name="variant_id",
dtype="O",
dtype=STRING_DTYPE_NAME,
dimensions=["variants"],
description=None,
),
Expand Down
5 changes: 3 additions & 2 deletions bio2zarr/tskit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np

from bio2zarr import constants, core, vcz
from bio2zarr.zarr_utils import STRING_DTYPE_NAME

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -116,7 +117,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
copy=False,
):
gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
# length is the length of the REF allele unless other fields
# are included.
variant_length = len(variant.alleles[0])
Expand Down Expand Up @@ -200,7 +201,7 @@ def generate_schema(
vcz.ZarrArraySpec(
source=None,
name="variant_allele",
dtype="O",
dtype=STRING_DTYPE_NAME,
dimensions=["variants", "alleles"],
description="Alleles for each variant",
),
Expand Down
23 changes: 15 additions & 8 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import numcodecs
import numpy as np

from bio2zarr.zarr_utils import STRING_DTYPE_NAME

from . import constants, core, provenance, vcf_utils, vcz

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -110,7 +112,7 @@ def smallest_dtype(self):
ret = "U1"
else:
assert self.vcf_type == "String"
ret = "O"
ret = STRING_DTYPE_NAME
return ret


Expand Down Expand Up @@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):

def sanitise_value_string_1d(shape, value):
if value is None:
return np.full(shape, ".", dtype="O")
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
else:
value = drop_empty_second_dim(value)
result = np.full(shape, "", dtype=value.dtype)
Expand All @@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):

def sanitise_value_string_2d(shape, value):
if value is None:
return np.full(shape, ".", dtype="O")
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
else:
result = np.full(shape, "", dtype="O")
result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
if value.ndim == 2:
result[: value.shape[0], : value.shape[1]] = value
else:
Expand Down Expand Up @@ -569,7 +571,12 @@ def transform(self, vcf_value):
value = np.array(list(vcf_value.split(",")))
else:
# TODO can we make this faster??
value = np.array([v.split(",") for v in vcf_value], dtype="O")
var_len_values = [v.split(",") for v in vcf_value]
number = max(len(v) for v in var_len_values)
value = np.array(
[v + [""] * (number - len(v)) for v in var_len_values],
dtype=STRING_DTYPE_NAME,
)
# print("HERE", vcf_value, value)
# for v in vcf_value:
# print("\t", type(v), len(v), v.split(","))
Expand Down Expand Up @@ -1044,7 +1051,7 @@ def iter_alleles(self, start, stop, num_alleles):
ref_field.iter_values(start, stop),
alt_field.iter_values(start, stop),
):
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
alleles[0] = ref[0]
alleles[1 : 1 + len(alt)] = alt
yield alleles
Expand Down Expand Up @@ -1165,7 +1172,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
),
fixed_field_spec(
name="variant_allele",
dtype="O",
dtype=STRING_DTYPE_NAME,
dimensions=["variants", "alleles"],
),
fixed_field_spec(
Expand All @@ -1175,7 +1182,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
),
fixed_field_spec(
name="variant_id",
dtype="O",
dtype=STRING_DTYPE_NAME,
),
fixed_field_spec(
name="variant_id_mask",
Expand Down
9 changes: 6 additions & 3 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def variant_chunk_nbytes(self, schema):
for size in self.get_shape(schema)[1:]:
chunk_items *= size
dt = np.dtype(self.dtype)
if dt.kind == "O" and "samples" in self.dimensions:
if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
logger.warning(
f"Field {self.name} is a string; max memory usage may "
"be a significant underestimate"
Expand Down Expand Up @@ -707,13 +707,16 @@ def init_array(self, root, schema, array_spec, variants_dim_size):
else schema.defaults["compressor"]
)
compressor = numcodecs.get_codec(compressor)
if array_spec.dtype == "O":
if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
if zarr_utils.zarr_v3():
filters = [*list(filters), numcodecs.VLenUTF8()]
else:
kwargs["object_codec"] = numcodecs.VLenUTF8()

if not zarr_utils.zarr_v3():
if zarr_utils.zarr_v3():
# see https://github.com/zarr-developers/zarr-python/issues/3197
kwargs["fill_value"] = None
else:
kwargs["dimension_separator"] = self.metadata.dimension_separator

shape = schema.get_shape(array_spec.dimensions)
Expand Down
2 changes: 2 additions & 0 deletions bio2zarr/zarr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ def zarr_v3() -> bool:
if zarr_v3():
# Use zarr format v2 even when running with zarr-python v3
ZARR_FORMAT_KWARGS = dict(zarr_format=2)
STRING_DTYPE_NAME = "T"
else:
ZARR_FORMAT_KWARGS = dict()
STRING_DTYPE_NAME = "O"


# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
Expand Down
12 changes: 9 additions & 3 deletions tests/test_icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from bio2zarr import provenance, vcf_utils, vcz
from bio2zarr import vcf as vcf_mod
from bio2zarr.zarr_utils import STRING_DTYPE_NAME


class TestSmallExample:
Expand Down Expand Up @@ -227,9 +228,14 @@ def schema(self, icf):
("variant_IFD", "f4", (208, 9), ("variants", "INFO_IFD_dim")),
("variant_IC1", "U1", (208,), ("variants",)),
("variant_IC2", "U1", (208, 2), ("variants", "INFO_IC2_dim")),
("variant_IS1", "O", (208,), ("variants",)),
("variant_IS2", "O", (208, 2), ("variants", "INFO_IS2_dim")),
("call_FS2", "O", (208, 2, 2), ("variants", "samples", "FORMAT_FS2_dim")),
("variant_IS1", STRING_DTYPE_NAME, (208,), ("variants",)),
("variant_IS2", STRING_DTYPE_NAME, (208, 2), ("variants", "INFO_IS2_dim")),
(
"call_FS2",
STRING_DTYPE_NAME,
(208, 2, 2),
("variants", "samples", "FORMAT_FS2_dim"),
),
("call_FC2", "U1", (208, 2, 2), ("variants", "samples", "FORMAT_FC2_dim")),
("call_FIG", "i2", (208, 2, 6), ("variants", "samples", "genotypes")),
("call_FIA", "i2", (208, 2, 2), ("variants", "samples", "alt_alleles")),
Expand Down
7 changes: 4 additions & 3 deletions tests/test_tskit.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from bio2zarr import tskit as tsk
from bio2zarr import vcf
from bio2zarr.zarr_utils import STRING_DTYPE_NAME


def test_missing_dependency():
Expand Down Expand Up @@ -115,7 +116,7 @@ def test_alleles(self, conversion):
ts, zroot = conversion
alleles = zroot["variant_allele"][:]
assert alleles.shape == (3, 2)
assert alleles.dtype == "O"
assert alleles.dtype.kind == STRING_DTYPE_NAME
nt.assert_array_equal(alleles, [["A", "TTTT"], ["CCC", "G"], ["G", "AA"]])

def test_variant_length(self, conversion):
Expand Down Expand Up @@ -146,7 +147,7 @@ def test_contig_id(self, conversion):
ts, zroot = conversion
contigs = zroot["contig_id"][:]
assert contigs.shape == (1,)
assert contigs.dtype == "O"
assert contigs.dtype.kind == STRING_DTYPE_NAME
nt.assert_array_equal(contigs, ["1"])

def test_variant_contig(self, conversion):
Expand All @@ -160,7 +161,7 @@ def test_sample_id(self, conversion):
ts, zroot = conversion
samples = zroot["sample_id"][:]
assert samples.shape == (4,)
assert samples.dtype == "O"
assert samples.dtype.kind == STRING_DTYPE_NAME
nt.assert_array_equal(samples, ["tsk_0", "tsk_1", "tsk_2", "tsk_3"])

def test_region_index(self, conversion):
Expand Down
5 changes: 5 additions & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from bio2zarr import constants, provenance, vcz_verification
from bio2zarr import vcf as vcf_mod
from bio2zarr.zarr_utils import zarr_v3


def assert_dataset_equal(ds1, ds2, drop_vars=None):
Expand Down Expand Up @@ -275,6 +276,10 @@ def test_no_genotypes(self, ds, tmp_path):
if field_name != "sample_id" and not field_name.startswith("call_"):
xt.assert_equal(ds[field_name], ds2[field_name])

@pytest.mark.skipif(
zarr_v3(),
reason="Zarr chunks ignored when reading arrays with StringDType: https://github.com/pydata/xarray/issues/11054",
)
@pytest.mark.parametrize(
("variants_chunk_size", "samples_chunk_size", "y_chunks", "x_chunks"),
[
Expand Down
14 changes: 10 additions & 4 deletions tests/test_vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
from bio2zarr import vcf as vcf_mod
from bio2zarr.zarr_utils import zarr_v3

# In zarr-python v2 strings are stored as object arrays (O) with itemsize 8
# In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
STRING_ITEMSIZE = 16 if zarr_v3() else 8


@pytest.fixture(scope="module")
def vcf_file():
Expand Down Expand Up @@ -82,7 +86,10 @@ def test_not_enough_memory(self, tmp_path, icf_path, max_memory):
with pytest.raises(ValueError, match="Insufficient memory"):
vcf_mod.encode(icf_path, zarr_path, max_memory=max_memory)

@pytest.mark.parametrize("max_memory", ["315KiB", "500KiB"])
# zarr-python v3 string use more memory
@pytest.mark.parametrize(
"max_memory", ["630KiB", "1000KiB"] if zarr_v3() else ["315KiB", "500KiB"]
)
def test_not_enough_memory_for_two(
self, tmp_path, icf_path, zarr_path, caplog, max_memory
):
Expand Down Expand Up @@ -292,9 +299,8 @@ class TestChunkNbytes:
("variant_position", 36), # 9 * 4
("variant_H2", 9),
("variant_AC", 18), # 9 * 2
# Object fields have an itemsize of 8
("variant_AA", 72), # 9 * 8
("variant_allele", 9 * 4 * 8),
("variant_AA", 9 * STRING_ITEMSIZE),
("variant_allele", 9 * 4 * STRING_ITEMSIZE),
],
)
def test_example_schema(self, schema, field, value):
Expand Down