Skip to content

Commit 833f536

Browse files
committed
Fix zarr-python 3 tests
1 parent 3f955ce commit 833f536

7 files changed

Lines changed: 43 additions & 22 deletions

File tree

bio2zarr/plink.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77

88
from bio2zarr import constants, core, vcz
9+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
910

1011
logger = logging.getLogger(__name__)
1112

@@ -198,7 +199,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
198199
ref_iter = self.bim.allele_2.values[start:stop]
199200
gt_iter = self.bed_reader.iter_decode(start, stop)
200201
for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
201-
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
202+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
202203
alleles[0] = ref
203204
alleles[1 : 1 + len(alt)] = alt
204205
phased = np.zeros(gt.shape[0], dtype=bool)
@@ -246,13 +247,13 @@ def generate_schema(
246247
),
247248
vcz.ZarrArraySpec(
248249
name="variant_allele",
249-
dtype="O",
250+
dtype=STRING_DTYPE_NAME,
250251
dimensions=["variants", "alleles"],
251252
description=None,
252253
),
253254
vcz.ZarrArraySpec(
254255
name="variant_id",
255-
dtype="O",
256+
dtype=STRING_DTYPE_NAME,
256257
dimensions=["variants"],
257258
description=None,
258259
),

bio2zarr/tskit.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55

66
from bio2zarr import constants, core, vcz
7+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
78

89
logger = logging.getLogger(__name__)
910

@@ -116,7 +117,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
116117
copy=False,
117118
):
118119
gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
119-
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
120+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
120121
# length is the length of the REF allele unless other fields
121122
# are included.
122123
variant_length = len(variant.alleles[0])
@@ -200,7 +201,7 @@ def generate_schema(
200201
vcz.ZarrArraySpec(
201202
source=None,
202203
name="variant_allele",
203-
dtype="O",
204+
dtype=STRING_DTYPE_NAME,
204205
dimensions=["variants", "alleles"],
205206
description="Alleles for each variant",
206207
),

bio2zarr/vcf.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import numcodecs
1717
import numpy as np
1818

19+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
20+
1921
from . import constants, core, provenance, vcf_utils, vcz
2022

2123
logger = logging.getLogger(__name__)
@@ -110,7 +112,7 @@ def smallest_dtype(self):
110112
ret = "U1"
111113
else:
112114
assert self.vcf_type == "String"
113-
ret = "O"
115+
ret = STRING_DTYPE_NAME
114116
return ret
115117

116118

@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
397399

398400
def sanitise_value_string_1d(shape, value):
399401
if value is None:
400-
return np.full(shape, ".", dtype="O")
402+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
401403
else:
402404
value = drop_empty_second_dim(value)
403405
result = np.full(shape, "", dtype=value.dtype)
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
407409

408410
def sanitise_value_string_2d(shape, value):
409411
if value is None:
410-
return np.full(shape, ".", dtype="O")
412+
return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
411413
else:
412-
result = np.full(shape, "", dtype="O")
414+
result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
413415
if value.ndim == 2:
414416
result[: value.shape[0], : value.shape[1]] = value
415417
else:
@@ -569,7 +571,12 @@ def transform(self, vcf_value):
569571
value = np.array(list(vcf_value.split(",")))
570572
else:
571573
# TODO can we make this faster??
572-
value = np.array([v.split(",") for v in vcf_value], dtype="O")
574+
var_len_values = [v.split(",") for v in vcf_value]
575+
number = max(len(v) for v in var_len_values)
576+
value = np.array(
577+
[v + [""] * (number - len(v)) for v in var_len_values],
578+
dtype=STRING_DTYPE_NAME,
579+
)
573580
# print("HERE", vcf_value, value)
574581
# for v in vcf_value:
575582
# print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ def iter_alleles(self, start, stop, num_alleles):
10441051
ref_field.iter_values(start, stop),
10451052
alt_field.iter_values(start, stop),
10461053
):
1047-
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
1054+
alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
10481055
alleles[0] = ref[0]
10491056
alleles[1 : 1 + len(alt)] = alt
10501057
yield alleles
@@ -1163,7 +1170,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11631170
),
11641171
fixed_field_spec(
11651172
name="variant_allele",
1166-
dtype="O",
1173+
dtype=STRING_DTYPE_NAME,
11671174
dimensions=["variants", "alleles"],
11681175
),
11691176
fixed_field_spec(
@@ -1173,7 +1180,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11731180
),
11741181
fixed_field_spec(
11751182
name="variant_id",
1176-
dtype="O",
1183+
dtype=STRING_DTYPE_NAME,
11771184
),
11781185
fixed_field_spec(
11791186
name="variant_id_mask",

bio2zarr/vcz.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def variant_chunk_nbytes(self, schema):
284284
for size in self.get_shape(schema)[1:]:
285285
chunk_items *= size
286286
dt = np.dtype(self.dtype)
287-
if dt.kind == "O" and "samples" in self.dimensions:
287+
if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
288288
logger.warning(
289289
f"Field {self.name} is a string; max memory usage may "
290290
"be a significant underestimate"
@@ -707,13 +707,16 @@ def init_array(self, root, schema, array_spec, variants_dim_size):
707707
else schema.defaults["compressor"]
708708
)
709709
compressor = numcodecs.get_codec(compressor)
710-
if array_spec.dtype == "O":
710+
if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
711711
if zarr_utils.zarr_v3():
712712
filters = [*list(filters), numcodecs.VLenUTF8()]
713713
else:
714714
kwargs["object_codec"] = numcodecs.VLenUTF8()
715715

716-
if not zarr_utils.zarr_v3():
716+
if zarr_utils.zarr_v3():
717+
# see https://github.com/zarr-developers/zarr-python/issues/3197
718+
kwargs["fill_value"] = None
719+
else:
717720
kwargs["dimension_separator"] = self.metadata.dimension_separator
718721

719722
shape = schema.get_shape(array_spec.dimensions)

bio2zarr/zarr_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ def zarr_v3() -> bool:
88
if zarr_v3():
99
# Use zarr format v2 even when running with zarr-python v3
1010
ZARR_FORMAT_KWARGS = dict(zarr_format=2)
11+
STRING_DTYPE_NAME = "T"
1112
else:
1213
ZARR_FORMAT_KWARGS = dict()
14+
STRING_DTYPE_NAME = "O"
1315

1416

1517
# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529

tests/test_icf.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from bio2zarr import provenance, vcf_utils, vcz
1010
from bio2zarr import vcf as vcf_mod
11+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
1112

1213

1314
class TestSmallExample:
@@ -227,9 +228,14 @@ def schema(self, icf):
227228
("variant_IFD", "f4", (208, 9), ("variants", "INFO_IFD_dim")),
228229
("variant_IC1", "U1", (208,), ("variants",)),
229230
("variant_IC2", "U1", (208, 2), ("variants", "INFO_IC2_dim")),
230-
("variant_IS1", "O", (208,), ("variants",)),
231-
("variant_IS2", "O", (208, 2), ("variants", "INFO_IS2_dim")),
232-
("call_FS2", "O", (208, 2, 2), ("variants", "samples", "FORMAT_FS2_dim")),
231+
("variant_IS1", STRING_DTYPE_NAME, (208,), ("variants",)),
232+
("variant_IS2", STRING_DTYPE_NAME, (208, 2), ("variants", "INFO_IS2_dim")),
233+
(
234+
"call_FS2",
235+
STRING_DTYPE_NAME,
236+
(208, 2, 2),
237+
("variants", "samples", "FORMAT_FS2_dim"),
238+
),
233239
("call_FC2", "U1", (208, 2, 2), ("variants", "samples", "FORMAT_FC2_dim")),
234240
("call_FIG", "i2", (208, 2, 6), ("variants", "samples", "genotypes")),
235241
("call_FIA", "i2", (208, 2, 2), ("variants", "samples", "alt_alleles")),

tests/test_tskit.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from bio2zarr import tskit as tsk
1313
from bio2zarr import vcf
14+
from bio2zarr.zarr_utils import STRING_DTYPE_NAME
1415

1516

1617
def test_missing_dependency():
@@ -115,7 +116,7 @@ def test_alleles(self, conversion):
115116
ts, zroot = conversion
116117
alleles = zroot["variant_allele"][:]
117118
assert alleles.shape == (3, 2)
118-
assert alleles.dtype == "O"
119+
assert alleles.dtype.kind == STRING_DTYPE_NAME
119120
nt.assert_array_equal(alleles, [["A", "TTTT"], ["CCC", "G"], ["G", "AA"]])
120121

121122
def test_variant_length(self, conversion):
@@ -146,7 +147,7 @@ def test_contig_id(self, conversion):
146147
ts, zroot = conversion
147148
contigs = zroot["contig_id"][:]
148149
assert contigs.shape == (1,)
149-
assert contigs.dtype == "O"
150+
assert contigs.dtype.kind == STRING_DTYPE_NAME
150151
nt.assert_array_equal(contigs, ["1"])
151152

152153
def test_variant_contig(self, conversion):
@@ -160,7 +161,7 @@ def test_sample_id(self, conversion):
160161
ts, zroot = conversion
161162
samples = zroot["sample_id"][:]
162163
assert samples.shape == (4,)
163-
assert samples.dtype == "O"
164+
assert samples.dtype.kind == STRING_DTYPE_NAME
164165
nt.assert_array_equal(samples, ["tsk_0", "tsk_1", "tsk_2", "tsk_3"])
165166

166167
def test_region_index(self, conversion):

0 commit comments

Comments
 (0)