sgkit-dev · benjeffery · Jan 6, 2026 · Dec 22, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from bio2zarr import constants, core, vcz
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 
 logger = logging.getLogger(__name__)
 
@@ -198,7 +199,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         ref_iter = self.bim.allele_2.values[start:stop]
         gt_iter = self.bed_reader.iter_decode(start, stop)
         for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
             phased = np.zeros(gt.shape[0], dtype=bool)
@@ -246,13 +247,13 @@ def generate_schema(
             ),
             vcz.ZarrArraySpec(
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
                 description=None,
             ),
             vcz.ZarrArraySpec(
                 name="variant_id",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants"],
                 description=None,
             ),

diff --git a/bio2zarr/tskit.py b/bio2zarr/tskit.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from bio2zarr import constants, core, vcz
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 
 logger = logging.getLogger(__name__)
 
@@ -116,7 +117,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
             copy=False,
         ):
             gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             # length is the length of the REF allele unless other fields
             # are included.
             variant_length = len(variant.alleles[0])
@@ -200,7 +201,7 @@ def generate_schema(
             vcz.ZarrArraySpec(
                 source=None,
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
                 description="Alleles for each variant",
             ),

diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -16,6 +16,8 @@
 import numcodecs
 import numpy as np
 
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
+
 from . import constants, core, provenance, vcf_utils, vcz
 
 logger = logging.getLogger(__name__)
@@ -110,7 +112,7 @@ def smallest_dtype(self):
             ret = "U1"
         else:
             assert self.vcf_type == "String"
-            ret = "O"
+            ret = STRING_DTYPE_NAME
         return ret
 
 
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
 
 def sanitise_value_string_1d(shape, value):
     if value is None:
-        return np.full(shape, ".", dtype="O")
+        return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
     else:
         value = drop_empty_second_dim(value)
         result = np.full(shape, "", dtype=value.dtype)
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
 
 def sanitise_value_string_2d(shape, value):
     if value is None:
-        return np.full(shape, ".", dtype="O")
+        return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
     else:
-        result = np.full(shape, "", dtype="O")
+        result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
         if value.ndim == 2:
             result[: value.shape[0], : value.shape[1]] = value
         else:
@@ -569,7 +571,12 @@ def transform(self, vcf_value):
             value = np.array(list(vcf_value.split(",")))
         else:
             # TODO can we make this faster??
-            value = np.array([v.split(",") for v in vcf_value], dtype="O")
+            var_len_values = [v.split(",") for v in vcf_value]
+            number = max(len(v) for v in var_len_values)
+            value = np.array(
+                [v + [""] * (number - len(v)) for v in var_len_values],
+                dtype=STRING_DTYPE_NAME,
+            )
             # print("HERE", vcf_value, value)
             # for v in vcf_value:
             #     print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ def iter_alleles(self, start, stop, num_alleles):
             ref_field.iter_values(start, stop),
             alt_field.iter_values(start, stop),
         ):
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             alleles[0] = ref[0]
             alleles[1 : 1 + len(alt)] = alt
             yield alleles
@@ -1165,7 +1172,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
             ),
             fixed_field_spec(
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
             ),
             fixed_field_spec(
@@ -1175,7 +1182,7 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
             ),
             fixed_field_spec(
                 name="variant_id",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
             ),
             fixed_field_spec(
                 name="variant_id_mask",

diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -284,7 +284,7 @@ def variant_chunk_nbytes(self, schema):
         for size in self.get_shape(schema)[1:]:
             chunk_items *= size
         dt = np.dtype(self.dtype)
-        if dt.kind == "O" and "samples" in self.dimensions:
+        if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
             logger.warning(
                 f"Field {self.name} is a string; max memory usage may "
                 "be a significant underestimate"
@@ -707,13 +707,16 @@ def init_array(self, root, schema, array_spec, variants_dim_size):
             else schema.defaults["compressor"]
         )
         compressor = numcodecs.get_codec(compressor)
-        if array_spec.dtype == "O":
+        if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
             if zarr_utils.zarr_v3():
                 filters = [*list(filters), numcodecs.VLenUTF8()]
             else:
                 kwargs["object_codec"] = numcodecs.VLenUTF8()
 
-        if not zarr_utils.zarr_v3():
+        if zarr_utils.zarr_v3():
+            # see https://github.com/zarr-developers/zarr-python/issues/3197
+            kwargs["fill_value"] = None
+        else:
             kwargs["dimension_separator"] = self.metadata.dimension_separator
 
         shape = schema.get_shape(array_spec.dimensions)

diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py
@@ -8,8 +8,10 @@ def zarr_v3() -> bool:
 if zarr_v3():
     # Use zarr format v2 even when running with zarr-python v3
     ZARR_FORMAT_KWARGS = dict(zarr_format=2)
+    STRING_DTYPE_NAME = "T"
 else:
     ZARR_FORMAT_KWARGS = dict()
+    STRING_DTYPE_NAME = "O"
 
 
 # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529

diff --git a/tests/test_icf.py b/tests/test_icf.py
@@ -8,6 +8,7 @@
 
 from bio2zarr import provenance, vcf_utils, vcz
 from bio2zarr import vcf as vcf_mod
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 
 
 class TestSmallExample:
@@ -227,9 +228,14 @@ def schema(self, icf):
             ("variant_IFD", "f4", (208, 9), ("variants", "INFO_IFD_dim")),
             ("variant_IC1", "U1", (208,), ("variants",)),
             ("variant_IC2", "U1", (208, 2), ("variants", "INFO_IC2_dim")),
-            ("variant_IS1", "O", (208,), ("variants",)),
-            ("variant_IS2", "O", (208, 2), ("variants", "INFO_IS2_dim")),
-            ("call_FS2", "O", (208, 2, 2), ("variants", "samples", "FORMAT_FS2_dim")),
+            ("variant_IS1", STRING_DTYPE_NAME, (208,), ("variants",)),
+            ("variant_IS2", STRING_DTYPE_NAME, (208, 2), ("variants", "INFO_IS2_dim")),
+            (
+                "call_FS2",
+                STRING_DTYPE_NAME,
+                (208, 2, 2),
+                ("variants", "samples", "FORMAT_FS2_dim"),
+            ),
             ("call_FC2", "U1", (208, 2, 2), ("variants", "samples", "FORMAT_FC2_dim")),
             ("call_FIG", "i2", (208, 2, 6), ("variants", "samples", "genotypes")),
             ("call_FIA", "i2", (208, 2, 2), ("variants", "samples", "alt_alleles")),

diff --git a/tests/test_tskit.py b/tests/test_tskit.py
@@ -11,6 +11,7 @@
 
 from bio2zarr import tskit as tsk
 from bio2zarr import vcf
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 
 
 def test_missing_dependency():
@@ -115,7 +116,7 @@ def test_alleles(self, conversion):
         ts, zroot = conversion
         alleles = zroot["variant_allele"][:]
         assert alleles.shape == (3, 2)
-        assert alleles.dtype == "O"
+        assert alleles.dtype.kind == STRING_DTYPE_NAME
         nt.assert_array_equal(alleles, [["A", "TTTT"], ["CCC", "G"], ["G", "AA"]])
 
     def test_variant_length(self, conversion):
@@ -146,7 +147,7 @@ def test_contig_id(self, conversion):
         ts, zroot = conversion
         contigs = zroot["contig_id"][:]
         assert contigs.shape == (1,)
-        assert contigs.dtype == "O"
+        assert contigs.dtype.kind == STRING_DTYPE_NAME
         nt.assert_array_equal(contigs, ["1"])
 
     def test_variant_contig(self, conversion):
@@ -160,7 +161,7 @@ def test_sample_id(self, conversion):
         ts, zroot = conversion
         samples = zroot["sample_id"][:]
         assert samples.shape == (4,)
-        assert samples.dtype == "O"
+        assert samples.dtype.kind == STRING_DTYPE_NAME
         nt.assert_array_equal(samples, ["tsk_0", "tsk_1", "tsk_2", "tsk_3"])
 
     def test_region_index(self, conversion):

diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py
@@ -11,6 +11,7 @@
 
 from bio2zarr import constants, provenance, vcz_verification
 from bio2zarr import vcf as vcf_mod
+from bio2zarr.zarr_utils import zarr_v3
 
 
 def assert_dataset_equal(ds1, ds2, drop_vars=None):
@@ -275,6 +276,10 @@ def test_no_genotypes(self, ds, tmp_path):
             if field_name != "sample_id" and not field_name.startswith("call_"):
                 xt.assert_equal(ds[field_name], ds2[field_name])
 
+    @pytest.mark.skipif(
+        zarr_v3(),
+        reason="Zarr chunks ignored when reading arrays with StringDType: https://github.com/pydata/xarray/issues/11054",
+    )
     @pytest.mark.parametrize(
         ("variants_chunk_size", "samples_chunk_size", "y_chunks", "x_chunks"),
         [

diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -13,6 +13,10 @@
 from bio2zarr import vcf as vcf_mod
 from bio2zarr.zarr_utils import zarr_v3
 
+# In zarr-python v2 strings are stored as object arrays (O) with itemsize 8
+# In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
+STRING_ITEMSIZE = 16 if zarr_v3() else 8
+
 
 @pytest.fixture(scope="module")
 def vcf_file():
@@ -82,7 +86,10 @@ def test_not_enough_memory(self, tmp_path, icf_path, max_memory):
         with pytest.raises(ValueError, match="Insufficient memory"):
             vcf_mod.encode(icf_path, zarr_path, max_memory=max_memory)
 
-    @pytest.mark.parametrize("max_memory", ["315KiB", "500KiB"])
+    # zarr-python v3 string use more memory
+    @pytest.mark.parametrize(
+        "max_memory", ["630KiB", "1000KiB"] if zarr_v3() else ["315KiB", "500KiB"]
+    )
     def test_not_enough_memory_for_two(
         self, tmp_path, icf_path, zarr_path, caplog, max_memory
     ):
@@ -292,9 +299,8 @@ class TestChunkNbytes:
             ("variant_position", 36),  # 9 * 4
             ("variant_H2", 9),
             ("variant_AC", 18),  # 9 * 2
-            # Object fields have an itemsize of 8
-            ("variant_AA", 72),  # 9 * 8
-            ("variant_allele", 9 * 4 * 8),
+            ("variant_AA", 9 * STRING_ITEMSIZE),
+            ("variant_allele", 9 * 4 * STRING_ITEMSIZE),
         ],
     )
     def test_example_schema(self, schema, field, value):