From 38eccc0c74f7384ca3c3c5d1ae076888f9cd9e8a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 22 May 2025 14:57:45 +0200 Subject: [PATCH 01/23] (fix): fill type values --- pyproject.toml | 4 +++- src/anndata/_io/specs/methods.py | 2 ++ src/anndata/compat/__init__.py | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aa2d617cf..97e021f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7", + "zarr@git+https://github.com/d-v-b/zarr-python.git#egg=feat/fixed-length-strings", ] dynamic = [ "version" ] @@ -119,6 +119,8 @@ source = "vcs" raw-options.version_scheme = "release-branch-semver" [tool.hatch.build.targets.wheel] packages = [ "src/anndata", "src/testing" ] +[tool.hatch.metadata] +allow-direct-references = true [tool.coverage.run] data_file = "test-data/coverage" diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index b934d37f3..41f4616f9 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -640,6 +640,7 @@ def write_vlen_string_array_zarr( shape=elem.shape, dtype=dtype, filters=filters, + fill_value="", **dataset_kwargs, ) f[k][:] = elem @@ -1294,6 +1295,7 @@ def write_scalar_zarr( shape=(), dtype=dtype, filters=filters, + fill_value="" if dtype is object else None, **dataset_kwargs, ) a[...] = np.array(value) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index a81843f35..53b8bf333 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -54,10 +54,10 @@ class Empty: ############################# @cache def is_zarr_v2() -> bool: - import zarr - from packaging.version import Version + # import zarr + # from packaging.version import Version - return Version(zarr.__version__) < Version("3.0.0") + return False if is_zarr_v2(): From 3df0114ff85ffe833db6924760c471943484dba8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 22 May 2025 15:33:36 +0200 Subject: [PATCH 02/23] (chore): remove recarray checks --- src/anndata/_io/zarr.py | 14 -------------- src/anndata/tests/helpers.py | 5 ----- 2 files changed, 19 deletions(-) diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 01a93829a..3b9667300 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -27,19 +27,6 @@ T = TypeVar("T") -def _check_rec_array(adata: AnnData) -> None: - if settings.zarr_write_format == 3 and ( - structured_dtype_keys := { - k - for k, v in adata.uns.items() - if isinstance(v, np.recarray) - or (isinstance(v, np.ndarray) and v.dtype.fields) - } - ): - msg = f"zarr v3 does not support structured dtypes. Found keys {structured_dtype_keys}" - raise NotImplementedError(msg) - - @no_write_dataset_2d def write_zarr( store: StoreLike, @@ -50,7 +37,6 @@ def write_zarr( **ds_kwargs, ) -> None: """See :meth:`~anndata.AnnData.write_zarr`.""" - _check_rec_array(adata) if isinstance(store, Path): store = str(store) if convert_strings_to_categoricals: diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 674fa4d96..be4452a44 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -16,7 +16,6 @@ from pandas.api.types import is_numeric_dtype from scipy import sparse -import anndata from anndata import AnnData, ExperimentalFeatureWarning, Raw from anndata._core.aligned_mapping import AlignedMappingBase from anndata._core.sparse_dataset import BaseCompressedSparseDataset @@ -410,10 +409,6 @@ def gen_adata( # noqa: PLR0913 awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) - # https://github.com/zarr-developers/zarr-python/issues/2134 - # zarr v3 on-disk does not write structured dtypes - if anndata.settings.zarr_write_format == 3: - del uns["O_recarray"] with warnings.catch_warnings(): warnings.simplefilter("ignore", ExperimentalFeatureWarning) adata = AnnData( From 94063e7eb31682c3c0228c76742992ed6997c6a7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 28 May 2025 16:27:24 +0200 Subject: [PATCH 03/23] (fix): read_only --- src/anndata/tests/helpers.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index be4452a44..e6f6d9499 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1132,26 +1132,9 @@ def __getitem__(self, key: str) -> bytes: else: class AccessTrackingStore(AccessTrackingStoreBase): - async def get( - self, - key: str, - prototype: BufferPrototype | None = None, - byte_range: ByteRequest | None = None, - ) -> object: - self._check_and_track_key(key) - return await super().get(key, prototype=prototype, byte_range=byte_range) + def __init__(*args, **kwargs): + super().__init__(*args, **kwargs, read_only=True) - -if is_zarr_v2(): - - class AccessTrackingStore(AccessTrackingStoreBase): - def __getitem__(self, key: str) -> bytes: - self._check_and_track_key(key) - return super().__getitem__(key) - -else: - - class AccessTrackingStore(AccessTrackingStoreBase): async def get( self, key: str, From 534af6652639234abc88110b8aeb60906e49e226 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 3 Jun 2025 11:36:36 +0200 Subject: [PATCH 04/23] (fix): add self --- src/anndata/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index e6f6d9499..d94cc2e12 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1132,7 +1132,7 @@ def __getitem__(self, key: str) -> bytes: else: class AccessTrackingStore(AccessTrackingStoreBase): - def __init__(*args, **kwargs): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs, read_only=True) async def get( From 92b81d408d275116b102f0289abaf36ab37ff7ce Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Jun 2025 14:43:11 +0200 Subject: [PATCH 05/23] (fix): explicit modes for access tracking store --- pyproject.toml | 2 +- tests/test_backed_sparse.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 97e021f3e..4a02832dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,7 +109,7 @@ gpu = [ "cupy" ] cu12 = [ "cupy-cuda12x" ] cu11 = [ "cupy-cuda11x" ] # requests and aiohttp needed for zarr remote data -lazy = [ "xarray>=2025.04.0", "aiohttp", "requests", "anndata[dask]" ] +lazy = [ "xarray>=2025.04.0,<2025.06.0", "aiohttp", "requests", "anndata[dask]" ] # https://github.com/dask/dask/issues/11290 # https://github.com/dask/dask/issues/11752 dask = [ "dask[array]>=2023.5.1,!=2024.8.*,!=2024.9.*,<2025.2.0" ] diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index f656e8c2a..d61c14fa6 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -15,13 +15,7 @@ from anndata._core.sparse_dataset import sparse_dataset from anndata._io.specs.registry import read_elem_lazy from anndata._io.zarr import open_write_group -from anndata.compat import ( - CSArray, - CSMatrix, - DaskArray, - ZarrGroup, - is_zarr_v2, -) +from anndata.compat import CSArray, CSMatrix, DaskArray, ZarrGroup, is_zarr_v2 from anndata.experimental import read_dispatched from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func @@ -388,7 +382,7 @@ def test_lazy_array_cache( store = AccessTrackingStore(path) for elem in elems: store.initialize_key_trackers([f"X/{elem}"]) - f = open_write_group(store, mode="a") + f = zarr.open_group(store, mode="r") a_disk = sparse_dataset(f["X"]) a_disk[:1] a_disk[3:5] @@ -512,7 +506,7 @@ def test_data_access( ) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) - f = zarr.open_group(store) + f = zarr.open_group(store, mode="r") a_disk = AnnData(X=open_func(f["X"])) subset = a_disk[idx_maj, idx_min] if a.format == "csr" else a_disk[idx_min, idx_maj] if isinstance(subset.X, DaskArray): From 99e14ea64eb29835cb81b960d6df785456079ffb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 13 Jun 2025 10:27:57 +0200 Subject: [PATCH 06/23] (fix): wow! --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a02832dd..f579903dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr@git+https://github.com/d-v-b/zarr-python.git#egg=feat/fixed-length-strings", + "zarr@git+https://github.com/d-v-b/zarr-python.git@feat/fixed-length-strings", ] dynamic = [ "version" ] From 666a2e8414b33fddd993e73efb6cbb1e1b52754d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 13 Jun 2025 14:53:39 +0200 Subject: [PATCH 07/23] (fix): vlen string --- src/anndata/_io/specs/methods.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 41f4616f9..c18c121d1 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -622,19 +622,14 @@ def write_vlen_string_array_zarr( f[k][:] = elem else: from numcodecs import VLenUTF8 + from zarr.core.dtype import VariableLengthUTF8 dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) - match ( - ad.settings.zarr_write_format, - Version(np.__version__) >= Version("2.0.0"), - ): - case 2, _: - filters, dtype = [VLenUTF8()], object - case 3, True: - filters, dtype = None, np.dtypes.StringDType() - case 3, False: - filters, dtype = None, np.dtypes.ObjectDType() + dtype = VariableLengthUTF8() + filters = None + if ad.settings.zarr_write_format == 2: + filters = [VLenUTF8()] f.create_array( k, shape=elem.shape, From 87e09314dbcff879bc97d722f715f2a926d40585 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 13 Jun 2025 14:53:57 +0200 Subject: [PATCH 08/23] (fix): allow structured array test --- tests/test_structured_arrays.py | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/tests/test_structured_arrays.py b/tests/test_structured_arrays.py index a22fa526b..3787d38c8 100644 --- a/tests/test_structured_arrays.py +++ b/tests/test_structured_arrays.py @@ -1,11 +1,9 @@ from __future__ import annotations -from contextlib import nullcontext from itertools import combinations, product from typing import TYPE_CHECKING import numpy as np -import pytest import anndata as ad from anndata import AnnData @@ -45,24 +43,17 @@ def test_io( initial = AnnData(np.zeros((3, 3))) initial.uns = dict(str_rec=str_recarray, u_rec=u_recarray, s_rec=s_recarray) - with ( - pytest.raises( - NotImplementedError, match=r"zarr v3 does not support structured dtypes" - ) - if diskfmt == "zarr" and ad.settings.zarr_write_format == 3 - else nullcontext() - ): - write1(initial, filepth1) - disk_once = read1(filepth1) - write2(disk_once, filepth2) - disk_twice = read2(filepth2) + write1(initial, filepth1) + disk_once = read1(filepth1) + write2(disk_once, filepth2) + disk_twice = read2(filepth2) - adatas = [initial, disk_once, disk_twice] - keys = [ - "str_rec", - "u_rec", - # "s_rec" - ] + adatas = [initial, disk_once, disk_twice] + keys = [ + "str_rec", + "u_rec", + # "s_rec" + ] - for (ad1, key1), (ad2, key2) in combinations(product(adatas, keys), 2): - assert_str_contents_equal(ad1.uns[key1], ad2.uns[key2]) + for (ad1, key1), (ad2, key2) in combinations(product(adatas, keys), 2): + assert_str_contents_equal(ad1.uns[key1], ad2.uns[key2]) From b33295e4738bfc281d382edf1db206fa2f101070 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 13 Jun 2025 15:36:23 +0200 Subject: [PATCH 09/23] (fix): scalar handling --- src/anndata/_io/specs/methods.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index c18c121d1..bd75dcbe2 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -1277,12 +1277,13 @@ def write_scalar_zarr( return f.create_dataset(key, data=np.array(value), shape=(), **dataset_kwargs) else: from numcodecs import VLenUTF8 + from zarr.core.dtype import VariableLengthUTF8 match ad.settings.zarr_write_format, value: case 2, str(): - filters, dtype = [VLenUTF8()], object + filters, dtype = [VLenUTF8()], VariableLengthUTF8() case 3, str(): - filters, dtype = None, np.dtypes.StringDType() + filters, dtype = None, VariableLengthUTF8() case _, _: filters, dtype = None, np.array(value).dtype a = f.create_array( @@ -1290,7 +1291,9 @@ def write_scalar_zarr( shape=(), dtype=dtype, filters=filters, - fill_value="" if dtype is object else None, + fill_value="" + if ad.settings.zarr_write_format == 2 and dtype == VariableLengthUTF8() + else None, **dataset_kwargs, ) a[...] = np.array(value) From 5100a0007957ce573650bac9405dea9132cfa2d5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 13 Jun 2025 15:37:37 +0200 Subject: [PATCH 10/23] (fix): ds chunking --- tests/lazy/conftest.py | 11 ++++++++++- tests/lazy/test_read.py | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/lazy/conftest.py b/tests/lazy/conftest.py index 6e181c70b..e6c9bab4f 100644 --- a/tests/lazy/conftest.py +++ b/tests/lazy/conftest.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pytest +import zarr from scipy import sparse import anndata as ad @@ -126,7 +127,7 @@ def adata_remote_with_store_tall_skinny_path( worker_id: str = "serial", ) -> Path: orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}.zarr") - M = 100_000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access + M = 1000 N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) var_names = pd.Index(f"gene{i}" for i in range(N)) @@ -139,6 +140,14 @@ def adata_remote_with_store_tall_skinny_path( ) orig.raw = orig.copy() orig.write_zarr(orig_path) + g = zarr.open_group(orig_path, mode="a", use_consolidated=False) + ad.io.write_elem( + g, + "obs", + obs, + dataset_kwargs=dict(chunks=(250,)), + ) + zarr.consolidate_metadata(g.store) return orig_path diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 4800f898a..4afd6bf7d 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -64,8 +64,8 @@ def test_access_count_subset( ["obs/cat/codes", *non_obs_elem_names] ) adata_remote_tall_skinny[adata_remote_tall_skinny.obs["cat"] == "a", :] - # all codes read in for subset (from 1 chunk) - remote_store_tall_skinny.assert_access_count("obs/cat/codes", 1) + # all codes read in for subset (from 4 chunks as set in the fixture) + remote_store_tall_skinny.assert_access_count("obs/cat/codes", 4) for elem_name in non_obs_elem_names: remote_store_tall_skinny.assert_access_count(elem_name, 0) From 742a166a62a645c086a305a9c5d286eae88dbc1d Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 16 Jun 2025 10:41:24 +0200 Subject: [PATCH 11/23] (chore): point at main --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b91104130..89d87ca78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr@git+https://github.com/d-v-b/zarr-python.git@feat/fixed-length-strings", + "zarr@git+https://github.com/zarr-developers/zarr-python.git", ] dynamic = [ "version" ] From 8914bd5cb0c2f9011a1ce1927e03bdea063d98da Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 16 Jun 2025 15:44:49 +0200 Subject: [PATCH 12/23] Update test_backed_sparse.py --- tests/test_backed_sparse.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 34a0fa8e1..0535ed18b 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -387,13 +387,8 @@ def test_lazy_array_cache( store = AccessTrackingStore(path) for elem in elems: store.initialize_key_trackers([f"X/{elem}"]) -<<<<<<< ig/zarr_dtype f = zarr.open_group(store, mode="r") - a_disk = sparse_dataset(f["X"]) -======= - f = open_write_group(store, mode="a") a_disk = sparse_dataset(f["X"], should_cache_indptr=should_cache_indptr) ->>>>>>> main a_disk[:1] a_disk[3:5] a_disk[6:7] From e05f7738a69e556ca05c52f7a988f6b82d8428e5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 16 Jun 2025 19:03:20 +0200 Subject: [PATCH 13/23] (fix): v2 string array type --- src/anndata/_core/xarray.py | 2 +- src/anndata/_io/specs/methods.py | 4 +++- src/anndata/experimental/backed/_lazy_arrays.py | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 3eca668e3..70a0da233 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -144,7 +144,7 @@ def to_memory(self, *, copy=False) -> pd.DataFrame: if df.index.name != index_key and index_key is not None: df = df.set_index(index_key) for col in set(self.columns) - non_nullable_string_cols: - df[col] = pd.array(self[col].data, dtype="string") + df[col] = pd.array(df[col].to_numpy(), dtype="string") df.index.name = None # matches old AnnData object return df diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index bd75dcbe2..5d46d0708 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -635,7 +635,9 @@ def write_vlen_string_array_zarr( shape=elem.shape, dtype=dtype, filters=filters, - fill_value="", + fill_value="" + if ad.settings.zarr_write_format == 2 and dtype == VariableLengthUTF8() + else None, **dataset_kwargs, ) f[k][:] = elem diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 5afe4dc84..095c733e6 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -146,6 +146,9 @@ def __getitem__( extension_array = pd.arrays.BooleanArray(values, mask=mask) elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 + values = values.astype( + "object" + ) # TODO: file bug report around roundtripped v2 arrays values[mask] = np.nan return values else: From 666080b0f6b26989912113e9a66fc7c0c319347e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 16 Jun 2025 23:29:21 +0200 Subject: [PATCH 14/23] (fix): nullable string handling --- src/anndata/_core/xarray.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 9 +++++++-- src/anndata/_io/specs/methods.py | 9 ++++++++- src/anndata/experimental/backed/_lazy_arrays.py | 13 ++++++++----- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 70a0da233..4a3867e4c 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -144,7 +144,7 @@ def to_memory(self, *, copy=False) -> pd.DataFrame: if df.index.name != index_key and index_key is not None: df = df.set_index(index_key) for col in set(self.columns) - non_nullable_string_cols: - df[col] = pd.array(df[col].to_numpy(), dtype="string") + df[col] = df[col].astype(dtype="string") df.index.name = None # matches old AnnData object return df diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 40a38018c..42675ef08 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -8,6 +8,7 @@ import h5py import numpy as np import pandas as pd +from packaging.version import Version from scipy import sparse import anndata as ad @@ -251,8 +252,12 @@ def _gen_xarray_dict_iterator_from_elems( "base_path_or_zarr_group": v.base_path_or_zarr_group, "elem_name": v.elem_name, "is_nullable_string": isinstance(v, MaskedArray) - and v.dtype # CategoricalArray dtype access requires a read nad is not necessary here - == np.dtype("O"), + and v.dtype + == ( + np.dtype("O") + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) + ), }, ) elif k == dim_name: diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 5d46d0708..4d6a07b50 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -1208,7 +1208,14 @@ def _string_array( values: np.ndarray, mask: np.ndarray ) -> pd.api.extensions.ExtensionArray: """Construct a string array from values and mask.""" - arr = pd.array(values, dtype=pd.StringDtype()) + arr = pd.array( + values.astype( + "object" + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) + ), + dtype=pd.StringDtype(), + ) arr[mask] = pd.NA return arr diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 095c733e6..f6bc24200 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from packaging.version import Version from anndata._core.index import _subset from anndata._core.views import as_view @@ -146,10 +147,8 @@ def __getitem__( extension_array = pd.arrays.BooleanArray(values, mask=mask) elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - values = values.astype( - "object" - ) # TODO: file bug report around roundtripped v2 arrays - values[mask] = np.nan + values = values.astype(self.dtype) + values[mask] = pd.NA return values else: msg = f"Invalid dtype_str {self._dtype_str}" @@ -167,7 +166,11 @@ def dtype(self): return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - return np.dtype("O") + return ( + np.dtype("O") + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) + ) msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg) From 5255329e60f1e5f56a0dbfdb6a7b1ceb938af4af Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 17 Jun 2025 10:07:16 +0200 Subject: [PATCH 15/23] (fix): nullable numpy string type --- src/anndata/_io/specs/lazy_methods.py | 18 ++++++++++-------- src/anndata/_io/specs/methods.py | 7 ++----- src/anndata/compat/__init__.py | 7 +++++++ .../experimental/backed/_lazy_arrays.py | 12 ++++-------- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 42675ef08..bd26d5ac6 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -8,14 +8,21 @@ import h5py import numpy as np import pandas as pd -from packaging.version import Version from scipy import sparse import anndata as ad from anndata._core.file_backing import filename, get_elem_name from anndata._core.xarray import Dataset2D, requires_xarray from anndata.abc import CSCDataset, CSRDataset -from anndata.compat import DaskArray, H5Array, H5Group, XDataArray, ZarrArray, ZarrGroup +from anndata.compat import ( + NULLABLE_NUMPY_STRING_TYPE, + DaskArray, + H5Array, + H5Group, + XDataArray, + ZarrArray, + ZarrGroup, +) from .registry import _LAZY_REGISTRY, IOSpec @@ -252,12 +259,7 @@ def _gen_xarray_dict_iterator_from_elems( "base_path_or_zarr_group": v.base_path_or_zarr_group, "elem_name": v.elem_name, "is_nullable_string": isinstance(v, MaskedArray) - and v.dtype - == ( - np.dtype("O") - if Version(np.__version__) < Version("2") - else np.dtypes.StringDType(na_object=pd.NA) - ), + and v.dtype == NULLABLE_NUMPY_STRING_TYPE, }, ) elif k == dim_name: diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 4d6a07b50..c31429953 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -24,6 +24,7 @@ from anndata._io.utils import H5PY_V3, check_key, zero_dim_array_as_scalar from anndata._warnings import OldFormatWarning from anndata.compat import ( + NULLABLE_NUMPY_STRING_TYPE, AwkArray, CupyArray, CupyCSCMatrix, @@ -1209,11 +1210,7 @@ def _string_array( ) -> pd.api.extensions.ExtensionArray: """Construct a string array from values and mask.""" arr = pd.array( - values.astype( - "object" - if Version(np.__version__) < Version("2") - else np.dtypes.StringDType(na_object=pd.NA) - ), + values.astype(NULLABLE_NUMPY_STRING_TYPE), dtype=pd.StringDtype(), ) arr[mask] = pd.NA diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index ce6c1137c..ba11f4820 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -190,6 +190,13 @@ def old_positionals(*old_positionals): ############################# +NULLABLE_NUMPY_STRING_TYPE = ( + np.dtype("O") + if Version(np.__version__) < Version("2") + else np.dtypes.StringDType(na_object=pd.NA) +) + + @singledispatch def _read_attr(attrs: Mapping, name: str, default: Any | None = Empty): if default is Empty: diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index f6bc24200..abfbfd65a 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -3,14 +3,12 @@ from functools import cached_property from typing import TYPE_CHECKING, Generic, TypeVar -import numpy as np import pandas as pd -from packaging.version import Version from anndata._core.index import _subset from anndata._core.views import as_view from anndata._io.specs.lazy_methods import get_chunksize -from anndata.compat import H5Array, ZarrArray +from anndata.compat import NULLABLE_NUMPY_STRING_TYPE, H5Array, ZarrArray from ..._settings import settings from ...compat import XBackendArray, XDataArray, XZarrArrayWrapper @@ -20,6 +18,8 @@ from pathlib import Path from typing import Literal + import numpy as np + from anndata._core.index import Index from anndata.compat import ZarrGroup @@ -166,11 +166,7 @@ def dtype(self): return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 - return ( - np.dtype("O") - if Version(np.__version__) < Version("2") - else np.dtypes.StringDType(na_object=pd.NA) - ) + return NULLABLE_NUMPY_STRING_TYPE msg = f"Invalid dtype_str {self._dtype_str}" raise RuntimeError(msg) From 01c15fb4869fb353c6aec2893a6eb6d6b70e2057 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 14 Jul 2025 15:49:41 +0200 Subject: [PATCH 16/23] fix: lint --- src/anndata/compat/__init__.py | 7 ------- src/anndata/experimental/backed/_lazy_arrays.py | 3 ++- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index ce9911d7d..ba11f4820 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -411,10 +411,3 @@ def _map_cat_to_str(cat: pd.Categorical) -> pd.Categorical: return cat.map(str, na_action="ignore") else: return cat.map(str) - - -NULLABLE_NUMPY_STRING_TYPE = ( - np.dtype("O") - if Version(np.__version__) < Version("2") - else np.dtypes.StringDType(na_object=pd.NA) -) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index a87714319..3158fafc5 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -8,14 +8,15 @@ from anndata._core.index import _subset from anndata._core.views import as_view from anndata._io.specs.lazy_methods import get_chunksize -from anndata.compat import NULLABLE_NUMPY_STRING_TYPE, H5Array, ZarrArray from ..._settings import settings from ...compat import ( NULLABLE_NUMPY_STRING_TYPE, + H5Array, XBackendArray, XDataArray, XZarrArrayWrapper, + ZarrArray, ) from ...compat import xarray as xr From 6903366aefb5063b41f7ee83266e28ccd56470ec Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 15 Jul 2025 10:55:39 +0200 Subject: [PATCH 17/23] Update pyproject.toml --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2a094fda0..fb5bc0a59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr@git+https://github.com/zarr-developers/zarr-python.git", + "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7, !=3.0.8, !=3.0.9, !=3.0.10, <3.1", ] dynamic = [ "version" ] @@ -119,8 +119,6 @@ source = "vcs" raw-options.version_scheme = "release-branch-semver" [tool.hatch.build.targets.wheel] packages = [ "src/anndata", "src/testing" ] -[tool.hatch.metadata] -allow-direct-references = true [tool.coverage.run] data_file = "test-data/coverage" From 6e172bfac792be7a81e7dbc4b6858bc47ece63c4 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 15 Jul 2025 10:55:54 +0200 Subject: [PATCH 18/23] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fb5bc0a59..ac5d052d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7, !=3.0.8, !=3.0.9, !=3.0.10, <3.1", + "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7, !=3.0.8, !=3.0.9, !=3.0.10", ] dynamic = [ "version" ] From c8e53d4aa3aca379d300f39172a7457d64f32373 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 15 Jul 2025 11:05:54 +0200 Subject: [PATCH 19/23] Update __init__.py --- src/anndata/compat/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index ba11f4820..0a20c5416 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -54,11 +54,10 @@ class Empty: ############################# @cache def is_zarr_v2() -> bool: - # import zarr - # from packaging.version import Version - - return False + import zarr + from packaging.version import Version + return Version(zarr.__version__) < Version("3.0.0") if is_zarr_v2(): msg = "anndata will no longer support zarr v2 in the near future. Please prepare to upgrade to zarr>=3." From 69441ca700241c4641dc4b65777a6ccae25b570e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:06:01 +0000 Subject: [PATCH 20/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/anndata/compat/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 0a20c5416..00e81a80d 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -59,6 +59,7 @@ def is_zarr_v2() -> bool: return Version(zarr.__version__) < Version("3.0.0") + if is_zarr_v2(): msg = "anndata will no longer support zarr v2 in the near future. Please prepare to upgrade to zarr>=3." warn(msg, DeprecationWarning, stacklevel=2) From 75bba8fe18be1fa63d7a8ce054ba31c52894f417 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 15 Jul 2025 12:18:15 +0200 Subject: [PATCH 21/23] refactor: simplify fill_value handling --- src/anndata/_io/specs/methods.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 2ac790cd3..a311eff0a 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -628,17 +628,15 @@ def write_vlen_string_array_zarr( dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) dtype = VariableLengthUTF8() - filters = None + filters, fill_value = None, None if ad.settings.zarr_write_format == 2: - filters = [VLenUTF8()] + filters, fill_value = [VLenUTF8()], "" f.create_array( k, shape=elem.shape, dtype=dtype, filters=filters, - fill_value="" - if ad.settings.zarr_write_format == 2 and dtype == VariableLengthUTF8() - else None, + fill_value=fill_value, **dataset_kwargs, ) f[k][:] = elem @@ -1287,19 +1285,17 @@ def write_scalar_zarr( match ad.settings.zarr_write_format, value: case 2, str(): - filters, dtype = [VLenUTF8()], VariableLengthUTF8() + filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), "" case 3, str(): - filters, dtype = None, VariableLengthUTF8() + filters, dtype, fill_value = None, VariableLengthUTF8(), None case _, _: - filters, dtype = None, np.array(value).dtype + filters, dtype, fill_value = None, np.array(value).dtype, None a = f.create_array( key, shape=(), dtype=dtype, filters=filters, - fill_value="" - if ad.settings.zarr_write_format == 2 and dtype == VariableLengthUTF8() - else None, + fill_value=fill_value, **dataset_kwargs, ) a[...] = np.array(value) From 1aac57a83f8585a228ea3cd5b140836541e6d347 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 15 Jul 2025 14:07:58 +0200 Subject: [PATCH 22/23] fix: dont put in h5 datasets --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a311eff0a..97d1a8640 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -432,7 +432,7 @@ def write_basic( dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) f.create_array(k, shape=elem.shape, dtype=dtype, **dataset_kwargs) # see https://github.com/zarr-developers/zarr-python/discussions/2712 - if isinstance(elem, ZarrArray): + if isinstance(elem, ZarrArray | H5Array): f[k][...] = elem[...] else: f[k][...] = elem From bbf7507f2c738ea35ffe5511cc60cdba81ac9bde Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 15 Jul 2025 14:19:59 +0200 Subject: [PATCH 23/23] Update pyproject.toml Co-authored-by: Philipp A. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5983ef061..487f02e3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dependencies = [ "packaging>=24.2", "array_api_compat>=1.7.1", "legacy-api-wrap", - "zarr >=2.18.7, !=3.0.0, !=3.0.1, !=3.0.2, !=3.0.3, !=3.0.4, !=3.0.5, !=3.0.6, !=3.0.7, !=3.0.8, !=3.0.9, !=3.0.10", + "zarr >=2.18.7, !=3.0.*", ] dynamic = [ "version" ]