Skip to content

Commit 5cb5596

Browse files
committed
fix: restore old path normalization logic, and add regression tests
1 parent 9556ac0 commit 5cb5596

3 files changed

Lines changed: 156 additions & 46 deletions

File tree

src/zarr/storage/_fsspec.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717
from zarr.core.buffer import Buffer
1818
from zarr.errors import ZarrUserWarning
19-
from zarr.storage._utils import _join_paths
19+
from zarr.storage._utils import _dereference_path
2020

2121
if TYPE_CHECKING:
2222
from collections.abc import AsyncIterator, Iterable
@@ -127,7 +127,7 @@ def __init__(
127127
) -> None:
128128
super().__init__(read_only=read_only)
129129
self.fs = fs
130-
self.path = path.rstrip("/")
130+
self.path = path
131131
self.allowed_exceptions = allowed_exceptions
132132

133133
if not self.fs.async_impl:
@@ -282,7 +282,7 @@ async def get(
282282
# docstring inherited
283283
if not self._is_open:
284284
await self._open()
285-
path = _join_paths([self.path, key])
285+
path = _dereference_path(self.path, key)
286286

287287
try:
288288
if byte_range is None:
@@ -329,7 +329,7 @@ async def set(
329329
raise TypeError(
330330
f"FsspecStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead."
331331
)
332-
path = _join_paths([self.path, key])
332+
path = _dereference_path(self.path, key)
333333
# write data
334334
if byte_range:
335335
raise NotImplementedError
@@ -338,7 +338,7 @@ async def set(
338338
async def delete(self, key: str) -> None:
339339
# docstring inherited
340340
self._check_writable()
341-
path = _join_paths([self.path, key])
341+
path = _dereference_path(self.path, key)
342342
try:
343343
await self.fs._rm(path)
344344
except FileNotFoundError:
@@ -354,14 +354,14 @@ async def delete_dir(self, prefix: str) -> None:
354354
)
355355
self._check_writable()
356356

357-
path_to_delete = _join_paths([self.path, prefix])
357+
path_to_delete = _dereference_path(self.path, prefix)
358358

359359
with suppress(*self.allowed_exceptions):
360360
await self.fs._rm(path_to_delete, recursive=True)
361361

362362
async def exists(self, key: str) -> bool:
363363
# docstring inherited
364-
path = _join_paths([self.path, key])
364+
path = _dereference_path(self.path, key)
365365
exists: bool = await self.fs._exists(path)
366366
return exists
367367

@@ -378,7 +378,7 @@ async def get_partial_values(
378378
starts: list[int | None] = []
379379
stops: list[int | None] = []
380380
for key, byte_range in key_ranges:
381-
paths.append(_join_paths([self.path, key]))
381+
paths.append(_dereference_path(self.path, key))
382382
if byte_range is None:
383383
starts.append(None)
384384
stops.append(None)
@@ -429,7 +429,7 @@ async def list_prefix(self, prefix: str) -> AsyncIterator[str]:
429429
yield onefile.removeprefix(f"{self.path}/")
430430

431431
async def getsize(self, key: str) -> int:
432-
path = _join_paths([self.path, key])
432+
path = _dereference_path(self.path, key)
433433
info = await self.fs._info(path)
434434

435435
size = info.get("size")

src/zarr/storage/_utils.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,53 @@ def _join_paths(paths: Iterable[str]) -> str:
185185
return "/".join(filter(lambda v: v != "", paths))
186186

187187

188+
def _dereference_path(root: str, path: str) -> str:
189+
"""
190+
Combine a store-side root with a key into a single fully-qualified path.
191+
192+
Unlike `_join_paths`, this is purpose-built for the case where `root` is
193+
an opaque backend-side prefix that may use `"/"` as a sentinel for "root
194+
of the filesystem" (notably for fsspec's `ReferenceFileSystem`). A
195+
trailing `"/"` is stripped from `root` before joining; if `root` is then
196+
empty, the bare `path` is returned so that joining `"/"` with `"key"`
197+
yields `"key"` rather than `"//key"`. A trailing `"/"` on the result is
198+
also stripped.
199+
200+
Leading slashes on `root` are preserved -- a backend-side path like
201+
`"/home/foo/data.zarr"` is an absolute filesystem path for
202+
`LocalFileSystem` and must not lose its leading separator.
203+
204+
Parameters
205+
----------
206+
root : str
207+
The backend-side root of a store. May be `""`, `"/"`, an absolute
208+
filesystem path, or a backend-specific prefix.
209+
path : str
210+
The key within the store, typically a zarr key like `"zarr.json"`
211+
or `"a/b/c/zarr.json"`.
212+
213+
Returns
214+
-------
215+
str
216+
`root` and `path` joined by a single `"/"`, with the `"/"` sentinel
217+
collapsed and trailing slashes removed.
218+
219+
Examples
220+
--------
221+
```python
222+
from zarr.storage._utils import _dereference_path
223+
_dereference_path("/", "zarr.json") # 'zarr.json'
224+
_dereference_path("", "zarr.json") # 'zarr.json'
225+
_dereference_path("/home/foo", "zarr.json") # '/home/foo/zarr.json'
226+
_dereference_path("/home/foo/", "zarr.json") # '/home/foo/zarr.json'
227+
_dereference_path("bucket/p", "zarr.json") # 'bucket/p/zarr.json'
228+
```
229+
"""
230+
root = root.rstrip("/")
231+
path = f"{root}/{path}" if root else path
232+
return path.rstrip("/")
233+
234+
188235
def _relativize_path(*, path: str, prefix: str) -> str:
189236
"""
190237
Make a "/"-delimited path relative to some prefix. If the prefix is '', then the path is

tests/test_store/test_fsspec.py

Lines changed: 100 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -286,51 +286,114 @@ def array_roundtrip(store: FsspecStore) -> None:
286286
np.testing.assert_array_equal(arr[:], data)
287287

288288

289-
@pytest.mark.skipif(
290-
parse_version(fsspec.__version__) < parse_version("2024.12.0"),
291-
reason="No AsyncFileSystemWrapper",
292-
)
293289
@pytest.mark.parametrize(
294-
("path", "expected"),
290+
("root", "key", "expected"),
295291
[
296-
("", ""),
297-
("/", ""),
298-
("//", ""),
299-
("foo", "foo"),
300-
("foo/", "foo"),
301-
# Leading slashes are preserved: `path` is an opaque string passed to the
302-
# underlying fsspec backend, and for some backends (notably LocalFileSystem)
303-
# a leading slash is semantically meaningful. The store only strips trailing
304-
# slashes so that `_join_paths([self.path, key])` produces a single separator
305-
# between the root and the key.
306-
("/foo", "/foo"),
307-
("/foo/", "/foo"),
308-
("//foo//", "//foo"),
309-
# An absolute filesystem path round-trips intact.
310-
("/home/runner/data.zarr", "/home/runner/data.zarr"),
311-
("/home/runner/data.zarr/", "/home/runner/data.zarr"),
292+
# `"/"` as root collapses so that bare-key backends (notably
293+
# ReferenceFileSystem) get the right key. Regression test for
294+
# https://github.com/zarr-developers/zarr-python/issues/3922 .
295+
("/", "zarr.json", "zarr.json"),
296+
("", "zarr.json", "zarr.json"),
297+
# Trailing slashes on the root are stripped before joining.
298+
("foo/", "zarr.json", "foo/zarr.json"),
299+
("foo", "zarr.json", "foo/zarr.json"),
300+
# Leading slashes on the root are preserved -- absolute filesystem
301+
# paths must stay absolute. Regression test for the titiler-xarray
302+
# breakage that #3924 introduced when `normalize_path` was applied to
303+
# `FsspecStore.path`.
304+
("/home/runner/data.zarr", "zarr.json", "/home/runner/data.zarr/zarr.json"),
305+
("/home/runner/data.zarr/", "zarr.json", "/home/runner/data.zarr/zarr.json"),
306+
# Multi-segment keys.
307+
("/home/foo", "a/b/zarr.json", "/home/foo/a/b/zarr.json"),
308+
("", "a/b/zarr.json", "a/b/zarr.json"),
309+
# Trailing slash on the result is stripped (relevant when key is "").
310+
("/home/foo", "", "/home/foo"),
312311
],
313312
)
314-
def test_fsspec_store_path_normalization(path: str, expected: str) -> None:
315-
"""`FsspecStore.path` strips trailing slashes only.
313+
def test_dereference_path(root: str, key: str, expected: str) -> None:
314+
"""Verify the contract `_dereference_path` provides for `FsspecStore`.
315+
316+
`FsspecStore.path` is stored verbatim; the join with a key must collapse a
317+
sentinel `"/"` root, strip trailing slashes, and preserve leading
318+
slashes on absolute paths.
319+
"""
320+
from zarr.storage._utils import _dereference_path
321+
322+
assert _dereference_path(root, key) == expected
323+
324+
325+
async def test_fsspec_store_open_group_via_reference_filesystem() -> None:
326+
"""End-to-end regression test for
327+
https://github.com/zarr-developers/zarr-python/issues/3922 .
328+
329+
``ReferenceFileSystem`` keys its refs by bare strings like ``"zarr.json"``.
330+
The bug was that ``FsspecStore(fs=ref_fs, path="/")`` produced
331+
``"//zarr.json"`` at the join site and failed to find the entry, raising
332+
``GroupNotFoundError``. This test pins ``path="/"`` explicitly to keep
333+
coverage even if the default value changes later.
334+
"""
335+
import json
336+
337+
from fsspec.implementations.reference import ReferenceFileSystem
316338

317-
Regression test for two related bugs:
339+
group_json = json.dumps({"zarr_format": 3, "node_type": "group", "attributes": {}})
340+
fs = ReferenceFileSystem(
341+
fo={"version": 1, "refs": {"zarr.json": group_json}},
342+
asynchronous=True,
343+
)
344+
store = FsspecStore(fs=fs, path="/", read_only=True)
345+
group = await zarr.api.asynchronous.open_group(store, mode="r")
346+
assert group.metadata.zarr_format == 3
318347

319-
- https://github.com/zarr-developers/zarr-python/issues/3922 -- a caller
320-
passing ``path="/"`` resulted in `_join_paths(["/", "key"])` producing
321-
``"//key"``, which missed the underlying object on backends like
322-
``ReferenceFileSystem``. ``"/"`` must collapse to ``""``.
323348

324-
- Titiler regression after #3924 -- normalizing ``path`` via
325-
``normalize_path`` corrupted absolute filesystem paths by stripping the
326-
leading slash, turning ``/home/runner/data.zarr`` into
327-
``home/runner/data.zarr`` (a relative path that doesn't exist). Leading
328-
slashes must be preserved.
349+
async def test_fsspec_store_read_array_chunk_via_reference_filesystem() -> None:
350+
"""End-to-end regression test that exercises the byte-range read path
351+
against ``ReferenceFileSystem``.
352+
353+
Beyond opening a group (covered by
354+
``test_fsspec_store_open_group_via_reference_filesystem``), this test
355+
constructs a small zarr v3 array whose chunk lives in the refs dict and
356+
reads it through the store. Path-handling bugs on the byte-range
357+
fetch path (used by kerchunk-style virtualization) would surface here
358+
rather than at metadata-open time.
329359
"""
330-
sync_fs = fsspec.filesystem("memory")
331-
fs = _make_async(sync_fs)
332-
store = FsspecStore(fs=fs, path=path)
333-
assert store.path == expected
360+
import json
361+
362+
import numpy as np
363+
from fsspec.implementations.reference import ReferenceFileSystem
364+
365+
# Construct a minimal v3 zarr: a single 1-D uint8 array of length 4 with
366+
# one chunk of size 4. The chunk bytes are little-endian uint8s 1..4.
367+
array_meta = json.dumps(
368+
{
369+
"zarr_format": 3,
370+
"node_type": "array",
371+
"shape": [4],
372+
"chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [4]}},
373+
"data_type": "uint8",
374+
"chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
375+
"fill_value": 0,
376+
"codecs": [{"name": "bytes", "configuration": {"endian": "little"}}],
377+
"attributes": {},
378+
}
379+
)
380+
chunk_bytes = bytes([1, 2, 3, 4])
381+
382+
refs: dict[str, str] = {
383+
"zarr.json": array_meta,
384+
# ReferenceFileSystem accepts raw bytes via base64 encoding or
385+
# latin-1-decoded strings; latin-1 round-trips bytes 1:1.
386+
"c/0": chunk_bytes.decode("latin-1"),
387+
}
388+
389+
fs = ReferenceFileSystem(
390+
fo={"version": 1, "refs": refs},
391+
asynchronous=True,
392+
)
393+
store = FsspecStore(fs=fs, path="/", read_only=True)
394+
array = await zarr.api.asynchronous.open_array(store=store, mode="r")
395+
data = await array.getitem(slice(None))
396+
np.testing.assert_array_equal(data, np.array([1, 2, 3, 4], dtype="uint8"))
334397

335398

336399
@pytest.mark.skipif(

0 commit comments

Comments
 (0)