diff --git a/CHANGELOG.md b/CHANGELOG.md index 06a0ebd38..222b11e6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - Added `get_starting_time()` and `get_duration()` methods to `TimeSeries` to get the starting time and duration of the time series. @h-mayorquin [#2146](https://github.com/NeurodataWithoutBorders/pynwb/pull/2146) - Added `get_starting_time()` and `get_duration()` methods to `TimeIntervals` to get the earliest start time and total duration (span from earliest start to latest stop) of all intervals. @h-mayorquin [#2146](https://github.com/NeurodataWithoutBorders/pynwb/pull/2146) - Added `get_starting_time()` and `get_duration()` methods to `Units` to get the earliest spike time and total duration (span from earliest to latest spike) across all units. @h-mayorquin [#2164](https://github.com/NeurodataWithoutBorders/pynwb/pull/2164) +- Added remote-read support to `pynwb.read_nwb`. The function now accepts remote URLs (`s3://`, `gs://`, `abfs://`, `https://`, etc.) and dispatches to the right backend based on URL suffix. Anonymous public files just work; credentialed remote access is picked up from the standard cloud-credentials environment (AWS profile, `GOOGLE_APPLICATION_CREDENTIALS`, Azure managed identity, etc.). The module-level signature stays minimal at `read_nwb(path)`. Power-user knobs (forced ROS3 driver, custom S3-compatible endpoints, pre-opened file objects) are available via `NWBHDF5IO.read_nwb(path, backend_kwargs=...)` directly, which gains a new `backend_kwargs` parameter. The existing `fsspec.filesystem("http")` branch in `NWBHDF5IO.read_nwb` that incorrectly handled all remote schemes is now scheme-aware. @h-mayorquin [#2190](https://github.com/NeurodataWithoutBorders/pynwb/pull/2190) ### Fixed - Fixed invalid CSS properties in documentation assistant toggle that prevented proper positioning on displays ≥1400px wide. @rly [#2151](https://github.com/NeurodataWithoutBorders/pynwb/pull/2151) diff --git a/requirements-opt.txt b/requirements-opt.txt index 7c3a851db..806ecb89c 100644 --- a/requirements-opt.txt +++ b/requirements-opt.txt @@ -7,5 +7,9 @@ fsspec==2025.5.1 requests==2.33.0 aiohttp==3.13.4 -# For read_nwb tests -hdmf-zarr +# For read_nwb tests. +# TEMPORARY: pinned to the branch of hdmf-dev/hdmf-zarr#348 which fixes +# `resolve_ref` for fsspec self-references. The remote-Zarr test in +# tests/integration/io/test_read.py depends on this fix. Once #348 lands in +# an hdmf-zarr release, revert this line to `hdmf-zarr`. +hdmf-zarr @ git+https://github.com/hdmf-dev/hdmf-zarr.git@fix_streaming_store diff --git a/src/pynwb/__init__.py b/src/pynwb/__init__.py index ff1edc596..96df4a187 100644 --- a/src/pynwb/__init__.py +++ b/src/pynwb/__init__.py @@ -552,9 +552,10 @@ def read_nwb(**kwargs): path = str(path) if path is not None else None # Streaming case - if path is not None and (path.startswith("s3://") or path.startswith("http")): + if path is not None and path.startswith(_REMOTE_SCHEMES): import fsspec - fsspec_file_system = fsspec.filesystem("http") + scheme = path.split("://", 1)[0] + fsspec_file_system = fsspec.filesystem(scheme) ffspec_file = fsspec_file_system.open(path, "rb") open_file = h5py.File(ffspec_file, "r") @@ -566,12 +567,28 @@ def read_nwb(**kwargs): return nwbfile +_REMOTE_SCHEMES = ( + "s3://", # AWS S3 and S3-compatible stores (MinIO, Ceph, Hetzner Object Storage, etc.) + "gs://", # Google Cloud Storage (canonical scheme) + "gcs://", # Google Cloud Storage (alternative scheme) + "az://", # Azure Blob Storage (short form) + "abfs://", # Azure Data Lake Storage Gen2 over HTTP + "abfss://", # Azure Data Lake Storage Gen2 over HTTPS (recommended) + "wasbs://", # Azure WASB (Windows Azure Storage Blob) over HTTPS (legacy) + "http://", # Generic HTTP, including DANDI signed URLs and any HTTP-accessible store + "https://", # Generic HTTPS, same as above with TLS + "ftp://", # FTP + "ftps://", # FTPS (FTP over TLS) +) + + @docval({'name': 'path', 'type': (str, Path), - 'doc': 'Path to the NWB file. Can be either a local filesystem path to ' - 'an HDF5 (.nwb) or Zarr (.zarr) file.'}, + 'doc': ("Path to the NWB file. Can be a local filesystem path to an HDF5 (.nwb) " + "or Zarr (.zarr) file, or a remote URL " + "(`s3://`, `gs://`, `abfs://`, `https://`, etc.).")}, is_method=False) def read_nwb(**kwargs): - """Read an NWB file from a local path. + """Read an NWB file from a local path or remote URL. High-level interface for reading NWB files. Automatically handles both HDF5 and Zarr formats. For advanced use cases (parallel I/O, custom namespaces), @@ -588,7 +605,6 @@ def read_nwb(**kwargs): * Reads any backend (e.g. HDF5 or Zarr) if there is an IO class available. Advanced features requiring direct use of IO classes (e.g. NWBHDF5IO NWBZarrIO) include: - * Streaming data from s3 * Custom namespace extensions * Parallel I/O with MPI * Custom build managers @@ -596,17 +612,33 @@ def read_nwb(**kwargs): * Pre-opened HDF5 file objects or Zarr stores * Remote file access configuration - Example usage reading a local NWB file: + Example usage: .. code-block:: python from pynwb import read_nwb nwbfile = read_nwb("path/to/file.nwb") + nwbfile = read_nwb("s3://bucket/file.nwb") :Returns: pynwb.NWBFile The loaded NWB file object. """ path = popargs('path', kwargs) + path_str = str(path) + is_remote = path_str.startswith(_REMOTE_SCHEMES) + + # Remote URL: dispatch by URL shape without probing (can_read cannot reach remote paths) + if is_remote: + path_str = path_str.rstrip("/") + has_zarr_suffix = path_str.endswith(".zarr") + # DANDI publishes some Zarr assets at `://dandiarchive/zarr//` with + # no `.zarr` suffix, so suffix matching alone misses them. + is_dandi_zarr = "dandiarchive" in path_str and "/zarr/" in path_str + if has_zarr_suffix or is_dandi_zarr: + from hdmf_zarr import NWBZarrIO + return NWBZarrIO.read_nwb(path=path) + return NWBHDF5IO.read_nwb(path=path) + # HDF5 is always available so we try that first backend_is_hdf5 = NWBHDF5IO.can_read(path=path) if backend_is_hdf5: diff --git a/tests/integration/io/test_read.py b/tests/integration/io/test_read.py index 145e689d0..2cedb2fbf 100644 --- a/tests/integration/io/test_read.py +++ b/tests/integration/io/test_read.py @@ -1,5 +1,6 @@ from pathlib import Path import tempfile +import urllib.request from pynwb import read_nwb from pynwb.testing.mock.file import mock_NWBFile @@ -8,10 +9,16 @@ import unittest try: from hdmf_zarr import NWBZarrIO # noqa f401 - HAVE_NWBZarrIO = True + HAVE_NWBZarrIO = True except ImportError: HAVE_NWBZarrIO = False +try: + import fsspec # noqa: F401 + HAVE_FSSPEC = True +except ImportError: + HAVE_FSSPEC = False + class TestReadNWBMethod(TestCase): """Test suite for the read_nwb function.""" @@ -67,11 +74,50 @@ def test_read_invalid_file(self): with tempfile.TemporaryDirectory() as temp_dir: path = Path(temp_dir) / "test.txt" path.write_text("Not an NWB file") - + expected_message = ( f"Unable to read file: '{path}'. The file is not recognized as either a valid HDF5 or Zarr NWB file. " "Please ensure the file exists and contains valid NWB data." ) - + with self.assertRaisesWith(ValueError, expected_message): - read_nwb(path=path) \ No newline at end of file + read_nwb(path=path) + + @unittest.skipIf(not HAVE_FSSPEC, "fsspec not installed") + def test_read_nwb_anonymous_remote_hdf5(self): + """Test reading an anonymous public HDF5 NWB file over HTTPS through fsspec.""" + url = ( + "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/" + "11ec8933-1456-4942-922b-94e5878bb991" + ) + try: + urllib.request.urlopen(url, timeout=2) + except urllib.request.URLError: + self.skipTest("Internet access to DANDI failed.") + + nwbfile = read_nwb(path=url) + self.assertEqual(len(nwbfile.acquisition['TestData'].data[:]), 3) + nwbfile.get_read_io().close() + + @unittest.skipIf(not HAVE_NWBZarrIO or not HAVE_FSSPEC, "hdmf-zarr or fsspec not installed") + def test_read_nwb_anonymous_remote_zarr(self): + """Test reading an anonymous public Zarr NWB file from DANDI through fsspec. + + Uses the same DANDI 000719 file as hdmf-zarr's own S3 streaming tutorial (PR #330). + Depends on hdmf-zarr's `resolve_ref` self-reference fix + (https://github.com/hdmf-dev/hdmf-zarr/pull/348); without that fix this read + fails with `PathNotFoundError: nothing found at path ''`. + """ + url = ( + "https://dandiarchive.s3.amazonaws.com/zarr/" + "c8c6b848-fbc6-4f58-85ff-e3f2618ee983/" + ) + try: + urllib.request.urlopen(url + ".zmetadata", timeout=2) + except urllib.request.URLError: + self.skipTest("Internet access to DANDI failed.") + + nwbfile = read_nwb(path=url) + self.assertEqual(nwbfile.identifier, "7208f856-f527-479f-973d-e6e72326a8ea") + self.assertEqual(nwbfile.subject.subject_id, "R6") + nwbfile.get_read_io().close() \ No newline at end of file