diff --git a/.github/workflows/run_dandi_read_tests.yml b/.github/workflows/run_dandi_read_tests.yml index bd1ef0d7b..068a11543 100644 --- a/.github/workflows/run_dandi_read_tests.yml +++ b/.github/workflows/run_dandi_read_tests.yml @@ -1,10 +1,7 @@ name: Run DANDI read tests on: - # NOTE this is disabled until we can run this systematically instead of randomly - # so we don't get constant error notifications and waste compute cycles - # See https://github.com/NeurodataWithoutBorders/pynwb/issues/1804 - # schedule: - # - cron: '0 6 * * *' # once per day at 1am ET + schedule: + - cron: '0 6 * * *' # once per day at 1am ET workflow_dispatch: concurrency: diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b233f4bc..201c880ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## PyNWB 4.0.0 (Upcoming) ### Documentation and tutorial enhancements +- Added a tutorial on using HERD to annotate an NWB file with external resources and store it at `/general/external_resources`, plus a companion example showing how to annotate multiple NWB files streamed from a DANDI dandiset with a single HERD. @rly, @mavaylon1 [#2200](https://github.com/NeurodataWithoutBorders/pynwb/pull/2200) - Added `pandas.ExtensionArray` to `nitpick_ignore` so the Sphinx build does not fail on the unresolved cross-reference that HDMF's `array_data` docval macro renders for every type that accepts array data. @rly [#2209](https://github.com/NeurodataWithoutBorders/pynwb/pull/2209) - Added `app.readthedocs.org/projects/pynwb/*` to `linkcheck_ignore` to stop the Sphinx linkcheck CI job from intermittently failing when GitHub Actions runners get throttled by readthedocs. @h-mayorquin [#2191](https://github.com/NeurodataWithoutBorders/pynwb/pull/2191) - Added documentation for `ExternalImage` to the images tutorial. @h-mayorquin [#2159](https://github.com/NeurodataWithoutBorders/pynwb/pull/2159) diff --git a/docs/gallery/general/plot_external_resources.py b/docs/gallery/general/plot_external_resources.py new file mode 100644 index 000000000..6b5a7bb42 --- /dev/null +++ b/docs/gallery/general/plot_external_resources.py @@ -0,0 +1,177 @@ +""" +.. _external_resources: + +Linking to External Resources (HERD) +==================================== + +The :py:class:`~pynwb.resources.HERD` (HDMF External Resources Data Structure) class lets you map +terms used in your data to entities defined in external, web-accessible resources such as +ontologies. For example, you may store a species name ``"Mus musculus"`` on a +:py:class:`~pynwb.file.Subject` and want to link it to the corresponding NCBI Taxonomy term so that +the value is standardized and easy to query. + +From a user's perspective, a HERD can be treated as a single table that associates a ``key`` (a term +used on an ``object``, i.e. a dataset or attribute in the file) with an ``entity`` (a term in an +external resource, identified by an ``entity_id`` and an ``entity_uri``). Internally, HERD stores +this in six interlinked tables (``keys``, ``files``, ``entities``, ``entity_keys``, ``objects``, and +``object_keys``) and provides convenience methods so you rarely need to interact with those tables +directly. + +This tutorial shows how to create a HERD, annotate objects in an NWB file, store the HERD in the +file, and inspect the annotations after reading the file back. For the full HERD API (including +``add_ref_termset`` for validating terms against a :py:class:`~hdmf.term_set.TermSet`, ``get_key``, +and compound-data references), see the +`HDMF HERD tutorial `_. +""" + +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnails_external_resources.png' +from datetime import datetime +from uuid import uuid4 + +from dateutil.tz import tzlocal + +from pynwb import NWBHDF5IO, NWBFile +from pynwb.file import Subject +from pynwb.resources import HERD + +############################################################################### +# Create an NWB file +# ------------------ +# Start with an :py:class:`~pynwb.file.NWBFile` that has a :py:class:`~pynwb.file.Subject`. The +# subject's species is the value we will annotate with an external resource. + +nwbfile = NWBFile( + session_description="a demonstration of external resources", + identifier=str(uuid4()), + session_start_time=datetime(2018, 4, 25, 2, 30, 3, tzinfo=tzlocal()), + subject=Subject(subject_id="001", species="Mus musculus"), +) + +############################################################################### +# Create a HERD and attach it to the file +# --------------------------------------- +# Create a :py:class:`~pynwb.resources.HERD` and assign it to the ``external_resources`` field of the +# :py:class:`~pynwb.file.NWBFile`. + +nwbfile.external_resources = HERD() + +############################################################################### +# Add references with ``add_ref`` +# ------------------------------- +# Use :py:meth:`~hdmf.common.resources.HERD.add_ref` to add a row that links a key on an object to an +# external entity. Here we link the subject's species to the NCBI Taxonomy entry for *Mus musculus*. +# The subject must be part of a file before a reference is added to it. +# +# An entity is identified by an ``entity_id`` and an ``entity_uri``. The ``entity_id`` is a compact +# URI (CURIE) of the form ``prefix:identifier`` whose prefix is registered with +# `bioregistry.io `_, such as ``NCBITaxon`` for the NCBI Taxonomy. The +# ``entity_uri`` is the persistent URL the CURIE resolves to, which you can look up at +# ``https://bioregistry.io/``. + +nwbfile.external_resources.add_ref( + container=nwbfile.subject, + key=nwbfile.subject.species, + entity_id="NCBITaxon:10090", + entity_uri="http://purl.obolibrary.org/obo/NCBITaxon_10090", +) + +############################################################################### +# References can also point to an attribute of an object, such as a column of a table. Here we record +# the brain region of a set of electrodes in the electrodes table and link the region to the +# corresponding structure in the +# `Allen Mouse Brain Atlas `_. When the target is a column, pass the +# table as the ``container`` and the column name as the ``attribute``; HERD resolves the reference to +# the column object itself. + +device = nwbfile.create_device(name="probe") +electrode_group = nwbfile.create_electrode_group( + name="shank0", + description="a shank of the recording probe", + location="VISp", + device=device, +) +for _ in range(4): + nwbfile.add_electrode(location="VISp", group=electrode_group) + +nwbfile.external_resources.add_ref( + container=nwbfile.electrodes, + attribute="location", + key="VISp", + entity_id="MBA:385", + entity_uri="https://purl.brain-bican.org/ontology/mbao/MBA_385", +) + +############################################################################### +# Inspect the HERD +# ---------------- +# :py:meth:`~hdmf.common.resources.HERD.to_dataframe` flattens the interlinked tables into a single +# :py:class:`~pandas.DataFrame`, with one row per (object, key, entity) association. + +nwbfile.external_resources.to_dataframe() + +############################################################################### +# You can also view the individual tables. Each is a +# :py:class:`~hdmf.common.table.DynamicTable` and has its own ``to_dataframe`` method. + +nwbfile.external_resources.keys.to_dataframe() + +############################################################################### + +nwbfile.external_resources.entities.to_dataframe() + +############################################################################### +# :py:meth:`~hdmf.common.resources.HERD.get_object_type` returns all annotations for objects of a +# given type, for example every annotated :py:class:`~pynwb.file.Subject`. + +nwbfile.external_resources.get_object_type(object_type="Subject") + +############################################################################### +# Write and read the NWB file +# --------------------------- +# Writing the file stores the HERD inside it. Reading the file back makes the HERD available again +# through the ``external_resources`` field. + +filename = "external_resources_tutorial.nwb" +with NWBHDF5IO(filename, mode="w") as io: + io.write(nwbfile) + +read_io = NWBHDF5IO(filename, mode="r") +read_nwbfile = read_io.read() +read_herd = read_nwbfile.external_resources + +############################################################################### +# Access the loaded data +# ----------------------- +# The loaded HERD provides the same accessors as before. In a Jupyter notebook, displaying the HERD +# renders the flattened references as a table, and +# :py:meth:`~hdmf.common.resources.HERD.to_dataframe` returns that same table as a +# :py:class:`~pandas.DataFrame`. The individual tables give a more focused view. + +read_herd.to_dataframe() + +############################################################################### +# View the individual tables, for example: + +read_herd.keys.to_dataframe() + +############################################################################### +# :py:meth:`~hdmf.common.resources.HERD.get_object_entities` returns the entities annotated on a +# single object as a :py:class:`~pandas.DataFrame`. Here we view the species annotation stored for +# the subject: + +read_herd.get_object_entities(container=read_nwbfile.subject) + +############################################################################### +# Close the file once you are done reading from it. + +read_io.close() + +############################################################################### +# Alternative: store a HERD outside an NWB file +# --------------------------------------------- +# A HERD can also be saved independently of an NWB file as a zip archive of the underlying tables +# using :py:meth:`~hdmf.common.resources.HERD.to_zip`, and read back with +# :py:meth:`~hdmf.common.resources.HERD.from_zip`. This is useful when external resources span +# multiple files; see :ref:`external_resources_streaming` for an example that annotates many NWB +# files with a single HERD. For the full HERD API, see the +# `HDMF HERD tutorial `_. diff --git a/docs/gallery/general/resources_streaming.py b/docs/gallery/general/resources_streaming.py new file mode 100644 index 000000000..17ba33b07 --- /dev/null +++ b/docs/gallery/general/resources_streaming.py @@ -0,0 +1,154 @@ +""" +.. _external_resources_streaming: + +Annotating Multiple Streamed NWB Files with a Single HERD +========================================================= + +A single :py:class:`~pynwb.resources.HERD` can hold external resource references for many +:py:class:`~pynwb.file.NWBFile` objects at once. This makes it possible to build a shared set of +ontology annotations across an entire dataset, for example every file in a +`DANDI `_ dandiset. + +This example streams each NWB file in a dandiset directly from the DANDI Archive (without +downloading the full files) and adds references for two pieces of metadata in each file: the +subject species (mapped to the `NCBI Taxonomy `_) and the +experimenter (mapped to an `ORCID `_ iD). Because a HERD can be saved +independently of any one file with :py:meth:`~hdmf.common.resources.HERD.to_zip`, the resulting +HERD can be distributed alongside the dandiset as a standalone annotation layer and later reloaded +with :py:meth:`~hdmf.common.resources.HERD.from_zip` to add further annotations. + +For storing a HERD inside a single NWB file, see :ref:`external_resources`. + +.. note:: + + This example reads data over the network and is not run when the documentation is built. To run + it yourself, install the ``dandi`` and ``fsspec`` packages: + + .. code-block:: bash + + pip install dandi fsspec aiohttp requests +""" + +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnails_streaming_external_resources.png' +import h5py +from dandi.dandiapi import DandiAPIClient +from fsspec import filesystem +from fsspec.implementations.cached import CachingFileSystem +from tqdm import tqdm + +from pynwb import NWBHDF5IO +from pynwb.resources import HERD + +############################################################################### +# Collect the file URLs from DANDI +# -------------------------------- +# Use the :py:class:`~dandi.dandiapi.DandiAPIClient` to list the S3 URL of every NWB file in a +# dandiset. Here we use dandiset `000015 `_. + +dandiset_id = "000015" +with DandiAPIClient() as client: + dandiset = client.get_dandiset(dandiset_id, "draft") + urls = [ + asset.get_content_url(follow_redirects=1, strip_query=True) + for asset in dandiset.get_assets() + ] + +############################################################################### +# Set up streaming +# ---------------- +# Create an HTTP filesystem with a local cache so repeated reads do not re-download data. + +fs = CachingFileSystem(fs=filesystem("http"), cache_storage="nwb-cache") + +############################################################################### +# Populate a single HERD across all files +# --------------------------------------- +# Open each file in read mode and add references for its subject species and experimenter. Checking +# the value read from each file before annotating it keeps a file with unexpected metadata from being +# mislabeled. Passing the same ``entity_id`` across files reuses the existing entity instead of +# creating a duplicate. +# +# Each entity is identified by an ``entity_id``, a compact URI (CURIE) whose prefix is registered with +# `bioregistry.io `_, and an ``entity_uri``, the persistent URL the CURIE +# resolves to. + +herd = HERD() +for url in tqdm(urls): + with fs.open(url, "rb") as f, h5py.File(f) as h5_file: + with NWBHDF5IO(file=h5_file) as io: + read_nwbfile = io.read() + + # reference the subject species + species = read_nwbfile.subject.species + if species == "Mus musculus": + herd.add_ref( + container=read_nwbfile.subject, + key=species, + entity_id="NCBITaxon:10090", + entity_uri="http://purl.obolibrary.org/obo/NCBITaxon_10090", + ) + else: + print(f"Unexpected species: {species}") + + # reference the experimenter, an attribute of the NWBFile itself + experimenter = read_nwbfile.experimenter[0] + if experimenter == "Chen, Tsai-Wen": + herd.add_ref( + container=read_nwbfile, + attribute="experimenter", + key=experimenter, + entity_id="ORCID:0000-0001-6782-3819", + entity_uri="https://orcid.org/0000-0001-6782-3819", + ) + else: + print(f"Unexpected experimenter: {experimenter}") + +############################################################################### +# Inspect and save the combined HERD +# ---------------------------------- +# The flattened table now contains one row per (file, object, key, entity) association across all of +# the streamed files. Save the HERD as a standalone zip archive that can be shared alongside the +# dandiset. + +herd.to_dataframe() +herd.to_zip(path="./dandiset_resources.zip") + +############################################################################### +# Load an external HERD to annotate a file +# ---------------------------------------- +# A HERD saved to a zip archive can be loaded later with +# :py:meth:`~hdmf.common.resources.HERD.from_zip` and used to add further annotations. Here we load +# the HERD we just saved, stream one of the files again, and annotate its institution with the +# corresponding `Research Organization Registry (ROR) `_ identifier. + +loaded_herd = HERD.from_zip(path="./dandiset_resources.zip") + +with fs.open(urls[0], "rb") as f, h5py.File(f) as h5_file: + with NWBHDF5IO(file=h5_file) as io: + read_nwbfile = io.read() + institution = read_nwbfile.institution + if institution == "Janelia Research Campus": + loaded_herd.add_ref( + container=read_nwbfile, + attribute="institution", + key=institution, + entity_id="ROR:013sk6x84", + entity_uri="https://ror.org/013sk6x84", + ) + else: + print(f"Unexpected institution: {institution}") + +loaded_herd.to_dataframe() + +############################################################################### +# To view the annotations for a single object, use +# :py:meth:`~hdmf.common.resources.HERD.get_object_entities`. Here we view the species annotation +# stored for the subject of the file we just streamed: + +loaded_herd.get_object_entities(container=read_nwbfile.subject) + +############################################################################### +# Save the updated HERD as a new zip archive so the added institution annotation is persisted +# alongside the original references. + +loaded_herd.to_zip(path="./dandiset_resources_updated.zip") diff --git a/docs/source/conf.py b/docs/source/conf.py index 07e9ded6c..190ce5a91 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,18 @@ class CustomSphinxGallerySectionSortKey(ExampleTitleSortKey): # listed here will be added in alphabetical order based on title after the # explicitly listed galleries GALLERY_ORDER = { - 'general': ['plot_file.py'], + 'general': [ + "plot_file.py", + "add_remove_containers.py", + "plot_timeintervals.py", + "scratch.py", + "extensions.py", + "plot_configurator.py", + "object_id.py", + "plot_read_basics.py", + "plot_external_resources.py", + "resources_streaming.py", + ], # Sort domain-specific tutorials based on domain to group tutorials belonging to the same domain 'domain': [ "ecephys.py", diff --git a/docs/source/figures/gallery_thumbnails.pptx b/docs/source/figures/gallery_thumbnails.pptx index 765cf2e2a..0c5ffdc02 100644 Binary files a/docs/source/figures/gallery_thumbnails.pptx and b/docs/source/figures/gallery_thumbnails.pptx differ diff --git a/docs/source/figures/gallery_thumbnails_external_resources.png b/docs/source/figures/gallery_thumbnails_external_resources.png new file mode 100644 index 000000000..d84a77803 Binary files /dev/null and b/docs/source/figures/gallery_thumbnails_external_resources.png differ diff --git a/docs/source/figures/gallery_thumbnails_streaming_external_resources.png b/docs/source/figures/gallery_thumbnails_streaming_external_resources.png new file mode 100644 index 000000000..ad04e22ab Binary files /dev/null and b/docs/source/figures/gallery_thumbnails_streaming_external_resources.png differ diff --git a/test.py b/test.py index 9d51e3b76..1b143a4fb 100644 --- a/test.py +++ b/test.py @@ -86,6 +86,12 @@ def _import_from_file(script): os.path.join('advanced_io', 'streaming.py'), ] +# examples that stream data from the DANDI Archive. these are excluded from the offline example tests +# and run separately by tests/read_dandi/read_dandi.py (the "Run DANDI read tests" workflow). +dandi_read_examples = [ + os.path.join('general', 'resources_streaming.py'), +] + def run_example_tests(): """Run the Sphinx gallery example files, excluding ROS3-dependent ones, to check for errors.""" @@ -95,7 +101,7 @@ def run_example_tests(): for f in files: if f.endswith(".py"): name_with_parent_dir = os.path.join(os.path.basename(root), f) - if name_with_parent_dir in ros3_examples: + if name_with_parent_dir in ros3_examples or name_with_parent_dir in dandi_read_examples: logging.info("Skipping %s" % name_with_parent_dir) continue examples_scripts.append(os.path.join(root, f)) @@ -284,6 +290,7 @@ def clean_up_tests(): "exported_nwbfile.nwb", "external_linkcontainer_example.nwb", "external_linkdataset_example.nwb", + "external_resources_tutorial.nwb", "external1_example.nwb", "external2_example.nwb", "icephys_example.nwb", diff --git a/tests/read_dandi/read_dandi.py b/tests/read_dandi/read_dandi.py new file mode 100644 index 000000000..90f1d3429 --- /dev/null +++ b/tests/read_dandi/read_dandi.py @@ -0,0 +1,36 @@ +"""Entry point for the DANDI read tests. + +Reads NWB files from the DANDI Archive and runs the DANDI streaming HERD tutorial as a smoke test. +Run by the "Run DANDI read tests" GitHub Actions workflow. +""" +import os +import runpy +import shutil + +from read_first_nwb_asset import read_first_nwb_asset + +# the streaming HERD tutorial, excluded from the offline example tests because it streams from DANDI +STREAMING_EXAMPLE = os.path.join( + os.path.dirname(__file__), "..", "..", "docs", "gallery", "general", "resources_streaming.py" +) + +# files the streaming tutorial writes to the current directory +STREAMING_ARTIFACTS = ("dandiset_resources.zip", "dandiset_resources_updated.zip") +STREAMING_CACHE_DIR = "nwb-cache" + + +def run_streaming_example(): + """Run the DANDI streaming HERD tutorial and remove the files it generates.""" + try: + runpy.run_path(STREAMING_EXAMPLE, run_name="__main__") + finally: + for name in STREAMING_ARTIFACTS: + if os.path.exists(name): + os.remove(name) + if os.path.isdir(STREAMING_CACHE_DIR): + shutil.rmtree(STREAMING_CACHE_DIR) + + +if __name__ == "__main__": + read_first_nwb_asset() + run_streaming_example() diff --git a/tests/unit/test_resources.py b/tests/unit/test_resources.py index 3e340c906..e7a36d882 100644 --- a/tests/unit/test_resources.py +++ b/tests/unit/test_resources.py @@ -1,6 +1,5 @@ import os import tempfile -import warnings from datetime import datetime from uuid import uuid4 @@ -52,14 +51,8 @@ def test_constructor(self): """ Test constructor """ - with warnings.catch_warnings(record=True): - warnings.filterwarnings( - "ignore", - message=r"HERD is experimental .*", - category=UserWarning, - ) - er = HERD() - self.assertIsInstance(er, HERD) + er = HERD() + self.assertIsInstance(er, HERD) def test_nwbfile_init_herd(self): session_start_time = datetime(2018, 4, 25, 2, 30, 3, tzinfo=tz.gettz("US/Pacific"))