From 802bdaec4e3034b70d52a320e5f92e9af6b85559 Mon Sep 17 00:00:00 2001
From: Dan Birman <danbirman@gmail.com>
Date: Mon, 1 Jun 2026 15:46:50 -0700
Subject: [PATCH 1/2] feat: new swdb tables

---
 README.md                                     |  21 +
 scripts/build_swdb_metadata.py                |  33 +
 src/zombie_squirrel/__init__.py               |   1 +
 src/zombie_squirrel/acorn_helpers/__init__.py |   1 +
 .../acorn_helpers/swdb_metadata.py            | 905 ++++++++++++++++++
 src/zombie_squirrel/acorns.py                 |   1 +
 tests/acorn_helpers/test_swdb_metadata.py     | 261 +++++
 7 files changed, 1223 insertions(+)
 create mode 100644 scripts/build_swdb_metadata.py
 create mode 100644 src/zombie_squirrel/acorn_helpers/swdb_metadata.py
 create mode 100644 tests/acorn_helpers/test_swdb_metadata.py

diff --git a/README.md b/README.md
index 02c71a5..8851231 100644
--- a/README.md
+++ b/README.md
@@ -55,9 +55,30 @@ project_names = unique_project_names()
 | `metadata_core` | Presence of core aind-data-schema metadata files per asset (True if file is not null) | `s3://allen-data-views/data-asset-cache/zs_metadata_core.pqt` | metadata | False | `_id`, `_last_modified`, `subject`, `data_description`, `procedures`, `instrument`, `acquisition`, `processing`, `quality_control` |
 | `foraging_sessions` | Foraging behavior sessions with key performance metrics, one row per session | `s3://allen-data-views/data-asset-cache/zs_foraging_sessions.pqt` | metadata | False | `subject_id`, `session_date`, `session`, `nwb_suffix`, `rig`, `trainer`, `task`, `curriculum_name`, `curriculum_version`, `current_stage_actual`, `foraging_eff`, `foraging_eff_random_seed`, `finished_trials`, `finished_rate`, `total_trials`, `bias_naive` |
 | `behavior_curriculum` | Behavior assets with curriculum name and stage, one row per behavior asset | `s3://allen-data-views/data-asset-cache/zs_behavior_curriculum.pqt` | asset | False | `asset_name`, `curriculum_name`, `stage_name`, `stage_node_id` |
+| `swdb_metadata` | Per-project metadata tables for SWDB datasets, one row per data asset (or per asset/stream for BCI) | `s3://allen-data-views/data-asset-cache/zs_swdb_metadata/` | metadata | True (by `dataset`) | See dataset-specific columns below |
 
 The `raw_to_derived` function is not a table stored in S3, instead it is used by passing an asset_name (or list of asset names) and a modality. The function returns the latest derived asset matching the requested pattern.
 
+#### swdb_metadata datasets
+
+`swdb_metadata` is parameterized by `dataset`. Available values:
+
+| Dataset | Project filter | Columns |
+| ------- | -------------- | ------- |
+| `v1dd` | `data_description.project_name = "V1 Deep Dive"` | `project_name`, `_id`, `name`, `subject_id`, `golden_mouse`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `column`, `volume` |
+| `bci` | `session.session_type = "BCI single neuron stim"`, `data_level = derived`, `processing >= 2025-08-03` | `project_name`, `session_type`, `_id`, `name`, `subject_id`, `genotype`, `virus`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `targeted_structure`, `ophys_fov`, `session_number` |
+| `dynamic_foraging` | `project_name = "Behavior Platform"`, `data_level = derived`, `session >= 2025`, all QC passing | `project_name`, `name`, `subject_id`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_type`, `session_date`, `age`, `session_time`, `trials_total`, `trials_rewarded` |
+| `np_ultra` | `project_name = "NP Ultra and Psychedelics"`, `data_level = derived` | `project_name`, `_id`, `name`, `subject_id`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `session_type`, `stimulus_types`, `notes` |
+
+```python
+from zombie_squirrel import swdb_metadata
+
+df = swdb_metadata("v1dd")
+df = swdb_metadata("bci")
+df = swdb_metadata("dynamic_foraging")
+df = swdb_metadata("np_ultra")
+```
+
 ### Custom acorn
 
 The `custom` function allows you to store and retrieve your own user-defined DataFrames in the cache by name. This requires write authentication to the active backend.
diff --git a/scripts/build_swdb_metadata.py b/scripts/build_swdb_metadata.py
new file mode 100644
index 0000000..506d874
--- /dev/null
+++ b/scripts/build_swdb_metadata.py
@@ -0,0 +1,33 @@
+"""Build SWDB metadata tables and upload to S3.
+
+Usage:
+    python scripts/build_swdb_metadata.py [--dataset v1dd|bci|dynamic_foraging|np_ultra]
+"""
+
+import argparse
+import logging
+
+from zombie_squirrel.acorn_helpers.swdb_metadata import DATASETS
+from zombie_squirrel.acorns import ACORN_REGISTRY
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--dataset", choices=DATASETS, help="Build only this dataset")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    swdb_metadata = ACORN_REGISTRY["swdb_metadata"]
+    targets = [args.dataset] if args.dataset else DATASETS
+
+    for dataset in targets:
+        logging.info(f"Building swdb_metadata/{dataset}...")
+        df = swdb_metadata(dataset=dataset, force_update=True)
+        logging.info(f"  Done: {len(df)} rows")
+
+    logging.info("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/zombie_squirrel/__init__.py b/src/zombie_squirrel/__init__.py
index 3d729a7..f1b8c47 100644
--- a/src/zombie_squirrel/__init__.py
+++ b/src/zombie_squirrel/__init__.py
@@ -27,4 +27,5 @@
 from zombie_squirrel.acorn_helpers.unique_subject_ids import (  # noqa: F401
     unique_subject_ids,
 )
+from zombie_squirrel.acorn_helpers.swdb_metadata import swdb_metadata  # noqa: F401
 from zombie_squirrel.utils import get_squirrel_info  # noqa: F401
diff --git a/src/zombie_squirrel/acorn_helpers/__init__.py b/src/zombie_squirrel/acorn_helpers/__init__.py
index 52c2bc8..ebb81bf 100644
--- a/src/zombie_squirrel/acorn_helpers/__init__.py
+++ b/src/zombie_squirrel/acorn_helpers/__init__.py
@@ -9,6 +9,7 @@
     qc,
     raw_to_derived,
     source_data,
+    swdb_metadata,
     unique_genotypes,
     unique_project_names,
     unique_subject_ids,
diff --git a/src/zombie_squirrel/acorn_helpers/swdb_metadata.py b/src/zombie_squirrel/acorn_helpers/swdb_metadata.py
new file mode 100644
index 0000000..afaab7c
--- /dev/null
+++ b/src/zombie_squirrel/acorn_helpers/swdb_metadata.py
@@ -0,0 +1,905 @@
+"""SWDB metadata acorn: per-project metadata tables for the Summer Workshop on the Dynamic Brain."""
+
+import logging
+from datetime import datetime
+
+import pandas as pd
+from aind_data_access_api.document_db import MetadataDbClient
+
+import zombie_squirrel.acorns as acorns
+from zombie_squirrel.squirrel import Column
+from zombie_squirrel.utils import SquirrelMessage, setup_logging
+
+DATASETS = ["v1dd", "bci", "dynamic_foraging", "np_ultra"]
+
+BATCH_SIZE = 50
+
+BCI_PROBLEM_ASSETS = [
+    "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
+    "single-plane-ophys_772414_2025-02-04_13-21-29_processed_2025-08-12_06-14-42",
+]
+
+NP_ULTRA_SALINE_EPOCHS = [
+    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
+    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
+    "Spontaneous_2", "RFMapping_2", "OptoTagging_2", "Anesthesia",
+    "Spontaneous_3", "RFMapping_3", "Spontaneous_4",
+]
+
+NP_ULTRA_PSILO_EPOCHS = [
+    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
+    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
+    "Spontaneous_2", "RFMapping_2", "OptoTagging_2",
+]
+
+DATASET_FILTERS = {
+    "v1dd": {
+        "data_description.project_name": "V1 Deep Dive",
+    },
+    "bci": {
+        "acquisition.acquisition_type": "BCI single neuron stim",
+        "data_description.data_level": "derived",
+        "processing.processing_pipeline.data_processes.start_date_time": {"$gte": "2025-08-03"},
+    },
+    "dynamic_foraging": {
+        "session.session_start_time": {"$regex": "^2025"},
+        "data_description.modality.abbreviation": {"$nin": ["ecephys", "fib"]},
+        "data_description.data_level": "derived",
+        "data_description.project_name": "Behavior Platform",
+        "procedures": {"$ne": None},
+        "$and": [
+            {"quality_control.evaluations": {"$exists": True, "$ne": []}},
+            {
+                "quality_control.evaluations": {
+                    "$not": {"$elemMatch": {"latest_status": {"$ne": "Pass"}}}
+                }
+            },
+        ],
+    },
+    "np_ultra": {
+        "data_description.project_name": "NP Ultra and Psychedelics",
+        "data_description.data_level": "derived",
+    },
+}
+
+
+@acorns.register_acorn(acorns.NAMES["swdb"])
+def swdb_metadata(dataset: str, force_update: bool = False) -> pd.DataFrame:
+    """Build a metadata table for a SWDB project dataset.
+
+    One row per data asset with subject, session, and project-specific fields.
+    Results are cached per dataset.
+
+    Args:
+        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
+        force_update: If True, bypass cache and rebuild from database.
+
+    Returns:
+        DataFrame with columns specific to the requested dataset.
+
+    Raises:
+        ValueError: If dataset is not recognized or cache is empty without force_update.
+    """
+    if dataset not in DATASETS:
+        raise ValueError(f"Unknown dataset '{dataset}'. Must be one of {DATASETS}.")
+
+    cache_key = f"swdb_metadata/{dataset}"
+    df = acorns.TREE.scurry(cache_key)
+
+    if df.empty and not force_update:
+        raise ValueError(f"Cache is empty for dataset '{dataset}'. Use force_update=True to rebuild.")
+
+    if df.empty or force_update:
+        setup_logging()
+        logging.info(
+            SquirrelMessage(
+                tree=acorns.TREE.__class__.__name__,
+                acorn=acorns.NAMES["swdb"],
+                message=f"Building SWDB metadata for '{dataset}'",
+            ).to_json()
+        )
+        df = _build(dataset)
+        if not df.empty:
+            acorns.TREE.hide(cache_key, df)
+
+    return df
+
+
+def _build(dataset: str) -> pd.DataFrame:
+    """Build the metadata DataFrame for the given dataset."""
+    client = MetadataDbClient(
+        host=acorns.API_GATEWAY_HOST,
+        version="v2",
+    )
+    ids = _get_ids(client, DATASET_FILTERS[dataset])
+    if not ids:
+        return pd.DataFrame()
+    records = _fetch_records(client, ids, dataset)
+    if dataset == "v1dd":
+        return _build_v1dd(records)
+    if dataset == "bci":
+        return _build_bci(records)
+    if dataset == "dynamic_foraging":
+        return _build_dynamic_foraging(records)
+    if dataset == "np_ultra":
+        return _build_np_ultra(records)
+    return pd.DataFrame()
+
+
+def _get_ids(client: MetadataDbClient, filter_query: dict) -> list[str]:
+    """Fetch the _id values of all records matching filter_query."""
+    records = client.retrieve_docdb_records(
+        filter_query=filter_query,
+        projection={"_id": 1},
+        limit=0,
+    )
+    return [r["_id"] for r in records]
+
+
+def _fetch_records(client: MetadataDbClient, ids: list[str], dataset: str) -> list[dict]:
+    """Fetch full records for the given _id list in batches of BATCH_SIZE."""
+    records = []
+    for i in range(0, len(ids), BATCH_SIZE):
+        batch = ids[i: i + BATCH_SIZE]
+        logging.info(
+            SquirrelMessage(
+                tree=acorns.TREE.__class__.__name__,
+                acorn=acorns.NAMES["swdb"],
+                message=f"Fetching {dataset} batch {i // BATCH_SIZE + 1} / {-(-len(ids) // BATCH_SIZE)}",
+            ).to_json()
+        )
+        batch_records = client.retrieve_docdb_records(
+            filter_query={"_id": {"$in": batch}},
+            limit=0,
+        )
+        records.extend(batch_records)
+    return records
+
+
+def _get(obj: dict, *path, default=None):
+    """Safely navigate a nested dict, returning default if any key is missing."""
+    for key in path:
+        if not isinstance(obj, dict):
+            return default
+        obj = obj.get(key)
+        if obj is None:
+            return default
+    return obj
+
+
+def _first_modality_name(record: dict) -> str | None:
+    """Return the name of the first modality, checking both 'modality' and 'modalities' keys."""
+    for key in ("modality", "modalities"):
+        entries = _get(record, "data_description", key, default=[]) or []
+        if entries and isinstance(entries[0], dict):
+            return entries[0].get("name")
+    return None
+
+
+def _to_datetime(x) -> datetime | None:
+    """Coerce a value to datetime, handling strings, existing datetimes, and None."""
+    if x is None or (isinstance(x, float) and pd.isna(x)):
+        return None
+    if isinstance(x, datetime):
+        return x
+    return datetime.fromisoformat(str(x))
+
+
+def _parse_dates(df: pd.DataFrame) -> pd.DataFrame:
+    """Parse session_time into session_date and time, parse date_of_birth, compute age in days."""
+    df = df.copy()
+    df = df.dropna(subset=["session_time"]).reset_index(drop=True)
+    parsed = df["session_time"].apply(_to_datetime)
+    df["session_date"] = parsed.apply(lambda x: x.date() if x is not None else None)
+    df["session_time"] = parsed.apply(lambda x: x.time() if x is not None else None)
+    df["date_of_birth"] = df["date_of_birth"].apply(
+        lambda x: datetime.strptime(x, "%Y-%m-%d").date()
+        if x and not (isinstance(x, float) and pd.isna(x))
+        else None
+    )
+    df["age"] = df.apply(
+        lambda x: (x["session_date"] - x["date_of_birth"]).days
+        if x["session_date"] is not None and x["date_of_birth"] is not None
+        else None,
+        axis=1,
+    )
+    return df
+
+
+def _reorder(df: pd.DataFrame, order: list[str]) -> pd.DataFrame:
+    """Subset and reorder columns, skipping any that are absent."""
+    return df[[c for c in order if c in df.columns]]
+
+
+def _build_v1dd(records: list[dict]) -> pd.DataFrame:
+    """Extract V1 Deep Dive fields from full records."""
+    rows = []
+    for record in records:
+        tags = _get(record, "data_description", "tags", default=[]) or []
+        modalities = _get(record, "data_description", "modalities", default=[]) or []
+        row = {
+            "_id": record["_id"],
+            "name": record.get("name"),
+            "subject_id": _get(record, "data_description", "subject_id"),
+            "genotype": _get(record, "subject", "subject_details", "genotype"),
+            "date_of_birth": _get(record, "subject", "subject_details", "date_of_birth"),
+            "sex": _get(record, "subject", "subject_details", "sex"),
+            "session_time": _get(record, "acquisition", "acquisition_start_time"),
+            "project_name": _get(record, "data_description", "project_name"),
+            "modality": [m.get("name") for m in modalities if isinstance(m, dict)],
+            "column": tags[0] if len(tags) > 0 else None,
+            "volume": tags[1] if len(tags) > 1 else None,
+        }
+        rows.append(row)
+
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+
+    df = _parse_dates(df)
+    df["column"] = df["column"].apply(lambda x: int(x.split(" ")[-1]) if x else None)
+    df["volume"] = df["volume"].apply(lambda x: int(x.split(" ")[-1]) if x else None)
+    df["golden_mouse"] = False
+    df.loc[df["subject_id"] == "409828", "golden_mouse"] = True
+
+    order = [
+        "project_name", "_id", "name", "subject_id", "golden_mouse", "genotype",
+        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
+        "column", "volume",
+    ]
+    return _reorder(df, order)
+
+
+def _extract_bci_virus(record: dict) -> str | None:
+    """Extract the first injection material name from procedures."""
+    for sp in _get(record, "procedures", "subject_procedures", default=[]) or []:
+        for proc in sp.get("procedures", []) or []:
+            for mat in proc.get("injection_materials", []) or []:
+                if isinstance(mat, dict) and mat.get("name"):
+                    return mat["name"]
+    return None
+
+
+def _extract_bci_targeted_structure(record: dict) -> str | None:
+    """Extract the first targeted_structure from session data_streams."""
+    for stream in _get(record, "session", "data_streams", default=[]) or []:
+        ts = _get(stream, "stack_parameters", "targeted_structure")
+        if ts:
+            return ts
+    return None
+
+
+def _extract_bci_ophys_fov(record: dict) -> str | None:
+    """Extract the first ophys FOV note from session data_streams."""
+    for stream in _get(record, "session", "data_streams", default=[]) or []:
+        for fov in stream.get("ophys_fovs", []) or []:
+            note = fov.get("notes") if isinstance(fov, dict) else None
+            if note:
+                return note
+    return None
+
+
+def _build_bci(records: list[dict]) -> pd.DataFrame:
+    """Extract BCI single neuron stim fields from full records."""
+    rows = []
+    for record in records:
+        epochs = _get(record, "session", "stimulus_epochs", default=[]) or []
+        session_number = next(
+            (e.get("session_number") for e in epochs if e.get("stimulus_name") == "single neuron BCI conditioning"),
+            None,
+        )
+        row = {
+            "_id": record["_id"],
+            "name": record.get("name"),
+            "subject_id": _get(record, "data_description", "subject_id"),
+            "genotype": _get(record, "subject", "genotype"),
+            "virus": _extract_bci_virus(record),
+            "date_of_birth": _get(record, "subject", "date_of_birth"),
+            "sex": _get(record, "subject", "sex"),
+            "session_type": _get(record, "acquisition", "acquisition_type"),
+            "session_time": _get(record, "acquisition", "acquisition_start_time"),
+            "stimulus_epochs": [e.get("stimulus_name") for e in epochs if isinstance(e, dict)],
+            "project_name": _get(record, "data_description", "project_name"),
+            "modality": _first_modality_name(record),
+            "targeted_structure": _extract_bci_targeted_structure(record),
+            "ophys_fov": _extract_bci_ophys_fov(record),
+            "session_number": session_number,
+        }
+        rows.append(row)
+
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+
+    df = df.drop_duplicates(subset="name")
+    df = df[~df["name"].isin(BCI_PROBLEM_ASSETS)]
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "session_type", "_id", "name", "subject_id", "genotype", "virus",
+        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
+        "targeted_structure", "ophys_fov", "session_number",
+    ]
+    return _reorder(df, order)
+
+
+def _build_dynamic_foraging(records: list[dict]) -> pd.DataFrame:
+    """Extract Dynamic Foraging (Behavior Platform) fields from full records."""
+    rows = []
+    for record in records:
+        epochs = _get(record, "session", "stimulus_epochs", default=[]) or []
+        first_epoch = epochs[0] if epochs else {}
+        row = {
+            "_id": record["_id"],
+            "name": record.get("name"),
+            "subject_id": _get(record, "data_description", "subject_id"),
+            "genotype": _get(record, "subject", "genotype"),
+            "date_of_birth": _get(record, "subject", "date_of_birth"),
+            "sex": _get(record, "subject", "sex"),
+            "session_type": _get(record, "session", "session_type"),
+            "session_time": _get(record, "session", "session_start_time"),
+            "project_name": _get(record, "data_description", "project_name"),
+            "modality": _first_modality_name(record),
+            "trials_total": first_epoch.get("trials_total"),
+            "trials_rewarded": first_epoch.get("trials_rewarded"),
+        }
+        rows.append(row)
+
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+
+    df = df.drop_duplicates(subset="name")
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "name", "subject_id", "genotype", "date_of_birth", "sex",
+        "modality", "session_type", "session_date", "age", "session_time",
+        "trials_total", "trials_rewarded",
+    ]
+    return _reorder(df, order)
+
+
+def _build_np_ultra(records: list[dict]) -> pd.DataFrame:
+    """Extract NP Ultra and Psychedelics fields from full records.
+
+    Note: stimulus_epochs are assigned manually per subject because the metadata
+    is incomplete in the database. Each subject is assumed to have exactly two
+    sessions in sorted order: saline first, then psilocybin.
+    """
+    rows = []
+    for record in records:
+        epochs = _get(record, "session", "stimulus_epochs", default=[]) or []
+        row = {
+            "_id": record["_id"],
+            "name": record.get("name"),
+            "subject_id": _get(record, "data_description", "subject_id"),
+            "genotype": _get(record, "subject", "genotype"),
+            "date_of_birth": _get(record, "subject", "date_of_birth"),
+            "sex": _get(record, "subject", "sex"),
+            "session_time": _get(record, "session", "session_start_time"),
+            "stimulus_epochs": [e.get("stimulus_name") for e in epochs if isinstance(e, dict)],
+            "project_name": _get(record, "data_description", "project_name"),
+            "modality": _first_modality_name(record),
+            "notes": [e.get("notes") for e in epochs if isinstance(e, dict)],
+        }
+        rows.append(row)
+
+    df = pd.DataFrame(rows)
+    if df.empty:
+        return df
+
+    df = df.sort_values(by="session_time").reset_index(drop=True)
+    n_subjects = len(df["subject_id"].unique())
+
+    df["session_type"] = ["saline", "psilocybin"] * n_subjects
+    df["stimulus_epochs"] = [NP_ULTRA_SALINE_EPOCHS, NP_ULTRA_PSILO_EPOCHS] * n_subjects
+
+    sal_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_SALINE_EPOCHS))
+    psi_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_PSILO_EPOCHS))
+    df["stimulus_types"] = [sal_stim_types, psi_stim_types] * n_subjects
+
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "_id", "name", "subject_id", "genotype", "date_of_birth",
+        "sex", "modality", "session_date", "age", "session_time", "session_type",
+        "stimulus_types", "notes",
+    ]
+    return _reorder(df, order)
+
+
+def swdb_metadata_columns(dataset: str) -> list[Column]:
+    """Return column definitions for the given SWDB dataset.
+
+    Args:
+        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
+
+    Returns:
+        List of Column definitions for the dataset.
+    """
+    common = [
+        Column(name="project_name", description="Project name from data_description"),
+        Column(name="_id", description="MongoDB document ID"),
+        Column(name="name", description="Data asset name"),
+        Column(name="subject_id", description="Subject/mouse ID"),
+        Column(name="genotype", description="Mouse genotype"),
+        Column(name="date_of_birth", description="Date of birth (date)"),
+        Column(name="sex", description="Subject sex"),
+        Column(name="modality", description="Data modality name"),
+        Column(name="session_date", description="Session date (date)"),
+        Column(name="age", description="Age at session in days"),
+        Column(name="session_time", description="Session start time (time)"),
+    ]
+    if dataset == "v1dd":
+        return common + [
+            Column(name="golden_mouse", description="True if subject_id is 409828 (golden mouse)"),
+            Column(name="column", description="V1DD column number extracted from data_description.tags[0]"),
+            Column(name="volume", description="V1DD volume number extracted from data_description.tags[1]"),
+        ]
+    if dataset == "bci":
+        return [
+            Column(name="project_name", description="Project name from data_description"),
+            Column(name="session_type", description="Session type (BCI single neuron stim)"),
+            Column(name="_id", description="MongoDB document ID"),
+            Column(name="name", description="Data asset name"),
+            Column(name="subject_id", description="Subject/mouse ID"),
+            Column(name="genotype", description="Mouse genotype"),
+            Column(name="virus", description="Injection material / virus name"),
+            Column(name="date_of_birth", description="Date of birth (date)"),
+            Column(name="sex", description="Subject sex"),
+            Column(name="modality", description="Data modality name"),
+            Column(name="session_date", description="Session date (date)"),
+            Column(name="age", description="Age at session in days"),
+            Column(name="session_time", description="Session start time (time)"),
+            Column(name="targeted_structure", description="Targeted brain structure"),
+            Column(name="ophys_fov", description="Notes from the ophys field-of-view"),
+            Column(name="session_number", description="BCI conditioning session number"),
+        ]
+    if dataset == "dynamic_foraging":
+        return [
+            Column(name="project_name", description="Project name (Behavior Platform)"),
+            Column(name="name", description="Data asset name"),
+            Column(name="subject_id", description="Subject/mouse ID"),
+            Column(name="genotype", description="Mouse genotype"),
+            Column(name="date_of_birth", description="Date of birth (date)"),
+            Column(name="sex", description="Subject sex"),
+            Column(name="modality", description="Data modality name"),
+            Column(name="session_type", description="Session type / task name"),
+            Column(name="session_date", description="Session date (date)"),
+            Column(name="age", description="Age at session in days"),
+            Column(name="session_time", description="Session start time (time)"),
+            Column(name="trials_total", description="Total number of trials in the session"),
+            Column(name="trials_rewarded", description="Number of rewarded trials in the session"),
+        ]
+    if dataset == "np_ultra":
+        return common + [
+            Column(name="session_type", description="Session type: 'saline' or 'psilocybin'"),
+            Column(name="stimulus_types", description="Unique stimulus type names for the session"),
+            Column(name="notes", description="Notes from session stimulus epochs"),
+        ]
+    return []
+
+    "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
+    "single-plane-ophys_772414_2025-02-04_13-21-29_processed_2025-08-12_06-14-42",
+]
+
+NP_ULTRA_SALINE_EPOCHS = [
+    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
+    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
+    "Spontaneous_2", "RFMapping_2", "OptoTagging_2", "Anesthesia",
+    "Spontaneous_3", "RFMapping_3", "Spontaneous_4",
+]
+
+NP_ULTRA_PSILO_EPOCHS = [
+    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
+    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
+    "Spontaneous_2", "RFMapping_2", "OptoTagging_2",
+]
+
+
+@acorns.register_acorn(acorns.NAMES["swdb"])
+def swdb_metadata(dataset: str, force_update: bool = False) -> pd.DataFrame:
+    """Build a metadata table for a SWDB project dataset.
+
+    One row per data asset (or per asset/stream for BCI) with subject, session,
+    and project-specific fields. Results are cached per dataset.
+
+    Args:
+        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
+        force_update: If True, bypass cache and rebuild from database.
+
+    Returns:
+        DataFrame with columns specific to the requested dataset.
+
+    Raises:
+        ValueError: If dataset is not recognized or cache is empty without force_update.
+    """
+    if dataset not in DATASETS:
+        raise ValueError(f"Unknown dataset '{dataset}'. Must be one of {DATASETS}.")
+
+    cache_key = f"swdb_metadata/{dataset}"
+    df = acorns.TREE.scurry(cache_key)
+
+    if df.empty and not force_update:
+        raise ValueError(f"Cache is empty for dataset '{dataset}'. Use force_update=True to rebuild.")
+
+    if df.empty or force_update:
+        setup_logging()
+        logging.info(
+            SquirrelMessage(
+                tree=acorns.TREE.__class__.__name__,
+                acorn=acorns.NAMES["swdb"],
+                message=f"Building SWDB metadata for '{dataset}'",
+            ).to_json()
+        )
+        df = _build(dataset)
+        if not df.empty:
+            acorns.TREE.hide(cache_key, df)
+
+    return df
+
+
+def _build(dataset: str) -> pd.DataFrame:
+    """Build the metadata DataFrame for the given dataset."""
+    client = MetadataDbClient(
+        host=acorns.API_GATEWAY_HOST,
+        version="v2",
+    )
+    if dataset == "v1dd":
+        return _build_v1dd(client)
+    if dataset == "bci":
+        return _build_bci(client)
+    if dataset == "dynamic_foraging":
+        return _build_dynamic_foraging(client)
+    if dataset == "np_ultra":
+        return _build_np_ultra(client)
+    return pd.DataFrame()
+
+
+def _to_datetime(x) -> datetime | None:
+    """Coerce a value to datetime, handling strings, existing datetimes, and None."""
+    if x is None or (isinstance(x, float) and pd.isna(x)):
+        return None
+    if isinstance(x, datetime):
+        return x
+    return datetime.fromisoformat(str(x))
+
+
+def _parse_dates(df: pd.DataFrame) -> pd.DataFrame:
+    """Parse session_time into session_date and time, parse date_of_birth, compute age in days."""
+    df = df.copy()
+    df = df.dropna(subset=["session_time"]).reset_index(drop=True)
+    parsed = df["session_time"].apply(_to_datetime)
+    df["session_date"] = parsed.apply(lambda x: x.date() if x is not None else None)
+    df["session_time"] = parsed.apply(lambda x: x.time() if x is not None else None)
+    df["date_of_birth"] = df["date_of_birth"].apply(
+        lambda x: datetime.strptime(x, "%Y-%m-%d").date() if x and not (isinstance(x, float) and pd.isna(x)) else None
+    )
+    df["age"] = df.apply(
+        lambda x: (x["session_date"] - x["date_of_birth"]).days
+        if x["session_date"] is not None and x["date_of_birth"] is not None
+        else None,
+        axis=1,
+    )
+    return df
+
+
+def _reorder(df: pd.DataFrame, order: list[str]) -> pd.DataFrame:
+    """Subset and reorder columns, skipping any that are absent."""
+    return df[[c for c in order if c in df.columns]]
+
+
+def _build_v1dd(client: MetadataDbClient) -> pd.DataFrame:
+    """Build V1 Deep Dive metadata table."""
+    pipeline = [
+        {
+            "$match": {
+                "data_description.project_name": "V1 Deep Dive",
+            }
+        },
+        {
+            "$project": {
+                "name": 1,
+                "subject_id": "$data_description.subject_id",
+                "genotype": "$subject.subject_details.genotype",
+                "date_of_birth": "$subject.subject_details.date_of_birth",
+                "sex": "$subject.subject_details.sex",
+                "session_time": "$acquisition.acquisition_start_time",
+                "project_name": "$data_description.project_name",
+                "modality": "$data_description.modalities.name",
+                "column": {"$arrayElemAt": ["$data_description.tags", 0]},
+                "volume": {"$arrayElemAt": ["$data_description.tags", 1]},
+            }
+        },
+    ]
+    records = client.aggregate_docdb_records(pipeline=pipeline)
+    df = pd.DataFrame(records)
+    if df.empty:
+        return df
+
+    df = _parse_dates(df)
+    df["column"] = df["column"].apply(lambda x: int(x.split(" ")[-1]))
+    df["volume"] = df["volume"].apply(lambda x: int(x.split(" ")[-1]))
+    df["golden_mouse"] = False
+    df.loc[df["subject_id"] == "409828", "golden_mouse"] = True
+
+    order = [
+        "project_name", "_id", "name", "subject_id", "golden_mouse", "genotype",
+        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
+        "column", "volume",
+    ]
+    return _reorder(df, order)
+
+
+def _build_bci(client: MetadataDbClient) -> pd.DataFrame:
+    """Build BCI single neuron stim metadata table."""
+    pipeline = [
+        {
+            "$match": {
+                "acquisition.acquisition_type": "BCI single neuron stim",
+                "data_description.data_level": "derived",
+                "processing.processing_pipeline.data_processes.start_date_time": {"$gte": "2025-08-03"},
+            }
+        },
+        {
+            "$project": {
+                "name": 1,
+                "subject_id": "$data_description.subject_id",
+                "genotype": "$subject.genotype",
+                "virus": "$procedures.subject_procedures.procedures.injection_materials.name",
+                "date_of_birth": "$subject.date_of_birth",
+                "sex": "$subject.sex",
+                "session_type": "$acquisition.acquisition_type",
+                "session_time": "$acquisition.acquisition_start_time",
+                "stimulus_epochs": "$session.stimulus_epochs.stimulus_name",
+                "project_name": "$data_description.project_name",
+                "modality": "$data_description.modality.name",
+                "targeted_structure": "$session.data_streams.stack_parameters.targeted_structure",
+                "session_number": {
+                    "$filter": {
+                        "input": "$session.stimulus_epochs",
+                        "as": "epoch",
+                        "cond": {"$eq": ["$$epoch.stimulus_name", "single neuron BCI conditioning"]},
+                    }
+                },
+                "ophys_fov": {
+                    "$map": {
+                        "input": "$session.data_streams",
+                        "as": "stream",
+                        "in": {
+                            "$map": {
+                                "input": "$$stream.ophys_fovs",
+                                "as": "fov",
+                                "in": "$$fov.notes",
+                            }
+                        },
+                    }
+                },
+            }
+        },
+        {
+            "$project": {
+                "name": 1,
+                "subject_id": 1,
+                "genotype": 1,
+                "virus": 1,
+                "date_of_birth": 1,
+                "sex": 1,
+                "session_type": 1,
+                "session_time": 1,
+                "stimulus_epochs": 1,
+                "project_name": 1,
+                "modality": 1,
+                "targeted_structure": 1,
+                "session_number": {"$arrayElemAt": ["$session_number.session_number", 0]},
+                "ophys_fov": 1,
+            }
+        },
+        {"$unwind": {"path": "$ophys_fov", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$ophys_fov", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$targeted_structure", "preserveNullAndEmptyArrays": False}},
+    ]
+    records = client.aggregate_docdb_records(pipeline=pipeline)
+    df = pd.DataFrame(records)
+    if df.empty:
+        return df
+
+    df = df.drop_duplicates(subset="name")
+    df = df[~df["name"].isin(BCI_PROBLEM_ASSETS)]
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "session_type", "_id", "name", "subject_id", "genotype", "virus",
+        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
+        "targeted_structure", "ophys_fov", "session_number",
+    ]
+    return _reorder(df, order)
+
+
+def _build_dynamic_foraging(client: MetadataDbClient) -> pd.DataFrame:
+    """Build Dynamic Foraging (Behavior Platform) metadata table."""
+    pipeline = [
+        {
+            "$match": {
+                "session.session_start_time": {"$regex": "^2025"},
+                "data_description.modality.abbreviation": {"$nin": ["ecephys", "fib"]},
+                "data_description.data_level": "derived",
+                "data_description.project_name": "Behavior Platform",
+                "procedures": {"$ne": None},
+                "$and": [
+                    {"quality_control.evaluations": {"$exists": True, "$ne": []}},
+                    {
+                        "quality_control.evaluations": {
+                            "$not": {
+                                "$elemMatch": {"latest_status": {"$ne": "Pass"}}
+                            }
+                        }
+                    },
+                ],
+            }
+        },
+        {
+            "$project": {
+                "name": 1,
+                "subject_id": "$data_description.subject_id",
+                "genotype": "$subject.genotype",
+                "date_of_birth": "$subject.date_of_birth",
+                "sex": "$subject.sex",
+                "session_type": "$session.session_type",
+                "session_time": "$session.session_start_time",
+                "project_name": "$data_description.project_name",
+                "modality": "$data_description.modality.name",
+                "trials_total": "$session.stimulus_epochs.trials_total",
+                "trials_rewarded": "$session.stimulus_epochs.trials_rewarded",
+            }
+        },
+        {"$unwind": {"path": "$trials_total", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$trials_rewarded", "preserveNullAndEmptyArrays": False}},
+        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
+    ]
+    records = client.aggregate_docdb_records(pipeline=pipeline)
+    df = pd.DataFrame(records)
+    if df.empty:
+        return df
+
+    df = df.drop_duplicates(subset="name")
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "name", "subject_id", "genotype", "date_of_birth", "sex",
+        "modality", "session_type", "session_date", "age", "session_time",
+        "trials_total", "trials_rewarded",
+    ]
+    return _reorder(df, order)
+
+
+def _build_np_ultra(client: MetadataDbClient) -> pd.DataFrame:
+    """Build NP Ultra and Psychedelics metadata table.
+
+    Note: stimulus_epochs are assigned manually per subject because the metadata
+    is incomplete in the database. Each subject is assumed to have exactly two
+    sessions in sorted order: saline first, then psilocybin.
+    """
+    pipeline = [
+        {
+            "$match": {
+                "data_description.project_name": "NP Ultra and Psychedelics",
+                "data_description.data_level": "derived",
+            }
+        },
+        {
+            "$project": {
+                "name": 1,
+                "subject_id": "$data_description.subject_id",
+                "genotype": "$subject.genotype",
+                "date_of_birth": "$subject.date_of_birth",
+                "sex": "$subject.sex",
+                "session_time": "$session.session_start_time",
+                "stimulus_epochs": "$session.stimulus_epochs.stimulus_name",
+                "project_name": "$data_description.project_name",
+                "modality": "$data_description.modality.name",
+                "notes": "$session.stimulus_epochs.notes",
+            }
+        },
+        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
+    ]
+    records = client.aggregate_docdb_records(pipeline=pipeline)
+    df = pd.DataFrame(records)
+    if df.empty:
+        return df
+
+    df = df.sort_values(by="session_time").reset_index(drop=True)
+    n_subjects = len(df["subject_id"].unique())
+
+    df["session_type"] = ["saline", "psilocybin"] * n_subjects
+    df["stimulus_epochs"] = [NP_ULTRA_SALINE_EPOCHS, NP_ULTRA_PSILO_EPOCHS] * n_subjects
+
+    sal_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_SALINE_EPOCHS))
+    psi_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_PSILO_EPOCHS))
+    df["stimulus_types"] = [sal_stim_types, psi_stim_types] * n_subjects
+
+    df = _parse_dates(df)
+
+    order = [
+        "project_name", "_id", "name", "subject_id", "genotype", "date_of_birth",
+        "sex", "modality", "session_date", "age", "session_time", "session_type",
+        "stimulus_types", "notes",
+    ]
+    return _reorder(df, order)
+
+
+def swdb_metadata_columns(dataset: str) -> list[Column]:
+    """Return column definitions for the given SWDB dataset.
+
+    Args:
+        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
+
+    Returns:
+        List of Column definitions for the dataset.
+    """
+    common = [
+        Column(name="project_name", description="Project name from data_description"),
+        Column(name="_id", description="MongoDB document ID"),
+        Column(name="name", description="Data asset name"),
+        Column(name="subject_id", description="Subject/mouse ID"),
+        Column(name="genotype", description="Mouse genotype"),
+        Column(name="date_of_birth", description="Date of birth (date)"),
+        Column(name="sex", description="Subject sex"),
+        Column(name="modality", description="Data modality name"),
+        Column(name="session_date", description="Session date (date)"),
+        Column(name="age", description="Age at session in days"),
+        Column(name="session_time", description="Session start time (time)"),
+    ]
+    if dataset == "v1dd":
+        return common + [
+            Column(name="golden_mouse", description="True if subject_id is 409828 (golden mouse)"),
+            Column(name="column", description="V1DD column number extracted from data_description.tags[0]"),
+            Column(name="volume", description="V1DD volume number extracted from data_description.tags[1]"),
+        ]
+    if dataset == "bci":
+        return [
+            Column(name="project_name", description="Project name from data_description"),
+            Column(name="session_type", description="Session type (BCI single neuron stim)"),
+            Column(name="_id", description="MongoDB document ID"),
+            Column(name="name", description="Data asset name"),
+            Column(name="subject_id", description="Subject/mouse ID"),
+            Column(name="genotype", description="Mouse genotype"),
+            Column(name="virus", description="Injection material / virus name"),
+            Column(name="date_of_birth", description="Date of birth (date)"),
+            Column(name="sex", description="Subject sex"),
+            Column(name="modality", description="Data modality name"),
+            Column(name="session_date", description="Session date (date)"),
+            Column(name="age", description="Age at session in days"),
+            Column(name="session_time", description="Session start time (time)"),
+            Column(name="targeted_structure", description="Targeted brain structure"),
+            Column(name="ophys_fov", description="Notes from the ophys field-of-view"),
+            Column(name="session_number", description="BCI conditioning session number"),
+        ]
+    if dataset == "dynamic_foraging":
+        return [
+            Column(name="project_name", description="Project name (Behavior Platform)"),
+            Column(name="name", description="Data asset name"),
+            Column(name="subject_id", description="Subject/mouse ID"),
+            Column(name="genotype", description="Mouse genotype"),
+            Column(name="date_of_birth", description="Date of birth (date)"),
+            Column(name="sex", description="Subject sex"),
+            Column(name="modality", description="Data modality name"),
+            Column(name="session_type", description="Session type / task name"),
+            Column(name="session_date", description="Session date (date)"),
+            Column(name="age", description="Age at session in days"),
+            Column(name="session_time", description="Session start time (time)"),
+            Column(name="trials_total", description="Total number of trials in the session"),
+            Column(name="trials_rewarded", description="Number of rewarded trials in the session"),
+        ]
+    if dataset == "np_ultra":
+        return common + [
+            Column(name="session_type", description="Session type: 'saline' or 'psilocybin'"),
+            Column(name="stimulus_types", description="Unique stimulus type names for the session"),
+            Column(name="notes", description="Notes from session stimulus epochs"),
+        ]
+    return []
diff --git a/src/zombie_squirrel/acorns.py b/src/zombie_squirrel/acorns.py
index 368583e..270cd42 100644
--- a/src/zombie_squirrel/acorns.py
+++ b/src/zombie_squirrel/acorns.py
@@ -47,6 +47,7 @@
     "foraging": "foraging_sessions",
     "curriculum": "behavior_curriculum",
     "platform_qc": "platform_qc",
+    "swdb": "swdb_metadata",
 }
 
 ACORN_REGISTRY: dict[str, Callable[[], Any]] = {}
diff --git a/tests/acorn_helpers/test_swdb_metadata.py b/tests/acorn_helpers/test_swdb_metadata.py
new file mode 100644
index 0000000..e2eee0a
--- /dev/null
+++ b/tests/acorn_helpers/test_swdb_metadata.py
@@ -0,0 +1,261 @@
+"""Unit tests for swdb_metadata acorn."""
+
+from unittest.mock import MagicMock, patch
+
+import pandas as pd
+import pytest
+
+import zombie_squirrel.acorns as acorns
+from zombie_squirrel.acorn_helpers.swdb_metadata import (
+    DATASETS,
+    swdb_metadata,
+    swdb_metadata_columns,
+    _parse_dates,
+)
+from zombie_squirrel.forest import MemoryTree
+
+
+@pytest.fixture(autouse=True)
+def memory_tree():
+    acorns.TREE = MemoryTree()
+
+
+def test_datasets_list():
+    assert "v1dd" in DATASETS
+    assert "bci" in DATASETS
+    assert "dynamic_foraging" in DATASETS
+    assert "np_ultra" in DATASETS
+
+
+def test_invalid_dataset_raises():
+    with pytest.raises(ValueError, match="Unknown dataset"):
+        swdb_metadata("nonexistent")
+
+
+def test_empty_cache_raises():
+    with pytest.raises(ValueError, match="Cache is empty"):
+        swdb_metadata("v1dd", force_update=False)
+
+
+def test_cache_hit():
+    cached = pd.DataFrame({"name": ["asset_1"], "subject_id": ["sub1"]})
+    acorns.TREE.hide("swdb_metadata/v1dd", cached)
+    df = swdb_metadata("v1dd", force_update=False)
+    assert len(df) == 1
+    assert df.iloc[0]["name"] == "asset_1"
+
+
+@patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
+def test_force_update_replaces_cache(mock_client_class):
+    cached = pd.DataFrame({"name": ["old_asset"], "subject_id": ["sub1"]})
+    acorns.TREE.hide("swdb_metadata/v1dd", cached)
+
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.aggregate_docdb_records.return_value = [
+        {
+            "_id": "abc",
+            "name": "v1dd_asset",
+            "subject_id": "sub1",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-01",
+            "sex": "M",
+            "session_time": "2025-03-01T10:00:00",
+            "project_name": "V1 Deep Dive",
+            "modality": "SPIM",
+            "column": "Column 1",
+            "volume": "Volume 2",
+        }
+    ]
+
+    df = swdb_metadata("v1dd", force_update=True)
+    assert len(df) == 1
+    assert df.iloc[0]["name"] == "v1dd_asset"
+    assert df.iloc[0]["column"] == 1
+    assert df.iloc[0]["volume"] == 2
+    assert not df.iloc[0]["golden_mouse"]
+
+
+@patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
+def test_v1dd_golden_mouse(mock_client_class):
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.aggregate_docdb_records.return_value = [
+        {
+            "_id": "abc",
+            "name": "v1dd_asset",
+            "subject_id": "409828",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-01",
+            "sex": "M",
+            "session_time": "2025-03-01T10:00:00",
+            "project_name": "V1 Deep Dive",
+            "modality": "SPIM",
+            "column": "Column 3",
+            "volume": "Volume 5",
+        }
+    ]
+    df = swdb_metadata("v1dd", force_update=True)
+    assert df.iloc[0]["golden_mouse"]
+
+
+@patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
+def test_dynamic_foraging_deduplication(mock_client_class):
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.aggregate_docdb_records.return_value = [
+        {
+            "name": "asset_1",
+            "subject_id": "sub1",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-01",
+            "sex": "F",
+            "session_time": "2025-04-01T09:00:00",
+            "project_name": "Behavior Platform",
+            "modality": "behavior",
+            "session_type": "Coupled Baiting",
+            "trials_total": 200,
+            "trials_rewarded": 150,
+        },
+        {
+            "name": "asset_1",
+            "subject_id": "sub1",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-01",
+            "sex": "F",
+            "session_time": "2025-04-01T09:00:00",
+            "project_name": "Behavior Platform",
+            "modality": "behavior",
+            "session_type": "Coupled Baiting",
+            "trials_total": 200,
+            "trials_rewarded": 150,
+        },
+    ]
+    df = swdb_metadata("dynamic_foraging", force_update=True)
+    assert len(df) == 1
+
+
+@patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
+def test_bci_problem_assets_excluded(mock_client_class):
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.aggregate_docdb_records.return_value = [
+        {
+            "_id": "abc",
+            "name": "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
+            "subject_id": "731015",
+            "genotype": "wt",
+            "date_of_birth": "2024-06-01",
+            "sex": "M",
+            "session_time": "2025-01-28T17:40:57",
+            "project_name": "BCI",
+            "modality": "ophys",
+            "session_type": "BCI single neuron stim",
+            "virus": "AAV",
+            "targeted_structure": "V1",
+            "ophys_fov": "note",
+            "session_number": 1,
+            "stimulus_epochs": [],
+        },
+        {
+            "_id": "def",
+            "name": "good_asset",
+            "subject_id": "sub2",
+            "genotype": "wt",
+            "date_of_birth": "2024-06-01",
+            "sex": "F",
+            "session_time": "2025-05-01T12:00:00",
+            "project_name": "BCI",
+            "modality": "ophys",
+            "session_type": "BCI single neuron stim",
+            "virus": "AAV",
+            "targeted_structure": "V1",
+            "ophys_fov": "note",
+            "session_number": 2,
+            "stimulus_epochs": [],
+        },
+    ]
+    df = swdb_metadata("bci", force_update=True)
+    assert "single-plane-ophys_731015" not in df["name"].values
+    assert "good_asset" in df["name"].values
+
+
+@patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
+def test_np_ultra_session_types(mock_client_class):
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.aggregate_docdb_records.return_value = [
+        {
+            "_id": "a1",
+            "name": "np_asset_1",
+            "subject_id": "sub1",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-15",
+            "sex": "M",
+            "session_time": "2025-02-01T10:00:00",
+            "project_name": "NP Ultra and Psychedelics",
+            "modality": "ecephys",
+            "stimulus_epochs": [],
+            "notes": [],
+        },
+        {
+            "_id": "a2",
+            "name": "np_asset_2",
+            "subject_id": "sub1",
+            "genotype": "wt",
+            "date_of_birth": "2024-01-15",
+            "sex": "M",
+            "session_time": "2025-03-01T10:00:00",
+            "project_name": "NP Ultra and Psychedelics",
+            "modality": "ecephys",
+            "stimulus_epochs": [],
+            "notes": [],
+        },
+    ]
+    df = swdb_metadata("np_ultra", force_update=True)
+    assert list(df["session_type"]) == ["saline", "psilocybin"]
+    assert isinstance(df.iloc[0]["stimulus_types"], list)
+
+
+def test_parse_dates():
+    df = pd.DataFrame({
+        "session_time": ["2025-06-01T14:30:00"],
+        "date_of_birth": ["2024-01-15"],
+    })
+    result = _parse_dates(df)
+    assert result.iloc[0]["age"] == (result.iloc[0]["session_date"] - result.iloc[0]["date_of_birth"]).days
+
+
+def test_swdb_metadata_columns_v1dd():
+    cols = swdb_metadata_columns("v1dd")
+    names = [c.name for c in cols]
+    assert "golden_mouse" in names
+    assert "column" in names
+    assert "volume" in names
+
+
+def test_swdb_metadata_columns_bci():
+    cols = swdb_metadata_columns("bci")
+    names = [c.name for c in cols]
+    assert "virus" in names
+    assert "ophys_fov" in names
+    assert "session_number" in names
+
+
+def test_swdb_metadata_columns_dynamic_foraging():
+    cols = swdb_metadata_columns("dynamic_foraging")
+    names = [c.name for c in cols]
+    assert "trials_total" in names
+    assert "trials_rewarded" in names
+
+
+def test_swdb_metadata_columns_np_ultra():
+    cols = swdb_metadata_columns("np_ultra")
+    names = [c.name for c in cols]
+    assert "session_type" in names
+    assert "stimulus_types" in names
+    assert "notes" in names
+
+
+def test_swdb_metadata_columns_unknown():
+    assert swdb_metadata_columns("unknown") == []

From 03c7c9a5a8012f5a0ce31114af4dfb2c114f0998 Mon Sep 17 00:00:00 2001
From: Dan Birman <danbirman@gmail.com>
Date: Tue, 2 Jun 2026 13:43:30 -0700
Subject: [PATCH 2/2] refactor: replacing v1 with v2 paths

---
 README.md                                     |   4 +-
 .../acorn_helpers/swdb_metadata.py            | 456 +-----------------
 tests/acorn_helpers/test_swdb_metadata.py     | 211 ++++----
 3 files changed, 107 insertions(+), 564 deletions(-)

diff --git a/README.md b/README.md
index 8851231..fc9b588 100644
--- a/README.md
+++ b/README.md
@@ -66,8 +66,8 @@ The `raw_to_derived` function is not a table stored in S3, instead it is used by
 | Dataset | Project filter | Columns |
 | ------- | -------------- | ------- |
 | `v1dd` | `data_description.project_name = "V1 Deep Dive"` | `project_name`, `_id`, `name`, `subject_id`, `golden_mouse`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `column`, `volume` |
-| `bci` | `session.session_type = "BCI single neuron stim"`, `data_level = derived`, `processing >= 2025-08-03` | `project_name`, `session_type`, `_id`, `name`, `subject_id`, `genotype`, `virus`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `targeted_structure`, `ophys_fov`, `session_number` |
-| `dynamic_foraging` | `project_name = "Behavior Platform"`, `data_level = derived`, `session >= 2025`, all QC passing | `project_name`, `name`, `subject_id`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_type`, `session_date`, `age`, `session_time`, `trials_total`, `trials_rewarded` |
+| `bci` | `acquisition.acquisition_type = "BCI single neuron stim"`, `data_level = derived`, `processing >= 2025-08-03` | `project_name`, `session_type`, `_id`, `name`, `subject_id`, `genotype`, `virus`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `targeted_structure`, `ophys_fov`, `session_number` |
+| `dynamic_foraging` | `project_name = "Behavior Platform"`, `data_level = derived`, `acquisition >= 2025`, `quality_control.status` all `"Pass"` | `project_name`, `name`, `subject_id`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_type`, `session_date`, `age`, `session_time`, `trials_total`, `trials_rewarded` |
 | `np_ultra` | `project_name = "NP Ultra and Psychedelics"`, `data_level = derived` | `project_name`, `_id`, `name`, `subject_id`, `genotype`, `date_of_birth`, `sex`, `modality`, `session_date`, `age`, `session_time`, `session_type`, `stimulus_types`, `notes` |
 
 ```python
diff --git a/src/zombie_squirrel/acorn_helpers/swdb_metadata.py b/src/zombie_squirrel/acorn_helpers/swdb_metadata.py
index afaab7c..00da0ff 100644
--- a/src/zombie_squirrel/acorn_helpers/swdb_metadata.py
+++ b/src/zombie_squirrel/acorn_helpers/swdb_metadata.py
@@ -42,19 +42,12 @@
         "processing.processing_pipeline.data_processes.start_date_time": {"$gte": "2025-08-03"},
     },
     "dynamic_foraging": {
-        "session.session_start_time": {"$regex": "^2025"},
-        "data_description.modality.abbreviation": {"$nin": ["ecephys", "fib"]},
+        "acquisition.acquisition_start_time": {"$regex": "^2025"},
+        "data_description.modalities.abbreviation": {"$nin": ["ecephys", "fib"]},
         "data_description.data_level": "derived",
         "data_description.project_name": "Behavior Platform",
         "procedures": {"$ne": None},
-        "$and": [
-            {"quality_control.evaluations": {"$exists": True, "$ne": []}},
-            {
-                "quality_control.evaluations": {
-                    "$not": {"$elemMatch": {"latest_status": {"$ne": "Pass"}}}
-                }
-            },
-        ],
+        "quality_control.status": {"$exists": True, "$ne": None},
     },
     "np_ultra": {
         "data_description.project_name": "NP Ultra and Psychedelics",
@@ -168,11 +161,10 @@ def _get(obj: dict, *path, default=None):
 
 
 def _first_modality_name(record: dict) -> str | None:
-    """Return the name of the first modality, checking both 'modality' and 'modalities' keys."""
-    for key in ("modality", "modalities"):
-        entries = _get(record, "data_description", key, default=[]) or []
-        if entries and isinstance(entries[0], dict):
-            return entries[0].get("name")
+    """Return the name of the first entry in data_description.modalities."""
+    entries = _get(record, "data_description", "modalities", default=[]) or []
+    if entries and isinstance(entries[0], dict):
+        return entries[0].get("name")
     return None
 
 
@@ -327,6 +319,9 @@ def _build_dynamic_foraging(records: list[dict]) -> pd.DataFrame:
     """Extract Dynamic Foraging (Behavior Platform) fields from full records."""
     rows = []
     for record in records:
+        qc_status = _get(record, "quality_control", "status") or {}
+        if not all(v == "Pass" for v in qc_status.values()):
+            continue
         epochs = _get(record, "session", "stimulus_epochs", default=[]) or []
         first_epoch = epochs[0] if epochs else {}
         row = {
@@ -336,8 +331,8 @@ def _build_dynamic_foraging(records: list[dict]) -> pd.DataFrame:
             "genotype": _get(record, "subject", "genotype"),
             "date_of_birth": _get(record, "subject", "date_of_birth"),
             "sex": _get(record, "subject", "sex"),
-            "session_type": _get(record, "session", "session_type"),
-            "session_time": _get(record, "session", "session_start_time"),
+            "session_type": _get(record, "acquisition", "acquisition_type"),
+            "session_time": _get(record, "acquisition", "acquisition_start_time"),
             "project_name": _get(record, "data_description", "project_name"),
             "modality": _first_modality_name(record),
             "trials_total": first_epoch.get("trials_total"),
@@ -369,7 +364,7 @@ def _build_np_ultra(records: list[dict]) -> pd.DataFrame:
     """
     rows = []
     for record in records:
-        epochs = _get(record, "session", "stimulus_epochs", default=[]) or []
+        epochs = _get(record, "acquisition", "stimulus_epochs", default=[]) or []
         row = {
             "_id": record["_id"],
             "name": record.get("name"),
@@ -377,7 +372,7 @@ def _build_np_ultra(records: list[dict]) -> pd.DataFrame:
             "genotype": _get(record, "subject", "genotype"),
             "date_of_birth": _get(record, "subject", "date_of_birth"),
             "sex": _get(record, "subject", "sex"),
-            "session_time": _get(record, "session", "session_start_time"),
+            "session_time": _get(record, "acquisition", "acquisition_start_time"),
             "stimulus_epochs": [e.get("stimulus_name") for e in epochs if isinstance(e, dict)],
             "project_name": _get(record, "data_description", "project_name"),
             "modality": _first_modality_name(record),
@@ -480,426 +475,3 @@ def swdb_metadata_columns(dataset: str) -> list[Column]:
         ]
     return []
 
-    "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
-    "single-plane-ophys_772414_2025-02-04_13-21-29_processed_2025-08-12_06-14-42",
-]
-
-NP_ULTRA_SALINE_EPOCHS = [
-    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
-    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
-    "Spontaneous_2", "RFMapping_2", "OptoTagging_2", "Anesthesia",
-    "Spontaneous_3", "RFMapping_3", "Spontaneous_4",
-]
-
-NP_ULTRA_PSILO_EPOCHS = [
-    "Spontaneous_0", "RFMapping_0", "OptoTagging_0", "Injection",
-    "Spontaneous_1", "RFMapping_1", "OptoTagging_1",
-    "Spontaneous_2", "RFMapping_2", "OptoTagging_2",
-]
-
-
-@acorns.register_acorn(acorns.NAMES["swdb"])
-def swdb_metadata(dataset: str, force_update: bool = False) -> pd.DataFrame:
-    """Build a metadata table for a SWDB project dataset.
-
-    One row per data asset (or per asset/stream for BCI) with subject, session,
-    and project-specific fields. Results are cached per dataset.
-
-    Args:
-        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
-        force_update: If True, bypass cache and rebuild from database.
-
-    Returns:
-        DataFrame with columns specific to the requested dataset.
-
-    Raises:
-        ValueError: If dataset is not recognized or cache is empty without force_update.
-    """
-    if dataset not in DATASETS:
-        raise ValueError(f"Unknown dataset '{dataset}'. Must be one of {DATASETS}.")
-
-    cache_key = f"swdb_metadata/{dataset}"
-    df = acorns.TREE.scurry(cache_key)
-
-    if df.empty and not force_update:
-        raise ValueError(f"Cache is empty for dataset '{dataset}'. Use force_update=True to rebuild.")
-
-    if df.empty or force_update:
-        setup_logging()
-        logging.info(
-            SquirrelMessage(
-                tree=acorns.TREE.__class__.__name__,
-                acorn=acorns.NAMES["swdb"],
-                message=f"Building SWDB metadata for '{dataset}'",
-            ).to_json()
-        )
-        df = _build(dataset)
-        if not df.empty:
-            acorns.TREE.hide(cache_key, df)
-
-    return df
-
-
-def _build(dataset: str) -> pd.DataFrame:
-    """Build the metadata DataFrame for the given dataset."""
-    client = MetadataDbClient(
-        host=acorns.API_GATEWAY_HOST,
-        version="v2",
-    )
-    if dataset == "v1dd":
-        return _build_v1dd(client)
-    if dataset == "bci":
-        return _build_bci(client)
-    if dataset == "dynamic_foraging":
-        return _build_dynamic_foraging(client)
-    if dataset == "np_ultra":
-        return _build_np_ultra(client)
-    return pd.DataFrame()
-
-
-def _to_datetime(x) -> datetime | None:
-    """Coerce a value to datetime, handling strings, existing datetimes, and None."""
-    if x is None or (isinstance(x, float) and pd.isna(x)):
-        return None
-    if isinstance(x, datetime):
-        return x
-    return datetime.fromisoformat(str(x))
-
-
-def _parse_dates(df: pd.DataFrame) -> pd.DataFrame:
-    """Parse session_time into session_date and time, parse date_of_birth, compute age in days."""
-    df = df.copy()
-    df = df.dropna(subset=["session_time"]).reset_index(drop=True)
-    parsed = df["session_time"].apply(_to_datetime)
-    df["session_date"] = parsed.apply(lambda x: x.date() if x is not None else None)
-    df["session_time"] = parsed.apply(lambda x: x.time() if x is not None else None)
-    df["date_of_birth"] = df["date_of_birth"].apply(
-        lambda x: datetime.strptime(x, "%Y-%m-%d").date() if x and not (isinstance(x, float) and pd.isna(x)) else None
-    )
-    df["age"] = df.apply(
-        lambda x: (x["session_date"] - x["date_of_birth"]).days
-        if x["session_date"] is not None and x["date_of_birth"] is not None
-        else None,
-        axis=1,
-    )
-    return df
-
-
-def _reorder(df: pd.DataFrame, order: list[str]) -> pd.DataFrame:
-    """Subset and reorder columns, skipping any that are absent."""
-    return df[[c for c in order if c in df.columns]]
-
-
-def _build_v1dd(client: MetadataDbClient) -> pd.DataFrame:
-    """Build V1 Deep Dive metadata table."""
-    pipeline = [
-        {
-            "$match": {
-                "data_description.project_name": "V1 Deep Dive",
-            }
-        },
-        {
-            "$project": {
-                "name": 1,
-                "subject_id": "$data_description.subject_id",
-                "genotype": "$subject.subject_details.genotype",
-                "date_of_birth": "$subject.subject_details.date_of_birth",
-                "sex": "$subject.subject_details.sex",
-                "session_time": "$acquisition.acquisition_start_time",
-                "project_name": "$data_description.project_name",
-                "modality": "$data_description.modalities.name",
-                "column": {"$arrayElemAt": ["$data_description.tags", 0]},
-                "volume": {"$arrayElemAt": ["$data_description.tags", 1]},
-            }
-        },
-    ]
-    records = client.aggregate_docdb_records(pipeline=pipeline)
-    df = pd.DataFrame(records)
-    if df.empty:
-        return df
-
-    df = _parse_dates(df)
-    df["column"] = df["column"].apply(lambda x: int(x.split(" ")[-1]))
-    df["volume"] = df["volume"].apply(lambda x: int(x.split(" ")[-1]))
-    df["golden_mouse"] = False
-    df.loc[df["subject_id"] == "409828", "golden_mouse"] = True
-
-    order = [
-        "project_name", "_id", "name", "subject_id", "golden_mouse", "genotype",
-        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
-        "column", "volume",
-    ]
-    return _reorder(df, order)
-
-
-def _build_bci(client: MetadataDbClient) -> pd.DataFrame:
-    """Build BCI single neuron stim metadata table."""
-    pipeline = [
-        {
-            "$match": {
-                "acquisition.acquisition_type": "BCI single neuron stim",
-                "data_description.data_level": "derived",
-                "processing.processing_pipeline.data_processes.start_date_time": {"$gte": "2025-08-03"},
-            }
-        },
-        {
-            "$project": {
-                "name": 1,
-                "subject_id": "$data_description.subject_id",
-                "genotype": "$subject.genotype",
-                "virus": "$procedures.subject_procedures.procedures.injection_materials.name",
-                "date_of_birth": "$subject.date_of_birth",
-                "sex": "$subject.sex",
-                "session_type": "$acquisition.acquisition_type",
-                "session_time": "$acquisition.acquisition_start_time",
-                "stimulus_epochs": "$session.stimulus_epochs.stimulus_name",
-                "project_name": "$data_description.project_name",
-                "modality": "$data_description.modality.name",
-                "targeted_structure": "$session.data_streams.stack_parameters.targeted_structure",
-                "session_number": {
-                    "$filter": {
-                        "input": "$session.stimulus_epochs",
-                        "as": "epoch",
-                        "cond": {"$eq": ["$$epoch.stimulus_name", "single neuron BCI conditioning"]},
-                    }
-                },
-                "ophys_fov": {
-                    "$map": {
-                        "input": "$session.data_streams",
-                        "as": "stream",
-                        "in": {
-                            "$map": {
-                                "input": "$$stream.ophys_fovs",
-                                "as": "fov",
-                                "in": "$$fov.notes",
-                            }
-                        },
-                    }
-                },
-            }
-        },
-        {
-            "$project": {
-                "name": 1,
-                "subject_id": 1,
-                "genotype": 1,
-                "virus": 1,
-                "date_of_birth": 1,
-                "sex": 1,
-                "session_type": 1,
-                "session_time": 1,
-                "stimulus_epochs": 1,
-                "project_name": 1,
-                "modality": 1,
-                "targeted_structure": 1,
-                "session_number": {"$arrayElemAt": ["$session_number.session_number", 0]},
-                "ophys_fov": 1,
-            }
-        },
-        {"$unwind": {"path": "$ophys_fov", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$ophys_fov", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$virus", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$targeted_structure", "preserveNullAndEmptyArrays": False}},
-    ]
-    records = client.aggregate_docdb_records(pipeline=pipeline)
-    df = pd.DataFrame(records)
-    if df.empty:
-        return df
-
-    df = df.drop_duplicates(subset="name")
-    df = df[~df["name"].isin(BCI_PROBLEM_ASSETS)]
-    df = _parse_dates(df)
-
-    order = [
-        "project_name", "session_type", "_id", "name", "subject_id", "genotype", "virus",
-        "date_of_birth", "sex", "modality", "session_date", "age", "session_time",
-        "targeted_structure", "ophys_fov", "session_number",
-    ]
-    return _reorder(df, order)
-
-
-def _build_dynamic_foraging(client: MetadataDbClient) -> pd.DataFrame:
-    """Build Dynamic Foraging (Behavior Platform) metadata table."""
-    pipeline = [
-        {
-            "$match": {
-                "session.session_start_time": {"$regex": "^2025"},
-                "data_description.modality.abbreviation": {"$nin": ["ecephys", "fib"]},
-                "data_description.data_level": "derived",
-                "data_description.project_name": "Behavior Platform",
-                "procedures": {"$ne": None},
-                "$and": [
-                    {"quality_control.evaluations": {"$exists": True, "$ne": []}},
-                    {
-                        "quality_control.evaluations": {
-                            "$not": {
-                                "$elemMatch": {"latest_status": {"$ne": "Pass"}}
-                            }
-                        }
-                    },
-                ],
-            }
-        },
-        {
-            "$project": {
-                "name": 1,
-                "subject_id": "$data_description.subject_id",
-                "genotype": "$subject.genotype",
-                "date_of_birth": "$subject.date_of_birth",
-                "sex": "$subject.sex",
-                "session_type": "$session.session_type",
-                "session_time": "$session.session_start_time",
-                "project_name": "$data_description.project_name",
-                "modality": "$data_description.modality.name",
-                "trials_total": "$session.stimulus_epochs.trials_total",
-                "trials_rewarded": "$session.stimulus_epochs.trials_rewarded",
-            }
-        },
-        {"$unwind": {"path": "$trials_total", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$trials_rewarded", "preserveNullAndEmptyArrays": False}},
-        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
-    ]
-    records = client.aggregate_docdb_records(pipeline=pipeline)
-    df = pd.DataFrame(records)
-    if df.empty:
-        return df
-
-    df = df.drop_duplicates(subset="name")
-    df = _parse_dates(df)
-
-    order = [
-        "project_name", "name", "subject_id", "genotype", "date_of_birth", "sex",
-        "modality", "session_type", "session_date", "age", "session_time",
-        "trials_total", "trials_rewarded",
-    ]
-    return _reorder(df, order)
-
-
-def _build_np_ultra(client: MetadataDbClient) -> pd.DataFrame:
-    """Build NP Ultra and Psychedelics metadata table.
-
-    Note: stimulus_epochs are assigned manually per subject because the metadata
-    is incomplete in the database. Each subject is assumed to have exactly two
-    sessions in sorted order: saline first, then psilocybin.
-    """
-    pipeline = [
-        {
-            "$match": {
-                "data_description.project_name": "NP Ultra and Psychedelics",
-                "data_description.data_level": "derived",
-            }
-        },
-        {
-            "$project": {
-                "name": 1,
-                "subject_id": "$data_description.subject_id",
-                "genotype": "$subject.genotype",
-                "date_of_birth": "$subject.date_of_birth",
-                "sex": "$subject.sex",
-                "session_time": "$session.session_start_time",
-                "stimulus_epochs": "$session.stimulus_epochs.stimulus_name",
-                "project_name": "$data_description.project_name",
-                "modality": "$data_description.modality.name",
-                "notes": "$session.stimulus_epochs.notes",
-            }
-        },
-        {"$unwind": {"path": "$modality", "preserveNullAndEmptyArrays": False}},
-    ]
-    records = client.aggregate_docdb_records(pipeline=pipeline)
-    df = pd.DataFrame(records)
-    if df.empty:
-        return df
-
-    df = df.sort_values(by="session_time").reset_index(drop=True)
-    n_subjects = len(df["subject_id"].unique())
-
-    df["session_type"] = ["saline", "psilocybin"] * n_subjects
-    df["stimulus_epochs"] = [NP_ULTRA_SALINE_EPOCHS, NP_ULTRA_PSILO_EPOCHS] * n_subjects
-
-    sal_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_SALINE_EPOCHS))
-    psi_stim_types = sorted(set(s.split("_")[0] for s in NP_ULTRA_PSILO_EPOCHS))
-    df["stimulus_types"] = [sal_stim_types, psi_stim_types] * n_subjects
-
-    df = _parse_dates(df)
-
-    order = [
-        "project_name", "_id", "name", "subject_id", "genotype", "date_of_birth",
-        "sex", "modality", "session_date", "age", "session_time", "session_type",
-        "stimulus_types", "notes",
-    ]
-    return _reorder(df, order)
-
-
-def swdb_metadata_columns(dataset: str) -> list[Column]:
-    """Return column definitions for the given SWDB dataset.
-
-    Args:
-        dataset: One of 'v1dd', 'bci', 'dynamic_foraging', 'np_ultra'.
-
-    Returns:
-        List of Column definitions for the dataset.
-    """
-    common = [
-        Column(name="project_name", description="Project name from data_description"),
-        Column(name="_id", description="MongoDB document ID"),
-        Column(name="name", description="Data asset name"),
-        Column(name="subject_id", description="Subject/mouse ID"),
-        Column(name="genotype", description="Mouse genotype"),
-        Column(name="date_of_birth", description="Date of birth (date)"),
-        Column(name="sex", description="Subject sex"),
-        Column(name="modality", description="Data modality name"),
-        Column(name="session_date", description="Session date (date)"),
-        Column(name="age", description="Age at session in days"),
-        Column(name="session_time", description="Session start time (time)"),
-    ]
-    if dataset == "v1dd":
-        return common + [
-            Column(name="golden_mouse", description="True if subject_id is 409828 (golden mouse)"),
-            Column(name="column", description="V1DD column number extracted from data_description.tags[0]"),
-            Column(name="volume", description="V1DD volume number extracted from data_description.tags[1]"),
-        ]
-    if dataset == "bci":
-        return [
-            Column(name="project_name", description="Project name from data_description"),
-            Column(name="session_type", description="Session type (BCI single neuron stim)"),
-            Column(name="_id", description="MongoDB document ID"),
-            Column(name="name", description="Data asset name"),
-            Column(name="subject_id", description="Subject/mouse ID"),
-            Column(name="genotype", description="Mouse genotype"),
-            Column(name="virus", description="Injection material / virus name"),
-            Column(name="date_of_birth", description="Date of birth (date)"),
-            Column(name="sex", description="Subject sex"),
-            Column(name="modality", description="Data modality name"),
-            Column(name="session_date", description="Session date (date)"),
-            Column(name="age", description="Age at session in days"),
-            Column(name="session_time", description="Session start time (time)"),
-            Column(name="targeted_structure", description="Targeted brain structure"),
-            Column(name="ophys_fov", description="Notes from the ophys field-of-view"),
-            Column(name="session_number", description="BCI conditioning session number"),
-        ]
-    if dataset == "dynamic_foraging":
-        return [
-            Column(name="project_name", description="Project name (Behavior Platform)"),
-            Column(name="name", description="Data asset name"),
-            Column(name="subject_id", description="Subject/mouse ID"),
-            Column(name="genotype", description="Mouse genotype"),
-            Column(name="date_of_birth", description="Date of birth (date)"),
-            Column(name="sex", description="Subject sex"),
-            Column(name="modality", description="Data modality name"),
-            Column(name="session_type", description="Session type / task name"),
-            Column(name="session_date", description="Session date (date)"),
-            Column(name="age", description="Age at session in days"),
-            Column(name="session_time", description="Session start time (time)"),
-            Column(name="trials_total", description="Total number of trials in the session"),
-            Column(name="trials_rewarded", description="Number of rewarded trials in the session"),
-        ]
-    if dataset == "np_ultra":
-        return common + [
-            Column(name="session_type", description="Session type: 'saline' or 'psilocybin'"),
-            Column(name="stimulus_types", description="Unique stimulus type names for the session"),
-            Column(name="notes", description="Notes from session stimulus epochs"),
-        ]
-    return []
diff --git a/tests/acorn_helpers/test_swdb_metadata.py b/tests/acorn_helpers/test_swdb_metadata.py
index e2eee0a..1db8177 100644
--- a/tests/acorn_helpers/test_swdb_metadata.py
+++ b/tests/acorn_helpers/test_swdb_metadata.py
@@ -50,23 +50,21 @@ def test_force_update_replaces_cache(mock_client_class):
     cached = pd.DataFrame({"name": ["old_asset"], "subject_id": ["sub1"]})
     acorns.TREE.hide("swdb_metadata/v1dd", cached)
 
-    mock_client = MagicMock()
-    mock_client_class.return_value = mock_client
-    mock_client.aggregate_docdb_records.return_value = [
-        {
-            "_id": "abc",
-            "name": "v1dd_asset",
+    full_record = {
+        "_id": "abc",
+        "name": "v1dd_asset",
+        "data_description": {
             "subject_id": "sub1",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-01",
-            "sex": "M",
-            "session_time": "2025-03-01T10:00:00",
             "project_name": "V1 Deep Dive",
-            "modality": "SPIM",
-            "column": "Column 1",
-            "volume": "Volume 2",
-        }
-    ]
+            "modalities": [{"name": "SPIM"}],
+            "tags": ["Column 1", "Volume 2"],
+        },
+        "subject": {"subject_details": {"genotype": "wt", "date_of_birth": "2024-01-01", "sex": "M"}},
+        "acquisition": {"acquisition_start_time": "2025-03-01T10:00:00"},
+    }
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.retrieve_docdb_records.side_effect = [[{"_id": "abc"}], [full_record]]
 
     df = swdb_metadata("v1dd", force_update=True)
     assert len(df) == 1
@@ -78,103 +76,85 @@ def test_force_update_replaces_cache(mock_client_class):
 
 @patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
 def test_v1dd_golden_mouse(mock_client_class):
-    mock_client = MagicMock()
-    mock_client_class.return_value = mock_client
-    mock_client.aggregate_docdb_records.return_value = [
-        {
-            "_id": "abc",
-            "name": "v1dd_asset",
+    full_record = {
+        "_id": "abc",
+        "name": "v1dd_asset",
+        "data_description": {
             "subject_id": "409828",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-01",
-            "sex": "M",
-            "session_time": "2025-03-01T10:00:00",
             "project_name": "V1 Deep Dive",
-            "modality": "SPIM",
-            "column": "Column 3",
-            "volume": "Volume 5",
-        }
-    ]
+            "modalities": [{"name": "SPIM"}],
+            "tags": ["Column 3", "Volume 5"],
+        },
+        "subject": {"subject_details": {"genotype": "wt", "date_of_birth": "2024-01-01", "sex": "M"}},
+        "acquisition": {"acquisition_start_time": "2025-03-01T10:00:00"},
+    }
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.retrieve_docdb_records.side_effect = [[{"_id": "abc"}], [full_record]]
     df = swdb_metadata("v1dd", force_update=True)
     assert df.iloc[0]["golden_mouse"]
 
 
 @patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
 def test_dynamic_foraging_deduplication(mock_client_class):
-    mock_client = MagicMock()
-    mock_client_class.return_value = mock_client
-    mock_client.aggregate_docdb_records.return_value = [
-        {
-            "name": "asset_1",
+    record = {
+        "_id": "abc",
+        "name": "asset_1",
+        "data_description": {
             "subject_id": "sub1",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-01",
-            "sex": "F",
-            "session_time": "2025-04-01T09:00:00",
             "project_name": "Behavior Platform",
-            "modality": "behavior",
-            "session_type": "Coupled Baiting",
-            "trials_total": 200,
-            "trials_rewarded": 150,
+            "modalities": [{"name": "behavior"}],
         },
-        {
-            "name": "asset_1",
-            "subject_id": "sub1",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-01",
-            "sex": "F",
-            "session_time": "2025-04-01T09:00:00",
-            "project_name": "Behavior Platform",
-            "modality": "behavior",
-            "session_type": "Coupled Baiting",
-            "trials_total": 200,
-            "trials_rewarded": 150,
+        "subject": {"genotype": "wt", "date_of_birth": "2024-01-01", "sex": "F"},
+        "acquisition": {
+            "acquisition_type": "Coupled Baiting",
+            "acquisition_start_time": "2025-04-01T09:00:00",
         },
-    ]
+        "quality_control": {"status": {"video": "Pass", "behavior": "Pass"}},
+        "session": {
+            "stimulus_epochs": [{"trials_total": 200, "trials_rewarded": 150}],
+        },
+    }
+    mock_client = MagicMock()
+    mock_client_class.return_value = mock_client
+    mock_client.retrieve_docdb_records.side_effect = [[{"_id": "abc"}, {"_id": "abc"}], [record, record]]
     df = swdb_metadata("dynamic_foraging", force_update=True)
     assert len(df) == 1
 
 
 @patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
 def test_bci_problem_assets_excluded(mock_client_class):
+    def _bci_record(_id, name, subject_id, session_time):
+        return {
+            "_id": _id,
+            "name": name,
+            "data_description": {
+                "subject_id": subject_id,
+                "project_name": "BCI",
+                "modalities": [{"name": "ophys"}],
+            },
+            "subject": {"genotype": "wt", "date_of_birth": "2024-06-01", "sex": "M"},
+            "acquisition": {
+                "acquisition_type": "BCI single neuron stim",
+                "acquisition_start_time": session_time,
+            },
+            "procedures": {"subject_procedures": [{"procedures": [{"injection_materials": [{"name": "AAV"}]}]}]},
+            "session": {
+                "data_streams": [{"stack_parameters": {"targeted_structure": "V1"}, "ophys_fovs": [{"notes": "note"}]}],
+                "stimulus_epochs": [],
+            },
+        }
+
+    problem = _bci_record(
+        "abc",
+        "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
+        "731015",
+        "2025-01-28T17:40:57",
+    )
+    good = _bci_record("def", "good_asset", "sub2", "2025-05-01T12:00:00")
     mock_client = MagicMock()
     mock_client_class.return_value = mock_client
-    mock_client.aggregate_docdb_records.return_value = [
-        {
-            "_id": "abc",
-            "name": "single-plane-ophys_731015_2025-01-28_17-40-57_processed_2025-08-04_04-38-08",
-            "subject_id": "731015",
-            "genotype": "wt",
-            "date_of_birth": "2024-06-01",
-            "sex": "M",
-            "session_time": "2025-01-28T17:40:57",
-            "project_name": "BCI",
-            "modality": "ophys",
-            "session_type": "BCI single neuron stim",
-            "virus": "AAV",
-            "targeted_structure": "V1",
-            "ophys_fov": "note",
-            "session_number": 1,
-            "stimulus_epochs": [],
-        },
-        {
-            "_id": "def",
-            "name": "good_asset",
-            "subject_id": "sub2",
-            "genotype": "wt",
-            "date_of_birth": "2024-06-01",
-            "sex": "F",
-            "session_time": "2025-05-01T12:00:00",
-            "project_name": "BCI",
-            "modality": "ophys",
-            "session_type": "BCI single neuron stim",
-            "virus": "AAV",
-            "targeted_structure": "V1",
-            "ophys_fov": "note",
-            "session_number": 2,
-            "stimulus_epochs": [],
-        },
-    ]
+    mock_client.retrieve_docdb_records.side_effect = [[{"_id": "abc"}, {"_id": "def"}], [problem, good]]
     df = swdb_metadata("bci", force_update=True)
     assert "single-plane-ophys_731015" not in df["name"].values
     assert "good_asset" in df["name"].values
@@ -182,36 +162,27 @@ def test_bci_problem_assets_excluded(mock_client_class):
 
 @patch("zombie_squirrel.acorn_helpers.swdb_metadata.MetadataDbClient")
 def test_np_ultra_session_types(mock_client_class):
+    def _np_record(_id, name, session_time):
+        return {
+            "_id": _id,
+            "name": name,
+            "data_description": {
+                "subject_id": "sub1",
+                "project_name": "NP Ultra and Psychedelics",
+                "modalities": [{"name": "ecephys"}],
+            },
+            "subject": {"genotype": "wt", "date_of_birth": "2024-01-15", "sex": "M"},
+            "acquisition": {"acquisition_start_time": session_time},
+            "session": {"stimulus_epochs": []},
+        }
+
+    records = [
+        _np_record("a1", "np_asset_1", "2025-02-01T10:00:00"),
+        _np_record("a2", "np_asset_2", "2025-03-01T10:00:00"),
+    ]
     mock_client = MagicMock()
     mock_client_class.return_value = mock_client
-    mock_client.aggregate_docdb_records.return_value = [
-        {
-            "_id": "a1",
-            "name": "np_asset_1",
-            "subject_id": "sub1",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-15",
-            "sex": "M",
-            "session_time": "2025-02-01T10:00:00",
-            "project_name": "NP Ultra and Psychedelics",
-            "modality": "ecephys",
-            "stimulus_epochs": [],
-            "notes": [],
-        },
-        {
-            "_id": "a2",
-            "name": "np_asset_2",
-            "subject_id": "sub1",
-            "genotype": "wt",
-            "date_of_birth": "2024-01-15",
-            "sex": "M",
-            "session_time": "2025-03-01T10:00:00",
-            "project_name": "NP Ultra and Psychedelics",
-            "modality": "ecephys",
-            "stimulus_epochs": [],
-            "notes": [],
-        },
-    ]
+    mock_client.retrieve_docdb_records.side_effect = [[{"_id": "a1"}, {"_id": "a2"}], records]
     df = swdb_metadata("np_ultra", force_update=True)
     assert list(df["session_type"]) == ["saline", "psilocybin"]
     assert isinstance(df.iloc[0]["stimulus_types"], list)