Skip to content

Commit e07cc52

Browse files
committed
Add LibriBrain100 — unified loader for LibriBrain + LibriBrain2
LibriBrain100 is the user-facing dataset for the full LibriBrain release: a virtual union of pnpl/LibriBrain (sub-0 × Sherlock1..7) and pnpl/LibriBrain2 (sub-0 × Sherlock8/Sherlock9/TIMIT/MOCHATIMIT/ Podcasts, plus sub-1..32 × Sherlock1 ses-11/ses-12). Manifest-driven, keeps the existing task-based API, adds two selectors: - subjects= "all"|"deep"|"new"|0|"sub-0"|[1,2,3]|range(1, 33) - corpus= "all"|"sherlock"|"timit"|"mocha"|"podcasts"|[...] Implementation notes: - manifest.py declares one RunRecord per (subject, session, task, run) tuple expected by the paper, tagging each with corpus, repo, and partition. Records the upload hasn't reached yet are tolerated at runtime — the loader skips missing H5s with a single grouped warning, so the API is usable while LibriBrain2 fills in. - selectors.py validates obvious dead-end combinations up front (e.g. subjects='new' has only Sherlock data and only val/test by design — the paper assigns no train partition to broad subjects). - base.py overrides _schedule_download to pick the per-record primary HF repo, so most fetches hit on the first try; the inherited HFDownloadMixin retry chain falls back to the other repo. Catches huggingface_hub.errors.{Entry,Repository}NotFound in addition to FileNotFoundError so missing-but-coming records don't crash through the download stack. - LibriBrain100 + LibriBrain100Speech/Phoneme/Word wrappers are exposed at pnpl.datasets; existing LibriBrain* classes are unchanged. Tests: - 25 rudimentary tests in tests/test_libribrain100.py (selector normalization + manifest invariants + validation rules + the download-disabled error path). - A comprehensive on-VM suite at /workspace/libribrain100-tests/ (60 tests, including live HF tree fingerprints, multi-repo download, full-DataLoader walks) — not in this repo. Verified end-to-end on the validation VM: Sherlock1 ses-11 download + sample materialization, MOCHATIMIT ses-1 download (different repo), graceful skip when a corpus is not yet uploaded.
1 parent 4d4f650 commit e07cc52

10 files changed

Lines changed: 2019 additions & 1 deletion

File tree

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,8 @@
33
*.egg-info/
44
scripts/*
55
.DS_Store
6-
.idea
6+
.idea
7+
.claude/
8+
9+
# Dataset paper draft — local only.
10+
/libribrain100-paper/

pnpl/datasets/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
"LibriBrainWord": ("pnpl.datasets.libribrain2025.compat", "LibriBrainWord"),
2121
"LibriBrainSentence": ("pnpl.datasets.libribrain2025.sentence_dataset", "LibriBrainSentence"),
2222

23+
# LibriBrain100 — union of pnpl/LibriBrain + pnpl/LibriBrain2
24+
"LibriBrain100": ("pnpl.datasets.libribrain100.dataset", "LibriBrain100"),
25+
"LibriBrain100Speech": ("pnpl.datasets.libribrain100.compat", "LibriBrain100Speech"),
26+
"LibriBrain100Phoneme": ("pnpl.datasets.libribrain100.compat", "LibriBrain100Phoneme"),
27+
"LibriBrain100Word": ("pnpl.datasets.libribrain100.compat", "LibriBrain100Word"),
28+
2329
# MEG-MASC (Gwilliams et al., 2022) — auto-downloads from OSF
2430
"Gwilliams2022": ("pnpl.datasets.gwilliams2022.dataset", "Gwilliams2022"),
2531

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
LibriBrain100 — unified loader for LibriBrain + LibriBrain2.
3+
4+
LibriBrain100 is the user-facing PyTorch Dataset wrapping the full
5+
LibriBrain release: a virtual union of the original
6+
``pnpl/LibriBrain`` Hugging Face dataset and the
7+
``pnpl/LibriBrain2`` extension. The deep component (~80h, sub-0)
8+
covers the entire Sherlock Holmes canon plus TIMIT, MOCHA-TIMIT, and
9+
30 'The Moth' podcasts; the broad component (~44 min × 32 subjects)
10+
covers Sherlock book 1 chapters 11 and 12.
11+
12+
Public API:
13+
14+
from pnpl.datasets import (
15+
LibriBrain100,
16+
LibriBrain100Speech,
17+
LibriBrain100Phoneme,
18+
LibriBrain100Word,
19+
)
20+
"""
21+
22+
from __future__ import annotations
23+
24+
from .compat import LibriBrain100Phoneme, LibriBrain100Speech, LibriBrain100Word
25+
from .constants import (
26+
CORPORA,
27+
CORPUS_MOCHA,
28+
CORPUS_PODCASTS,
29+
CORPUS_SHERLOCK,
30+
CORPUS_TIMIT,
31+
DEEP_SUBJECT,
32+
NEW_SUBJECTS,
33+
PARTITION_TEST,
34+
PARTITION_TRAIN,
35+
PARTITION_VALIDATION,
36+
PARTITIONS,
37+
SUBJECTS,
38+
)
39+
from .dataset import LibriBrain100
40+
from .manifest import (
41+
RUN_KEYS,
42+
RUN_RECORDS,
43+
RunRecord,
44+
TEST_RUN_KEYS,
45+
VALIDATION_RUN_KEYS,
46+
get_record,
47+
select_records,
48+
)
49+
from .selectors import (
50+
normalize_corpus,
51+
normalize_partition,
52+
normalize_subjects,
53+
validate_selector_combination,
54+
)
55+
56+
__all__ = [
57+
# Dataset classes
58+
"LibriBrain100",
59+
"LibriBrain100Speech",
60+
"LibriBrain100Phoneme",
61+
"LibriBrain100Word",
62+
# Manifest
63+
"RunRecord",
64+
"RUN_RECORDS",
65+
"RUN_KEYS",
66+
"VALIDATION_RUN_KEYS",
67+
"TEST_RUN_KEYS",
68+
"get_record",
69+
"select_records",
70+
# Selectors / normalisation
71+
"normalize_subjects",
72+
"normalize_corpus",
73+
"normalize_partition",
74+
"validate_selector_combination",
75+
# Constants
76+
"DEEP_SUBJECT",
77+
"SUBJECTS",
78+
"NEW_SUBJECTS",
79+
"CORPORA",
80+
"CORPUS_SHERLOCK",
81+
"CORPUS_TIMIT",
82+
"CORPUS_MOCHA",
83+
"CORPUS_PODCASTS",
84+
"PARTITIONS",
85+
"PARTITION_TRAIN",
86+
"PARTITION_VALIDATION",
87+
"PARTITION_TEST",
88+
]

0 commit comments

Comments
 (0)