LibriBrain100: align manifest with finalized HF upload

gereonelvers · gereonelvers · commit f46bc886badb · 2026-05-06T16:05:26.000+02:00
Fixes from the on-VM fingerprint test now that pnpl/LibriBrain2 has
finished uploading sub-0 across every corpus:

  - Podcasts task token is "TheMoth" on disk (not "Podcasts"). The
    user-facing corpus="podcasts" alias is unchanged; only the
    internal task token / HF folder name moves.
  - Sherlock9 starts at ses-0 (the LibriVox preface track), giving 13
    sessions instead of 12.
  - TIMIT has 14 sessions (not 10). Per the paper, TIMIT splits are
    utterance-level (24-speaker core test, 50-speaker Kaldi dev), so
    every TIMIT session is assigned to train and finer splits will
    surface from event-level filtering once the per-utterance metadata
    in events.tsv is finalized.

After this change, the on-VM fingerprint reports 100% manifest /
upload parity for sub-0 across MOCHATIMIT/Sherlock8/Sherlock9/TIMIT/
TheMoth. The 32 broad-subject Sherlock1 ses-11/ses-12 records are
still pending; the loader skips them with a single grouped warning.
diff --git a/pnpl/datasets/libribrain100/constants.py b/pnpl/datasets/libribrain100/constants.py
@@ -76,7 +76,7 @@
 
 TIMIT_TASK = "TIMIT"
 MOCHATIMIT_TASK = "MOCHATIMIT"
-PODCASTS_TASK = "Podcasts"
+PODCASTS_TASK = "TheMoth"   # HF folder name; user-facing corpus is "podcasts"
 
 # Map task token → corpus.
 TASK_TO_CORPUS: dict[str, str] = {
diff --git a/pnpl/datasets/libribrain100/manifest.py b/pnpl/datasets/libribrain100/manifest.py
@@ -135,18 +135,14 @@ def _build_libribrain_sherlock_records() -> list[RunRecord]:
 # ---------------------------------------------------------------------------
 # Sherlock8 / Sherlock9 — sub-0, LibriBrain2 release.
 #
-# The HF tree shows Sherlock8 with ses-1..10 currently. Sherlock9 is
-# expected to follow the same one-session-per-LibriVox-track pattern as
-# the other books in the canon; we approximate with ses-1..12 and
-# tolerate gaps at runtime. Both are entirely train (all-canon Sherlock
-# val/test live in book 1 already).
+# Sherlock8 has ses-1..10. Sherlock9 starts at ses-0 (the book's preface
+# track on LibriVox) and runs through ses-12. Both are entirely train
+# (all-canon Sherlock val/test live in book 1 already).
 # ---------------------------------------------------------------------------
 
 _LIBRIBRAIN2_SHERLOCK_SESSION_RUNS: dict[str, tuple[tuple[str, str], ...]] = {
     "Sherlock8": tuple((str(i), "1") for i in range(1, 11)),
-    # Sherlock9 session count is approximate; will be reconciled with the
-    # final upload. Missing sessions are skipped at load time.
-    "Sherlock9": tuple((str(i), "1") for i in range(1, 13)),
+    "Sherlock9": tuple((str(i), "1") for i in range(0, 13)),
 }
 
 
@@ -171,32 +167,20 @@ def _build_libribrain2_sherlock_records() -> list[RunRecord]:
 # ---------------------------------------------------------------------------
 # TIMIT — sub-0, LibriBrain2 release.
 #
-# Per the paper (Sec. 3.2), the standard split follows TIMIT's official
-# core test set + Kaldi 50-speaker dev set, applied at the utterance
-# level. The MEG-side session count is not yet visible on HF; we
-# placeholder with ses-1..10 and assign one session each to val/test.
-# Final per-utterance filtering will happen at the events level once
-# the upload exposes the per-utterance metadata.
+# 14 sessions on HF. Per the paper (Sec. 3.2), the standard TIMIT split
+# is utterance-level (24-speaker core test, 50-speaker Kaldi dev), so
+# session-level partitioning here would be too coarse. We assign every
+# session to ``train`` and rely on event-level filtering (in events.tsv
+# rows) to surface the standard val/test subsets — to be wired up once
+# the per-utterance metadata is finalized in the released events files.
 # ---------------------------------------------------------------------------
 
-_TIMIT_TRAIN_SESSIONS = tuple(str(i) for i in range(1, 9))
-_TIMIT_VAL_SESSIONS = ("9",)
-_TIMIT_TEST_SESSIONS = ("10",)
-
-
-def _timit_partition(session: str) -> str:
-    if session in _TIMIT_VAL_SESSIONS:
-        return PARTITION_VALIDATION
-    if session in _TIMIT_TEST_SESSIONS:
-        return PARTITION_TEST
-    return PARTITION_TRAIN
+_TIMIT_SESSIONS = tuple(str(i) for i in range(1, 15))
 
 
 def _build_timit_records() -> list[RunRecord]:
     out: list[RunRecord] = []
-    for ses in (
-        _TIMIT_TRAIN_SESSIONS + _TIMIT_VAL_SESSIONS + _TIMIT_TEST_SESSIONS
-    ):
+    for ses in _TIMIT_SESSIONS:
         out.append(
             RunRecord(
                 subject=DEEP_SUBJECT,
@@ -205,7 +189,7 @@ def _build_timit_records() -> list[RunRecord]:
                 run="1",
                 corpus=CORPUS_TIMIT,
                 repo=REPO_KEY_LIBRIBRAIN2,
-                partition=_timit_partition(ses),
+                partition=PARTITION_TRAIN,
             )
         )
     return out