fix: add chunking to get past memory issues for fib_traces

dbirman · dbirman · commit 45797d2a6cbe · 2026-06-30T09:23:29.000-07:00
diff --git a/src/biodata_cache/backend.py b/src/biodata_cache/backend.py
@@ -131,12 +131,60 @@ def read(self, table_name: str | list[str]) -> pd.DataFrame:
             return self._read_multiple(table_name)
         return self._read_single(table_name)
 
+    def clear_partition(self, table_name: str) -> None:
+        """Delete all parquet chunk files in a hive partition."""
+        if "/" not in table_name:
+            return
+        base, value = table_name.split("/", 1)
+        partition_key = HIVE_PARTITION_KEYS[base]
+        prefix = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/"
+        paginator = self.s3_client.get_paginator("list_objects_v2")
+        to_delete = []
+        for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+            for obj in page.get("Contents", []):
+                to_delete.append({"Key": obj["Key"]})
+        for i in range(0, len(to_delete), 1000):
+            self.s3_client.delete_objects(
+                Bucket=self.bucket,
+                Delete={"Objects": to_delete[i : i + 1000]},
+            )
+
+    def write_chunk(self, table_name: str, data: pd.DataFrame, chunk_idx: int) -> None:
+        """Append one numbered parquet chunk to a hive partition."""
+        base, value = table_name.split("/", 1)
+        partition_key = HIVE_PARTITION_KEYS[base]
+        s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data_{chunk_idx:04d}.pqt"
+        json_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}.json"
+
+        parquet_buffer = io.BytesIO()
+        table = pa.Table.from_pandas(data, preserve_index=False)
+        float_cols = [f.name for f in table.schema if pa.types.is_floating(f.type)]
+        dict_cols = [f.name for f in table.schema if f.name not in float_cols]
+        pq.write_table(
+            table,
+            parquet_buffer,
+            compression="zstd",
+            use_dictionary=dict_cols if dict_cols else False,
+            column_encoding={col: "BYTE_STREAM_SPLIT" for col in float_cols} or None,
+        )
+        parquet_buffer.seek(0)
+        self.s3_client.put_object(Bucket=self.bucket, Key=s3_key, Body=parquet_buffer.getvalue())
+        logging.info(
+            CacheLogMessage(
+                backend="S3Backend", table=table_name, message=f"Stored chunk {chunk_idx} to s3://{self.bucket}/{s3_key}"
+            ).to_json()
+        )
+        metadata = {"columns": data.columns.tolist()}
+        self.s3_client.put_object(
+            Bucket=self.bucket, Key=json_key, Body=json.dumps(metadata)
+        )
+
     def _read_single(self, table_name: str) -> pd.DataFrame:
         """Fetch a single table from S3."""
         if "/" in table_name:
             base, value = table_name.split("/", 1)
             partition_key = HIVE_PARTITION_KEYS[base]
-            s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data.pqt"
+            s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{base}/{partition_key}={value}/data*.pqt"
         else:
             s3_key = f"{_CACHE_ROOT}/{_VERSION_FOLDER}/{table_name}.pqt"
 
@@ -315,6 +363,17 @@ def get_versions_index(self) -> list[str]:
         """Return the list of all available version folders from the in-memory index."""
         return json.loads(self._json_store.get("cache_versions.json", "[]"))
 
+    def clear_partition(self, table_name: str) -> None:
+        """Remove all chunks stored for a partitioned table."""
+        self._store.pop(table_name, None)
+
+    def write_chunk(self, table_name: str, data: pd.DataFrame, chunk_idx: int) -> None:
+        """Append one chunk to the in-memory store for a partitioned table."""
+        existing = self._store.get(table_name, pd.DataFrame())
+        self._store[table_name] = (
+            pd.concat([existing, data], ignore_index=True) if not existing.empty else data.copy()
+        )
+
     def _read_multiple(self, table_names: list[str]) -> pd.DataFrame:
         """Fetch and merge multiple tables from memory."""
         dfs = []
diff --git a/src/biodata_cache/cache_table_helpers/platform_fib_traces.py b/src/biodata_cache/cache_table_helpers/platform_fib_traces.py
@@ -27,6 +27,7 @@
 _FALLBACK_METHOD = "dff-bright"
 _S3_URI_RE = re.compile(r"^s3://([^/]+)/(.+)$")
 _MAX_WORKERS = 32
+_CHUNK_SIZE = 10
 
 
 def _log(message: str) -> None:
@@ -151,7 +152,12 @@ def _extract_session_traces(root, asset_name: str, subject_id: str) -> pd.DataFr
 
 
 def _fetch_subject_fib_traces(subject_id: str) -> pd.DataFrame:
-    """Fetch and cache all processed dF/F traces for a subject from S3 NWB files."""
+    """Fetch and cache all processed dF/F traces for a subject from S3 NWB files.
+
+    Sessions are flushed to storage every ``_CHUNK_SIZE`` sessions so the
+    in-memory footprint stays bounded. Returns an empty DataFrame; callers
+    should read back from the backend.
+    """
     setup_logging()
     cache_key = f"{registry.NAMES['fib_traces']}/{subject_id}"
     _log(f"Updating cache for subject {subject_id}")
@@ -165,8 +171,14 @@ def _fetch_subject_fib_traces(subject_id: str) -> pd.DataFrame:
     ]
     subject_assets = subject_assets[subject_assets["data_level"] == "derived"]
 
-    frames = []
-    for _, row in subject_assets.iterrows():
+    registry.BACKEND.clear_partition(cache_key)
+
+    rows = list(subject_assets.iterrows())
+    frames: list[pd.DataFrame] = []
+    chunk_idx = 0
+    n_sessions = 0
+
+    for i, (_, row) in enumerate(rows):
         location = row["location"]
         if not location:
             continue
@@ -175,16 +187,21 @@ def _fetch_subject_fib_traces(subject_id: str) -> pd.DataFrame:
             _log(f"No NWB file found for asset {row['name']}")
             continue
         session_df = _extract_session_traces(root, row["name"], subject_id)
+        del root
         if not session_df.empty:
             frames.append(session_df)
+            n_sessions += 1
 
-    df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
-    if not df.empty:
-        df = df.sort_values(["asset_name", "channel", "timestamp", "fiber"]).reset_index(drop=True)
+        if frames and (n_sessions % _CHUNK_SIZE == 0 or i == len(rows) - 1):
+            chunk_df = pd.concat(frames, ignore_index=True)
+            chunk_df = chunk_df.sort_values(["asset_name", "channel", "timestamp", "fiber"]).reset_index(drop=True)
+            frames = []
+            registry.BACKEND.write_chunk(cache_key, chunk_df, chunk_idx)
+            del chunk_df
+            chunk_idx += 1
 
-    _log(f"Cached fib traces for subject {subject_id} ({len(frames)} sessions, {len(df)} samples)")
-    registry.BACKEND.write(cache_key, df)
-    return df
+    _log(f"Cached fib traces for subject {subject_id} ({n_sessions} sessions)")
+    return pd.DataFrame()
 
 
 @registry.register_table(registry.NAMES["fib_traces"])
@@ -228,6 +245,8 @@ def platform_fib_traces(
 
     if force_update:
         df = _fetch_subject_fib_traces(subject_id)
+        if df.empty:
+            df = registry.BACKEND.read(cache_key)
 
     return df
 
diff --git a/tests/cache_table_helpers/test_platform_fib_traces.py b/tests/cache_table_helpers/test_platform_fib_traces.py
@@ -191,9 +191,9 @@ def test_fetch_subject_filters_and_writes(mock_basics, mock_extract, mock_open,
 
     # Only the single derived fib asset for subject 856239 is processed.
     mock_open.assert_called_once_with("s3://bucket/abc")
-    mock_registry.BACKEND.write.assert_called_once()
-    assert mock_registry.BACKEND.write.call_args[0][0] == "platform_fib_traces/856239"
-    assert not result.empty
+    mock_registry.BACKEND.write_chunk.assert_called_once()
+    assert mock_registry.BACKEND.write_chunk.call_args[0][0] == "platform_fib_traces/856239"
+    assert result.empty
 
 
 @patch("biodata_cache.cache_table_helpers.platform_fib_traces.registry")
diff --git a/tests/test_backend.py b/tests/test_backend.py
@@ -187,7 +187,7 @@ def test_s3_scurry_partitioned_table(mock_boto3_client, mock_duckdb_query):
     mock_result.to_df.return_value = expected_df
     mock_duckdb_query.return_value = mock_result
     result = S3Backend().read("qc/subject123")
-    assert f"data-asset-cache/{_VF}/qc/subject_id=subject123/data.pqt" in mock_duckdb_query.call_args[0][0]
+    assert f"data-asset-cache/{_VF}/qc/subject_id=subject123/data*.pqt" in mock_duckdb_query.call_args[0][0]
     pd.testing.assert_frame_equal(result, expected_df)