Remove pickle use from SCDL [BIO-408] (#1545)

jstjohn · web-flow · commit dfd83a7db92a · 2026-04-03T15:27:33.000Z
### Description

Remove the allow_pickle=True option when loading feature name paths in
scdl.

### Type of changes

&lt;!-- Mark the relevant option with an [x] --&gt;

- [x] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Refactor
- [ ] Documentation update
- [ ] Other (please describe):

### Pre-submit Checklist

&lt;!--- Ensure all items are completed before submitting --&gt;

- [x] I have tested these changes locally
- [ ] I have updated the documentation accordingly
- [x] I have added/updated tests as needed
- [x] All existing tests pass successfully


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;

## Summary by CodeRabbit

* **New Features**
* Dataset labels now stored in JSON format for improved portability and
safety.

* **Bug Fixes**
* Enhanced error handling with clearer messages for missing or corrupted
label files.
* Added backward compatibility with deprecation warnings for legacy
label formats.

* **Tests**
  * Added tests for label format migration and error scenarios.

&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
Signed-off-by: John St. John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-scdl/src/bionemo/scdl/index/row_feature_index.py b/sub-packages/bionemo-scdl/src/bionemo/scdl/index/row_feature_index.py
@@ -30,6 +30,8 @@
 from __future__ import annotations
 
 import importlib.metadata
+import json
+import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Optional, Sequence, Tuple
@@ -143,7 +145,31 @@ def _load_common(datapath: str, instance: "RowFeatureIndex") -> "RowFeatureIndex
         for features in instance._feature_arr:
             instance._extend_num_entries_per_row(features)
         instance._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
-        instance._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
+        labels_json_path = Path(datapath) / "labels.json"
+        legacy_labels_npy_path = Path(datapath) / "labels.npy"
+        if labels_json_path.exists():
+            with open(labels_json_path) as f:
+                instance._labels = json.load(f)
+        elif legacy_labels_npy_path.exists():
+            warnings.warn(
+                f"Found legacy labels.npy in '{datapath}'. This format is deprecated due to a "
+                "security vulnerability (arbitrary code execution via pickle deserialization). "
+                "To re-index, load this dataset and call .save() to write the new labels.json format. "
+                "Support for labels.npy will be removed in a future release.",
+                FutureWarning,
+                stacklevel=3,
+            )
+            try:
+                instance._labels = list(np.load(legacy_labels_npy_path, allow_pickle=False))
+            except ValueError:
+                raise ValueError(
+                    f"Cannot safely load labels.npy in '{datapath}' because it contains pickled objects. "
+                    "This is a security risk and is no longer supported. To migrate, re-create the dataset "
+                    "from source (e.g. re-run your h5ad-to-SCDL conversion) so that labels are saved in "
+                    "the new JSON format."
+                )
+        else:
+            raise FileNotFoundError(f"No labels file found in {datapath}. Expected labels.json or labels.npy.")
         instance._version = np.load(Path(datapath) / "version.npy").item()
         return instance
 
@@ -305,7 +331,8 @@ def save(self, datapath: str) -> None:
             dataframe_str_index = f"{index:0{num_digits}d}"
             pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")
         np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
-        np.save(Path(datapath) / "labels.npy", self._labels)
+        with open(Path(datapath) / "labels.json", "w") as f:
+            json.dump([str(label) if label is not None else None for label in self._labels], f)
         np.save(Path(datapath) / "version.npy", np.array(self._version))
 
 
diff --git a/sub-packages/bionemo-scdl/src/bionemo/scdl/io/single_cell_memmap_dataset.py b/sub-packages/bionemo-scdl/src/bionemo/scdl/io/single_cell_memmap_dataset.py
@@ -985,7 +985,7 @@ def _write_header(self):
             features_rel_path = f"{feature_index_path}"
             index_files: List[str] = [
                 f"{features_rel_path}/cumulative_sum_index.npy",
-                f"{features_rel_path}/labels.npy",
+                f"{features_rel_path}/labels.json",
                 f"{features_rel_path}/version.npy",
             ]
             if num_frames > 0:
diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/index/test_obs_feature_index.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/index/test_obs_feature_index.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import warnings
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -261,3 +264,67 @@ def testObeservedFetureIndex_getitem_slice_with_step_and_order_preserved(make_fe
         assert set(actual.keys()) == set(exp.keys())
         for k in exp:
             assert np.array_equal(actual[k], exp[k])
+
+
+def test_load_prefers_json_over_npy(tmp_path, make_feat_dictionary):
+    """When both labels.json and labels.npy exist, load should use labels.json and emit no FutureWarning."""
+    idx = ObservedFeatureIndex()
+    idx.append_features(make_feat_dictionary(2, 3), label="A")
+    idx.save(tmp_path / "features")
+
+    # Write an extra legacy labels.npy alongside the existing labels.json
+    legacy_labels = np.array(["WRONG_LABEL"])
+    np.save(tmp_path / "features" / "labels.npy", legacy_labels)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", FutureWarning)
+        reloaded = ObservedFeatureIndex.load(tmp_path / "features")
+
+    assert reloaded.number_of_rows() == idx.number_of_rows()
+    # Verify it used the JSON data (label "A"), not the npy data ("WRONG_LABEL")
+    _, labels = reloaded[0 : reloaded.number_of_rows()]
+    assert labels == ["A"]
+
+
+def test_load_legacy_labels_npy_emits_future_warning(tmp_path, make_feat_dictionary):
+    """Loading an index saved with the old labels.npy format should emit a FutureWarning."""
+    idx = ObservedFeatureIndex()
+    idx.append_features(make_feat_dictionary(2, 3), label="A")
+    idx.save(tmp_path / "features")
+    # Replace labels.json with a legacy labels.npy
+    labels_json = tmp_path / "features" / "labels.json"
+    labels_npy = tmp_path / "features" / "labels.npy"
+    with open(labels_json) as f:
+        labels = json.load(f)
+    np.save(labels_npy, np.array([l if l is not None else "" for l in labels]))
+    labels_json.unlink()
+
+    with pytest.warns(FutureWarning, match="legacy labels.npy"):
+        reloaded = ObservedFeatureIndex.load(tmp_path / "features")
+    assert reloaded.number_of_rows() == idx.number_of_rows()
+
+
+def test_load_legacy_labels_npy_with_pickle_raises(tmp_path, make_feat_dictionary):
+    """Loading a labels.npy that requires pickle should raise ValueError."""
+    idx = ObservedFeatureIndex()
+    idx.append_features(make_feat_dictionary(2, 3), label="A")
+    idx.save(tmp_path / "features")
+    # Replace labels.json with a labels.npy containing an object array (requires pickle)
+    labels_json = tmp_path / "features" / "labels.json"
+    labels_npy = tmp_path / "features" / "labels.npy"
+    np.save(labels_npy, np.array([None, "A"], dtype=object), allow_pickle=True)
+    labels_json.unlink()
+
+    with pytest.raises(ValueError, match="contains pickled objects"):
+        ObservedFeatureIndex.load(tmp_path / "features")
+
+
+def test_load_missing_labels_file_raises(tmp_path, make_feat_dictionary):
+    """Loading an index with no labels file should raise FileNotFoundError."""
+    idx = ObservedFeatureIndex()
+    idx.append_features(make_feat_dictionary(2, 3), label="A")
+    idx.save(tmp_path / "features")
+    (tmp_path / "features" / "labels.json").unlink()
+
+    with pytest.raises(FileNotFoundError, match="No labels file found"):
+        ObservedFeatureIndex.load(tmp_path / "features")

Original file line number	Diff line number	Diff line change
`@@ -985,7 +985,7 @@ def _write_header(self):`
`985`	`985`	`features_rel_path = f"{feature_index_path}"`
`986`	`986`	`index_files: List[str] = [`
`987`	`987`	`f"{features_rel_path}/cumulative_sum_index.npy",`
`988`		`- f"{features_rel_path}/labels.npy",`
	`988`	`+ f"{features_rel_path}/labels.json",`
`989`	`989`	`f"{features_rel_path}/version.npy",`
`990`	`990`	`]`
`991`	`991`	`if num_frames > 0:`