Merge pull request #4127 from h-mayorquin/improve_phy_and_kilosort_extractors

alejoe91 · web-flow · commit b7718219b759 · 2025-09-04T14:49:21.000+02:00
Fix KeyError when loading Kilosort 2.5 output with files missing cluster id
diff --git a/src/spikeinterface/extractors/__init__.py b/src/spikeinterface/extractors/__init__.py
@@ -56,6 +56,7 @@ def __getattr__(extractor_name):
                 "Importing classes at __init__ has been deprecated in favor of only importing function-size wrappers "
                 "and will be removed in 0.105.0. For developers that prefer working with the class versions of extractors "
                 "they can be imported from spikeinterface.extractors.extractor_classes"
+                f"For class {reading_function.__name__}"
             )
             warn(dep_msg)
             return reading_function
diff --git a/src/spikeinterface/extractors/phykilosortextractors.py b/src/spikeinterface/extractors/phykilosortextractors.py
@@ -24,6 +24,24 @@ class BasePhyKilosortSortingExtractor(BaseSorting):
         If True, empty units are removed from the sorting extractor.
     load_all_cluster_properties : bool, default: True
         If True, all cluster properties are loaded from the tsv/csv files.
+
+    Notes
+    -----
+    This extractor loads cluster properties from CSV/TSV files to enrich the sorting
+    extractor with unit metadata such as quality labels, groups, and Kilosort metrics.
+
+    Cluster information is loaded in the following priority order:
+    1. From a dedicated cluster_info.csv/.tsv file if present
+    2. From all .csv/.tsv files in the folder that contain a 'cluster_id' column
+       Typical files include cluster_group.tsv, cluster_info.tsv, cluster_KSLabel.tsv
+       Files without cluster_id column are automatically skipped
+    3. If no files are found, minimal cluster info is generated with 'unsorted' labels
+
+    The cluster_id column is used as the merge key to combine properties from multiple files.
+    All loaded properties are added to the sorting extractor as unit properties, with some
+    renamed for SpikeInterface conventions: 'group' becomes 'quality', 'cluster_id'
+    becomes 'original_cluster_id'. These properties can be accessed via ``sorting.get_property()`` 
+    function.
     """
 
     installation_mesg = (
@@ -84,6 +102,15 @@ def __init__(
                 else:
                     delimiter = ","
                 new_property = pd.read_csv(file, delimiter=delimiter)
+
+                # Only merge files that contain a cluster_id column
+                # This prevents KeyError when extraneous files don't have cluster_id
+                # Typical aggregated files include cluster_group.tsv, cluster_info.tsv, cluster_KSLabel.tsv
+                # See Phy docs: https://phy.readthedocs.io/en/latest/sorting_user_guide/
+                # See: https://github.com/SpikeInterface/spikeinterface/issues/4124
+                if "cluster_id" not in new_property.columns:
+                    continue
+
                 if cluster_info is None:
                     cluster_info = new_property
                 else:

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ def __getattr__(extractor_name):`
`56`	`56`	`"Importing classes at __init__ has been deprecated in favor of only importing function-size wrappers "`
`57`	`57`	`"and will be removed in 0.105.0. For developers that prefer working with the class versions of extractors "`
`58`	`58`	`"they can be imported from spikeinterface.extractors.extractor_classes"`
	`59`	`+ f"For class {reading_function.__name__}"`
`59`	`60`	`)`
`60`	`61`	`warn(dep_msg)`
`61`	`62`	`return reading_function`