Allow metadata-aware splitter as cv_class used directly

Copilot · web-flow · commit 457e479cf0a4 · 2026-06-27T20:30:10.000Z
diff --git a/moabb/evaluations/splitters.py b/moabb/evaluations/splitters.py
@@ -28,6 +28,24 @@ def _splitter_metadata(splitter):
     return None
 
 
+def _is_metadata_aware_cv(cv_class):
+    """Return True when ``cv_class`` is a metadata-aware top-level splitter.
+
+    Such a splitter follows the moabb convention: it declares
+    ``metadata_columns`` and its ``split`` method accepts ``(self, y, metadata)``
+    (rather than the sklearn ``(self, X, y, groups)`` signature). When this is
+    the case, the splitter is used directly as the top-level splitter instead of
+    being wrapped as the inner groups-CV.
+    """
+    if not hasattr(cv_class, "metadata_columns"):
+        return False
+    try:
+        params = inspect.signature(cv_class.split).parameters
+    except (TypeError, ValueError):
+        return False
+    return "metadata" in params
+
+
 class WithinSessionSplitter(BaseCrossValidator):
     """Data splitter for within session evaluation.
 
@@ -388,6 +406,11 @@ def __init__(
 
         # Detect whether the cv_class uses the groups parameter
         self._cv_uses_groups = issubclass(cv_class, GroupsConsumerMixin)
+        # Detect whether the cv_class is a metadata-aware top-level splitter
+        # (i.e. it follows the moabb convention and declares ``metadata_columns``
+        # and implements ``split(self, y, metadata)``). In that case, it is used
+        # directly instead of being wrapped as the inner groups-CV.
+        self._cv_is_metadata_aware = _is_metadata_aware_cv(cv_class)
         self._last_split_metadata = None
 
     def get_n_splits(self, metadata):
@@ -409,6 +432,8 @@ def get_n_splits(self, metadata):
         n_splits: int
             The number of splits for the cross-validation
         """
+        if self._cv_is_metadata_aware:
+            return self.cv_class(**self._cv_kwargs).get_n_splits(metadata)
         subjects = metadata["subject"].unique()
         n_splits = 0
         for subject in subjects:  # noqa: B007 — referenced via @subject in pandas query below
@@ -429,6 +454,16 @@ def split(self, y, metadata):
         # here, I am getting the index across all the subject
         all_index = metadata.index.values
         self._last_split_metadata = None
+
+        # When the cv_class is a metadata-aware top-level splitter, delegate the
+        # fold creation to it directly, forwarding the full metadata.
+        if self._cv_is_metadata_aware:
+            splitter = self.cv_class(**self._cv_kwargs)
+            for train_idx, test_idx in splitter.split(y, metadata):
+                self._last_split_metadata = _splitter_metadata(splitter)
+                yield train_idx, test_idx
+            return
+
         # I check how many subjects are here:
         subjects = metadata["subject"].unique()
 
@@ -539,6 +574,11 @@ def __init__(
 
         # Detect whether the cv_class uses the groups parameter
         self._cv_uses_groups = issubclass(cv_class, GroupsConsumerMixin)
+        # Detect whether the cv_class is a metadata-aware top-level splitter
+        # (i.e. it follows the moabb convention and declares ``metadata_columns``
+        # and implements ``split(self, y, metadata)``). In that case, it is used
+        # directly instead of being wrapped as the inner groups-CV.
+        self._cv_is_metadata_aware = _is_metadata_aware_cv(cv_class)
         self._last_split_metadata = None
 
     def get_n_splits(self, metadata):
@@ -562,6 +602,8 @@ def get_n_splits(self, metadata):
         """
 
         splitter = self.cv_class(**self._cv_kwargs)
+        if self._cv_is_metadata_aware:
+            return splitter.get_n_splits(metadata)
         get_n_splits_kwargs = {"X": metadata.index}
         if self._cv_uses_groups:
             get_n_splits_kwargs["groups"] = metadata["subject"]
@@ -575,6 +617,14 @@ def split(self, y, metadata):
         splitter = self.cv_class(**self._cv_kwargs)
         self._last_split_metadata = None
 
+        # When the cv_class is a metadata-aware top-level splitter, delegate the
+        # fold creation to it directly, forwarding the full metadata.
+        if self._cv_is_metadata_aware:
+            for train_idx, test_idx in splitter.split(y, metadata):
+                self._last_split_metadata = _splitter_metadata(splitter)
+                yield train_idx, test_idx
+            return
+
         # Only pass groups to cv_classes that actually use them
         # (detected via GroupsConsumerMixin). This avoids the
         # "The groups parameter is ignored" warning from e.g. TimeSeriesSplit.
diff --git a/moabb/tests/test_splits.py b/moabb/tests/test_splits.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 from sklearn.model_selection import (
+    BaseCrossValidator,
     GroupShuffleSplit,
     KFold,
     LeaveOneGroupOut,
@@ -673,3 +674,75 @@ def test_cross_dataset_requires_group_column(data):
     splitter = CrossDatasetSplitter(group_column="does_not_exist")
     with pytest.raises(ValueError):
         list(splitter.split(y, metadata))
+
+
+class _TargetSubjectSplitter(BaseCrossValidator):
+    """Metadata-aware splitter used to test direct ``cv_class`` delegation.
+
+    The test set is restricted to one ``(subject, session)`` and the training
+    set is all the data from the other subjects.
+    """
+
+    metadata_columns = ("subject", "session")
+
+    def __init__(self, target=None, test_session=None):
+        self.target = target
+        self.test_session = test_session
+
+    def _iter_test_masks(self, X=None, y=None, groups=None):
+        raise NotImplementedError
+
+    def get_n_splits(self, metadata):
+        return 1
+
+    def split(self, y, metadata):
+        all_index = metadata.index.values
+        test_mask = (metadata["subject"] == self.target) & (
+            metadata["session"] == self.test_session
+        )
+        train_mask = metadata["subject"] != self.target
+        yield all_index[train_mask.values], all_index[test_mask.values]
+
+
+def test_metadata_aware_cv_class_used_directly(data):
+    """A cv_class declaring ``metadata_columns`` is used as top-level splitter."""
+    _, y, metadata = data
+    target = metadata["subject"].unique()[1]
+    test_session = metadata["session"].unique()[0]
+
+    splitter = CrossSubjectSplitter(
+        cv_class=_TargetSubjectSplitter, target=target, test_session=test_session
+    )
+
+    assert splitter.get_n_splits(metadata) == 1
+
+    splits = list(splitter.split(y, metadata))
+    assert len(splits) == 1
+    train, test = splits[0]
+
+    test_meta = metadata.loc[test]
+    train_meta = metadata.loc[train]
+    # Test set is exactly the target subject at the requested session.
+    assert set(test_meta["subject"]) == {target}
+    assert set(test_meta["session"]) == {test_session}
+    # Training set excludes the target subject entirely.
+    assert target not in set(train_meta["subject"])
+    assert len(set(train) & set(test)) == 0
+
+
+def test_metadata_aware_cv_class_cross_session(data):
+    """CrossSessionSplitter also delegates to a metadata-aware cv_class."""
+    _, y, metadata = data
+    target = metadata["subject"].unique()[1]
+    test_session = metadata["session"].unique()[0]
+
+    splitter = CrossSessionSplitter(
+        cv_class=_TargetSubjectSplitter, target=target, test_session=test_session
+    )
+
+    assert splitter.get_n_splits(metadata) == 1
+    splits = list(splitter.split(y, metadata))
+    assert len(splits) == 1
+    train, test = splits[0]
+    assert set(metadata.loc[test]["subject"]) == {target}
+    assert len(set(train) & set(test)) == 0