PCA flip for volumetric is now using randomized SVD to manage to run the SVD at all

Shrecki · Shrecki · commit 69937d2c1877 · 2025-05-16T11:33:39.000+02:00
diff --git a/mne/source_estimate.py b/mne/source_estimate.py
@@ -19,7 +19,7 @@
 from .cov import Covariance
 from .evoked import _get_peak
 from .filter import FilterMixin, _check_fun, resample
-from .fixes import _eye_array, _safe_svd
+from .fixes import _eye_array
 from .parallel import parallel_func
 from .source_space._source_space import (
     SourceSpaces,
@@ -3375,27 +3375,27 @@ def _get_ico_tris(grade, verbose=None, return_surf=False):
         return ico
 
 
+def _compute_pca_quantities(U, s, V, flip):
+    if isinstance(flip, int):
+        sign = np.sign((flip * U[:, 0]).sum())
+    else:
+        sign = np.sign(np.dot(U[:, 0], flip))
+    scale = np.linalg.norm(s) / np.sqrt(len(U))
+    result = sign * scale * V[0]
+    return result
+
+
 def _pca_flip(flip, data):
     result = None
     if flip is None:  # Case of volumetric data: flip is meaningless
         flip = 1
     if data.shape[0] < 2:
         result = data.mean(axis=0)  # Trivial accumulator
     else:
+        U, s, V = np.linalg.svd(data, full_matrices=False)
         # determine sign-flip.
         # if flip is a mere int, multiply U and sum
-        if isinstance(flip, int):
-            # We assume here that flip is thus denoting a volumetric.
-            # It means LAPACK is likely to overflow on big matrices => We use numpy
-            U, s, V = np.linalg.svd(data, full_matrices=False)
-
-            sign = np.sign((flip * U[:, 0]).sum())
-        else:
-            U, s, V = _safe_svd(data, full_matrices=False)
-            sign = np.sign(np.dot(U[:, 0], flip))
-        # use average power in label for scaling
-        scale = np.linalg.norm(s) / np.sqrt(len(data))
-        result = sign * scale * V[0]
+        result = _compute_pca_quantities(U, s, V, flip)
     return result
 
 
@@ -3678,6 +3678,7 @@ def _gen_extract_label_time_course(
     allow_empty=False,
     mri_resolution=True,
     verbose=None,
+    max_channels=400,
 ):
     # loop through source estimates and extract time series
     if src is None and mode in ["mean", "max"]:
@@ -3741,17 +3742,39 @@ def _gen_extract_label_time_course(
         else:
             # For other modes, initialize the label_tc array
             label_tc = np.zeros((n_labels,) + stc.data.shape[1:], dtype=stc.data.dtype)
+        pca_volumetric = kind == "volume" and mode == "pca_flip"
+        if pca_volumetric:
+            # Precompute randomized SVD on data
+            # Components are restricted to max_channels, which is the highest possible
+            # rank and is much smaller than the number of sources
+            from sklearn.utils.extmath import randomized_svd
+
+            u_data, s_data, vh_data = randomized_svd(
+                stc.data, n_components=max_channels
+            )
         for i, (vertidx, flip) in enumerate(zip(label_vertidx, src_flip)):
             if vertidx is not None:
-                if isinstance(vertidx, sparse.csr_array):
-                    assert mri_resolution
-                    assert vertidx.shape[1] == stc.data.shape[0]
-                    this_data = np.reshape(stc.data, (stc.data.shape[0], -1))
-                    this_data = vertidx @ this_data
-                    this_data.shape = (this_data.shape[0],) + stc.data.shape[1:]
+                if pca_volumetric:
+                    # Compute SVD of vertices
+                    # We will use it to compute vertidx @ data implicitly,
+                    u_vert, s_vert, vh_Vert = np.linalg.svd(vertidx.todense())
+                    center_prod = np.diag(s_vert) @ vh_Vert @ u_data @ np.diag(s_data)
+                    u_s, s_s, vh_s = np.linalg.svd(center_prod)
+                    U = u_vert @ u_s
+                    s = s_s
+                    V = vh_s @ vh_data
+                    label_tc[i] = _compute_pca_quantities(U, s, V, flip)
                 else:
-                    this_data = stc.data[vertidx]
-                label_tc[i] = func(flip, this_data)
+                    if isinstance(vertidx, sparse.csr_array):
+                        assert mri_resolution
+                        assert vertidx.shape[1] == stc.data.shape[0]
+                        this_data = np.reshape(stc.data, (stc.data.shape[0], -1))
+
+                        this_data = vertidx @ this_data
+                        this_data.shape = (this_data.shape[0],) + stc.data.shape[1:]
+                    else:
+                        this_data = stc.data[vertidx]
+                    label_tc[i] = func(flip, this_data)
         if mode is not None:
             offset = nvert[:-n_mean].sum()  # effectively :2 or :0
             for i, nv in enumerate(nvert[2:]):