perf: pre-allocate output in _apply_sos / _parallel_reduce_axis1

galenlynch · claude · galenlynch · commit 5ba4ec33e9a8 · 2026-04-28T19:49:28.000-07:00
The previous dispatch had each parallel worker return ``(c0, c1, block)``
tuples; the calling thread then allocated the output array and copied each
block into place.  That post-collection allocate-and-copy is wasted work
since the channel/time slices are non-overlapping — workers can write
directly into a pre-allocated output.

Measured on a (30000, 384) float32 chunk with sosfiltfilt and
n_workers=5:

  pattern                              wall (ms)   speedup
  E. sequential                          173.89      1.00×
  A. submit + collect + alloc + copy      75.66      2.30×   (current)
  B. pre-alloc, write in place            60.51      2.87×   (this PR)
  C. pool.map, write in place             63.55      2.74×
  D. manual threading.Thread              64.76      2.69×

So we save ~15 ms wall per `_apply_sos` call (likewise for
`_parallel_reduce_axis1`) by dropping the redundant copy.  Ideal 5×
scaling would be 34.78 ms; the remaining gap to ideal is the GIL-held
Python wrapper inside scipy's sosfiltfilt — pattern doesn't matter there
(B/C/D are all within noise), so we keep the simpler submit/result form.

Same pattern applied to common_reference._parallel_reduce_axis1.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/spikeinterface/preprocessing/common_reference.py b/src/spikeinterface/preprocessing/common_reference.py
@@ -244,6 +244,9 @@ def _parallel_reduce_axis1(self, traces):
         numpy's partition-based median and BLAS-backed mean release the GIL
         during per-row work, so Python-thread parallelism delivers real
         speedup (measured ~10× on 16 threads for 1M × 384 median).
+
+        Workers write directly into a pre-allocated output array — see
+        FilterRecordingSegment._apply_sos for the same pattern.
         """
         if self.n_workers == 1:
             return self.operator_func(traces, axis=1)
@@ -258,15 +261,17 @@ def _parallel_reduce_axis1(self, traces):
         block = (T + effective - 1) // effective
         bounds = [(t0, min(t0 + block, T)) for t0 in range(0, T, block)]
 
+        # Probe dtype: median/mean of a 1×C row gives the same dtype as the
+        # full reduction.
+        out_dtype = self.operator_func(traces[:1, :], axis=1).dtype
+        out = np.empty(T, dtype=out_dtype)
+
         def _work(t0, t1):
-            return t0, t1, self.operator_func(traces[t0:t1, :], axis=1)
+            out[t0:t1] = self.operator_func(traces[t0:t1, :], axis=1)
 
         futures = [pool.submit(_work, t0, t1) for t0, t1 in bounds]
-        results = [fut.result() for fut in futures]
-        out_dtype = results[0][2].dtype
-        out = np.empty(T, dtype=out_dtype)
-        for t0, t1, block_out in results:
-            out[t0:t1] = block_out
+        for fut in futures:
+            fut.result()
         return out
 
     def get_traces(self, start_frame, end_frame, channel_indices):
diff --git a/src/spikeinterface/preprocessing/filter.py b/src/spikeinterface/preprocessing/filter.py
@@ -241,6 +241,12 @@ def _apply_sos(self, fn, traces, axis=0):
         implementations of ``sosfiltfilt``/``sosfilt`` release the GIL during
         per-column work, so Python-thread parallelism delivers real speedup
         (measured ~3× on 8 threads for a 1M × 384 float32 chunk).
+
+        Workers write directly into a pre-allocated output array — eliminating
+        the per-block tuple return + post-loop allocate-and-copy that adds
+        ~15 ms of wall time per call on a (30k, 384) float32 chunk.  Each
+        block writes into a non-overlapping channel slice, so concurrent
+        writes are safe.
         """
         if self.n_workers == 1:
             return fn(self.coeff, traces, axis=axis)
@@ -251,17 +257,18 @@ def _apply_sos(self, fn, traces, axis=0):
         block = (C + self.n_workers - 1) // self.n_workers
         bounds = [(c0, min(c0 + block, C)) for c0 in range(0, C, block)]
 
+        # Probe the output dtype on a tiny slice (longer than scipy's internal
+        # padlen of 6 * len(sos)) so we can pre-allocate.  Cost: microseconds.
+        probe_len = max(64, 6 * self.coeff.shape[0] + 1)
+        out_dtype = fn(self.coeff, traces[:probe_len, :1], axis=axis).dtype
+        out = np.empty((traces.shape[0], C), dtype=out_dtype)
+
         def _work(c0, c1):
-            return c0, c1, fn(self.coeff, traces[:, c0:c1], axis=axis)
+            out[:, c0:c1] = fn(self.coeff, traces[:, c0:c1], axis=axis)
 
         futures = [pool.submit(_work, c0, c1) for c0, c1 in bounds]
-        results = [fut.result() for fut in futures]
-        # Allocate the output using the first block's dtype (scipy may promote
-        # int input to float64).
-        out_dtype = results[0][2].dtype
-        out = np.empty((traces.shape[0], C), dtype=out_dtype)
-        for c0, c1, block_out in results:
-            out[:, c0:c1] = block_out
+        for fut in futures:
+            fut.result()
         return out
 
     def get_traces(self, start_frame, end_frame, channel_indices):