docs(sync): enrich GPU metric and kernel docstrings (#278)

Ramdam17 · claude · web-flow · commit 3deb4d8aac22 · 2026-06-09T23:53:25.000-04:00
Restore the richer module docstrings (intent + literature references) for the
9 sync metrics, base.py, and the package __init__; expand the CUDA/Metal kernel
module docstrings (kernel list, fp64-on-A100 / fp32-on-Metal rationale, why the
pairwise dispatch avoids OOM); restore the _VRAM_THRESHOLD attribute docstring
and enrich run_pairwise_kernel. mkdocstrings cross-refs are flattened to plain
text for tooltip legibility; no reference to non-existent docs files.

Docstrings only — no behavior change. sync parity tests pass (74 passed /
9 CuPy skipped on M4 Max).

Co-authored-by: Claude Fable 5 &lt;noreply@anthropic.com&gt;
diff --git a/hypyp/sync/__init__.py b/hypyp/sync/__init__.py
@@ -4,8 +4,41 @@
 """
 Synchrony and connectivity metrics for hyperscanning analysis.
 
-This module provides a collection of connectivity metrics that can be used
-to measure neural synchronization between participants.
+Public API
+----------
+``BaseMetric``
+    Abstract base class. Concrete metrics inherit from it and implement
+    ``BaseMetric.compute``.
+
+Concrete metric classes (one per file):
+
+- ``PLV`` (``hypyp.sync.plv``) — Phase Locking Value.
+- ``CCorr`` (``hypyp.sync.ccorr``) — Circular Correlation.
+- ``ACCorr`` (``hypyp.sync.accorr``) — Adjusted Circular Correlation.
+- ``Coh`` (``hypyp.sync.coh``) — Coherence.
+- ``ImCoh`` (``hypyp.sync.imaginary_coh``) — Imaginary Coherence.
+- ``PLI`` (``hypyp.sync.pli``) — Phase Lag Index.
+- ``WPLI`` (``hypyp.sync.wpli``) — Weighted Phase Lag Index.
+- ``EnvCorr`` (``hypyp.sync.envelope_corr``) — Envelope Correlation.
+- ``PowCorr`` (``hypyp.sync.pow_corr``) — Power Correlation.
+
+Helpers
+-------
+``multiply_conjugate``, ``multiply_conjugate_time``,
+``multiply_product``, ``multiply_conjugate_torch``,
+``multiply_conjugate_time_torch``
+    Einsum building blocks shared across the metric implementations.
+
+Dispatcher
+----------
+``METRICS``
+    Dict mapping mode strings to metric classes.
+``get_metric``
+    Lookup helper used by ``hypyp.eeg.analyses.compute_sync`` and
+    ``hypyp.eeg.analyses.pair_connectivity``.
+
+Per-metric mathematical details and references live in each metric
+class's docstring.
 """
 
 from typing import Optional
diff --git a/hypyp/sync/accorr.py b/hypyp/sync/accorr.py
@@ -4,11 +4,22 @@
 """
 Adjusted Circular Correlation (ACCorr) connectivity metric.
 
-ACCorr computes the circular correlation between two phase time-series with
-per-pair phase centering, providing a more accurate inter-brain synchrony
-estimate than standard circular correlation (ccorr).
-
-Reference: Zimmermann et al. (2024). *Imaging Neuroscience*, 2.
+ACCorr computes the circular correlation between two phase time-series
+with **per-pair** phase centering, providing a more accurate inter-brain
+synchrony estimate than standard circular correlation (ccorr).
+
+See the ``ACCorr`` class for the public API; it supports a CPU
+``precompute`` strategy (vectorised numerator + loop denominator with
+pre-computed per-pair adjustments), a numba JIT backend, and PyTorch
+GPU/MPS backends. The torch implementation switches between a fully
+vectorised 5-D broadcast and a per-pair loop based on
+``ACCorr._VRAM_THRESHOLD`` — see that attribute's docstring.
+
+References
+----------
+Zimmermann, M., Schultz-Nielsen, K., Dumas, G., & Konvalinka, I. (2024).
+Arbitrary methodological decisions skew inter-brain synchronization
+estimates in hyperscanning-EEG studies. *Imaging Neuroscience*, 2.
 https://doi.org/10.1162/imag_a_00350
 
 Credits
@@ -211,9 +222,24 @@ def _compute_numba(self, complex_signal: np.ndarray, n_samp: int,
 
         return con
 
-    # Memory threshold for vectorized denominator (bytes). If the 5D tensor
-    # (E, F, C, C, T) would exceed this, fall back to the loop-based approach.
     _VRAM_THRESHOLD = 2 * 1024**3  # 2 GB
+    """
+    Memory threshold (bytes) for the vectorised torch denominator path.
+
+    Notes
+    -----
+    The torch implementation prefers a fully-vectorised broadcast over the
+    intermediate 5-D tensor of shape ``(n_epochs, n_freq, n_channels,
+    n_channels, n_samples)`` for the per-pair phase centering. When the
+    estimated tensor size in bytes exceeds this threshold, ``_compute_torch``
+    falls back to a per-pair loop on the same device (CPU / MPS / CUDA).
+
+    The 2 GB default is sized to keep one such tensor comfortably under
+    Apple-Silicon MPS and Quadro-class GPU memory budgets when the rest of
+    the pipeline (data tensors, kernel state) is already resident — a 4 GB
+    threshold can OOM on high-channel-count realistic_hd benchmarks. This
+    value is empirical; re-derive if you change the upstream tensor layout.
+    """
 
     def _compute_torch(self, complex_signal: np.ndarray, n_samp: int,
                        transpose_axes: tuple) -> np.ndarray:
diff --git a/hypyp/sync/base.py b/hypyp/sync/base.py
@@ -2,13 +2,44 @@
 # coding=utf-8
 
 """
-Base classes and helper functions for connectivity metrics.
-
-| Option | Description |
-| ------ | ----------- |
-| title           | base.py |
-| authors         | HyPyP Team |
-| date            | 2026-01-30 |
+Base classes, einsum helpers, optional-dependency probing, and the
+AUTO_PRIORITY benchmark dispatch table for the connectivity metrics.
+
+This module is shared by every concrete metric in ``hypyp.sync``. It
+exposes:
+
+- ``BaseMetric`` — abstract base. Concrete metrics override
+  ``BaseMetric.compute`` and rely on the shared backend-resolution and
+  warning-fallback logic.
+- ``multiply_conjugate``, ``multiply_conjugate_time``,
+  ``multiply_product`` — vectorised einsum kernels (numpy).
+- ``multiply_conjugate_torch``, ``multiply_conjugate_time_torch`` —
+  torch equivalents (only resolvable if torch is installed).
+- ``AUTO_PRIORITY`` — benchmark-driven backend lookup table per
+  ``{metric_name: {platform: [gpu_backend, fallback]}}``.
+- Capability flags ``TORCH_AVAILABLE``, ``MPS_AVAILABLE``,
+  ``CUDA_AVAILABLE``, ``NUMBA_AVAILABLE``, ``METAL_AVAILABLE``,
+  ``CUPY_AVAILABLE`` — probed at import time so concrete metric classes
+  don't have to retry.
+
+Design note
+-----------
+``AUTO_PRIORITY`` is intentionally kept as a Python dict (not
+externalised to a YAML file) for three reasons:
+
+1. The values are not user-tunable — they are derived from benchmarks
+   on Mac M4 Max (131 rows) and Narval A100 (111 rows) and need
+   re-derivation if the kernels change. Putting them in YAML would
+   wrongly suggest they are configuration knobs.
+2. The per-call ``priority=`` kwarg on ``get_metric`` already provides
+   the override path users actually need.
+3. The table is short (9 entries) and sits next to its rationale
+   comment block; a YAML file would split the explanation from the
+   data.
+
+If a future benchmark sweep changes the optimal backend, the change
+should be a code edit (with a tests/benchmarks update) — not a config
+change.
 """
 
 import warnings
diff --git a/hypyp/sync/ccorr.py b/hypyp/sync/ccorr.py
@@ -3,6 +3,15 @@
 
 """
 Circular Correlation (CCorr) connectivity metric.
+
+CCorr is the circular analogue of Pearson's r: it measures the linear
+correlation between the sines of phase deviations from a global
+circular mean. See the ``CCorr`` class for the public API.
+
+References
+----------
+Fisher, N. I. (1995). *Statistical Analysis of Circular Data*.
+Cambridge University Press.
 """
 
 import numpy as np
diff --git a/hypyp/sync/coh.py b/hypyp/sync/coh.py
@@ -3,6 +3,16 @@
 
 """
 Coherence (Coh) connectivity metric.
+
+Coh measures the linear relationship between two complex analytic
+signals in the frequency domain — it is the squared modulus of the
+cross-spectrum normalised by the product of the auto-spectra. See the
+``Coh`` class for the public API.
+
+References
+----------
+Nunez, P. L., & Srinivasan, R. (2006). *Electric Fields of the Brain:
+The Neurophysics of EEG* (2nd ed.). Oxford University Press.
 """
 
 import numpy as np
diff --git a/hypyp/sync/envelope_corr.py b/hypyp/sync/envelope_corr.py
@@ -3,6 +3,16 @@
 
 """
 Envelope Correlation (EnvCorr) connectivity metric.
+
+EnvCorr is the Pearson correlation between the analytic-signal
+amplitude (envelope) of two channels — it captures slow co-modulation
+of band-limited power. See the ``EnvCorr`` class for the public API.
+
+References
+----------
+Hipp, J. F., Hawellek, D. J., Corbetta, M., Siegel, M., & Engel, A. K.
+(2012). Large-scale cortical correlation structure of spontaneous
+oscillatory activity. *Nature Neuroscience*, 15(6), 884-890.
 """
 
 import numpy as np
diff --git a/hypyp/sync/imaginary_coh.py b/hypyp/sync/imaginary_coh.py
@@ -3,6 +3,17 @@
 
 """
 Imaginary Coherence (ImCoh) connectivity metric.
+
+ImCoh isolates the imaginary part of the cross-spectrum normalised by
+the auto-spectra, which makes it insensitive to zero-lag interactions
+(volume conduction). See the ``ImCoh`` class for the public API.
+
+References
+----------
+Nolte, G., Bai, O., Wheaton, L., Mari, Z., Vorbach, S., & Hallett, M.
+(2004). Identifying true brain interaction from EEG data using the
+imaginary part of coherency. *Clinical Neurophysiology*, 115(10),
+2292-2307.
 """
 
 import numpy as np
diff --git a/hypyp/sync/kernels/_cuda_dispatch.py b/hypyp/sync/kernels/_cuda_dispatch.py
@@ -1,8 +1,13 @@
 """
 Shared CUDA dispatch logic for all pairwise sync metric kernels.
 
-Uses CuPy RawKernel for inline CUDA source. All kernels use float64
-for exact precision (A100 has 9.7 TFLOPS fp64).
+Provides ``run_pairwise_kernel``, the common launch + memory-management
+routine used by every metric-specific CUDA kernel module
+(``cuda_phase``, ``cuda_amplitude``, ``cuda_accorr``).
+
+Uses CuPy ``RawKernel`` for inline CUDA source. All kernels run in
+float64 — the NVIDIA A100 reference target has 9.7 TFLOPS of fp64
+throughput, so the precision/speed trade-off favours fp64 there.
 """
 
 import numpy as np
@@ -17,14 +22,28 @@ def run_pairwise_kernel(complex_signal, get_kernel_fn):
     """
     Shared dispatch for pairwise CUDA kernels.
 
+    Builds the upper-triangle channel-pair index list, transfers the real
+    and imaginary parts to the device as float64, launches one thread per
+    ``(epoch*freq, pair)`` tuple, and reads the result back. Computing
+    pairwise — rather than materialising the full ``(E, F, C, C, T)``
+    cross-spectrum — keeps device memory bounded at high channel counts.
+
     Parameters
     ----------
     complex_signal : np.ndarray, shape (E, F, C, T)
-    get_kernel_fn : callable -> CuPy RawKernel
+        Complex analytic signals (epochs, freqs, channels, samples).
+    get_kernel_fn : callable -> cupy.RawKernel
+        Lazily compiles (and caches) the metric-specific CUDA kernel.
 
     Returns
     -------
     np.ndarray, shape (E, F, C, C), float64
+        Connectivity matrix per (epoch, freq).
+
+    Notes
+    -----
+    The CuPy default memory pool is explicitly freed before returning, so
+    repeated calls in a tight loop don't accumulate device allocations.
     """
     kernel = get_kernel_fn()
 
diff --git a/hypyp/sync/kernels/cuda_accorr.py b/hypyp/sync/kernels/cuda_accorr.py
@@ -1,9 +1,11 @@
 """
 CUDA kernel for ACCorr (Adjusted Circular Correlation).
-Float64 for exact precision on NVIDIA GPUs.
 
-ACCorr requires a custom dispatch (not run_pairwise_kernel) because
-it needs an extra angle buffer for the sin^2 denominator in pass 2.
+ACCorr requires a **custom** dispatch rather than the shared
+``_cuda_dispatch.run_pairwise_kernel`` because it needs an extra angle
+buffer for the ``sin²`` adjusted-phase denominator in pass 2. Float64
+throughout for exact precision on NVIDIA GPUs (the A100 reference
+target has 9.7 TFLOPS of fp64).
 """
 
 import numpy as np
diff --git a/hypyp/sync/kernels/cuda_amplitude.py b/hypyp/sync/kernels/cuda_amplitude.py
@@ -1,6 +1,16 @@
 """
-CUDA kernels for amplitude-based sync metrics: Coh, ImCoh, EnvCorr, PowCorr.
-All float64 for exact precision on NVIDIA GPUs.
+CUDA kernels for amplitude-based sync metrics.
+
+Implements:
+
+- ``coh_cuda`` — magnitude-squared Coherence.
+- ``imcoh_cuda`` — Imaginary Coherence.
+- ``envcorr_cuda`` — Envelope Correlation.
+- ``powcorr_cuda`` — Power Correlation.
+
+All kernels run in float64 for exact precision on NVIDIA GPUs. They
+share the pair-iteration scaffolding from
+``_cuda_dispatch.run_pairwise_kernel``.
 """
 
 import numpy as np
diff --git a/hypyp/sync/kernels/cuda_phase.py b/hypyp/sync/kernels/cuda_phase.py
@@ -1,6 +1,16 @@
 """
-CUDA kernels for phase-based sync metrics: PLI, wPLI, PLV, CCorr.
-All float64 for exact precision on NVIDIA GPUs.
+CUDA kernels for phase-based sync metrics.
+
+Implements:
+
+- ``pli_cuda`` — Phase Lag Index.
+- ``wpli_cuda`` — Weighted Phase Lag Index.
+- ``plv_cuda`` — Phase Locking Value.
+- ``ccorr_cuda`` — Circular Correlation.
+
+All kernels run in float64 for exact precision on NVIDIA GPUs. They
+share the pair-iteration scaffolding from
+``_cuda_dispatch.run_pairwise_kernel``.
 """
 
 import numpy as np
diff --git a/hypyp/sync/kernels/metal_phase.py b/hypyp/sync/kernels/metal_phase.py
@@ -1,9 +1,18 @@
 """
-Metal kernels for sign-based sync metrics: PLI, wPLI.
+Metal kernels for sign-based sync metrics on Apple Silicon.
 
-These metrics work on the imaginary part of the cross-spectrum and
-cannot be efficiently expressed as batched einsum/BLAS operations,
-making custom kernels faster than torch on Apple Silicon.
+Implements:
+
+- ``pli_metal`` — Phase Lag Index.
+- ``wpli_metal`` — Weighted Phase Lag Index.
+
+These metrics operate on the imaginary part of the cross-spectrum and
+cannot be efficiently expressed as batched einsum / BLAS operations
+(the ``sign()`` step is not vectorisable on MPS), which makes custom
+Metal compute shaders faster than torch-on-MPS at every channel count.
+The shared launch routine ``_metal_dispatch.run_pairwise_kernel``
+handles buffer allocation, command-encoder lifecycle, and dispatch
+geometry.
 """
 
 from functools import lru_cache
diff --git a/hypyp/sync/pli.py b/hypyp/sync/pli.py
@@ -3,6 +3,18 @@
 
 """
 Phase Lag Index (PLI) connectivity metric.
+
+PLI is a sign-based phase synchrony measure that is robust against
+zero-lag couplings (volume conduction) — it captures the asymmetry of
+the imaginary-cross-spectrum sign distribution. See the ``PLI`` class
+for the public API.
+
+References
+----------
+Stam, C. J., Nolte, G., & Daffertshofer, A. (2007). Phase lag index:
+assessment of functional connectivity from multi channel EEG and MEG
+with diminished bias from common sources. *Human Brain Mapping*,
+28(11), 1178-1193.
 """
 
 import numpy as np
diff --git a/hypyp/sync/plv.py b/hypyp/sync/plv.py
@@ -3,6 +3,16 @@
 
 """
 Phase Locking Value (PLV) connectivity metric.
+
+PLV measures the consistency of phase differences between two complex
+analytic signals across time, regardless of amplitude. See the ``PLV``
+class for the public API and references.
+
+References
+----------
+Lachaux, J. P., Rodriguez, E., Martinerie, J., & Varela, F. J. (1999).
+Measuring phase synchrony in brain signals. *Human Brain Mapping*,
+8(4), 194-208.
 """
 
 import numpy as np
diff --git a/hypyp/sync/pow_corr.py b/hypyp/sync/pow_corr.py
@@ -3,6 +3,18 @@
 
 """
 Power Correlation (PowCorr) connectivity metric.
+
+PowCorr is the Pearson correlation between the **squared** envelope
+(instantaneous power) of two channels — closely related to envelope
+correlation (``EnvCorr``) but applied to power instead of amplitude.
+See the ``PowCorr`` class for the public API.
+
+Notes
+-----
+Power correlation is a standard companion to envelope correlation in
+resting-state hyperscanning analyses; the two often qualitatively
+agree but differ in their sensitivity to outlier amplitude bursts (the
+squaring in PowCorr amplifies them).
 """
 
 import numpy as np
diff --git a/hypyp/sync/wpli.py b/hypyp/sync/wpli.py