Skip to content

Commit 64e4bdc

Browse files
authored
update docstring for harmony (#647)
* update docstring for harmony * align to scanpy
1 parent 8ec14dc commit 64e4bdc

2 files changed

Lines changed: 72 additions & 46 deletions

File tree

src/rapids_singlecell/preprocessing/_harmony/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,11 @@ def harmonize(
152152
cat_offsets, cell_indices = _create_category_index_mapping(cats, n_batches)
153153

154154
# Set up parameters
155+
if max_iter_harmony < 1:
156+
raise ValueError("max_iter_harmony must be >= 1")
155157
if n_clusters is None:
156158
n_clusters = int(min(100, n_cells / 30))
159+
n_clusters = max(n_clusters, 2)
157160

158161
# TODO: Allow for multiple colsum algorithms in a list
159162
assert colsum_algo in ["columns", "atomics", "gemm", "benchmark", None]

src/rapids_singlecell/preprocessing/_harmony_integrate.py

Lines changed: 69 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,17 @@ def harmony_integrate(
3737
random_state: int = 0,
3838
verbose: bool = False,
3939
) -> None:
40-
"""
41-
Integrate different experiments using the Harmony algorithm :cite:p:`Korsunsky2019,Patikas2026`.
40+
"""Integrate different experiments using the Harmony algorithm :cite:p:`Korsunsky2019,Patikas2026`.
4241
43-
This GPU-accelerated implementation is based on the `harmony-pytorch` package.
44-
As Harmony works by adjusting the
45-
principal components, this function should be run after performing
46-
PCA but before computing the neighbor graph.
42+
This GPU-accelerated implementation is based on the harmony-pytorch package.
43+
As Harmony works by adjusting the principal components,
44+
this function should be run after performing PCA but before computing the neighbor graph.
4745
48-
By default, the Harmony2 algorithm is used, which includes a stabilized
49-
diversity penalty, dynamic per-cluster-per-batch ridge regularization,
50-
and automatic batch pruning. To revert to the original Harmony behavior::
46+
By default, the Harmony2 algorithm is used,
47+
which includes a stabilized diversity penalty,
48+
dynamic per-cluster-per-batch ridge regularization,
49+
and automatic batch pruning.
50+
To revert to the original Harmony behavior::
5151
5252
rsc.pp.harmony_integrate(adata, key, flavor="harmony1")
5353
@@ -56,75 +56,98 @@ def harmony_integrate(
5656
adata
5757
The annotated data matrix.
5858
key
59-
The key(s) of the column(s) in ``adata.obs`` that differentiates
60-
among experiments/batches.
59+
The key(s) of the column(s) in ``adata.obs`` that differentiate(s) among experiments/batches.
60+
When multiple keys are provided, a combined batch variable is created from all columns.
6161
basis
62-
The name of the field in ``adata.obsm`` where the PCA table is
63-
stored. Defaults to ``'X_pca'``, which is the default for
64-
``sc.tl.pca()``.
62+
The name of the field in ``adata.obsm`` where the PCA table is stored.
6563
adjusted_basis
66-
The name of the field in ``adata.obsm`` where the adjusted PCA
67-
table will be stored after running this function. Defaults to
68-
``X_pca_harmony``.
64+
The name of the field in ``adata.obsm`` where the adjusted PCA table will be stored.
6965
dtype
70-
The data type to use for Harmony. If you use 32-bit you may
71-
experience numerical instability.
66+
The data type to use for Harmony computation.
67+
If you use 32-bit you may experience numerical instability.
7268
flavor
7369
Which version of the Harmony algorithm to use.
7470
``"harmony2"`` (default) enables the stabilized diversity penalty,
75-
dynamic per-cluster-per-batch ridge regularization, and automatic
76-
batch pruning from :cite:p:`Patikas2026`.
77-
``"harmony1"`` uses the original algorithm from
78-
:cite:p:`Korsunsky2019`.
71+
dynamic per-cluster-per-batch ridge regularization,
72+
and automatic batch pruning from :cite:p:`Patikas2026`.
73+
``"harmony1"`` uses the original algorithm from :cite:p:`Korsunsky2019`.
7974
n_clusters
80-
Number of clusters. If ``None``, uses ``min(100, N / 30)``.
75+
Number of clusters used for soft k-means in the Harmony algorithm.
76+
If ``None``, uses ``min(100, N / 30)``.
77+
More clusters capture finer-grained structure but increase computation time.
8178
max_iter_harmony
82-
Maximum number of Harmony iterations.
79+
Maximum number of outer Harmony iterations
80+
(each consisting of a clustering step followed by a correction step).
8381
max_iter_clustering
84-
Maximum iterations for the clustering step within each Harmony
85-
iteration.
82+
Maximum iterations for the clustering step within each Harmony iteration.
8683
tol_harmony
8784
Convergence tolerance for the Harmony objective function.
85+
The algorithm stops when the relative change in objective falls below this value.
8886
tol_clustering
89-
Convergence tolerance for the clustering step.
87+
Convergence tolerance for the clustering step within each
88+
Harmony iteration.
9089
sigma
91-
Weight of the entropy term in the objective function.
90+
Width of the soft-clustering kernel.
91+
Controls the entropy of cluster assignments:
92+
smaller values produce harder assignments (cells assigned to fewer clusters),
93+
while larger values produce softer assignments (cells spread across more clusters).
9294
theta
93-
Weight of the diversity penalty term in the objective function.
95+
Diversity penalty weight per batch variable.
96+
Controls how strongly Harmony encourages each cluster
97+
to contain a balanced representation of all batches.
98+
Higher values (e.g. ``4``) produce more aggressive mixing;
99+
lower values (e.g. ``0.5``) allow more batch-specific clusters.
100+
Set to ``0`` to disable batch correction entirely.
101+
A list can be provided to set different weights per batch variable.
94102
tau
95-
Discounting factor on ``theta``. By default, there is no
96-
discounting.
103+
Discounting factor on ``theta``.
104+
When ``tau > 0``,
105+
the diversity penalty is down-weighted for batches with fewer cells,
106+
preventing over-correction of small batches.
107+
By default (``0``), there is no discounting.
97108
ridge_lambda
98-
Ridge regression hyperparameter for the correction step.
109+
Ridge regression regularization for the correction step.
110+
Larger values produce more conservative (smaller) corrections,
111+
preventing over-fitting.
99112
Only used with ``flavor="harmony1"``.
100113
alpha
101-
Scaling factor for dynamic lambda. Only used with
102-
``flavor="harmony2"``.
114+
Scaling factor for the dynamic per-cluster-per-batch ridge regularization.
115+
The effective regularization for each cluster-batch pair is ``alpha * E_kb``
116+
where ``E_kb`` is the expected number of cells.
117+
Larger values produce more conservative corrections.
118+
Only used with ``flavor="harmony2"``.
103119
batch_prune_threshold
104-
Fraction threshold below which a batch–cluster pair is pruned
105-
(correction suppressed). Only used with ``flavor="harmony2"``.
120+
Fraction threshold below which a batch-cluster pair is pruned (correction suppressed).
121+
When the fraction of a batch's cells assigned to a cluster (``O_kb / N_b``) falls below this threshold,
122+
that batch-cluster pair receives no correction, preventing spurious adjustments.
123+
Only used with ``flavor="harmony2"``.
106124
Set to ``None`` to disable pruning.
107125
correction_method
108-
Method for the correction step: ``"original"``, ``"fast"``, or
109-
``"batched"`` (fastest, more memory). If ``None`` (default),
110-
automatically selects ``"batched"`` unless the workspace would
111-
exceed 1 GB, in which case ``"fast"`` is used.
126+
Method for the correction step.
127+
``"original"`` uses per-cluster ridge regression with explicit matrix inversion.
128+
``"fast"`` uses a precomputed factorization that avoids the full inversion,
129+
which can be faster for datasets with many batches.
130+
``"batched"`` processes all clusters simultaneously (fastest but requires more memory).
131+
If ``None`` (default), automatically selects ``"batched"`` unless
132+
the workspace would exceed 1 GB, in which case ``"fast"`` is used.
112133
colsum_algo
113-
Algorithm for column sums. If ``None``, chosen automatically.
134+
Algorithm for column sums.
135+
If ``None``, chosen automatically.
114136
If ``"benchmark"``, benchmarks all algorithms.
115137
block_proportion
116138
Proportion of cells updated per clustering sub-iteration.
139+
Smaller values produce more stochastic updates.
140+
Larger values are faster but may converge to different solutions.
117141
random_state
118142
Random seed for reproducibility.
119143
verbose
120144
Whether to print benchmarking and convergence information.
121145
122146
Returns
123147
-------
124-
Updates adata with the field ``adata.obsm[adjusted_basis]``, \
125-
containing principal components adjusted by Harmony such that \
126-
different experiments are integrated.
127-
148+
Updates adata with the field ``adata.obsm[adjusted_basis]``,
149+
containing principal components adjusted by Harmony
150+
such that different experiments are integrated.
128151
"""
129152
from ._harmony import harmonize
130153

0 commit comments

Comments
 (0)