From ca258a5c895240852cf9d41845f51a749ad16f0e Mon Sep 17 00:00:00 2001 From: Ilia Kats Date: Thu, 14 Aug 2025 14:08:46 +0200 Subject: [PATCH 1/3] DSB: fix float overflow for large datasets (closes #130) --- muon/_prot/preproc.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py index 5f03b90..8907574 100644 --- a/muon/_prot/preproc.py +++ b/muon/_prot/preproc.py @@ -153,10 +153,15 @@ def dsb( else np.log(cells.X.toarray() + pseudocount) ) - cells_scaled = (cells_scaled - empty_scaled.mean(axis=0)) / empty_scaled.std(axis=0) + need_castback = cells_scaled.dtype.kind == "f" + cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std( + axis=0, dtype=np.float64 + ) + if need_castback: + cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False) if denoise_counts: - bgmeans = np.empty(cells_scaled.shape[0], np.float32) + bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype) # init_params needs to be random, otherwise fitted variance for one of the n_components # sometimes goes to 0 sharedvar = GaussianMixture( From c948690d0269364344a27f7dd572d41b3c5cb8f5 Mon Sep 17 00:00:00 2001 From: Ilia Kats Date: Thu, 14 Aug 2025 15:09:42 +0200 Subject: [PATCH 2/3] DSB: use ddof=1 in std calculation to match the R behavior (closes #131) --- muon/_prot/preproc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py index 8907574..78955a2 100644 --- a/muon/_prot/preproc.py +++ b/muon/_prot/preproc.py @@ -155,7 +155,7 @@ def dsb( need_castback = cells_scaled.dtype.kind == "f" cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std( - axis=0, dtype=np.float64 + axis=0, ddof=1, dtype=np.float64 ) if need_castback: cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False) From 48a4ad280f128deb0c479043b77fa9cb0e1f051e Mon Sep 17 00:00:00 2001 From: Ilia Kats Date: Thu, 14 Aug 2025 15:44:26 +0200 Subject: [PATCH 3/3] DSB: implement scale_factor and quantile_clipping options (closes #42) --- muon/_prot/preproc.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py index 78955a2..74bd24b 100644 --- a/muon/_prot/preproc.py +++ b/muon/_prot/preproc.py @@ -1,4 +1,4 @@ -from typing import Optional, Iterable, Tuple, Union +from typing import Optional, Iterable, Tuple, Union, Literal from numbers import Integral, Real from warnings import warn @@ -21,6 +21,9 @@ def dsb( isotype_controls: Optional[Iterable[str]] = None, empty_counts_range: Optional[Tuple[Real, Real]] = None, cell_counts_range: Optional[Tuple[Real, Real]] = None, + scale_factor: Literal["standardize", "mean_subtract"] = "standardize", + quantile_clipping: bool = False, + quantile_clip: Tuple[float, float] = (0.001, 0.9995), add_layer: bool = False, random_state: Optional[Union[int, np.random.RandomState, None]] = None, ) -> Union[None, MuData]: @@ -47,6 +50,11 @@ def dsb( this specifies the minimum and maximum log10-counts for a droplet to be considered empty. cell_counts_range: If ``data_raw`` is ``None``, i.e. ``data`` contains the unfiltered data, this specifies the minimum and maximum log10-counts for a droplet to be considered not empty. + scale_factor: How to perform protein level denoising. `standardize` corresponds to the method + described as Step I in the paper, i.e. subtracting the mean and dividing by the standard + deviation. `mean_subtract` subtracts the mean without dividing by the standard deviation. + quantile_clipping: Whether to clip the final result. Useful if outliers are observied. + quantile_clip: The quantiles to clip outlier values to. Only used if `quantile_clipping=True`. add_layer: Whether to add a ``'dsb'`` layer instead of assigning to the X matrix. random_state: Random seed. @@ -102,6 +110,13 @@ def dsb( if pseudocount < 0: raise ValueError("pseudocount cannot be negative") + if quantile_clipping: + if len(quantile_clip) != 2: + raise ValueError("quantile_clip must have exactly 2 values") + quantile_clip = np.asarray(quantile_clip) + if np.any((quantile_clip < 0) | (quantile_clip > 1)): + raise ValueError("quantile_clip must be between 0 and 1") + if cells.shape[1] != empty.shape[1]: # this should only be possible if mudata_raw != None raise ValueError("data and data_raw have different numbers of proteins") @@ -153,12 +168,12 @@ def dsb( else np.log(cells.X.toarray() + pseudocount) ) - need_castback = cells_scaled.dtype.kind == "f" - cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std( - axis=0, ddof=1, dtype=np.float64 - ) - if need_castback: - cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False) + cells_scaled_dtype = cells_scaled.dtype + cells_scaled = cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64) + if scale_factor == "standardize": + cells_scaled /= empty_scaled.std(axis=0, ddof=1, dtype=np.float64) + if cells_scaled_dtype.kind == "f": + cells_scaled = cells_scaled.astype(cells_scaled_dtype, copy=False) if denoise_counts: bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype) @@ -196,6 +211,11 @@ def dsb( reg.fit(covar, cells_scaled) cells_scaled -= reg.predict(covar) - reg.intercept_ + + if quantile_clipping: + quantiles = np.quantile(cells_scaled, quantile_clip) + np.clip(cells_scaled, a_min=quantiles.min(), a_max=quantiles.max(), out=cells_scaled) + if add_layer: cells.layers["dsb"] = cells_scaled else: