Skip to content

Commit 48a4ad2

Browse files
committed
DSB: implement scale_factor and quantile_clipping options (closes #42)
1 parent c948690 commit 48a4ad2

1 file changed

Lines changed: 27 additions & 7 deletions

File tree

muon/_prot/preproc.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional, Iterable, Tuple, Union
1+
from typing import Optional, Iterable, Tuple, Union, Literal
22
from numbers import Integral, Real
33
from warnings import warn
44

@@ -21,6 +21,9 @@ def dsb(
2121
isotype_controls: Optional[Iterable[str]] = None,
2222
empty_counts_range: Optional[Tuple[Real, Real]] = None,
2323
cell_counts_range: Optional[Tuple[Real, Real]] = None,
24+
scale_factor: Literal["standardize", "mean_subtract"] = "standardize",
25+
quantile_clipping: bool = False,
26+
quantile_clip: Tuple[float, float] = (0.001, 0.9995),
2427
add_layer: bool = False,
2528
random_state: Optional[Union[int, np.random.RandomState, None]] = None,
2629
) -> Union[None, MuData]:
@@ -47,6 +50,11 @@ def dsb(
4750
this specifies the minimum and maximum log10-counts for a droplet to be considered empty.
4851
cell_counts_range: If ``data_raw`` is ``None``, i.e. ``data`` contains the unfiltered data,
4952
this specifies the minimum and maximum log10-counts for a droplet to be considered not empty.
53+
scale_factor: How to perform protein level denoising. `standardize` corresponds to the method
54+
described as Step I in the paper, i.e. subtracting the mean and dividing by the standard
55+
deviation. `mean_subtract` subtracts the mean without dividing by the standard deviation.
56+
quantile_clipping: Whether to clip the final result. Useful if outliers are observied.
57+
quantile_clip: The quantiles to clip outlier values to. Only used if `quantile_clipping=True`.
5058
add_layer: Whether to add a ``'dsb'`` layer instead of assigning to the X matrix.
5159
random_state: Random seed.
5260
@@ -102,6 +110,13 @@ def dsb(
102110
if pseudocount < 0:
103111
raise ValueError("pseudocount cannot be negative")
104112

113+
if quantile_clipping:
114+
if len(quantile_clip) != 2:
115+
raise ValueError("quantile_clip must have exactly 2 values")
116+
quantile_clip = np.asarray(quantile_clip)
117+
if np.any((quantile_clip < 0) | (quantile_clip > 1)):
118+
raise ValueError("quantile_clip must be between 0 and 1")
119+
105120
if cells.shape[1] != empty.shape[1]: # this should only be possible if mudata_raw != None
106121
raise ValueError("data and data_raw have different numbers of proteins")
107122

@@ -153,12 +168,12 @@ def dsb(
153168
else np.log(cells.X.toarray() + pseudocount)
154169
)
155170

156-
need_castback = cells_scaled.dtype.kind == "f"
157-
cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std(
158-
axis=0, ddof=1, dtype=np.float64
159-
)
160-
if need_castback:
161-
cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False)
171+
cells_scaled_dtype = cells_scaled.dtype
172+
cells_scaled = cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)
173+
if scale_factor == "standardize":
174+
cells_scaled /= empty_scaled.std(axis=0, ddof=1, dtype=np.float64)
175+
if cells_scaled_dtype.kind == "f":
176+
cells_scaled = cells_scaled.astype(cells_scaled_dtype, copy=False)
162177

163178
if denoise_counts:
164179
bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype)
@@ -196,6 +211,11 @@ def dsb(
196211
reg.fit(covar, cells_scaled)
197212

198213
cells_scaled -= reg.predict(covar) - reg.intercept_
214+
215+
if quantile_clipping:
216+
quantiles = np.quantile(cells_scaled, quantile_clip)
217+
np.clip(cells_scaled, a_min=quantiles.min(), a_max=quantiles.max(), out=cells_scaled)
218+
199219
if add_layer:
200220
cells.layers["dsb"] = cells_scaled
201221
else:

0 commit comments

Comments
 (0)