1- from typing import Optional , Iterable , Tuple , Union
1+ from typing import Optional , Iterable , Tuple , Union , Literal
22from numbers import Integral , Real
33from warnings import warn
44
@@ -21,6 +21,9 @@ def dsb(
2121 isotype_controls : Optional [Iterable [str ]] = None ,
2222 empty_counts_range : Optional [Tuple [Real , Real ]] = None ,
2323 cell_counts_range : Optional [Tuple [Real , Real ]] = None ,
24+ scale_factor : Literal ["standardize" , "mean_subtract" ] = "standardize" ,
25+ quantile_clipping : bool = False ,
26+ quantile_clip : Tuple [float , float ] = (0.001 , 0.9995 ),
2427 add_layer : bool = False ,
2528 random_state : Optional [Union [int , np .random .RandomState , None ]] = None ,
2629) -> Union [None , MuData ]:
@@ -47,6 +50,11 @@ def dsb(
4750 this specifies the minimum and maximum log10-counts for a droplet to be considered empty.
4851 cell_counts_range: If ``data_raw`` is ``None``, i.e. ``data`` contains the unfiltered data,
4952 this specifies the minimum and maximum log10-counts for a droplet to be considered not empty.
53+ scale_factor: How to perform protein level denoising. `standardize` corresponds to the method
54+ described as Step I in the paper, i.e. subtracting the mean and dividing by the standard
55+ deviation. `mean_subtract` subtracts the mean without dividing by the standard deviation.
56+ quantile_clipping: Whether to clip the final result. Useful if outliers are observied.
57+ quantile_clip: The quantiles to clip outlier values to. Only used if `quantile_clipping=True`.
5058 add_layer: Whether to add a ``'dsb'`` layer instead of assigning to the X matrix.
5159 random_state: Random seed.
5260
@@ -102,6 +110,13 @@ def dsb(
102110 if pseudocount < 0 :
103111 raise ValueError ("pseudocount cannot be negative" )
104112
113+ if quantile_clipping :
114+ if len (quantile_clip ) != 2 :
115+ raise ValueError ("quantile_clip must have exactly 2 values" )
116+ quantile_clip = np .asarray (quantile_clip )
117+ if np .any ((quantile_clip < 0 ) | (quantile_clip > 1 )):
118+ raise ValueError ("quantile_clip must be between 0 and 1" )
119+
105120 if cells .shape [1 ] != empty .shape [1 ]: # this should only be possible if mudata_raw != None
106121 raise ValueError ("data and data_raw have different numbers of proteins" )
107122
@@ -153,12 +168,12 @@ def dsb(
153168 else np .log (cells .X .toarray () + pseudocount )
154169 )
155170
156- need_castback = cells_scaled .dtype . kind == "f"
157- cells_scaled = ( cells_scaled - empty_scaled .mean (axis = 0 , dtype = np .float64 )) / empty_scaled . std (
158- axis = 0 , ddof = 1 , dtype = np . float64
159- )
160- if need_castback :
161- cells_scaled = cells_scaled .astype (cells_scaled . dtype , copy = False )
171+ cells_scaled_dtype = cells_scaled .dtype
172+ cells_scaled = cells_scaled - empty_scaled .mean (axis = 0 , dtype = np .float64 )
173+ if scale_factor == "standardize" :
174+ cells_scaled /= empty_scaled . std ( axis = 0 , ddof = 1 , dtype = np . float64 )
175+ if cells_scaled_dtype . kind == "f" :
176+ cells_scaled = cells_scaled .astype (cells_scaled_dtype , copy = False )
162177
163178 if denoise_counts :
164179 bgmeans = np .empty (cells_scaled .shape [0 ], cells_scaled .dtype )
@@ -196,6 +211,11 @@ def dsb(
196211 reg .fit (covar , cells_scaled )
197212
198213 cells_scaled -= reg .predict (covar ) - reg .intercept_
214+
215+ if quantile_clipping :
216+ quantiles = np .quantile (cells_scaled , quantile_clip )
217+ np .clip (cells_scaled , a_min = quantiles .min (), a_max = quantiles .max (), out = cells_scaled )
218+
199219 if add_layer :
200220 cells .layers ["dsb" ] = cells_scaled
201221 else :
0 commit comments