From ca258a5c895240852cf9d41845f51a749ad16f0e Mon Sep 17 00:00:00 2001
From: Ilia Kats <ilia-kats@gmx.net>
Date: Thu, 14 Aug 2025 14:08:46 +0200
Subject: [PATCH 1/3] DSB: fix float overflow for large datasets (closes #130)

---
 muon/_prot/preproc.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py
index 5f03b90..8907574 100644
--- a/muon/_prot/preproc.py
+++ b/muon/_prot/preproc.py
@@ -153,10 +153,15 @@ def dsb(
         else np.log(cells.X.toarray() + pseudocount)
     )
 
-    cells_scaled = (cells_scaled - empty_scaled.mean(axis=0)) / empty_scaled.std(axis=0)
+    need_castback = cells_scaled.dtype.kind == "f"
+    cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std(
+        axis=0, dtype=np.float64
+    )
+    if need_castback:
+        cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False)
 
     if denoise_counts:
-        bgmeans = np.empty(cells_scaled.shape[0], np.float32)
+        bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype)
         # init_params needs to be random, otherwise fitted variance for one of the n_components
         # sometimes goes to 0
         sharedvar = GaussianMixture(

From c948690d0269364344a27f7dd572d41b3c5cb8f5 Mon Sep 17 00:00:00 2001
From: Ilia Kats <ilia-kats@gmx.net>
Date: Thu, 14 Aug 2025 15:09:42 +0200
Subject: [PATCH 2/3] DSB: use ddof=1 in std calculation to match the R
 behavior (closes #131)

---
 muon/_prot/preproc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py
index 8907574..78955a2 100644
--- a/muon/_prot/preproc.py
+++ b/muon/_prot/preproc.py
@@ -155,7 +155,7 @@ def dsb(
 
     need_castback = cells_scaled.dtype.kind == "f"
     cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std(
-        axis=0, dtype=np.float64
+        axis=0, ddof=1, dtype=np.float64
     )
     if need_castback:
         cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False)

From 48a4ad280f128deb0c479043b77fa9cb0e1f051e Mon Sep 17 00:00:00 2001
From: Ilia Kats <ilia-kats@gmx.net>
Date: Thu, 14 Aug 2025 15:44:26 +0200
Subject: [PATCH 3/3] DSB: implement scale_factor and quantile_clipping options
 (closes #42)

---
 muon/_prot/preproc.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/muon/_prot/preproc.py b/muon/_prot/preproc.py
index 78955a2..74bd24b 100644
--- a/muon/_prot/preproc.py
+++ b/muon/_prot/preproc.py
@@ -1,4 +1,4 @@
-from typing import Optional, Iterable, Tuple, Union
+from typing import Optional, Iterable, Tuple, Union, Literal
 from numbers import Integral, Real
 from warnings import warn
 
@@ -21,6 +21,9 @@ def dsb(
     isotype_controls: Optional[Iterable[str]] = None,
     empty_counts_range: Optional[Tuple[Real, Real]] = None,
     cell_counts_range: Optional[Tuple[Real, Real]] = None,
+    scale_factor: Literal["standardize", "mean_subtract"] = "standardize",
+    quantile_clipping: bool = False,
+    quantile_clip: Tuple[float, float] = (0.001, 0.9995),
     add_layer: bool = False,
     random_state: Optional[Union[int, np.random.RandomState, None]] = None,
 ) -> Union[None, MuData]:
@@ -47,6 +50,11 @@ def dsb(
             this specifies the minimum and maximum log10-counts for a droplet to be considered empty.
         cell_counts_range: If ``data_raw`` is ``None``, i.e. ``data`` contains the unfiltered data,
             this specifies the minimum and maximum log10-counts for a droplet to be considered not empty.
+        scale_factor: How to perform protein level denoising. `standardize` corresponds to the method
+            described as Step I in the paper, i.e. subtracting the mean and dividing by the standard
+            deviation. `mean_subtract` subtracts the mean without dividing by the standard deviation.
+        quantile_clipping: Whether to clip the final result. Useful if outliers are observied.
+        quantile_clip: The quantiles to clip outlier values to. Only used if `quantile_clipping=True`.
         add_layer: Whether to add a ``'dsb'`` layer instead of assigning to the X matrix.
         random_state: Random seed.
 
@@ -102,6 +110,13 @@ def dsb(
     if pseudocount < 0:
         raise ValueError("pseudocount cannot be negative")
 
+    if quantile_clipping:
+        if len(quantile_clip) != 2:
+            raise ValueError("quantile_clip must have exactly 2 values")
+        quantile_clip = np.asarray(quantile_clip)
+        if np.any((quantile_clip < 0) | (quantile_clip > 1)):
+            raise ValueError("quantile_clip must be between 0 and 1")
+
     if cells.shape[1] != empty.shape[1]:  # this should only be possible if mudata_raw != None
         raise ValueError("data and data_raw have different numbers of proteins")
 
@@ -153,12 +168,12 @@ def dsb(
         else np.log(cells.X.toarray() + pseudocount)
     )
 
-    need_castback = cells_scaled.dtype.kind == "f"
-    cells_scaled = (cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)) / empty_scaled.std(
-        axis=0, ddof=1, dtype=np.float64
-    )
-    if need_castback:
-        cells_scaled = cells_scaled.astype(cells_scaled.dtype, copy=False)
+    cells_scaled_dtype = cells_scaled.dtype
+    cells_scaled = cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64)
+    if scale_factor == "standardize":
+        cells_scaled /= empty_scaled.std(axis=0, ddof=1, dtype=np.float64)
+    if cells_scaled_dtype.kind == "f":
+        cells_scaled = cells_scaled.astype(cells_scaled_dtype, copy=False)
 
     if denoise_counts:
         bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype)
@@ -196,6 +211,11 @@ def dsb(
         reg.fit(covar, cells_scaled)
 
         cells_scaled -= reg.predict(covar) - reg.intercept_
+
+    if quantile_clipping:
+        quantiles = np.quantile(cells_scaled, quantile_clip)
+        np.clip(cells_scaled, a_min=quantiles.min(), a_max=quantiles.max(), out=cells_scaled)
+
     if add_layer:
         cells.layers["dsb"] = cells_scaled
     else: