glasgowcompbio
diff --git a/‎tests/test_deisotoping.py‎
Lines changed: 61 additions & 0 deletions b/‎tests/test_deisotoping.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎vimms/Chemicals.py‎
Lines changed: 150 additions & 77 deletions b/‎vimms/Chemicals.py‎
Lines changed: 150 additions & 77 deletions
@@ -0,0 +1,61 @@
+# test deisotoping and isotope generation
+
+import numpy as np
+
+from vimms.Chemicals import Isotopes, Adducts
+from vimms.Common import Formula, ADDUCT_TERMS, POSITIVE, PROTON_MASS
+from vimms.Deisotoping import Deisotoper
+from vimms.MassSpecUtils import adduct_transformation
+
+
+def test_isotope_distribution_multi_element():
+    formula = Formula("C10H16N2O2S")
+    isotopes = Isotopes(formula)
+    peaks = isotopes.get_isotopes(total_proportion=0.99)
+
+    proportions = [peak[1] for peak in peaks]
+    mzs = [peak[0] for peak in peaks]
+
+    assert len(peaks) > 1
+    assert np.isclose(sum(proportions), 1.0, atol=1e-6)
+    assert all(mzs[i] < mzs[i + 1] for i in range(len(mzs) - 1))
+
+
+def test_isotope_distribution_chlorine_m2_peak():
+    formula = Formula("C5H10Cl2")
+    isotopes = Isotopes(formula)
+    peaks = isotopes.get_isotopes(total_proportion=0.99)
+
+    mono_mz = peaks[0][0]
+    deltas = [mz - mono_mz for mz, _, _ in peaks[1:]]
+
+    assert any(np.isclose(delta, 1.997, atol=0.01) for delta in deltas)
+
+
+def test_deisotoper_recovers_mono():
+    formula = Formula("C10H16N2O2S")
+    isotopes = Isotopes(formula)
+    adducts = Adducts(formula, adduct_prior_dict={POSITIVE: {"M+H": 1.0}})
+    adduct_name = adducts.get_adducts()[POSITIVE][0][0]
+    mul, add = ADDUCT_TERMS[adduct_name]
+
+    peaks = []
+    for mz, proportion, _ in isotopes.get_isotopes(total_proportion=0.99):
+        adducted_mz = adduct_transformation(mz, mul, add)
+        peaks.append((adducted_mz, proportion * 1e5))
+
+    deisotoper = Deisotoper(ppm_tolerance=10.0, max_charge=1, min_isotopes=2)
+    clusters = deisotoper.deisotope(peaks)
+
+    assert len(clusters) == 1
+    expected_mz = formula.mass + PROTON_MASS
+    assert np.isclose(clusters[0].monoisotopic_mz, expected_mz, atol=1e-3)
+
+
+def test_deisotoper_handles_m_plus_2_only():
+    peaks = [(100.0, 1e5), (101.997, 6e4)]
+    deisotoper = Deisotoper(ppm_tolerance=10.0, max_charge=1, min_isotopes=2)
+    clusters = deisotoper.deisotope(peaks)
+
+    assert len(clusters) == 1
+    assert np.isclose(clusters[0].monoisotopic_mz, 100.0, atol=1e-6)
@@ -10,8 +10,6 @@
 from collections import deque
 
 import numpy as np
-import scipy
-import scipy.stats
 from loguru import logger
 
 from vimms.ChemicalSamplers import (
@@ -26,14 +24,14 @@
     PROTON_MASS,
     POSITIVE,
     NEGATIVE,
-    C12_PROPORTION,
     C13_MZ_DIFF,
-    C,
     MONO,
-    C13,
     load_obj,
     ADDUCT_NAMES_POS,
     ADDUCT_NAMES_NEG,
+    ADDUCT_PRIOR_POS,
+    ADDUCT_PRIOR_NEG,
+    NATURAL_ISOTOPES,
 )
 from vimms.Noise import GaussianPeakNoise
 from vimms.Roi import make_roi, RoiBuilderParams
@@ -70,15 +68,21 @@ class Isotopes:
     A class to represent an isotope of a chemical
     """
 
-    def __init__(self, formula):
+    def __init__(self, formula, min_prob=1e-12, max_peaks=20, max_states=4000, mass_precision=8):
         """
         Create an Isotope object
         Args:
             formula: the formula for the given isotope
         """
         self.formula = formula
+        self.min_prob = min_prob
+        self.max_peaks = max_peaks
+        self.max_states = max_states
+        self.mass_precision = mass_precision
 
-    def get_isotopes(self, total_proportion):
+    def get_isotopes(
+        self, total_proportion, min_prob=None, max_peaks=None, max_states=None, mass_precision=None
+    ):
         """
         Gets the isotope total proportion
 
@@ -87,93 +91,151 @@ def get_isotopes(self, total_proportion):
 
         Returns: the computed isotope total proportion
 
-        TODO: Add functionality for elements other than Carbon
         """
-        peaks = [() for i in range(len(self._get_isotope_proportions(total_proportion)))]
-        for i in range(len(peaks)):
-            peaks[i] += (self._get_isotope_mz(self._get_isotope_names(i)),)
-            peaks[i] += (self._get_isotope_proportions(total_proportion)[i],)
-            peaks[i] += (self._get_isotope_names(i),)
+        peaks = []
+        distributions = self._get_isotope_distribution(
+            total_proportion=total_proportion,
+            min_prob=self.min_prob if min_prob is None else min_prob,
+            max_peaks=self.max_peaks if max_peaks is None else max_peaks,
+            max_states=self.max_states if max_states is None else max_states,
+            mass_precision=self.mass_precision if mass_precision is None else mass_precision,
+        )
+        base_mz = self.formula._get_mz()
+        for idx, (mass_shift, proportion) in enumerate(distributions):
+            name = MONO if idx == 0 else f"M+{idx}"
+            peaks.append((base_mz + mass_shift, proportion, name))
         return peaks
 
-    def _get_isotope_proportions(self, total_proportion):
-        """
-        Get isotope proportion by sampling from a binomial pmf
-
-        Args:
-            total_proportion: the total proportion to compute
-
-        Returns: the computed isotope total proportion
-
-        """
-        proportions = []
-        while sum(proportions) < total_proportion:
-            proportions.extend(
-                [
-                    scipy.stats.binom.pmf(
-                        len(proportions), self.formula._get_n_element(C), 1 - C12_PROPORTION
-                    )
-                ]
+    def _get_isotope_distribution(
+        self, total_proportion, min_prob=1e-12, max_peaks=20, max_states=4000, mass_precision=8
+    ):
+        distribution = [(0.0, 1.0)]
+        for element, count in self.formula.atoms.items():
+            if count <= 0:
+                continue
+            isotopes = NATURAL_ISOTOPES.get(element)
+            if not isotopes or len(isotopes) == 1:
+                continue
+            mono_mass = isotopes[0][0]
+            base_distribution = [(mass - mono_mass, abundance) for mass, abundance in isotopes]
+            element_distribution = self._power_distribution(
+                base_distribution,
+                count,
+                min_prob=min_prob,
+                max_states=max_states,
+                mass_precision=mass_precision,
+            )
+            distribution = self._convolve_distributions(
+                distribution,
+                element_distribution,
+                min_prob=min_prob,
+                max_states=max_states,
+                mass_precision=mass_precision,
             )
-        normalised_proportions = [
-            proportions[i] / sum(proportions) for i in range(len(proportions))
-        ]
-        return normalised_proportions
-
-    def _get_isotope_names(self, isotope_number):
-        """
-        Get the isotope name given the number, e.g. 0 is the monoisotope
-        Args:
-            isotope_number: the isotope number
-
-        Returns: the isotope name
-
-        """
-        if isotope_number == 0:
-            return MONO
-        else:
-            return str(isotope_number) + C13
-
-    def _get_isotope_mz(self, isotope):
-        """
-        Get the isotope m/z value
-        Args:
-            isotope: the isotope name
-
-        Returns: the isotope m/z value
 
-        """
-        if isotope == MONO:
-            return self.formula._get_mz()
-        elif isotope[-3:] == C13:
-            return self.formula._get_mz() + float(isotope.split(C13)[0]) * C13_MZ_DIFF
-        else:
-            return None
+        distribution = [(shift, prob) for shift, prob in distribution if prob >= min_prob]
+        distribution.sort(key=lambda x: x[0])
+
+        selected = []
+        cumulative = 0.0
+        for mass_shift, prob in distribution:
+            selected.append((mass_shift, prob))
+            cumulative += prob
+            if cumulative >= total_proportion or len(selected) >= max_peaks:
+                break
+
+        total = sum(prob for _, prob in selected)
+        if total == 0:
+            return [(0.0, 1.0)]
+        return [(shift, prob / total) for shift, prob in selected]
+
+    def _power_distribution(self, base_distribution, count, min_prob, max_states, mass_precision):
+        if count == 1:
+            return base_distribution
+        result = [(0.0, 1.0)]
+        power = base_distribution
+        remaining = count
+        while remaining > 0:
+            if remaining % 2 == 1:
+                result = self._convolve_distributions(
+                    result,
+                    power,
+                    min_prob=min_prob,
+                    max_states=max_states,
+                    mass_precision=mass_precision,
+                )
+            remaining //= 2
+            if remaining:
+                power = self._convolve_distributions(
+                    power,
+                    power,
+                    min_prob=min_prob,
+                    max_states=max_states,
+                    mass_precision=mass_precision,
+                )
+        return result
+
+    def _convolve_distributions(self, left, right, min_prob, max_states, mass_precision):
+        new_distribution = {}
+        for left_shift, left_prob in left:
+            for right_shift, right_prob in right:
+                prob = left_prob * right_prob
+                if prob < min_prob:
+                    continue
+                shift = left_shift + right_shift
+                key = round(shift, mass_precision)
+                new_distribution[key] = new_distribution.get(key, 0.0) + prob
+        if not new_distribution:
+            return []
+        distribution = list(new_distribution.items())
+        if len(distribution) > max_states:
+            distribution.sort(key=lambda x: x[1], reverse=True)
+            distribution = distribution[:max_states]
+        return distribution
 
 
 class Adducts:
     """
     A class to represent an adduct of a chemical
     """
 
-    def __init__(self, formula, adduct_proportion_cutoff=0.05, adduct_prior_dict=None):
+    def __init__(
+        self,
+        formula,
+        adduct_proportion_cutoff=0.05,
+        adduct_prior_dict=None,
+        adduct_profile=None,
+        adduct_concentration=15.0,
+    ):
         """
         Create an Adduct class
 
         Args:
             formula: the formula of this adduct
             adduct_proportion_cutoff: proportion cut-off of the adduct
-            adduct_prior_dict: custom adduct dictionary, if any
+            adduct_prior_dict: custom adduct dictionary or callable, if any
+            adduct_profile: preset profile name or dict of adduct priors
+            adduct_concentration: dirichlet concentration for adduct sampling
         """
+        if callable(adduct_prior_dict):
+            adduct_prior_dict = adduct_prior_dict(formula)
+
+        if adduct_prior_dict is None and adduct_profile is not None:
+            from vimms.Common import ADDUCT_PROFILE_PRESETS
+
+            if isinstance(adduct_profile, str):
+                adduct_prior_dict = ADDUCT_PROFILE_PRESETS.get(adduct_profile)
+                if adduct_prior_dict is None:
+                    raise ValueError(f"Unknown adduct profile '{adduct_profile}'")
+            else:
+                adduct_prior_dict = adduct_profile
+
         if adduct_prior_dict is None:
             self.adduct_names = {POSITIVE: ADDUCT_NAMES_POS, NEGATIVE: ADDUCT_NAMES_NEG}
             self.adduct_prior = {
-                POSITIVE: np.ones(len(self.adduct_names[POSITIVE])) * 0.1,
-                NEGATIVE: np.ones(len(self.adduct_names[NEGATIVE])) * 0.1,
+                POSITIVE: np.array([ADDUCT_PRIOR_POS.get(name, 0.05) for name in ADDUCT_NAMES_POS]),
+                NEGATIVE: np.array([ADDUCT_PRIOR_NEG.get(name, 0.05) for name in ADDUCT_NAMES_NEG]),
             }
-            # give more weight to the first one, i.e. M+H
-            self.adduct_prior[POSITIVE][0] = 1.0
-            self.adduct_prior[NEGATIVE][0] = 1.0
         else:
             assert POSITIVE in adduct_prior_dict or NEGATIVE in adduct_prior_dict
             self.adduct_names = {k: list(adduct_prior_dict[k].keys()) for k in adduct_prior_dict}
@@ -182,6 +244,7 @@ def __init__(self, formula, adduct_proportion_cutoff=0.05, adduct_prior_dict=Non
             }
         self.formula = formula
         self.adduct_proportion_cutoff = adduct_proportion_cutoff
+        self.adduct_concentration = adduct_concentration
 
     def get_adducts(self):
         """
@@ -204,15 +267,17 @@ def _get_adduct_proportions(self):
         Returns: adduct proportion after sampling
 
         """
-        # TODO: replace this with something proper
         proportions = {}
         for k in self.adduct_prior:
-            proportions[k] = np.random.dirichlet(self.adduct_prior[k])
-            while max(proportions[k]) < 0.2:
-                proportions[k] = np.random.dirichlet(self.adduct_prior[k])
+            alpha = self.adduct_prior[k] * self.adduct_concentration
+            alpha = np.where(alpha > 0, alpha, 0.001)
+            proportions[k] = np.random.dirichlet(alpha)
             proportions[k][np.where(proportions[k] < self.adduct_proportion_cutoff)] = 0
-            proportions[k] = proportions[k] / max(proportions[k])
-            proportions[k].tolist()
+            if proportions[k].sum() == 0:
+                proportions[k] = np.zeros_like(proportions[k])
+                proportions[k][np.argmax(alpha)] = 1.0
+            else:
+                proportions[k] = proportions[k] / proportions[k].sum()
             assert len(proportions[k]) == len(self.adduct_names[k])
         return proportions
 
@@ -625,6 +690,8 @@ def __init__(
         ms2_sampler=UniformMS2Sampler(),
         adduct_proportion_cutoff=0.05,
         adduct_prior_dict=None,
+        adduct_profile=None,
+        adduct_concentration=15.0,
     ):
         """
         Create a mixture of [vimms.Chemicals.KnownChemical][] objects.
@@ -642,13 +709,17 @@ def __init__(
                          fragmentation spectra.
             adduct_proportion_cutoff: proportion of adduct cut-off
             adduct_prior_dict: custom adduct dictionary
+            adduct_profile: preset name or dict of adduct priors
+            adduct_concentration: dirichlet concentration for adduct sampling
         """
         self.formula_sampler = formula_sampler
         self.rt_and_intensity_sampler = rt_and_intensity_sampler
         self.chromatogram_sampler = chromatogram_sampler
         self.ms2_sampler = ms2_sampler
         self.adduct_proportion_cutoff = adduct_proportion_cutoff
         self.adduct_prior_dict = adduct_prior_dict
+        self.adduct_profile = adduct_profile
+        self.adduct_concentration = adduct_concentration
 
         # if self.database is not None:
         #     logger.debug('Sorting database compounds by masses')
@@ -691,6 +762,8 @@ def sample(self, n_chemicals, ms_levels, include_adducts_isotopes=True):
                     formula,
                     self.adduct_proportion_cutoff,
                     adduct_prior_dict=self.adduct_prior_dict,
+                    adduct_profile=self.adduct_profile,
+                    adduct_concentration=self.adduct_concentration,
                 )
 
                 chemicals.append(