MannLabs
diff --git a/‎alphaquant/cluster/cluster_ions.py‎
Lines changed: 4 additions & 4 deletions b/‎alphaquant/cluster/cluster_ions.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎alphaquant/cluster/cluster_utils.py‎
Lines changed: 46 additions & 6 deletions b/‎alphaquant/cluster/cluster_utils.py‎
Lines changed: 46 additions & 6 deletions
diff --git a/‎alphaquant/cluster/outlier_filtering.py‎
Lines changed: 1 addition & 1 deletion b/‎alphaquant/cluster/outlier_filtering.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎alphaquant/diffquant/background_distributions.py‎
Lines changed: 4 additions & 78 deletions b/‎alphaquant/diffquant/background_distributions.py‎
Lines changed: 4 additions & 78 deletions
diff --git a/‎alphaquant/diffquant/condpair_analysis.py‎
Lines changed: 24 additions & 2 deletions b/‎alphaquant/diffquant/condpair_analysis.py‎
Lines changed: 24 additions & 2 deletions
@@ -28,7 +28,7 @@
 
 
 
-def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge):
+def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True):
     #typefilter = TypeFilter('successive')
 
     global FCDIFF_CUTOFF_CLUSTERMERGE
@@ -40,7 +40,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
     root_node = create_hierarchical_ion_grouping(gene_name, diffions)
     add_reduced_names_to_root(root_node)
     #LOGGER.info(anytree.RenderTree(root_node))
-    root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion)
+    root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering)
 
     level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)]
     level_sorted_nodes.reverse() #the base nodes are first
@@ -91,7 +91,7 @@ def add_reduced_names_to_root(node):
 
 
 import pandas as pd
-def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion):#~60% of overall runtime
+def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime
     #typefilter object specifies filtering and clustering of the nodes
     aqcluster_utils.assign_properties_to_base_ions(root_node, ionname2diffion, normed_c1, normed_c2)
 
@@ -125,7 +125,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
                 aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
                 aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
                 aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
-                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False)
+                aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering)
 
     return root_node
 
 
@@ -19,19 +19,21 @@
 TYPE2LEVEL = dict(zip(TYPES, LEVELS))
 
 
-def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False):
+def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False, fragment_outlier_filtering=True):
     """Goes through the children and summarizes their properties to the node
 
     Args:
         node ([type]): [description]
         only_use_mainclust (bool, optional): [description]. Defaults to True.
+        peptide_outlier_filtering (bool, optional): Whether to filter outlier peptides. Defaults to False.
+        fragment_outlier_filtering (bool, optional): Whether to filter outlier fragments. Defaults to True.
     """
     if only_use_mainclust:
         childs = [x for x in node.children if x.is_included & (x.cluster ==0)]
     else:
         childs = [x for x in node.children if x.is_included]
 
-    childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node)
+    childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering)
 
 
     zvals = get_feature_numpy_array_from_nodes(nodes=childs_zfiltered, feature_name="z_val")
@@ -80,11 +82,48 @@ def get_feature_numpy_array_from_nodes(nodes, feature_name ,dtype = 'float'):
     generator = (x.__dict__.get(feature_name) for x in nodes)
     return np.fromiter(generator, dtype=dtype)
 
-def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node):
+def _select_peptides_around_median_z(peptide_nodes, max_peptides=31):
+    """
+    Selects peptides closest to the median z-value.
+
+    When a protein has more than max_peptides peptides, this function selects
+    the max_peptides peptides that have z-values closest to the median z-value.
+    This helps to avoid biasing the protein-level statistics with extreme peptides.
+
+    Args:
+        peptide_nodes: List of peptide nodes with z_val attributes
+        max_peptides: Maximum number of peptides to keep (default: 31)
+
+    Returns:
+        List of peptide nodes closest to median z-value (up to max_peptides)
+    """
+    if len(peptide_nodes) <= max_peptides:
+        return peptide_nodes
+
+    # Get z-values and calculate median
+    z_values = [node.z_val for node in peptide_nodes]
+    median_z = np.median(z_values)
+
+    # Calculate distance from median for each peptide
+    peptide_distances = [(node, abs(node.z_val - median_z)) for node in peptide_nodes]
+
+    # Sort by distance from median (closest first)
+    peptide_distances.sort(key=lambda x: x[1])
+
+    # Select the max_peptides closest to median
+    selected_peptides = [node for node, _ in peptide_distances[:max_peptides]]
+
+    return selected_peptides
+
+def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering=True):
     if peptide_outlier_filtering and node.type == "gene":
-        return [x for x in childs if not x.is_outlier_peptide]
+        filtered_childs = [x for x in childs if not x.is_outlier_peptide]
+        # Additional restriction: if more than 31 peptides, keep only 31 closest to median z-value
+        if len(filtered_childs) > 31:
+            filtered_childs = _select_peptides_around_median_z(filtered_childs, max_peptides=31)
+        return filtered_childs
 
-    elif node.type == "frgion":
+    elif fragment_outlier_filtering and node.type == "frgion":
         return remove_outlier_fragion_childs(childs)
     else:
         return childs
@@ -189,7 +228,8 @@ def remove_outlier_fragion_childs(childs):
         idx_end = median_idx + 2
         idxs_to_use = sorted_idxs_zvals[idx_start:idx_end]
     else:
-        idxs_to_use = aq_utils_diffquant.find_non_outlier_indices_ipr(zvals, threshold=1.1, percentile_lower = 40, percentile_upper = 70)
+        # When there are 4 or fewer children, use all of them
+        idxs_to_use = list(range(len(childs)))
 
     return [childs[idx] for idx in idxs_to_use]
 
 
@@ -23,7 +23,7 @@ def calculate_regulation_score(protnodes: list[anytree.Node]):
     fraction_sig = num_sig / (num_sig + num_insig)
 
     log2fc_ratio_sig_vs_insig = np.median(abs_log2fc[sig_mask_005]) / (np.median(abs_log2fc[nonsig_mask]) + 1e-6)
-    regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/100) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
+    regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/10) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
     return regulation_score
 
 
 
@@ -10,65 +10,11 @@
 
 from numba import njit
 from statistics import NormalDist
+import alphaquant.diffquant.diffutils as aqdiffutils
 
-@njit
-def _compute_zscore_fast_bg(cumulative, min_fc, total):
-    """Fast computation of z-scores using Numba JIT compilation for background distributions"""
-    zscores = np.zeros(len(cumulative))
-    zero_pos = -min_fc
-
-    # Pre-calculate normalization factors
-    normfact_posvals = 1/(total-cumulative[zero_pos]+1)
-    normfact_negvals = 1/(cumulative[zero_pos-1]+1)
-
-    # Standard normal inverse CDF approximation (Beasley-Springer-Moro algorithm)
-    # This is much faster than calling NormalDist().inv_cdf()
-    for i in range(len(cumulative)):
-        if i == zero_pos or i == len(cumulative) - 1:
-            zscores[i] = 0.0
-            continue
-
-        if i < zero_pos:
-            num_more_extreme = cumulative[i]
-            normfact = normfact_negvals
-            sign = -1.0
-        else:
-            num_more_extreme = total - cumulative[i + 1]
-            normfact = normfact_posvals
-            sign = 1.0
-
-        p_val = 0.5 * max(1e-9, (num_more_extreme + 1) * normfact)
-
-        # Fast inverse normal CDF approximation
-        if p_val <= 0.5:
-            # For p <= 0.5, use symmetry: inv_cdf(p) = -inv_cdf(1-p)
-            t = np.sqrt(-2.0 * np.log(p_val))
-            z = -(((2.515517 + 0.802853*t + 0.010328*t*t) /
-                  (1.0 + 1.432788*t + 0.189269*t*t + 0.001308*t*t*t)) - t)
-        else:
-            t = np.sqrt(-2.0 * np.log(1.0 - p_val))
-            z = (((2.515517 + 0.802853*t + 0.010328*t*t) /
-                  (1.0 + 1.432788*t + 0.189269*t*t + 0.001308*t*t*t)) - t)
-
-        zscores[i] = sign * abs(z)
-
-    return zscores
 
-@njit
-def _compute_sd_fast_bg(cumulative, min_fc, mean, fc_conversion_factor):
-    """Fast computation of standard deviation using Numba JIT compilation for background distributions"""
-    sq_err = 0.0
-    previous = 0
 
-    for i in range(len(cumulative)):
-        fc = (i + min_fc) * fc_conversion_factor
-        freq = cumulative[i] - previous
-        sq_err += freq * (fc - mean) ** 2
-        previous = cumulative[i]
 
-    total = cumulative[-1]
-    var = sq_err / total
-    return math.sqrt(var)
 
 class ConditionBackgrounds():
 
@@ -284,11 +230,11 @@ def transform_cumulative_into_z_values(self, p2z: dict):
         self.max_z = abs(NormalDist().inv_cdf(max(1e-9, min_pval)))
 
         # Use the Numba-optimized function for dramatic speedup (100x+ faster)
-        return _compute_zscore_fast_bg(self.cumulative, self.min_fc, total)
+        return aqdiffutils.zscores_from_cumulative(self.cumulative, self.min_fc, total)
 
 
     def calc_zscore_from_fc(self, fc):
-        return _calc_zscore_from_fc(fc, self.fc_conversion_factor, self.fc_resolution_factor, self.min_fc, self.cumulative, self.max_z, self.zscores)
+        return aqdiffutils.z_from_fc_lookup(fc, self.fc_conversion_factor, self.fc_resolution_factor, self.min_fc, self.cumulative, self.max_z, self.zscores)
 
 
 
@@ -300,7 +246,7 @@ def calc_SD(self, mean:float, cumulative:list):
             cumulative (list[int]): cumulative distribution array
         """
         # Use the Numba-optimized function for dramatic speedup (100x+ faster)
-        self.SD = _compute_sd_fast_bg(np.asarray(cumulative), self.min_fc, mean, self.fc_conversion_factor)
+        self.SD = aqdiffutils.sd_from_cumulative(np.asarray(cumulative), self.min_fc, mean, self.fc_conversion_factor)
         self.var = self.SD ** 2
 
     def get_cache_key(self):
@@ -319,27 +265,7 @@ def get_cache_key(self):
         return (self.start_idx, self.end_idx, self.min_fc, self.max_fc,
                 len(self.cumulative), round(self.SD, 6))
 
-@njit
-def _calc_zscore_from_fc(fc, fc_conversion_factor, fc_resolution_factor, min_fc, cumulative, max_z, zscores):
-    """
-    Quick conversion function that looks up the z-value corresponding to an observed new fold change.
-    The fold change is mapped to its fc-bin in the binned fold change distribution and then the z-value of the bin is looked up
-
-    Args:
-        fc (float): [description]
 
-    Returns:
-        float: z-value of the observed fold change, based on the background distribution
-    """
-    if abs(fc)<fc_conversion_factor:
-        return 0
-    k = int(fc * fc_resolution_factor)
-    rank = k-min_fc
-    if rank <0:
-        return -max_z
-    if rank >=len(cumulative):
-        return max_z
-    return zscores[rank]
 
 
 # Cell
 
@@ -64,7 +64,10 @@ def analyze_condpair(*,runconfig, condpair):
         bg1 = normed_c1.ion2background.get(ion)
         bg2 = normed_c2.ion2background.get(ion)
         diffDist = aqbg.get_subtracted_bg(bgpair2diffDist, bg1, bg2, p2z)
-        diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, runconfig.outlier_correction)
+        if runconfig.ion_test_method == 'ttest':
+            diffIon = aqdiff.DifferentialIonTTest(vals1, vals2, ion, p2z, runconfig.outlier_correction)
+        else:
+            diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, runconfig.outlier_correction)
         protein = pep2prot.get(ion)
         if diffIon.usable:
             prot2diffions[protein].append(diffIon)
@@ -89,7 +92,8 @@ def analyze_condpair(*,runconfig, condpair):
 
         clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist,
                                                                         pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc,
-                                                                        take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge)
+                                                                        take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge,
+                                                                        fragment_outlier_filtering=runconfig.fragment_outlier_filtering)
         protnodes.append(clustered_prot_node)
 
         if count_prots%100==0:
@@ -222,6 +226,14 @@ def write_out_tables(condpair_node, runconfig):
     has_precursor_nodes = check_if_has_precursor_nodes(condpair_node)
     if has_precursor_nodes:
         prec_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "mod_seq_charge").results_df
+    else:
+        prec_df = None
+
+    has_base_nodes = check_if_has_base_nodes(condpair_node)
+    if has_base_nodes and runconfig.write_base_ions:
+        base_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "base").results_df
+    else:
+        base_df = None
 
 
     if runconfig.runtime_plots:
@@ -254,6 +266,9 @@ def write_out_tables(condpair_node, runconfig):
         if has_precursor_nodes:
             prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)
 
+        if base_df is not None:
+            base_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.base.tsv", sep = "\t", index=None)
+
     return res_df, pep_df
 
 def check_if_has_sequence_nodes(condpair_node):
@@ -264,3 +279,10 @@ def check_if_has_precursor_nodes(condpair_node):
         return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
     except:
         return False
+
+def check_if_has_base_nodes(condpair_node):
+    try:
+        # Check if we have base nodes (fragments/MS1) at the leaf level
+        return condpair_node.children[0].leaves[0].type == "base"
+    except:
+        return False