Skip to content

Commit e76f5db

Browse files
authored
Merge pull request #113 from MannLabs/outlier_proteins
Outlier proteins
2 parents 2365951 + 068f7a8 commit e76f5db

10 files changed

Lines changed: 328 additions & 119 deletions

alphaquant/cluster/cluster_ions.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929

3030

31-
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge):
31+
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True):
3232
#typefilter = TypeFilter('successive')
3333

3434
global FCDIFF_CUTOFF_CLUSTERMERGE
@@ -40,7 +40,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
4040
root_node = create_hierarchical_ion_grouping(gene_name, diffions)
4141
add_reduced_names_to_root(root_node)
4242
#LOGGER.info(anytree.RenderTree(root_node))
43-
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion)
43+
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering)
4444

4545
level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)]
4646
level_sorted_nodes.reverse() #the base nodes are first
@@ -91,7 +91,7 @@ def add_reduced_names_to_root(node):
9191

9292

9393
import pandas as pd
94-
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion):#~60% of overall runtime
94+
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime
9595
#typefilter object specifies filtering and clustering of the nodes
9696
aqcluster_utils.assign_properties_to_base_ions(root_node, ionname2diffion, normed_c1, normed_c2)
9797

@@ -125,7 +125,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
125125
aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
126126
aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
127127
aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
128-
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False)
128+
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering)
129129

130130
return root_node
131131

alphaquant/cluster/cluster_utils.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,21 @@
1919
TYPE2LEVEL = dict(zip(TYPES, LEVELS))
2020

2121

22-
def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False):
22+
def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False, fragment_outlier_filtering=True):
2323
"""Goes through the children and summarizes their properties to the node
2424
2525
Args:
2626
node ([type]): [description]
2727
only_use_mainclust (bool, optional): [description]. Defaults to True.
28+
peptide_outlier_filtering (bool, optional): Whether to filter outlier peptides. Defaults to False.
29+
fragment_outlier_filtering (bool, optional): Whether to filter outlier fragments. Defaults to True.
2830
"""
2931
if only_use_mainclust:
3032
childs = [x for x in node.children if x.is_included & (x.cluster ==0)]
3133
else:
3234
childs = [x for x in node.children if x.is_included]
3335

34-
childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node)
36+
childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering)
3537

3638

3739
zvals = get_feature_numpy_array_from_nodes(nodes=childs_zfiltered, feature_name="z_val")
@@ -80,11 +82,48 @@ def get_feature_numpy_array_from_nodes(nodes, feature_name ,dtype = 'float'):
8082
generator = (x.__dict__.get(feature_name) for x in nodes)
8183
return np.fromiter(generator, dtype=dtype)
8284

83-
def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node):
85+
def _select_peptides_around_median_z(peptide_nodes, max_peptides=31):
86+
"""
87+
Selects peptides closest to the median z-value.
88+
89+
When a protein has more than max_peptides peptides, this function selects
90+
the max_peptides peptides that have z-values closest to the median z-value.
91+
This helps to avoid biasing the protein-level statistics with extreme peptides.
92+
93+
Args:
94+
peptide_nodes: List of peptide nodes with z_val attributes
95+
max_peptides: Maximum number of peptides to keep (default: 31)
96+
97+
Returns:
98+
List of peptide nodes closest to median z-value (up to max_peptides)
99+
"""
100+
if len(peptide_nodes) <= max_peptides:
101+
return peptide_nodes
102+
103+
# Get z-values and calculate median
104+
z_values = [node.z_val for node in peptide_nodes]
105+
median_z = np.median(z_values)
106+
107+
# Calculate distance from median for each peptide
108+
peptide_distances = [(node, abs(node.z_val - median_z)) for node in peptide_nodes]
109+
110+
# Sort by distance from median (closest first)
111+
peptide_distances.sort(key=lambda x: x[1])
112+
113+
# Select the max_peptides closest to median
114+
selected_peptides = [node for node, _ in peptide_distances[:max_peptides]]
115+
116+
return selected_peptides
117+
118+
def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering=True):
84119
if peptide_outlier_filtering and node.type == "gene":
85-
return [x for x in childs if not x.is_outlier_peptide]
120+
filtered_childs = [x for x in childs if not x.is_outlier_peptide]
121+
# Additional restriction: if more than 31 peptides, keep only 31 closest to median z-value
122+
if len(filtered_childs) > 31:
123+
filtered_childs = _select_peptides_around_median_z(filtered_childs, max_peptides=31)
124+
return filtered_childs
86125

87-
elif node.type == "frgion":
126+
elif fragment_outlier_filtering and node.type == "frgion":
88127
return remove_outlier_fragion_childs(childs)
89128
else:
90129
return childs
@@ -189,7 +228,8 @@ def remove_outlier_fragion_childs(childs):
189228
idx_end = median_idx + 2
190229
idxs_to_use = sorted_idxs_zvals[idx_start:idx_end]
191230
else:
192-
idxs_to_use = aq_utils_diffquant.find_non_outlier_indices_ipr(zvals, threshold=1.1, percentile_lower = 40, percentile_upper = 70)
231+
# When there are 4 or fewer children, use all of them
232+
idxs_to_use = list(range(len(childs)))
193233

194234
return [childs[idx] for idx in idxs_to_use]
195235

alphaquant/cluster/outlier_filtering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def calculate_regulation_score(protnodes: list[anytree.Node]):
2323
fraction_sig = num_sig / (num_sig + num_insig)
2424

2525
log2fc_ratio_sig_vs_insig = np.median(abs_log2fc[sig_mask_005]) / (np.median(abs_log2fc[nonsig_mask]) + 1e-6)
26-
regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/100) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
26+
regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/10) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
2727
return regulation_score
2828

2929

alphaquant/diffquant/background_distributions.py

Lines changed: 4 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -10,65 +10,11 @@
1010

1111
from numba import njit
1212
from statistics import NormalDist
13+
import alphaquant.diffquant.diffutils as aqdiffutils
1314

14-
@njit
15-
def _compute_zscore_fast_bg(cumulative, min_fc, total):
16-
"""Fast computation of z-scores using Numba JIT compilation for background distributions"""
17-
zscores = np.zeros(len(cumulative))
18-
zero_pos = -min_fc
19-
20-
# Pre-calculate normalization factors
21-
normfact_posvals = 1/(total-cumulative[zero_pos]+1)
22-
normfact_negvals = 1/(cumulative[zero_pos-1]+1)
23-
24-
# Standard normal inverse CDF approximation (Beasley-Springer-Moro algorithm)
25-
# This is much faster than calling NormalDist().inv_cdf()
26-
for i in range(len(cumulative)):
27-
if i == zero_pos or i == len(cumulative) - 1:
28-
zscores[i] = 0.0
29-
continue
30-
31-
if i < zero_pos:
32-
num_more_extreme = cumulative[i]
33-
normfact = normfact_negvals
34-
sign = -1.0
35-
else:
36-
num_more_extreme = total - cumulative[i + 1]
37-
normfact = normfact_posvals
38-
sign = 1.0
39-
40-
p_val = 0.5 * max(1e-9, (num_more_extreme + 1) * normfact)
41-
42-
# Fast inverse normal CDF approximation
43-
if p_val <= 0.5:
44-
# For p <= 0.5, use symmetry: inv_cdf(p) = -inv_cdf(1-p)
45-
t = np.sqrt(-2.0 * np.log(p_val))
46-
z = -(((2.515517 + 0.802853*t + 0.010328*t*t) /
47-
(1.0 + 1.432788*t + 0.189269*t*t + 0.001308*t*t*t)) - t)
48-
else:
49-
t = np.sqrt(-2.0 * np.log(1.0 - p_val))
50-
z = (((2.515517 + 0.802853*t + 0.010328*t*t) /
51-
(1.0 + 1.432788*t + 0.189269*t*t + 0.001308*t*t*t)) - t)
52-
53-
zscores[i] = sign * abs(z)
54-
55-
return zscores
5615

57-
@njit
58-
def _compute_sd_fast_bg(cumulative, min_fc, mean, fc_conversion_factor):
59-
"""Fast computation of standard deviation using Numba JIT compilation for background distributions"""
60-
sq_err = 0.0
61-
previous = 0
6216

63-
for i in range(len(cumulative)):
64-
fc = (i + min_fc) * fc_conversion_factor
65-
freq = cumulative[i] - previous
66-
sq_err += freq * (fc - mean) ** 2
67-
previous = cumulative[i]
6817

69-
total = cumulative[-1]
70-
var = sq_err / total
71-
return math.sqrt(var)
7218

7319
class ConditionBackgrounds():
7420

@@ -284,11 +230,11 @@ def transform_cumulative_into_z_values(self, p2z: dict):
284230
self.max_z = abs(NormalDist().inv_cdf(max(1e-9, min_pval)))
285231

286232
# Use the Numba-optimized function for dramatic speedup (100x+ faster)
287-
return _compute_zscore_fast_bg(self.cumulative, self.min_fc, total)
233+
return aqdiffutils.zscores_from_cumulative(self.cumulative, self.min_fc, total)
288234

289235

290236
def calc_zscore_from_fc(self, fc):
291-
return _calc_zscore_from_fc(fc, self.fc_conversion_factor, self.fc_resolution_factor, self.min_fc, self.cumulative, self.max_z, self.zscores)
237+
return aqdiffutils.z_from_fc_lookup(fc, self.fc_conversion_factor, self.fc_resolution_factor, self.min_fc, self.cumulative, self.max_z, self.zscores)
292238

293239

294240

@@ -300,7 +246,7 @@ def calc_SD(self, mean:float, cumulative:list):
300246
cumulative (list[int]): cumulative distribution array
301247
"""
302248
# Use the Numba-optimized function for dramatic speedup (100x+ faster)
303-
self.SD = _compute_sd_fast_bg(np.asarray(cumulative), self.min_fc, mean, self.fc_conversion_factor)
249+
self.SD = aqdiffutils.sd_from_cumulative(np.asarray(cumulative), self.min_fc, mean, self.fc_conversion_factor)
304250
self.var = self.SD ** 2
305251

306252
def get_cache_key(self):
@@ -319,27 +265,7 @@ def get_cache_key(self):
319265
return (self.start_idx, self.end_idx, self.min_fc, self.max_fc,
320266
len(self.cumulative), round(self.SD, 6))
321267

322-
@njit
323-
def _calc_zscore_from_fc(fc, fc_conversion_factor, fc_resolution_factor, min_fc, cumulative, max_z, zscores):
324-
"""
325-
Quick conversion function that looks up the z-value corresponding to an observed new fold change.
326-
The fold change is mapped to its fc-bin in the binned fold change distribution and then the z-value of the bin is looked up
327-
328-
Args:
329-
fc (float): [description]
330268

331-
Returns:
332-
float: z-value of the observed fold change, based on the background distribution
333-
"""
334-
if abs(fc)<fc_conversion_factor:
335-
return 0
336-
k = int(fc * fc_resolution_factor)
337-
rank = k-min_fc
338-
if rank <0:
339-
return -max_z
340-
if rank >=len(cumulative):
341-
return max_z
342-
return zscores[rank]
343269

344270

345271
# Cell

alphaquant/diffquant/condpair_analysis.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def analyze_condpair(*,runconfig, condpair):
6464
bg1 = normed_c1.ion2background.get(ion)
6565
bg2 = normed_c2.ion2background.get(ion)
6666
diffDist = aqbg.get_subtracted_bg(bgpair2diffDist, bg1, bg2, p2z)
67-
diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, runconfig.outlier_correction)
67+
if runconfig.ion_test_method == 'ttest':
68+
diffIon = aqdiff.DifferentialIonTTest(vals1, vals2, ion, p2z, runconfig.outlier_correction)
69+
else:
70+
diffIon = aqdiff.DifferentialIon(vals1, vals2, diffDist, ion, runconfig.outlier_correction)
6871
protein = pep2prot.get(ion)
6972
if diffIon.usable:
7073
prot2diffions[protein].append(diffIon)
@@ -89,7 +92,8 @@ def analyze_condpair(*,runconfig, condpair):
8992

9093
clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist,
9194
pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc,
92-
take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge)
95+
take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge,
96+
fragment_outlier_filtering=runconfig.fragment_outlier_filtering)
9397
protnodes.append(clustered_prot_node)
9498

9599
if count_prots%100==0:
@@ -222,6 +226,14 @@ def write_out_tables(condpair_node, runconfig):
222226
has_precursor_nodes = check_if_has_precursor_nodes(condpair_node)
223227
if has_precursor_nodes:
224228
prec_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "mod_seq_charge").results_df
229+
else:
230+
prec_df = None
231+
232+
has_base_nodes = check_if_has_base_nodes(condpair_node)
233+
if has_base_nodes and runconfig.write_base_ions:
234+
base_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "base").results_df
235+
else:
236+
base_df = None
225237

226238

227239
if runconfig.runtime_plots:
@@ -254,6 +266,9 @@ def write_out_tables(condpair_node, runconfig):
254266
if has_precursor_nodes:
255267
prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)
256268

269+
if base_df is not None:
270+
base_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.base.tsv", sep = "\t", index=None)
271+
257272
return res_df, pep_df
258273

259274
def check_if_has_sequence_nodes(condpair_node):
@@ -264,3 +279,10 @@ def check_if_has_precursor_nodes(condpair_node):
264279
return condpair_node.children[0].children[0].children[0].children[0].type == "mod_seq_charge"
265280
except:
266281
return False
282+
283+
def check_if_has_base_nodes(condpair_node):
284+
try:
285+
# Check if we have base nodes (fragments/MS1) at the leaf level
286+
return condpair_node.children[0].leaves[0].type == "base"
287+
except:
288+
return False

0 commit comments

Comments
 (0)