Skip to content

Commit e632dcc

Browse files
authored
Merge pull request #25 from MannLabs/update_ptm_processing
Update ptm processing
2 parents fd5460e + 1705740 commit e632dcc

8 files changed

Lines changed: 72 additions & 34 deletions

File tree

alphaquant/cluster/cluster_missingval.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
import statistics
88

9+
PVALUE_THRESHOLD_FOR_INTENSITY_BASED_COUNTING = 0.1
910

1011
def create_protnode_from_missingval_ions(gene_name,diffions, normed_c1, normed_c2):
1112
return MissingValProtNodeCreator(gene_name, diffions, normed_c1, normed_c2).prot_node
@@ -32,7 +33,7 @@ def _define_condition_properties(self):
3233
self._all_intensities_c1 = self._normed_c1.all_intensities
3334
self._all_intensities_c2 = self._normed_c2.all_intensities
3435
self._total_intensity = (np.mean(self._all_intensities_c1) +np.mean(self._all_intensities_c2))/2
35-
36+
3637

3738
def _create_protnode_from_missingval_ions(self):
3839
#nrep_c1 and nrep_c2 are the number of replicates in the conditions in general, not the minimum required
@@ -58,9 +59,11 @@ def _assign_properties_to_missingval_base_ions(self, root_node):
5859
log2intensities_c2 = self._normed_c2.ion2nonNanvals.get(leaf.name)
5960
leaf.numvals_c1 = len(log2intensities_c1)
6061
leaf.numvals_c2 = len(log2intensities_c2)
62+
leaf.c1_has_values = leaf.numvals_c1 > 0
63+
leaf.c2_has_values = leaf.numvals_c2 > 0
6164

6265
leaf.fc = np.nan
63-
66+
6467
leaf.missingval = True
6568
leaf.total_intensity = self._total_intensity
6669
leaf.fraction_consistent = np.nan
@@ -104,6 +107,17 @@ def _propagate_properties_from_nodes_to_test_to_root(self, root_node, levelname_
104107
for level_node in level_nodes:
105108
self._aggregate_node_properties_missingval(level_node)
106109

110+
def _assign_missingvals_prob_per_node(self, nodes_to_test):
111+
for node in nodes_to_test:
112+
if node.c1_has_values and node.c2_has_values:
113+
continue
114+
missingval_node_tester = MissingValNodeTester(node, self._nrep_c1, self._nrep_c2, self._all_intensities_c1, self._all_intensities_c2)
115+
node.p_val = missingval_node_tester.pval
116+
node.fc = missingval_node_tester.fc
117+
flipped_pval = 1-0.5*node.p_val #the flipped pval is always larger than 0.5 and the closer to 1 is gets, the closer it goes to 0.5, while the smaller it gets, the closer it goes to 1. When we express this with the standard normal distribution, we are always on the right side of the distribution, so we can use the inv_cdf function to get a positive z-value equivalent to the p-value
118+
node.z_val = abs(statistics.NormalDist().inv_cdf(flipped_pval))
119+
#the p-value can be obtained again by applying the transformation: statistics.NormalDist().cdf(z)*2 - 1
120+
107121

108122
def _aggregate_node_properties_missingval(self, node):
109123
childs = node.children
@@ -117,19 +131,13 @@ def _aggregate_node_properties_missingval(self, node):
117131
node.total_intensity = np.sum([child.total_intensity for child in childs])
118132
node.intensity_c1 = np.mean([child.intensity_c1 for child in childs])
119133
node.intensity_c2 = np.mean([child.intensity_c2 for child in childs])
134+
node.c1_has_values = any(child.c1_has_values for child in childs)
135+
node.c2_has_values = any(child.c2_has_values for child in childs)
120136
if hasattr(childs[0], "z_val"):
121137
node.z_val = aq_cluster_utils.sum_and_re_scale_zvalues([child.z_val for child in childs])
122138
node.p_val = aq_cluster_utils.transform_znormed_to_pval(node.z_val)
123139

124140

125-
def _assign_missingvals_prob_per_node(self, nodes_to_test):
126-
for node in nodes_to_test:
127-
missingval_node_tester = MissingValNodeTester(node, self._nrep_c1, self._nrep_c2, self._all_intensities_c1, self._all_intensities_c2)
128-
node.p_val = missingval_node_tester.pval
129-
node.fc = missingval_node_tester.fc
130-
flipped_pval = 1-0.5*node.p_val #the flipped pval is always larger than 0.5 and the closer to 1 is gets, the closer it goes to 0.5, while the smaller it gets, the closer it goes to 1. When we express this with the standard normal distribution, we are always on the right side of the distribution, so we can use the inv_cdf function to get a positive z-value equivalent to the p-value
131-
node.z_val = abs(statistics.NormalDist().inv_cdf(flipped_pval))
132-
#the p-value can be obtained again by applying the transformation: statistics.NormalDist().cdf(z)*2 - 1
133141

134142

135143

@@ -151,8 +159,9 @@ def __init__(self, node_to_test, nrep_c1, nrep_c2, all_intensities_c1, all_inten
151159
self._define_pvalue_by_iterative_testing()
152160
self._define_matching_fc(node_to_test)
153161

154-
162+
155163
def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all_intensities_c1, all_intensities_c2):
164+
156165
if node_to_test.numvals_c1 > node_to_test.numvals_c2:
157166
self._numvals_higher_condition = node_to_test.numvals_c1
158167
self._numvals_lower_condition = node_to_test.numvals_c2
@@ -161,8 +170,8 @@ def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all
161170
self._nrep_higher_condition = nrep_c1
162171
self._nrep_lower_condition = nrep_c2
163172
self._all_intensities_higher_condition = all_intensities_c1
164-
165-
elif node_to_test.numvals_c2 > node_to_test.numvals_c1:
173+
174+
elif node_to_test.numvals_c1 < node_to_test.numvals_c2:
166175
self._numvals_higher_condition = node_to_test.numvals_c2
167176
self._numvals_lower_condition = node_to_test.numvals_c1
168177
self._fraction_missingval_higher_condition = node_to_test.fraction_missingval_c2
@@ -171,20 +180,23 @@ def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all
171180
self._nrep_lower_condition = nrep_c1
172181
self._all_intensities_higher_condition = all_intensities_c2
173182

174-
else:
175-
raise Exception("Condition 1 and condition 2 have the same number of values. This should not be handled by the counting statistics module.")
176-
183+
184+
185+
186+
187+
188+
177189
def _define_pvalue_by_iterative_testing(self):
178-
if self._perform_binomal_test_on_higher_condition() > 0.1: #the function returns a p-value
190+
if self._perform_binomal_test_on_higher_condition() > PVALUE_THRESHOLD_FOR_INTENSITY_BASED_COUNTING: #the function returns a p-value
179191
self.pval = self._perform_binomal_test_on_lower_condition()
180-
192+
181193
else:
182194
self.pval = self._perform_fishers_exact_test()
183195

184-
def _perform_binomal_test_on_higher_condition(self): # we first test the null hypothesis that the values observed in the higher condition (e.g. 5 values are there and we have 6 measurements in total) are missing at random. If this is not the case, we can't apply the binomial test to the lower condition.
196+
def _perform_binomal_test_on_higher_condition(self): # we first test the null hypothesis that the values observed in the higher condition (e.g. 5 values are there and we have 6 measurements in total) are missing at random. If this is not the case, we can't apply the binomial test to the lower condition.
185197
pval_higher_condition = scipy.stats.binomtest(int(self._numvals_higher_condition), self._nrep_higher_condition, 1-self._fraction_missingval_higher_condition).pvalue
186198
return pval_higher_condition
187-
199+
188200
def _perform_binomal_test_on_lower_condition(self):
189201
pval_lower_condition = scipy.stats.binomtest(int(self._numvals_lower_condition), self._nrep_lower_condition, 1-self._fraction_missingval_higher_condition).pvalue
190202
return pval_lower_condition
@@ -196,7 +208,7 @@ def _perform_fishers_exact_test(self):
196208

197209
contingency_table = np.array([[self._numvals_higher_condition, num_missing_higher_condition],
198210
[self._numvals_lower_condition, num_missing_lower_condition]])
199-
211+
200212
odds_ratio, p = scipy.stats.fisher_exact(contingency_table)
201213

202214
return p
@@ -212,6 +224,5 @@ def _define_matching_fc(self, node_to_test):
212224
self.fc = intensity_lower - node_to_test.intensity_c2
213225
else:
214226
raise Exception("Condition 1 and condition 2 have the same number of values. This should not be handled by the binomial test.")
215-
216227

217-
228+

alphaquant/diffquant/condpair_analysis.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ def analyze_condpair(*,runconfig, condpair):
101101
continue
102102
ions = prot2missingval_diffions.get(prot)
103103
protnode_missingval = aq_clust_missingval.create_protnode_from_missingval_ions(gene_name=prot,diffions=ions, normed_c1=normed_c1, normed_c2=normed_c2)
104+
if (protnode_missingval.c1_has_values) and (protnode_missingval.c2_has_values): #one of the conditions has to be missing, otherwise it means that there was e.g. one fragment ion with values in c1 and other fragment ions with values in c2
105+
continue
104106
protnodes_missingval.append(protnode_missingval)
105107

106108
LOGGER.info(f"finished missing value analysis")

alphaquant/ptm/ptmsite_mapping.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def assign_dataset_chunkwise(input_file, results_dir, samplemap_df , modificatio
4848

4949

5050

51-
def assign_dataset_inmemory(input_file, results_dir, samplemap_df, modification_type = "[Phospho (STY)]", id_thresh = 0.6, excl_thresh =0.2 ,swissprot_file = None,
51+
def assign_dataset_inmemory(input_file, results_dir, samplemap_df, modification_type = "[Phospho (STY)]", id_thresh = 0.7, excl_thresh =0.1 ,swissprot_file = None,
5252
sequence_file=None, input_type = "Spectronaut", organism = "human"):
5353
if input_type == "Spectronaut":
5454
input_df = read_df_spectronaut_reduce_cols(input_file, modification_type)
@@ -74,6 +74,7 @@ def assign_dataset(input_df, samplemap_df, id_thresh = 0.6, excl_thresh =0.2, re
7474
"FG.Charge"
7575
7676
"""""
77+
print("id_thresh", id_thresh, "excl_thresh", excl_thresh)
7778
if(id_thresh < 0.5):
7879
LOGGER.info("id threshold was set below 0.5, which can lead to ambigous ID sites. Setting to 0.51")
7980
id_thresh = 0.51
@@ -83,7 +84,8 @@ def assign_dataset(input_df, samplemap_df, id_thresh = 0.6, excl_thresh =0.2, re
8384
headers_dict = headers_dicts.get(input_type)
8485
label_column = headers_dict.get("label_column")
8586
fg_id_column = headers_dict.get("fg_id_column")
86-
sample2cond = dict(zip(samplemap_df["sample"], samplemap_df["condition"]))
87+
# sample2cond = dict(zip(samplemap_df["sample"], samplemap_df["condition"]))
88+
sample2cond = {x : "cond" for x in samplemap_df["sample"]} #we now compare over all conditions.
8789
len_before = len(input_df.index)
8890
input_df = filter_input_table(input_type, modification_type, input_df)
8991
LOGGER.info(f"filtered PTM peptides from {len_before} to {len(input_df.index)}")
@@ -542,8 +544,6 @@ def add_ptm_precursor_names_spectronaut(ptm_annotated_input):
542544
# Cell
543545
def filter_input_table(input_type, modification_type,input_df):
544546
if input_type == "Spectronaut":
545-
non_fragion_columns = [x for x in input_df.columns if not x.startswith("F.")]
546-
547547
return input_df[~input_df[f"EG.PTMProbabilities {modification_type}"].isna()]
548548
if input_type == "DIANN":
549549
return input_df[[(modification_type in x) for x in input_df["Modified.Sequence"]]]

alphaquant/run_pipeline.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def run_pipeline(input_file: str,
5959
take_median_ion: bool = True,
6060
perform_ptm_mapping: bool = False,
6161
perform_phospho_inference: bool = False,
62+
enable_experimental_ptm_counting_statistics: bool = False,
6263
outlier_correction: bool = True,
6364
normalize: bool = True,
6465
use_iontree_if_possible: bool = True,
@@ -108,6 +109,7 @@ def run_pipeline(input_file: str,
108109
take_median_ion (bool): Use median-centered fragment ions for peptide comparisons. Defaults to True.
109110
perform_ptm_mapping (bool): Enable PTM site mapping analysis. Defaults to False.
110111
perform_phospho_inference (bool): Enable phosphorylation-prone region annotation. Defaults to False.
112+
enable_experimental_ptm_counting_statistics (bool): Allow experimental PTM counting statistics with "either" mode or zero min_valid_values. Defaults to False.
111113
outlier_correction (bool): Enable outlier correction in differential testing. Defaults to True.
112114
normalize (bool): Enable sample and condition normalization. Defaults to True.
113115
use_iontree_if_possible (bool): Use ion tree structure when available. Defaults to True.
@@ -158,6 +160,16 @@ def run_pipeline(input_file: str,
158160
if perform_ptm_mapping:
159161
if modification_type is None:
160162
raise Exception("modification_type is None, but perform_ptm_mapping is True. Please set perform_ptm_mapping to False or specify modification_type.")
163+
if (valid_values_filter_mode == "either") and not enable_experimental_ptm_counting_statistics:
164+
LOGGER.warning("For PTM mapping analysis, using valid_values_filter_mode='either' with counting statistics is currently experimental and may produce unreliable results. Setting to 'both' instead for stability. If you'd like to use 'either' mode anyway, set enable_experimental_ptm_counting_statistics=True.")
165+
valid_values_filter_mode = "both"
166+
if (min_valid_values_c1 == 0 or min_valid_values_c2 == 0) and not enable_experimental_ptm_counting_statistics:
167+
LOGGER.warning("For PTM mapping analysis, using min_valid_values_c1=0 or min_valid_values_c2=0 with counting statistics is currently experimental and may produce unreliable results. Setting minimum value to 2 instead for stability. If you'd like to keep the original values, set enable_experimental_ptm_counting_statistics=True.")
168+
if min_valid_values_c1 == 0:
169+
min_valid_values_c1 = 2
170+
if min_valid_values_c2 == 0:
171+
min_valid_values_c2 = 2
172+
161173
input_file_reformat = load_ptm_input_file(input_file = input_file_original, input_type_to_use = "spectronaut_ptm_fragion", results_dir = results_dir, samplemap_df = samplemap_df, modification_type = modification_type, organism = organism)
162174
if use_ml:
163175
ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)

alphaquant/ui/dashboard_parts_run_pipeline.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,11 @@ def _make_widgets(self):
418418
value=False,
419419
width=300
420420
),
421+
'enable_experimental_ptm_counting_statistics': pn.widgets.Checkbox(
422+
name='Enable counting statistics for PTM sites (experimental feature!)',
423+
value=False,
424+
width=300
425+
),
421426
'outlier_correction': pn.widgets.Checkbox(
422427
name='Enable outlier correction',
423428
value=True,
@@ -450,6 +455,7 @@ def _make_widgets(self):
450455
'take_median_ion': pn.pane.Markdown('Center ion intensities around their median values'),
451456
'perform_ptm_mapping': pn.pane.Markdown('Map post-translational modifications to proteins'),
452457
'perform_phospho_inference': pn.pane.Markdown('Infer phosphorylation sites from the data'),
458+
'enable_experimental_ptm_counting_statistics': pn.pane.Markdown('Enable experimental support for PTM counting statistics with minimum valid values "either" mode. This may produce unreliable results.'),
453459
'outlier_correction': pn.pane.Markdown('Automatically detect and correct outliers in the data'),
454460
'normalize': pn.pane.Markdown('Normalize data to account for technical variations'),
455461
'write_out_results_tree': pn.pane.Markdown('Save detailed results in a tree structure'),
@@ -494,6 +500,9 @@ def _make_widgets(self):
494500
sizing_mode='stretch_width'
495501
)
496502

503+
# Initially hide the experimental PTM counting statistics checkbox since PTM mapping is off by default
504+
self.switches['enable_experimental_ptm_counting_statistics'].visible = False
505+
497506
# Watchers
498507
self.sample_mapping_select.param.watch(self._toggle_sample_mapping_mode, 'value')
499508
self.path_analysis_file.param.watch(
@@ -529,6 +538,7 @@ def create(self):
529538
),
530539
self.modification_type,
531540
self.organism,
541+
self.switches['enable_experimental_ptm_counting_statistics'],
532542
margin=(5, 5, 5, 5)
533543
)
534544

@@ -793,6 +803,7 @@ def _run_pipeline(self, *events):
793803
'take_median_ion': self.switches['take_median_ion'].value,
794804
'perform_ptm_mapping': self.switches['perform_ptm_mapping'].value,
795805
'perform_phospho_inference': self.switches['perform_phospho_inference'].value,
806+
'enable_experimental_ptm_counting_statistics': self.switches['enable_experimental_ptm_counting_statistics'].value,
796807
'outlier_correction': self.switches['outlier_correction'].value,
797808
'normalize': self.switches['normalize'].value,
798809
'write_out_results_tree': self.switches['write_out_results_tree'].value,
@@ -1382,9 +1393,11 @@ def _toggle_ptm_fields(self, event):
13821393
if event.new:
13831394
self.modification_type.visible = True
13841395
self.organism.visible = True
1396+
self.switches['enable_experimental_ptm_counting_statistics'].visible = True
13851397
else:
13861398
self.modification_type.visible = False
13871399
self.organism.visible = False
1400+
self.switches['enable_experimental_ptm_counting_statistics'].visible = False
13881401

13891402
class Tabs(param.Parameterized):
13901403
"""

example_nbs/differential_expression_PTM.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@
9898
"import alphaquant.run_pipeline as aq_pipeline\n",
9999
"\n",
100100
"aq_pipeline.run_pipeline(input_file=PHOSPHO_FILE, samplemap_file=SAMPLEMAP_PHOSPHO, results_dir=RESULTS_DIR_PHOSPHO,\n",
101-
" condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\")"
101+
" condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\", valid_values_filter_mode=\"both\") #counting statistics together with PTM mapping is currently an experimental feature, so we set valid_values_filter_mode to \"both\""
102102
]
103103
},
104104
{

tests/e2e_tests_small/different_input_tables.ipynb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,19 @@
4949
"}\n",
5050
"\n",
5151
"for input_file in input_files:\n",
52+
"\tprint(f\"Running pipeline for {input_file}\")\n",
5253
"\tif input_file ==\"fragpipe.tsv\":\n",
5354
"\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True, multicond_median_analysis=True)\n",
5455
"\telse:\n",
55-
"\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True)\n",
56+
"\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True, valid_values_filter_mode=\"both\")\n",
5657
"\n",
5758
"\n"
5859
]
5960
}
6061
],
6162
"metadata": {
6263
"kernelspec": {
63-
"display_name": "test",
64+
"display_name": "alphaquant",
6465
"language": "python",
6566
"name": "python3"
6667
},
@@ -74,7 +75,7 @@
7475
"name": "python",
7576
"nbconvert_exporter": "python",
7677
"pygments_lexer": "ipython3",
77-
"version": "3.12.8"
78+
"version": "3.11.0"
7879
}
7980
},
8081
"nbformat": 4,

tests/e2e_tests_small/phospho.ipynb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@
5858
"source": [
5959
"import alphaquant.run_pipeline as aq_run_pipeline\n",
6060
"\n",
61-
"aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE, results_dir=RESULTS_DIR, min_valid_values=2, modification_type=\"[Phospho (STY)]\",\n",
62-
" perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"
61+
"aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE, results_dir=RESULTS_DIR, min_valid_values=2, valid_values_filter_mode=\"both\",modification_type=\"[Phospho (STY)]\", perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"
6362
]
6463
},
6564
{
@@ -71,7 +70,7 @@
7170
"import pandas as pd\n",
7271
"results_df = pd.read_csv(RESULTS_DIR + \"/Y150_VS_Y200.results.tsv\", sep = \"\\t\")\n",
7372
"\n",
74-
"assert sum(results_df[\"fdr\"]<0.01)<2"
73+
"assert sum(results_df[\"fdr\"]<0.01)<3"
7574
]
7675
}
7776
],

0 commit comments

Comments
 (0)