Merge pull request #25 from MannLabs/update_ptm_processing

ammarcsj · web-flow · commit e632dcc5b8f3 · 2025-03-21T12:31:27.000-07:00
Update ptm processing
diff --git a/alphaquant/cluster/cluster_missingval.py b/alphaquant/cluster/cluster_missingval.py
@@ -6,6 +6,7 @@
 import numpy as np
 import statistics
 
+PVALUE_THRESHOLD_FOR_INTENSITY_BASED_COUNTING = 0.1
 
 def create_protnode_from_missingval_ions(gene_name,diffions, normed_c1, normed_c2):
     return MissingValProtNodeCreator(gene_name, diffions, normed_c1, normed_c2).prot_node
@@ -32,7 +33,7 @@ def _define_condition_properties(self):
         self._all_intensities_c1 = self._normed_c1.all_intensities
         self._all_intensities_c2 = self._normed_c2.all_intensities
         self._total_intensity = (np.mean(self._all_intensities_c1) +np.mean(self._all_intensities_c2))/2
-    
+
 
     def _create_protnode_from_missingval_ions(self):
         #nrep_c1 and nrep_c2 are the number of replicates in the conditions in general, not the minimum required
@@ -58,9 +59,11 @@ def _assign_properties_to_missingval_base_ions(self, root_node):
             log2intensities_c2 = self._normed_c2.ion2nonNanvals.get(leaf.name)
             leaf.numvals_c1 = len(log2intensities_c1)
             leaf.numvals_c2 = len(log2intensities_c2)
+            leaf.c1_has_values = leaf.numvals_c1 > 0
+            leaf.c2_has_values = leaf.numvals_c2 > 0
 
             leaf.fc = np.nan
-            
+
             leaf.missingval = True
             leaf.total_intensity = self._total_intensity
             leaf.fraction_consistent = np.nan
@@ -104,6 +107,17 @@ def _propagate_properties_from_nodes_to_test_to_root(self, root_node, levelname_
                 for level_node in level_nodes:
                     self._aggregate_node_properties_missingval(level_node)
 
+    def _assign_missingvals_prob_per_node(self, nodes_to_test):
+        for node in nodes_to_test:
+            if node.c1_has_values and node.c2_has_values:
+                continue
+            missingval_node_tester = MissingValNodeTester(node, self._nrep_c1, self._nrep_c2, self._all_intensities_c1, self._all_intensities_c2)
+            node.p_val = missingval_node_tester.pval
+            node.fc = missingval_node_tester.fc
+            flipped_pval = 1-0.5*node.p_val #the flipped pval is always larger than 0.5 and the closer to 1 is gets, the closer it goes to 0.5, while the smaller it gets, the closer it goes to 1. When we express this with the standard normal distribution, we are always on the right side of the distribution, so we can use the inv_cdf function to get a positive z-value equivalent to the p-value
+            node.z_val = abs(statistics.NormalDist().inv_cdf(flipped_pval))
+            #the p-value can be obtained again by applying the transformation: statistics.NormalDist().cdf(z)*2 - 1
+
 
     def _aggregate_node_properties_missingval(self, node):
         childs = node.children
@@ -117,19 +131,13 @@ def _aggregate_node_properties_missingval(self, node):
         node.total_intensity = np.sum([child.total_intensity for child in childs])
         node.intensity_c1 = np.mean([child.intensity_c1 for child in childs])
         node.intensity_c2 = np.mean([child.intensity_c2 for child in childs])
+        node.c1_has_values = any(child.c1_has_values for child in childs)
+        node.c2_has_values = any(child.c2_has_values for child in childs)
         if hasattr(childs[0], "z_val"):
             node.z_val = aq_cluster_utils.sum_and_re_scale_zvalues([child.z_val for child in childs])
             node.p_val = aq_cluster_utils.transform_znormed_to_pval(node.z_val)
 
 
-    def _assign_missingvals_prob_per_node(self, nodes_to_test):
-        for node in nodes_to_test:
-            missingval_node_tester = MissingValNodeTester(node, self._nrep_c1, self._nrep_c2, self._all_intensities_c1, self._all_intensities_c2)
-            node.p_val = missingval_node_tester.pval
-            node.fc = missingval_node_tester.fc
-            flipped_pval = 1-0.5*node.p_val #the flipped pval is always larger than 0.5 and the closer to 1 is gets, the closer it goes to 0.5, while the smaller it gets, the closer it goes to 1. When we express this with the standard normal distribution, we are always on the right side of the distribution, so we can use the inv_cdf function to get a positive z-value equivalent to the p-value
-            node.z_val = abs(statistics.NormalDist().inv_cdf(flipped_pval))
-            #the p-value can be obtained again by applying the transformation: statistics.NormalDist().cdf(z)*2 - 1
 
 
 
@@ -151,8 +159,9 @@ def __init__(self, node_to_test, nrep_c1, nrep_c2, all_intensities_c1, all_inten
         self._define_pvalue_by_iterative_testing()
         self._define_matching_fc(node_to_test)
 
-    
+
     def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all_intensities_c1, all_intensities_c2):
+
         if node_to_test.numvals_c1 > node_to_test.numvals_c2:
             self._numvals_higher_condition = node_to_test.numvals_c1
             self._numvals_lower_condition = node_to_test.numvals_c2
@@ -161,8 +170,8 @@ def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all
             self._nrep_higher_condition = nrep_c1
             self._nrep_lower_condition = nrep_c2
             self._all_intensities_higher_condition = all_intensities_c1
-        
-        elif node_to_test.numvals_c2 > node_to_test.numvals_c1:
+
+        elif node_to_test.numvals_c1 < node_to_test.numvals_c2:
             self._numvals_higher_condition = node_to_test.numvals_c2
             self._numvals_lower_condition = node_to_test.numvals_c1
             self._fraction_missingval_higher_condition = node_to_test.fraction_missingval_c2
@@ -171,20 +180,23 @@ def _define_higher_and_lower_condition(self, node_to_test, nrep_c1, nrep_c2, all
             self._nrep_lower_condition = nrep_c1
             self._all_intensities_higher_condition = all_intensities_c2
 
-        else:
-            raise Exception("Condition 1 and condition 2 have the same number of values. This should not be handled by the counting statistics module.")
-    
+
+
+
+
+
+
     def _define_pvalue_by_iterative_testing(self):
-        if self._perform_binomal_test_on_higher_condition() > 0.1: #the function returns a p-value
+        if self._perform_binomal_test_on_higher_condition() > PVALUE_THRESHOLD_FOR_INTENSITY_BASED_COUNTING: #the function returns a p-value
             self.pval = self._perform_binomal_test_on_lower_condition()
-        
+
         else:
             self.pval = self._perform_fishers_exact_test()
 
-    def _perform_binomal_test_on_higher_condition(self): # we first test the null hypothesis that the values observed in the higher condition (e.g. 5 values are there and we have 6 measurements in total) are missing at random. If this is not the case, we can't apply the binomial test to the lower condition. 
+    def _perform_binomal_test_on_higher_condition(self): # we first test the null hypothesis that the values observed in the higher condition (e.g. 5 values are there and we have 6 measurements in total) are missing at random. If this is not the case, we can't apply the binomial test to the lower condition.
             pval_higher_condition = scipy.stats.binomtest(int(self._numvals_higher_condition), self._nrep_higher_condition, 1-self._fraction_missingval_higher_condition).pvalue
             return pval_higher_condition
-    
+
     def _perform_binomal_test_on_lower_condition(self):
         pval_lower_condition = scipy.stats.binomtest(int(self._numvals_lower_condition), self._nrep_lower_condition, 1-self._fraction_missingval_higher_condition).pvalue
         return pval_lower_condition
@@ -196,7 +208,7 @@ def _perform_fishers_exact_test(self):
 
         contingency_table = np.array([[self._numvals_higher_condition, num_missing_higher_condition],
                                     [self._numvals_lower_condition, num_missing_lower_condition]])
-        
+
         odds_ratio, p = scipy.stats.fisher_exact(contingency_table)
 
         return p
@@ -212,6 +224,5 @@ def _define_matching_fc(self, node_to_test):
             self.fc = intensity_lower - node_to_test.intensity_c2
         else:
             raise Exception("Condition 1 and condition 2 have the same number of values. This should not be handled by the binomial test.")
-        
 
-        
+
diff --git a/alphaquant/diffquant/condpair_analysis.py b/alphaquant/diffquant/condpair_analysis.py
@@ -101,6 +101,8 @@ def analyze_condpair(*,runconfig, condpair):
                 continue
             ions = prot2missingval_diffions.get(prot)
             protnode_missingval = aq_clust_missingval.create_protnode_from_missingval_ions(gene_name=prot,diffions=ions, normed_c1=normed_c1, normed_c2=normed_c2)
+            if (protnode_missingval.c1_has_values) and (protnode_missingval.c2_has_values): #one of the conditions has to be missing, otherwise it means that there was e.g. one fragment ion with values in c1 and other fragment ions with values in c2
+                continue
             protnodes_missingval.append(protnode_missingval)
 
         LOGGER.info(f"finished missing value analysis")
diff --git a/alphaquant/ptm/ptmsite_mapping.py b/alphaquant/ptm/ptmsite_mapping.py
@@ -48,7 +48,7 @@ def assign_dataset_chunkwise(input_file, results_dir, samplemap_df , modificatio
 
 
 
-def assign_dataset_inmemory(input_file, results_dir, samplemap_df, modification_type = "[Phospho (STY)]", id_thresh = 0.6, excl_thresh =0.2 ,swissprot_file = None,
+def assign_dataset_inmemory(input_file, results_dir, samplemap_df, modification_type = "[Phospho (STY)]", id_thresh = 0.7, excl_thresh =0.1 ,swissprot_file = None,
 sequence_file=None, input_type = "Spectronaut", organism = "human"):
     if input_type == "Spectronaut":
         input_df = read_df_spectronaut_reduce_cols(input_file, modification_type)
@@ -74,6 +74,7 @@ def assign_dataset(input_df, samplemap_df, id_thresh = 0.6, excl_thresh =0.2, re
     "FG.Charge"
 
     """""
+    print("id_thresh", id_thresh, "excl_thresh", excl_thresh)
     if(id_thresh < 0.5):
         LOGGER.info("id threshold was set below 0.5, which can lead to ambigous ID sites. Setting to 0.51")
         id_thresh = 0.51
@@ -83,7 +84,8 @@ def assign_dataset(input_df, samplemap_df, id_thresh = 0.6, excl_thresh =0.2, re
     headers_dict = headers_dicts.get(input_type)
     label_column = headers_dict.get("label_column")
     fg_id_column = headers_dict.get("fg_id_column")
-    sample2cond = dict(zip(samplemap_df["sample"], samplemap_df["condition"]))
+   # sample2cond = dict(zip(samplemap_df["sample"], samplemap_df["condition"]))
+    sample2cond = {x : "cond" for x in samplemap_df["sample"]} #we now compare over all conditions.
     len_before = len(input_df.index)
     input_df = filter_input_table(input_type, modification_type, input_df)
     LOGGER.info(f"filtered PTM peptides from {len_before} to {len(input_df.index)}")
@@ -542,8 +544,6 @@ def add_ptm_precursor_names_spectronaut(ptm_annotated_input):
 # Cell
 def filter_input_table(input_type, modification_type,input_df):
     if input_type == "Spectronaut":
-        non_fragion_columns = [x for x in input_df.columns if not x.startswith("F.")]
-
         return input_df[~input_df[f"EG.PTMProbabilities {modification_type}"].isna()]
     if input_type == "DIANN":
         return input_df[[(modification_type in x) for x in input_df["Modified.Sequence"]]]
diff --git a/alphaquant/run_pipeline.py b/alphaquant/run_pipeline.py
@@ -59,6 +59,7 @@ def run_pipeline(input_file: str,
                 take_median_ion: bool = True,
                 perform_ptm_mapping: bool = False,
                 perform_phospho_inference: bool = False,
+                enable_experimental_ptm_counting_statistics: bool = False,
                 outlier_correction: bool = True,
                 normalize: bool = True,
                 use_iontree_if_possible: bool = True,
@@ -108,6 +109,7 @@ def run_pipeline(input_file: str,
     take_median_ion (bool): Use median-centered fragment ions for peptide comparisons. Defaults to True.
     perform_ptm_mapping (bool): Enable PTM site mapping analysis. Defaults to False.
     perform_phospho_inference (bool): Enable phosphorylation-prone region annotation. Defaults to False.
+    enable_experimental_ptm_counting_statistics (bool): Allow experimental PTM counting statistics with "either" mode or zero min_valid_values. Defaults to False.
     outlier_correction (bool): Enable outlier correction in differential testing. Defaults to True.
     normalize (bool): Enable sample and condition normalization. Defaults to True.
     use_iontree_if_possible (bool): Use ion tree structure when available. Defaults to True.
@@ -158,6 +160,16 @@ def run_pipeline(input_file: str,
     if perform_ptm_mapping:
         if modification_type is None:
             raise Exception("modification_type is None, but perform_ptm_mapping is True. Please set perform_ptm_mapping to False or specify modification_type.")
+        if (valid_values_filter_mode == "either") and not enable_experimental_ptm_counting_statistics:
+            LOGGER.warning("For PTM mapping analysis, using valid_values_filter_mode='either' with counting statistics is currently experimental and may produce unreliable results. Setting to 'both' instead for stability. If you'd like to use 'either' mode anyway, set enable_experimental_ptm_counting_statistics=True.")
+            valid_values_filter_mode = "both"
+        if (min_valid_values_c1 == 0 or min_valid_values_c2 == 0) and not enable_experimental_ptm_counting_statistics:
+            LOGGER.warning("For PTM mapping analysis, using min_valid_values_c1=0 or min_valid_values_c2=0 with counting statistics is currently experimental and may produce unreliable results. Setting minimum value to 2 instead for stability. If you'd like to keep the original values, set enable_experimental_ptm_counting_statistics=True.")
+            if min_valid_values_c1 == 0:
+                min_valid_values_c1 = 2
+            if min_valid_values_c2 == 0:
+                min_valid_values_c2 = 2
+
         input_file_reformat = load_ptm_input_file(input_file = input_file_original, input_type_to_use = "spectronaut_ptm_fragion", results_dir = results_dir, samplemap_df = samplemap_df, modification_type = modification_type, organism = organism)
         if use_ml:
             ml_input_file = load_ml_info_file(input_file_original, input_type, modification_type)
diff --git a/alphaquant/ui/dashboard_parts_run_pipeline.py b/alphaquant/ui/dashboard_parts_run_pipeline.py
@@ -418,6 +418,11 @@ def _make_widgets(self):
 				value=False,
 				width=300
 			),
+			'enable_experimental_ptm_counting_statistics': pn.widgets.Checkbox(
+				name='Enable counting statistics for PTM sites (experimental feature!)',
+				value=False,
+				width=300
+			),
 			'outlier_correction': pn.widgets.Checkbox(
 				name='Enable outlier correction',
 				value=True,
@@ -450,6 +455,7 @@ def _make_widgets(self):
 			'take_median_ion': pn.pane.Markdown('Center ion intensities around their median values'),
 			'perform_ptm_mapping': pn.pane.Markdown('Map post-translational modifications to proteins'),
 			'perform_phospho_inference': pn.pane.Markdown('Infer phosphorylation sites from the data'),
+			'enable_experimental_ptm_counting_statistics': pn.pane.Markdown('Enable experimental support for PTM counting statistics with minimum valid values "either" mode. This may produce unreliable results.'),
 			'outlier_correction': pn.pane.Markdown('Automatically detect and correct outliers in the data'),
 			'normalize': pn.pane.Markdown('Normalize data to account for technical variations'),
 			'write_out_results_tree': pn.pane.Markdown('Save detailed results in a tree structure'),
@@ -494,6 +500,9 @@ def _make_widgets(self):
 			sizing_mode='stretch_width'
 		)
 
+		# Initially hide the experimental PTM counting statistics checkbox since PTM mapping is off by default
+		self.switches['enable_experimental_ptm_counting_statistics'].visible = False
+
 		# Watchers
 		self.sample_mapping_select.param.watch(self._toggle_sample_mapping_mode, 'value')
 		self.path_analysis_file.param.watch(
@@ -529,6 +538,7 @@ def create(self):
 			),
 			self.modification_type,
 			self.organism,
+			self.switches['enable_experimental_ptm_counting_statistics'],
 			margin=(5, 5, 5, 5)
 		)
 
@@ -793,6 +803,7 @@ def _run_pipeline(self, *events):
 				'take_median_ion': self.switches['take_median_ion'].value,
 				'perform_ptm_mapping': self.switches['perform_ptm_mapping'].value,
 				'perform_phospho_inference': self.switches['perform_phospho_inference'].value,
+				'enable_experimental_ptm_counting_statistics': self.switches['enable_experimental_ptm_counting_statistics'].value,
 				'outlier_correction': self.switches['outlier_correction'].value,
 				'normalize': self.switches['normalize'].value,
 				'write_out_results_tree': self.switches['write_out_results_tree'].value,
@@ -1382,9 +1393,11 @@ def _toggle_ptm_fields(self, event):
 		if event.new:
 			self.modification_type.visible = True
 			self.organism.visible = True
+			self.switches['enable_experimental_ptm_counting_statistics'].visible = True
 		else:
 			self.modification_type.visible = False
 			self.organism.visible = False
+			self.switches['enable_experimental_ptm_counting_statistics'].visible = False
 
 class Tabs(param.Parameterized):
 	"""
diff --git a/example_nbs/differential_expression_PTM.ipynb b/example_nbs/differential_expression_PTM.ipynb
@@ -98,7 +98,7 @@
     "import alphaquant.run_pipeline as aq_pipeline\n",
     "\n",
     "aq_pipeline.run_pipeline(input_file=PHOSPHO_FILE, samplemap_file=SAMPLEMAP_PHOSPHO, results_dir=RESULTS_DIR_PHOSPHO,\n",
-    "                        condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\")"
+    "                        condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\", valid_values_filter_mode=\"both\") #counting statistics together with PTM mapping is currently an experimental feature, so we set valid_values_filter_mode to \"both\""
    ]
   },
   {
diff --git a/tests/e2e_tests_small/different_input_tables.ipynb b/tests/e2e_tests_small/different_input_tables.ipynb
@@ -49,18 +49,19 @@
     "}\n",
     "\n",
     "for input_file in input_files:\n",
+    "\tprint(f\"Running pipeline for {input_file}\")\n",
     "\tif input_file ==\"fragpipe.tsv\":\n",
     "\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True, multicond_median_analysis=True)\n",
     "\telse:\n",
-    "\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True)\n",
+    "\t\taq_run_pipeline.run_pipeline(input_file=os.path.join(TEST_FILE_DIR, input_file), samplemap_file=samplemap_map[input_file], results_dir=results_dir_map[input_file], reset_progress_folder=True, valid_values_filter_mode=\"both\")\n",
     "\n",
     "\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "test",
+   "display_name": "alphaquant",
    "language": "python",
    "name": "python3"
   },
@@ -74,7 +75,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.8"
+   "version": "3.11.0"
   }
  },
  "nbformat": 4,
diff --git a/tests/e2e_tests_small/phospho.ipynb b/tests/e2e_tests_small/phospho.ipynb
@@ -58,8 +58,7 @@
    "source": [
     "import alphaquant.run_pipeline as aq_run_pipeline\n",
     "\n",
-    "aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE,  results_dir=RESULTS_DIR, min_valid_values=2, modification_type=\"[Phospho (STY)]\",\n",
-    "                             perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"
+    "aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE,  results_dir=RESULTS_DIR, min_valid_values=2, valid_values_filter_mode=\"both\",modification_type=\"[Phospho (STY)]\", perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"
    ]
   },
   {
@@ -71,7 +70,7 @@
     "import pandas as pd\n",
     "results_df = pd.read_csv(RESULTS_DIR + \"/Y150_VS_Y200.results.tsv\", sep = \"\\t\")\n",
     "\n",
-    "assert sum(results_df[\"fdr\"]<0.01)<2"
+    "assert sum(results_df[\"fdr\"]<0.01)<3"
    ]
   }
  ],

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@`
`98`	`98`	`"import alphaquant.run_pipeline as aq_pipeline\n",`
`99`	`99`	`"\n",`
`100`	`100`	`"aq_pipeline.run_pipeline(input_file=PHOSPHO_FILE, samplemap_file=SAMPLEMAP_PHOSPHO, results_dir=RESULTS_DIR_PHOSPHO,\n",`
`101`		`- " condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\")"`
	`101`	`+ " condpairs_list=CONDPAIRS_LIST, perform_ptm_mapping=True,modification_type=\"[Phospho (STY)]\",organism=\"human\", valid_values_filter_mode=\"both\") #counting statistics together with PTM mapping is currently an experimental feature, so we set valid_values_filter_mode to \"both\""`
`102`	`102`	`]`
`103`	`103`	`},`
`104`	`104`	`{`
Original file line number	Diff line number	Diff line change
`@@ -58,8 +58,7 @@`
`58`	`58`	`"source": [`
`59`	`59`	`"import alphaquant.run_pipeline as aq_run_pipeline\n",`
`60`	`60`	`"\n",`
`61`		`- "aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE, results_dir=RESULTS_DIR, min_valid_values=2, modification_type=\"[Phospho (STY)]\",\n",`
`62`		`- " perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"`
	`61`	`+ "aq_run_pipeline.run_pipeline(input_file=INPUT_FILE, samplemap_file=SAMPLEMAP_FILE, results_dir=RESULTS_DIR, min_valid_values=2, valid_values_filter_mode=\"both\",modification_type=\"[Phospho (STY)]\", perform_ptm_mapping=True,organism=\"human\", runtime_plots=True,peptides_to_exclude_file=PEPTIDES_TO_REMOVE, normalize=True)"`
`63`	`62`	`]`
`64`	`63`	`},`
`65`	`64`	`{`
`@@ -71,7 +70,7 @@`
`71`	`70`	`"import pandas as pd\n",`
`72`	`71`	`"results_df = pd.read_csv(RESULTS_DIR + \"/Y150_VS_Y200.results.tsv\", sep = \"\\t\")\n",`
`73`	`72`	`"\n",`
`74`		`- "assert sum(results_df[\"fdr\"]<0.01)<2"`
	`73`	`+ "assert sum(results_df[\"fdr\"]<0.01)<3"`
`75`	`74`	`]`
`76`	`75`	`}`
`77`	`76`	`],`