feat: add write_base_ions option and cap the max number of included peptides

ammarcsj · ammarcsj · commit 068f7a829e34 · 2025-11-07T17:11:30.000+01:00
diff --git a/alphaquant/cluster/cluster_utils.py b/alphaquant/cluster/cluster_utils.py
@@ -82,9 +82,46 @@ def get_feature_numpy_array_from_nodes(nodes, feature_name ,dtype = 'float'):
     generator = (x.__dict__.get(feature_name) for x in nodes)
     return np.fromiter(generator, dtype=dtype)
 
+def _select_peptides_around_median_z(peptide_nodes, max_peptides=31):
+    """
+    Selects peptides closest to the median z-value.
+
+    When a protein has more than max_peptides peptides, this function selects
+    the max_peptides peptides that have z-values closest to the median z-value.
+    This helps to avoid biasing the protein-level statistics with extreme peptides.
+
+    Args:
+        peptide_nodes: List of peptide nodes with z_val attributes
+        max_peptides: Maximum number of peptides to keep (default: 31)
+
+    Returns:
+        List of peptide nodes closest to median z-value (up to max_peptides)
+    """
+    if len(peptide_nodes) <= max_peptides:
+        return peptide_nodes
+
+    # Get z-values and calculate median
+    z_values = [node.z_val for node in peptide_nodes]
+    median_z = np.median(z_values)
+
+    # Calculate distance from median for each peptide
+    peptide_distances = [(node, abs(node.z_val - median_z)) for node in peptide_nodes]
+
+    # Sort by distance from median (closest first)
+    peptide_distances.sort(key=lambda x: x[1])
+
+    # Select the max_peptides closest to median
+    selected_peptides = [node for node, _ in peptide_distances[:max_peptides]]
+
+    return selected_peptides
+
 def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering=True):
     if peptide_outlier_filtering and node.type == "gene":
-        return [x for x in childs if not x.is_outlier_peptide]
+        filtered_childs = [x for x in childs if not x.is_outlier_peptide]
+        # Additional restriction: if more than 31 peptides, keep only 31 closest to median z-value
+        if len(filtered_childs) > 31:
+            filtered_childs = _select_peptides_around_median_z(filtered_childs, max_peptides=31)
+        return filtered_childs
 
     elif fragment_outlier_filtering and node.type == "frgion":
         return remove_outlier_fragion_childs(childs)
diff --git a/alphaquant/cluster/outlier_filtering.py b/alphaquant/cluster/outlier_filtering.py
@@ -23,7 +23,7 @@ def calculate_regulation_score(protnodes: list[anytree.Node]):
     fraction_sig = num_sig / (num_sig + num_insig)
 
     log2fc_ratio_sig_vs_insig = np.median(abs_log2fc[sig_mask_005]) / (np.median(abs_log2fc[nonsig_mask]) + 1e-6)
-    regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/100) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
+    regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/10) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
     return regulation_score
 
 
diff --git a/alphaquant/diffquant/condpair_analysis.py b/alphaquant/diffquant/condpair_analysis.py
@@ -230,7 +230,7 @@ def write_out_tables(condpair_node, runconfig):
         prec_df = None
 
     has_base_nodes = check_if_has_base_nodes(condpair_node)
-    if has_base_nodes:
+    if has_base_nodes and runconfig.write_base_ions:
         base_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "base").results_df
     else:
         base_df = None
@@ -266,7 +266,7 @@ def write_out_tables(condpair_node, runconfig):
         if has_precursor_nodes:
             prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)
 
-        if has_base_nodes:
+        if base_df is not None:
             base_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.base.tsv", sep = "\t", index=None)
 
     return res_df, pep_df
diff --git a/alphaquant/run_pipeline.py b/alphaquant/run_pipeline.py
@@ -65,6 +65,7 @@ def run_pipeline(input_file: str,
                 normalize: bool = True,
                 use_iontree_if_possible: bool = True,
                 write_out_results_tree: bool = True,
+                write_base_ions: bool = False,
                 use_multiprocessing: bool = False,
                 runtime_plots: bool = True,
                 volcano_fdr: float = 0.05,
@@ -119,6 +120,7 @@ def run_pipeline(input_file: str,
     normalize (bool): Enable sample and condition normalization. Defaults to True.
     use_iontree_if_possible (bool): Use ion tree structure when available. Defaults to True.
     write_out_results_tree (bool): Write results in hierarchical tree format. Defaults to True.
+    write_base_ions (bool): Write base ion level results table. Defaults to False.
     use_multiprocessing (bool): Enable parallel processing. Defaults to False.
     runtime_plots (bool): Generate diagnostic plots including volcanos. Defaults to True.
     volcano_fdr (float): FDR cutoff for volcano plot significance. Defaults to 0.05.
diff --git a/alphaquant/tables/diffquant_table.py b/alphaquant/tables/diffquant_table.py
@@ -24,7 +24,7 @@ def __init__(self, condpair_tree, node_type = "gene", min_num_peptides = 1, anno
         self._filter_annotate_results_df()
 
     def _get_list_of_nodetype_nodes(self):
-        return anytree.findall(self._condpair_tree, filter_ = lambda x : x.type == self._node_type)
+        return anytree.findall(self._condpair_tree, filter_ = lambda x : x.type == self._node_type and x.is_included and hasattr(x, 'p_val'))
 
     def _get_condpair_name(self):
         return aqutils.get_condpairname(self._condpair_tree.name)
@@ -34,35 +34,35 @@ def _define_results_df(self):
         for node in self._list_of_nodetype_nodes:
             list_of_dicts.append(self._get_node_dict(node))
         self.results_df = pd.DataFrame(list_of_dicts)
-        
+
     def _get_node_dict(self, node):
-        typename_dict = {"gene" : "protein", "seq" : "sequence", "mod_seq" : "modified_sequence"} #map the short name in the node to a more descriptive name. "gene" to "protein" is a bit confusing, I plan to change everything to "gene" in the future
+        typename_dict = {"gene" : "protein", "seq" : "sequence", "mod_seq" : "modified_sequence", "base": "ion"} #map the short name in the node to a more descriptive name. "gene" to "protein" is a bit confusing, I plan to change everything to "gene" in the future
         type_name  = typename_dict.get(self._node_type, self._node_type)
         node_dict = {}
         node_dict["condition_pair"] = self._condpair_name_table
         node_dict["protein"] = aq_cluster_utils.find_node_parent_at_level(node, "gene").name
         node_dict[type_name] = node.name
         node_dict["p_value"] = node.p_val
         node_dict["log2fc"] = node.fc
-        node_dict["number_of_ions"] = len(node.leaves)
+        node_dict["number_of_ions"] = len(node.leaves) if self._node_type != "base" else 1  # Base nodes ARE the ions
         node_dict["counting_based"] = node.missingval
         if hasattr(node, "ml_score"):
             node_dict["ml_score"] = node.ml_score
         else:
-            node_dict["consistency_score"] = node.fraction_consistent * len(node.leaves)
+            node_dict["consistency_score"] = node.fraction_consistent * len(node.leaves) if self._node_type != "base" else 1.0
 
         if hasattr(node, "total_intensity"):
             node_dict["total_intensity"] = node.total_intensity
 
         if self._node_type == "gene":
             node_dict["num_peptides"] = len(node.children)
-        
+
         return node_dict
-    
+
     def _filter_annotate_results_df(self):
         self.results_df = TableAnnotatorFilterer(self.results_df, self._list_of_nodetype_nodes, self._min_num_peptides, self._annotation_file, self._condpair_tree.fraction_missingval).results_df
         self.results_df = aqtableutils.QualityScoreNormalizer(self.results_df).results_df
-    
+
 
 class TableAnnotatorFilterer():
 
@@ -76,25 +76,25 @@ def __init__(self, results_df, list_of_nodes, min_num_peptides, annotation_file,
         self._fraction_missingval = fraction_missingval
 
         self._filter_annotate_results_df()
-    
+
     def _filter_annotate_results_df(self):
         if self._level_type== "gene":
             self._filter_num_peptides()
             self._add_annotation_columns_if_applicable()
         self._scatter_pvals()
         self._add_fdr_fc_based_set()
         self._add_fdr_counting_based_set()
-    
+
     def _filter_num_peptides(self):
         self.results_df[self.results_df["num_peptides"] >= self._min_num_peptides]
 
     def _add_annotation_columns_if_applicable(self):
-        if self._annotation_file is not None:    
+        if self._annotation_file is not None:
             annotation_df = pd.read_csv(self._annotation_file, sep = "\t")
             annotation_df = annotation_df.drop_duplicates(subset = "protein", keep="first")
             self.results_df = self.results_df.merge(annotation_df, on = "protein", how = "left")
 
-    def _scatter_pvals(self): #add some scatter to the pvalues that are 1.00E-16, which we set as the lowest possible pvalue. This allows for a better visualization as there are less overlapping points. 
+    def _scatter_pvals(self): #add some scatter to the pvalues that are 1.00E-16, which we set as the lowest possible pvalue. This allows for a better visualization as there are less overlapping points.
         #Scatter is added by adding a very small random number, therefore minimally reducing significance (i.e. not artificially making significance stronger)
         rng = np.random.RandomState(123)
         number_of_cut_pvals = (self.results_df['p_value'] == 1.00E-16).sum()