Skip to content

Commit 068f7a8

Browse files
committed
feat: add write_base_ions option and cap the max number of included peptides
1 parent 6430fa6 commit 068f7a8

5 files changed

Lines changed: 55 additions & 16 deletions

File tree

alphaquant/cluster/cluster_utils.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,46 @@ def get_feature_numpy_array_from_nodes(nodes, feature_name ,dtype = 'float'):
8282
generator = (x.__dict__.get(feature_name) for x in nodes)
8383
return np.fromiter(generator, dtype=dtype)
8484

85+
def _select_peptides_around_median_z(peptide_nodes, max_peptides=31):
86+
"""
87+
Selects peptides closest to the median z-value.
88+
89+
When a protein has more than max_peptides peptides, this function selects
90+
the max_peptides peptides that have z-values closest to the median z-value.
91+
This helps to avoid biasing the protein-level statistics with extreme peptides.
92+
93+
Args:
94+
peptide_nodes: List of peptide nodes with z_val attributes
95+
max_peptides: Maximum number of peptides to keep (default: 31)
96+
97+
Returns:
98+
List of peptide nodes closest to median z-value (up to max_peptides)
99+
"""
100+
if len(peptide_nodes) <= max_peptides:
101+
return peptide_nodes
102+
103+
# Get z-values and calculate median
104+
z_values = [node.z_val for node in peptide_nodes]
105+
median_z = np.median(z_values)
106+
107+
# Calculate distance from median for each peptide
108+
peptide_distances = [(node, abs(node.z_val - median_z)) for node in peptide_nodes]
109+
110+
# Sort by distance from median (closest first)
111+
peptide_distances.sort(key=lambda x: x[1])
112+
113+
# Select the max_peptides closest to median
114+
selected_peptides = [node for node, _ in peptide_distances[:max_peptides]]
115+
116+
return selected_peptides
117+
85118
def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering=True):
86119
if peptide_outlier_filtering and node.type == "gene":
87-
return [x for x in childs if not x.is_outlier_peptide]
120+
filtered_childs = [x for x in childs if not x.is_outlier_peptide]
121+
# Additional restriction: if more than 31 peptides, keep only 31 closest to median z-value
122+
if len(filtered_childs) > 31:
123+
filtered_childs = _select_peptides_around_median_z(filtered_childs, max_peptides=31)
124+
return filtered_childs
88125

89126
elif fragment_outlier_filtering and node.type == "frgion":
90127
return remove_outlier_fragion_childs(childs)

alphaquant/cluster/outlier_filtering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def calculate_regulation_score(protnodes: list[anytree.Node]):
2323
fraction_sig = num_sig / (num_sig + num_insig)
2424

2525
log2fc_ratio_sig_vs_insig = np.median(abs_log2fc[sig_mask_005]) / (np.median(abs_log2fc[nonsig_mask]) + 1e-6)
26-
regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/100) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
26+
regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/10) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset
2727
return regulation_score
2828

2929

alphaquant/diffquant/condpair_analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def write_out_tables(condpair_node, runconfig):
230230
prec_df = None
231231

232232
has_base_nodes = check_if_has_base_nodes(condpair_node)
233-
if has_base_nodes:
233+
if has_base_nodes and runconfig.write_base_ions:
234234
base_df = aq_tablewriter_protein.TableFromNodeCreator(condpair_node, node_type = "base").results_df
235235
else:
236236
base_df = None
@@ -266,7 +266,7 @@ def write_out_tables(condpair_node, runconfig):
266266
if has_precursor_nodes:
267267
prec_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.prec.tsv", sep = "\t", index=None)
268268

269-
if has_base_nodes:
269+
if base_df is not None:
270270
base_df.to_csv(f"{runconfig.results_dir}/{aqutils.get_condpairname(condpair)}.results.base.tsv", sep = "\t", index=None)
271271

272272
return res_df, pep_df

alphaquant/run_pipeline.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def run_pipeline(input_file: str,
6565
normalize: bool = True,
6666
use_iontree_if_possible: bool = True,
6767
write_out_results_tree: bool = True,
68+
write_base_ions: bool = False,
6869
use_multiprocessing: bool = False,
6970
runtime_plots: bool = True,
7071
volcano_fdr: float = 0.05,
@@ -119,6 +120,7 @@ def run_pipeline(input_file: str,
119120
normalize (bool): Enable sample and condition normalization. Defaults to True.
120121
use_iontree_if_possible (bool): Use ion tree structure when available. Defaults to True.
121122
write_out_results_tree (bool): Write results in hierarchical tree format. Defaults to True.
123+
write_base_ions (bool): Write base ion level results table. Defaults to False.
122124
use_multiprocessing (bool): Enable parallel processing. Defaults to False.
123125
runtime_plots (bool): Generate diagnostic plots including volcanos. Defaults to True.
124126
volcano_fdr (float): FDR cutoff for volcano plot significance. Defaults to 0.05.

alphaquant/tables/diffquant_table.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, condpair_tree, node_type = "gene", min_num_peptides = 1, anno
2424
self._filter_annotate_results_df()
2525

2626
def _get_list_of_nodetype_nodes(self):
27-
return anytree.findall(self._condpair_tree, filter_ = lambda x : x.type == self._node_type)
27+
return anytree.findall(self._condpair_tree, filter_ = lambda x : x.type == self._node_type and x.is_included and hasattr(x, 'p_val'))
2828

2929
def _get_condpair_name(self):
3030
return aqutils.get_condpairname(self._condpair_tree.name)
@@ -34,35 +34,35 @@ def _define_results_df(self):
3434
for node in self._list_of_nodetype_nodes:
3535
list_of_dicts.append(self._get_node_dict(node))
3636
self.results_df = pd.DataFrame(list_of_dicts)
37-
37+
3838
def _get_node_dict(self, node):
39-
typename_dict = {"gene" : "protein", "seq" : "sequence", "mod_seq" : "modified_sequence"} #map the short name in the node to a more descriptive name. "gene" to "protein" is a bit confusing, I plan to change everything to "gene" in the future
39+
typename_dict = {"gene" : "protein", "seq" : "sequence", "mod_seq" : "modified_sequence", "base": "ion"} #map the short name in the node to a more descriptive name. "gene" to "protein" is a bit confusing, I plan to change everything to "gene" in the future
4040
type_name = typename_dict.get(self._node_type, self._node_type)
4141
node_dict = {}
4242
node_dict["condition_pair"] = self._condpair_name_table
4343
node_dict["protein"] = aq_cluster_utils.find_node_parent_at_level(node, "gene").name
4444
node_dict[type_name] = node.name
4545
node_dict["p_value"] = node.p_val
4646
node_dict["log2fc"] = node.fc
47-
node_dict["number_of_ions"] = len(node.leaves)
47+
node_dict["number_of_ions"] = len(node.leaves) if self._node_type != "base" else 1 # Base nodes ARE the ions
4848
node_dict["counting_based"] = node.missingval
4949
if hasattr(node, "ml_score"):
5050
node_dict["ml_score"] = node.ml_score
5151
else:
52-
node_dict["consistency_score"] = node.fraction_consistent * len(node.leaves)
52+
node_dict["consistency_score"] = node.fraction_consistent * len(node.leaves) if self._node_type != "base" else 1.0
5353

5454
if hasattr(node, "total_intensity"):
5555
node_dict["total_intensity"] = node.total_intensity
5656

5757
if self._node_type == "gene":
5858
node_dict["num_peptides"] = len(node.children)
59-
59+
6060
return node_dict
61-
61+
6262
def _filter_annotate_results_df(self):
6363
self.results_df = TableAnnotatorFilterer(self.results_df, self._list_of_nodetype_nodes, self._min_num_peptides, self._annotation_file, self._condpair_tree.fraction_missingval).results_df
6464
self.results_df = aqtableutils.QualityScoreNormalizer(self.results_df).results_df
65-
65+
6666

6767
class TableAnnotatorFilterer():
6868

@@ -76,25 +76,25 @@ def __init__(self, results_df, list_of_nodes, min_num_peptides, annotation_file,
7676
self._fraction_missingval = fraction_missingval
7777

7878
self._filter_annotate_results_df()
79-
79+
8080
def _filter_annotate_results_df(self):
8181
if self._level_type== "gene":
8282
self._filter_num_peptides()
8383
self._add_annotation_columns_if_applicable()
8484
self._scatter_pvals()
8585
self._add_fdr_fc_based_set()
8686
self._add_fdr_counting_based_set()
87-
87+
8888
def _filter_num_peptides(self):
8989
self.results_df[self.results_df["num_peptides"] >= self._min_num_peptides]
9090

9191
def _add_annotation_columns_if_applicable(self):
92-
if self._annotation_file is not None:
92+
if self._annotation_file is not None:
9393
annotation_df = pd.read_csv(self._annotation_file, sep = "\t")
9494
annotation_df = annotation_df.drop_duplicates(subset = "protein", keep="first")
9595
self.results_df = self.results_df.merge(annotation_df, on = "protein", how = "left")
9696

97-
def _scatter_pvals(self): #add some scatter to the pvalues that are 1.00E-16, which we set as the lowest possible pvalue. This allows for a better visualization as there are less overlapping points.
97+
def _scatter_pvals(self): #add some scatter to the pvalues that are 1.00E-16, which we set as the lowest possible pvalue. This allows for a better visualization as there are less overlapping points.
9898
#Scatter is added by adding a very small random number, therefore minimally reducing significance (i.e. not artificially making significance stronger)
9999
rng = np.random.RandomState(123)
100100
number_of_cut_pvals = (self.results_df['p_value'] == 1.00E-16).sum()

0 commit comments

Comments
 (0)