Skip to content

Commit dafec6e

Browse files
authored
Merge pull request #139 from MannLabs/add_quant_reader_and_table_updates
Merge stacked feature branches into main
2 parents 5e22dea + 8dab154 commit dafec6e

32 files changed

Lines changed: 4478 additions & 588 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,5 @@ paper_analyses
229229

230230
alphaquant/resources/reference_databases
231231
alphaquant/resources/phosphopred_databases
232+
.claude/settings.local.json
233+
.claude/settings.local.json

alphaquant/classify/ml_info_table.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,12 @@ def _adapt_precursor_name_to_modification_type(self):
6464

6565

6666
def _define_ml_info_filename(self):
67-
self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv")
67+
self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv.zip")
6868

6969
def _write_ml_info_table(self):
70-
self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False)
70+
archive_name = self.ml_info_filename.split("/")[-1].removesuffix(".zip")
71+
compression = {"method": "zip", "archive_name": archive_name}
72+
self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False, compression=compression)
7173

7274

7375
class MLInfoTableLoader():
@@ -81,4 +83,3 @@ def __init__(self, ml_info_file, samples_used):
8183
def _subset_df_to_relevant_samples(self):
8284
self.ml_info_df = self.ml_info_df[self.ml_info_df["sample_ID"].isin(self._samples_used)]
8385
self.ml_info_df = self.ml_info_df.drop(columns=["sample_ID"])
84-

alphaquant/cluster/cluster_ions.py

Lines changed: 113 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,28 @@
1+
"""Hierarchical proteoform clustering and differential-expression aggregation.
2+
3+
This module builds a hierarchical tree for each protein (fragments → peptides
4+
→ modified peptides → unmodified peptides → protein) and performs two
5+
*independent* statistical procedures at each level:
6+
7+
1. **Proteoform clustering** — pairwise double-differential tests assess
8+
whether sibling ions share the same fold change (null hypothesis: "ions
9+
have equal regulation"). The resulting similarity p-values are corrected
10+
with Benjamini-Yekutieli (appropriate for the intrinsic dependencies among
11+
pairwise comparisons) and then used for hierarchical clustering to separate
12+
proteoforms.
13+
14+
2. **Differential-expression aggregation** — the per-ion z-values from
15+
individual differential-expression tests (null hypothesis: "no change
16+
between conditions") are combined into parent-level z-values via Stouffer's
17+
method. This aggregation propagates evidence of regulation up the tree.
18+
19+
The two sets of p-values address fundamentally different questions and are
20+
corrected separately. The Benjamini-Yekutieli correction in step 1 applies
21+
*within* a single protein to the similarity matrix; the final protein-level
22+
p-values produced by step 2 are later corrected across all proteins with
23+
Benjamini-Hochberg (see ``diffquant_table.py``).
24+
"""
25+
126
import scipy.spatial.distance
227
import scipy.cluster.hierarchy
328
import alphaquant.cluster.cluster_utils as aqcluster_utils
@@ -28,7 +53,7 @@
2853

2954

3055

31-
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True):
56+
def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):
3257
"""Main entry point for hierarchical clustering and tree-based quantification of a protein.
3358
3459
This function creates a hierarchical tree structure from fragment ions up to the protein level
@@ -47,7 +72,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
4772
fcfc_threshold: Fold-change difference threshold for clustering
4873
take_median_ion: If True, use median-centered ions for clustering
4974
fcdiff_cutoff_clustermerge: Fold-change threshold for merging similar clusters
50-
fragment_outlier_filtering: Whether to filter outlier fragments when aggregating to peptides
75+
aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
5176
5277
Returns:
5378
anytree.Node: Root node of the hierarchical tree containing all statistics and clustering results
@@ -63,7 +88,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
6388
root_node = create_hierarchical_ion_grouping(gene_name, diffions)
6489
add_reduced_names_to_root(root_node)
6590
#LOGGER.info(anytree.RenderTree(root_node))
66-
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering)
91+
root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode=aggregation_mode, cluster_threshold_ion_type=cluster_threshold_ion_type)
6792

6893
level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)]
6994
level_sorted_nodes.reverse() #the base nodes are first
@@ -114,7 +139,7 @@ def add_reduced_names_to_root(node):
114139

115140

116141
import pandas as pd
117-
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime
142+
def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):#~60% of overall runtime
118143
"""Performs hierarchical clustering at each level of the tree from bottom to top.
119144
120145
Starting from base ions (fragments/MS1), this function iterates through each level
@@ -123,6 +148,31 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
123148
pairwise for consistent fold-change differences, clustered hierarchically, and
124149
statistics are aggregated to parent nodes.
125150
151+
Important: this function interleaves two *independent* statistical procedures
152+
that address different questions:
153+
154+
1. **Proteoform clustering** (``find_fold_change_clusters``): tests whether
155+
pairs of sibling ions (e.g. two peptides of the same protein) have
156+
*different* fold changes, i.e. whether they belong to different
157+
proteoforms. The resulting pairwise p-values form a dependent similarity
158+
matrix that is corrected for multiple testing with Benjamini-Yekutieli
159+
(appropriate for dependent tests) *before* hierarchical clustering.
160+
161+
2. **Differential-expression aggregation** (``aggregate_node_properties``):
162+
combines the per-ion z-values (derived from individual differential
163+
expression tests) into a single parent-level z-value via Stouffer's
164+
method. These z-values quantify *how much* each ion changes between
165+
conditions and are unrelated to the proteoform-similarity p-values
166+
from step 1.
167+
168+
The Benjamini-Yekutieli correction in step 1 therefore precedes the
169+
Stouffer aggregation in step 2 by design, because they operate on
170+
different null hypotheses: step 1 asks "do these two peptides differ
171+
from each other?" while step 2 asks "does this protein change between
172+
conditions?". The final protein-level p-values produced in step 2
173+
are later corrected with Benjamini-Hochberg across all proteins (see
174+
``diffquant_table.py``).
175+
126176
Args:
127177
root_node: Root of the hierarchical tree (protein level)
128178
ionname2diffion: Dictionary mapping ion names to DifferentialIon objects
@@ -134,7 +184,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
134184
pval_threshold_basis: P-value threshold for clustering decisions
135185
fcfc_threshold: Fold-change threshold for clustering
136186
take_median_ion: Whether to use median-centered ions
137-
fragment_outlier_filtering: Whether to filter fragment outliers
187+
aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
138188
139189
Returns:
140190
anytree.Node: The root node with all clustering annotations and aggregated statistics
@@ -164,15 +214,15 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
164214
if take_median_ion:
165215
grouped_mainclust_leafs = aqcluster_utils.select_median_fc_leafs(grouped_mainclust_leafs)
166216
diffions = aqcluster_utils.map_grouped_leafs_to_diffions(grouped_mainclust_leafs, ionname2diffion) #the diffions are the ions that are actually compared
167-
childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes
217+
childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=cluster_threshold_ion_type) #the clustering is performed on the child nodes
168218
childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE)
169219
childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust)
170220

171221
aq_cluster_pfstats.add_proteoform_statistics_to_nodes(childnode2clust, take_median_ion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist)
172222
aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust)
173223
aqcluster_utils.annotate_mainclust_leaves(childnode2clust)
174224
aqcluster_utils.assign_cluster_number(type_node, childnode2clust)
175-
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering)
225+
aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, aggregation_mode=aggregation_mode)
176226

177227
return root_node
178228

@@ -183,21 +233,46 @@ def get_childnode2clust_for_single_ion(type_node):
183233
return {type_node.children[0]: 0}
184234

185235

186-
def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold):
187-
"""Compares the fold changes of the ions corresponding to the nodes that are compared and returns the set of ions with consistent fold changes.
236+
def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=0.01):
237+
"""Cluster sibling ions by the similarity of their fold changes (proteoform inference).
238+
239+
For each pair of sibling ion groups, a *double-differential* test
240+
(``evaluate_similarity``) computes a p-value for the null hypothesis that
241+
their fold changes are equal (i.e. they belong to the same proteoform).
242+
These pairwise p-values form a condensed similarity matrix that is
243+
corrected for multiple testing with the Benjamini-Yekutieli procedure
244+
(``get_multiple_testing_corrected_condensed_similarity_matrix``), which is
245+
appropriate here because the pairwise comparisons are intrinsically
246+
dependent. The corrected matrix is then converted to a distance matrix and
247+
subjected to Ward hierarchical clustering.
248+
249+
Note: the p-values computed and corrected here test *inter-ion similarity*
250+
for proteoform grouping. They are entirely separate from the per-ion
251+
differential-expression p-values that are later aggregated via Stouffer's
252+
method in ``aggregate_node_properties``.
188253
189254
Args:
190-
diffions (list[list[ionnames]]): contains the sets of ions to be tested, for example [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar in type (e.g. fragment, precursor)!
191-
normed_c1 (ConditionBackground): [description]
192-
normed_c2 (ConditionBackground): [description]
193-
ion2diffDist (dict(ion : SubtractedBackground)): [description]
194-
p2z ([type]): [description]
195-
deedpair2doublediffdist ([type]): [description]
196-
fc_threshold (float, optional): [description]. Defaults to 0.
197-
pval_threshold_basis (float, optional): the threshold at which to merge peptides at the gene level. Defaults to 0.01
255+
type_node: Parent node whose children are being clustered
256+
diffions: List of lists of DifferentialIon objects, one sublist per
257+
child node, e.g. ``[[fragion1_prec1, fragion2_prec1], [fragion1_prec2]]``
258+
normed_c1: ConditionBackgrounds for condition 1
259+
normed_c2: ConditionBackgrounds for condition 2
260+
ion2diffDist: Mapping of ion pairs to differential background distributions
261+
p2z: Cache for p-value to z-value conversions
262+
deedpair2doublediffdist: Cache for double-differential distributions
263+
pval_threshold_basis: P-value threshold at the gene level for cutting the
264+
clustering dendrogram; thresholds for lower levels are looked up in
265+
``LEVEL2PVALTHRESH``
266+
fcfc_threshold: Minimum fold-change difference below which two ion groups
267+
are assumed similar without a formal test
268+
cluster_threshold_ion_type: P-value threshold at the ion_type level
269+
270+
Returns:
271+
list[tuple[Node, int]]: Pairs of (child node, cluster index) sorted by
272+
node name for reproducibility
198273
"""
199274

200-
pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis)
275+
pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=cluster_threshold_ion_type)
201276
diffions_idxs = [[x] for x in range(len(diffions))]
202277
diffions_fcs = aqcluster_utils.get_fcs_ions(diffions)
203278
#mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
@@ -216,26 +291,36 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
216291

217292
return childnode2clust
218293

219-
def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
294+
def get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=0.01): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
220295
if type_node.level == "gene":
221296
return pval_threshold_basis
222-
else:
223-
return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
297+
if type_node.level == "ion_type":
298+
return cluster_threshold_ion_type
299+
return LEVEL2PVALTHRESH.get(type_node.level, 0.2)
224300

225301
def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array):
226-
"""
227-
condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
302+
"""Apply Benjamini-Yekutieli FDR correction to pairwise ion-similarity p-values.
303+
304+
The condensed matrix contains p-values from *double-differential* tests
305+
between all pairs of sibling ions (see ``evaluate_similarity``). Each
306+
p-value tests the null hypothesis that two ions share the same fold
307+
change. Because every ion appears in multiple pairwise comparisons, the
308+
tests are intrinsically dependent, which is why the Benjamini-Yekutieli
309+
(``fdr_by``) procedure is used instead of Benjamini-Hochberg.
310+
311+
These corrected p-values are used exclusively for proteoform clustering
312+
(deciding which ions belong together) and are unrelated to the per-ion
313+
differential-expression p-values that are later aggregated via Stouffer's
314+
method.
228315
229316
Args:
230-
condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
317+
condensed_distance_matrix: 1-D array of pairwise similarity p-values
318+
in scipy condensed-matrix format.
231319
232320
Returns:
233-
np.array: Corrected condensed distance matrix.
321+
np.array: Corrected p-values in the same condensed-matrix layout.
234322
"""
235-
# Apply Benjamini-Yekutieli correction
236323
_, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by')
237-
238-
# Return the corrected condensed matrix
239324
return corrected_pvalues
240325

241326

@@ -341,4 +426,3 @@ def exclude_node(node):
341426
node.is_included = False
342427
for descendant in node.descendants:
343428
descendant.is_included = False
344-

0 commit comments

Comments
 (0)