1+ """Hierarchical proteoform clustering and differential-expression aggregation.
2+
3+ This module builds a hierarchical tree for each protein (fragments → peptides
4+ → modified peptides → unmodified peptides → protein) and performs two
5+ *independent* statistical procedures at each level:
6+
7+ 1. **Proteoform clustering** — pairwise double-differential tests assess
8+ whether sibling ions share the same fold change (null hypothesis: "ions
9+ have equal regulation"). The resulting similarity p-values are corrected
10+ with Benjamini-Yekutieli (appropriate for the intrinsic dependencies among
11+ pairwise comparisons) and then used for hierarchical clustering to separate
12+ proteoforms.
13+
14+ 2. **Differential-expression aggregation** — the per-ion z-values from
15+ individual differential-expression tests (null hypothesis: "no change
16+ between conditions") are combined into parent-level z-values via Stouffer's
17+ method. This aggregation propagates evidence of regulation up the tree.
18+
19+ The two sets of p-values address fundamentally different questions and are
20+ corrected separately. The Benjamini-Yekutieli correction in step 1 applies
21+ *within* a single protein to the similarity matrix; the final protein-level
22+ p-values produced by step 2 are later corrected across all proteins with
23+ Benjamini-Hochberg (see ``diffquant_table.py``).
24+ """
25+
126import scipy .spatial .distance
227import scipy .cluster .hierarchy
328import alphaquant .cluster .cluster_utils as aqcluster_utils
2853
2954
3055
31- def get_scored_clusterselected_ions (gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fcdiff_cutoff_clustermerge , fragment_outlier_filtering = True ):
56+ def get_scored_clusterselected_ions (gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fcdiff_cutoff_clustermerge , aggregation_mode = "stouffer_decorrelation" , cluster_threshold_ion_type = 0.01 ):
3257 """Main entry point for hierarchical clustering and tree-based quantification of a protein.
3358
3459 This function creates a hierarchical tree structure from fragment ions up to the protein level
@@ -47,7 +72,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
4772 fcfc_threshold: Fold-change difference threshold for clustering
4873 take_median_ion: If True, use median-centered ions for clustering
4974 fcdiff_cutoff_clustermerge: Fold-change threshold for merging similar clusters
50- fragment_outlier_filtering: Whether to filter outlier fragments when aggregating to peptides
75+ aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
5176
5277 Returns:
5378 anytree.Node: Root node of the hierarchical tree containing all statistics and clustering results
@@ -63,7 +88,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i
6388 root_node = create_hierarchical_ion_grouping (gene_name , diffions )
6489 add_reduced_names_to_root (root_node )
6590 #LOGGER.info(anytree.RenderTree(root_node))
66- root_node_clust = cluster_along_specified_levels (root_node , name2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fragment_outlier_filtering )
91+ root_node_clust = cluster_along_specified_levels (root_node , name2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , aggregation_mode = aggregation_mode , cluster_threshold_ion_type = cluster_threshold_ion_type )
6792
6893 level_sorted_nodes = [[node for node in children ] for children in anytree .ZigZagGroupIter (root_node_clust )]
6994 level_sorted_nodes .reverse () #the base nodes are first
@@ -114,7 +139,7 @@ def add_reduced_names_to_root(node):
114139
115140
116141import pandas as pd
117- def cluster_along_specified_levels (root_node , ionname2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fragment_outlier_filtering = True ):#~60% of overall runtime
142+ def cluster_along_specified_levels (root_node , ionname2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , aggregation_mode = "stouffer_decorrelation" , cluster_threshold_ion_type = 0.01 ):#~60% of overall runtime
118143 """Performs hierarchical clustering at each level of the tree from bottom to top.
119144
120145 Starting from base ions (fragments/MS1), this function iterates through each level
@@ -123,6 +148,31 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
123148 pairwise for consistent fold-change differences, clustered hierarchically, and
124149 statistics are aggregated to parent nodes.
125150
151+ Important: this function interleaves two *independent* statistical procedures
152+ that address different questions:
153+
154+ 1. **Proteoform clustering** (``find_fold_change_clusters``): tests whether
155+ pairs of sibling ions (e.g. two peptides of the same protein) have
156+ *different* fold changes, i.e. whether they belong to different
157+ proteoforms. The resulting pairwise p-values form a dependent similarity
158+ matrix that is corrected for multiple testing with Benjamini-Yekutieli
159+ (appropriate for dependent tests) *before* hierarchical clustering.
160+
161+ 2. **Differential-expression aggregation** (``aggregate_node_properties``):
162+ combines the per-ion z-values (derived from individual differential
163+ expression tests) into a single parent-level z-value via Stouffer's
164+ method. These z-values quantify *how much* each ion changes between
165+ conditions and are unrelated to the proteoform-similarity p-values
166+ from step 1.
167+
168+ The Benjamini-Yekutieli correction in step 1 therefore precedes the
169+ Stouffer aggregation in step 2 by design, because they operate on
170+ different null hypotheses: step 1 asks "do these two peptides differ
171+ from each other?" while step 2 asks "does this protein change between
172+ conditions?". The final protein-level p-values produced in step 2
173+ are later corrected with Benjamini-Hochberg across all proteins (see
174+ ``diffquant_table.py``).
175+
126176 Args:
127177 root_node: Root of the hierarchical tree (protein level)
128178 ionname2diffion: Dictionary mapping ion names to DifferentialIon objects
@@ -134,7 +184,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
134184 pval_threshold_basis: P-value threshold for clustering decisions
135185 fcfc_threshold: Fold-change threshold for clustering
136186 take_median_ion: Whether to use median-centered ions
137- fragment_outlier_filtering: Whether to filter fragment outliers
187+ aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES)
138188
139189 Returns:
140190 anytree.Node: The root node with all clustering annotations and aggregated statistics
@@ -164,15 +214,15 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
164214 if take_median_ion :
165215 grouped_mainclust_leafs = aqcluster_utils .select_median_fc_leafs (grouped_mainclust_leafs )
166216 diffions = aqcluster_utils .map_grouped_leafs_to_diffions (grouped_mainclust_leafs , ionname2diffion ) #the diffions are the ions that are actually compared
167- childnode2clust = find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold ) #the clustering is performed on the child nodes
217+ childnode2clust = find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , cluster_threshold_ion_type = cluster_threshold_ion_type ) #the clustering is performed on the child nodes
168218 childnode2clust = merge_similar_clusters_if_applicable (childnode2clust , type_node , fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE )
169219 childnode2clust = aq_cluster_sorting .decide_cluster_order (childnode2clust )
170220
171221 aq_cluster_pfstats .add_proteoform_statistics_to_nodes (childnode2clust , take_median_ion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist )
172222 aqcluster_utils .assign_clusterstats_to_type_node (type_node , childnode2clust )
173223 aqcluster_utils .annotate_mainclust_leaves (childnode2clust )
174224 aqcluster_utils .assign_cluster_number (type_node , childnode2clust )
175- aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , peptide_outlier_filtering = False , fragment_outlier_filtering = fragment_outlier_filtering )
225+ aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , peptide_outlier_filtering = False , aggregation_mode = aggregation_mode )
176226
177227 return root_node
178228
@@ -183,21 +233,46 @@ def get_childnode2clust_for_single_ion(type_node):
183233 return {type_node .children [0 ]: 0 }
184234
185235
186- def find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold ):
187- """Compares the fold changes of the ions corresponding to the nodes that are compared and returns the set of ions with consistent fold changes.
236+ def find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , cluster_threshold_ion_type = 0.01 ):
237+ """Cluster sibling ions by the similarity of their fold changes (proteoform inference).
238+
239+ For each pair of sibling ion groups, a *double-differential* test
240+ (``evaluate_similarity``) computes a p-value for the null hypothesis that
241+ their fold changes are equal (i.e. they belong to the same proteoform).
242+ These pairwise p-values form a condensed similarity matrix that is
243+ corrected for multiple testing with the Benjamini-Yekutieli procedure
244+ (``get_multiple_testing_corrected_condensed_similarity_matrix``), which is
245+ appropriate here because the pairwise comparisons are intrinsically
246+ dependent. The corrected matrix is then converted to a distance matrix and
247+ subjected to Ward hierarchical clustering.
248+
249+ Note: the p-values computed and corrected here test *inter-ion similarity*
250+ for proteoform grouping. They are entirely separate from the per-ion
251+ differential-expression p-values that are later aggregated via Stouffer's
252+ method in ``aggregate_node_properties``.
188253
189254 Args:
190- diffions (list[list[ionnames]]): contains the sets of ions to be tested, for example [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar in type (e.g. fragment, precursor)!
191- normed_c1 (ConditionBackground): [description]
192- normed_c2 (ConditionBackground): [description]
193- ion2diffDist (dict(ion : SubtractedBackground)): [description]
194- p2z ([type]): [description]
195- deedpair2doublediffdist ([type]): [description]
196- fc_threshold (float, optional): [description]. Defaults to 0.
197- pval_threshold_basis (float, optional): the threshold at which to merge peptides at the gene level. Defaults to 0.01
255+ type_node: Parent node whose children are being clustered
256+ diffions: List of lists of DifferentialIon objects, one sublist per
257+ child node, e.g. ``[[fragion1_prec1, fragion2_prec1], [fragion1_prec2]]``
258+ normed_c1: ConditionBackgrounds for condition 1
259+ normed_c2: ConditionBackgrounds for condition 2
260+ ion2diffDist: Mapping of ion pairs to differential background distributions
261+ p2z: Cache for p-value to z-value conversions
262+ deedpair2doublediffdist: Cache for double-differential distributions
263+ pval_threshold_basis: P-value threshold at the gene level for cutting the
264+ clustering dendrogram; thresholds for lower levels are looked up in
265+ ``LEVEL2PVALTHRESH``
266+ fcfc_threshold: Minimum fold-change difference below which two ion groups
267+ are assumed similar without a formal test
268+ cluster_threshold_ion_type: P-value threshold at the ion_type level
269+
270+ Returns:
271+ list[tuple[Node, int]]: Pairs of (child node, cluster index) sorted by
272+ node name for reproducibility
198273 """
199274
200- pval_threshold_basis = get_pval_threshold_basis (type_node , pval_threshold_basis )
275+ pval_threshold_basis = get_pval_threshold_basis (type_node , pval_threshold_basis , cluster_threshold_ion_type = cluster_threshold_ion_type )
201276 diffions_idxs = [[x ] for x in range (len (diffions ))]
202277 diffions_fcs = aqcluster_utils .get_fcs_ions (diffions )
203278 #mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
@@ -216,26 +291,36 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
216291
217292 return childnode2clust
218293
219- def get_pval_threshold_basis (type_node , pval_threshold_basis ): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
294+ def get_pval_threshold_basis (type_node , pval_threshold_basis , cluster_threshold_ion_type = 0.01 ): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary
220295 if type_node .level == "gene" :
221296 return pval_threshold_basis
222- else :
223- return LEVEL2PVALTHRESH .get (type_node .level , 0.2 )
297+ if type_node .level == "ion_type" :
298+ return cluster_threshold_ion_type
299+ return LEVEL2PVALTHRESH .get (type_node .level , 0.2 )
224300
225301def get_multiple_testing_corrected_condensed_similarity_matrix (condensed_distance_matrix : np .array ):
226- """
227- condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
302+ """Apply Benjamini-Yekutieli FDR correction to pairwise ion-similarity p-values.
303+
304+ The condensed matrix contains p-values from *double-differential* tests
305+ between all pairs of sibling ions (see ``evaluate_similarity``). Each
306+ p-value tests the null hypothesis that two ions share the same fold
307+ change. Because every ion appears in multiple pairwise comparisons, the
308+ tests are intrinsically dependent, which is why the Benjamini-Yekutieli
309+ (``fdr_by``) procedure is used instead of Benjamini-Hochberg.
310+
311+ These corrected p-values are used exclusively for proteoform clustering
312+ (deciding which ions belong together) and are unrelated to the per-ion
313+ differential-expression p-values that are later aggregated via Stouffer's
314+ method.
228315
229316 Args:
230- condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
317+ condensed_distance_matrix: 1-D array of pairwise similarity p-values
318+ in scipy condensed-matrix format.
231319
232320 Returns:
233- np.array: Corrected condensed distance matrix.
321+ np.array: Corrected p-values in the same condensed- matrix layout .
234322 """
235- # Apply Benjamini-Yekutieli correction
236323 _ , corrected_pvalues , _ , _ = multitest .multipletests (condensed_distance_matrix , method = 'fdr_by' )
237-
238- # Return the corrected condensed matrix
239324 return corrected_pvalues
240325
241326
@@ -341,4 +426,3 @@ def exclude_node(node):
341426 node .is_included = False
342427 for descendant in node .descendants :
343428 descendant .is_included = False
344-
0 commit comments