2121LEVEL_NAMES = ['ion_type' , 'mod_seq_charge' , 'mod_seq' , 'seq' ]
2222MAPPING_DICT = {'SEQ' :'seq' , 'MOD' :'mod_seq' , 'CHARGE' :'mod_seq_charge' , 'MS1ISOTOPES' :'ms1_isotopes' ,'FRGION' :'frgion' , 'PRECURSOR' : 'precursor' }
2323FCDIFF_CUTOFF_CLUSTERMERGE = 0
24- LEVEL2PVALTHRESH = {'ion_type' :0.01 , 'mod_seq_charge' :0.01 , 'mod_seq' :1e-20 , 'seq' :1e-20 } #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
2524
25+ LEVEL2PVALTHRESH = {'ion_type' :0.01 , 'mod_seq_charge' :0.01 , 'mod_seq' :1e-20 , 'seq' :0.2 } #the pval threshold is only set at the gene level, the rest of the levels are set as specified here. The threshold applies to the children of the node
2626
2727
2828
2929
30- def get_scored_clusterselected_ions ( gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion ,
31- fcdiff_cutoff_clustermerge ):
30+
31+ def get_scored_clusterselected_ions ( gene_name , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion , fcdiff_cutoff_clustermerge ):
3232 #typefilter = TypeFilter('successive')
3333
3434 global FCDIFF_CUTOFF_CLUSTERMERGE
3535 FCDIFF_CUTOFF_CLUSTERMERGE = fcdiff_cutoff_clustermerge
3636
37+
3738 diffions = sorted (diffions , key = lambda x : x .name )
3839 name2diffion = {x .name : x for x in diffions }
3940 root_node = create_hierarchical_ion_grouping (gene_name , diffions )
@@ -87,13 +88,13 @@ def add_reduced_names_to_root(node):
8788 node .name_reduced = node .name .replace (node .parent .name , "" )
8889 else :
8990 node .name_reduced = node .name
90-
91+
9192
9293import pandas as pd
9394def cluster_along_specified_levels (root_node , ionname2diffion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold , take_median_ion ):#~60% of overall runtime
9495 #typefilter object specifies filtering and clustering of the nodes
9596 aqcluster_utils .assign_properties_to_base_ions (root_node , ionname2diffion , normed_c1 , normed_c2 )
96-
97+
9798 for level_nodes in aqcluster_utils .iterate_through_tree_levels_bottom_to_top (root_node ):
9899 nodetypes_at_level = list (set ([node .type for node in level_nodes ]))
99100 if nodetypes_at_level == ["base" ]:
@@ -105,7 +106,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
105106 for type_node in type_nodes : #this goes through each precursor individually and clusters the children
106107 child_nodes = type_node .children
107108 grouped_mainclust_leafs = aqcluster_utils .get_grouped_mainclust_leafs (child_nodes ) #leafs are excluded if they are not in the main cluster
108-
109+
109110 if len (grouped_mainclust_leafs )== 0 : #this means the leafs were previously excluded
110111 exclude_node (type_node )
111112 continue
@@ -119,12 +120,12 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed
119120 childnode2clust = find_fold_change_clusters (type_node , diffions , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist , pval_threshold_basis , fcfc_threshold ) #the clustering is performed on the child nodes
120121 childnode2clust = merge_similar_clusters_if_applicable (childnode2clust , type_node , fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE )
121122 childnode2clust = aq_cluster_sorting .decide_cluster_order (childnode2clust )
122-
123+
123124 aq_cluster_pfstats .add_proteoform_statistics_to_nodes (childnode2clust , take_median_ion , normed_c1 , normed_c2 , ion2diffDist , p2z , deedpair2doublediffdist )
124125 aqcluster_utils .assign_clusterstats_to_type_node (type_node , childnode2clust )
125126 aqcluster_utils .annotate_mainclust_leaves (childnode2clust )
126127 aqcluster_utils .assign_cluster_number (type_node , childnode2clust )
127- aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , use_fewpeps_per_protein = True )
128+ aqcluster_utils .aggregate_node_properties (type_node ,only_use_mainclust = True , peptide_outlier_filtering = False )
128129
129130 return root_node
130131
@@ -153,11 +154,11 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif
153154 diffions_idxs = [[x ] for x in range (len (diffions ))]
154155 diffions_fcs = aqcluster_utils .get_fcs_ions (diffions )
155156 #mt_corrected_pval_thresh = pval_threshold_basis/len(diffions)
156- condensed_similarity_matrix = scipy .spatial .distance .pdist (diffions_idxs , lambda idx1 , idx2 : evaluate_similarity (idx1 [0 ], idx2 [0 ], diffions , diffions_fcs , normed_c1 , normed_c2 , ion2diffDist ,p2z ,
157+ condensed_similarity_matrix = scipy .spatial .distance .pdist (diffions_idxs , lambda idx1 , idx2 : evaluate_similarity (idx1 [0 ], idx2 [0 ], diffions , diffions_fcs , normed_c1 , normed_c2 , ion2diffDist ,p2z ,
157158 deedpair2doublediffdist , fcfc_threshold )) #gives p-values of the pairwise comparisons of the ions
158159 condensed_similarity_matrix_mt_corrected = get_multiple_testing_corrected_condensed_similarity_matrix (condensed_similarity_matrix )
159160 condensed_distance_matrix_mt_corrected = 1 / condensed_similarity_matrix_mt_corrected
160-
161+
161162 after_clust = scipy .cluster .hierarchy .ward (condensed_distance_matrix_mt_corrected )
162163 clustered = scipy .cluster .hierarchy .fcluster (after_clust , 1 / (pval_threshold_basis ), criterion = 'distance' )
163164 clustered = aqcluster_utils .exchange_cluster_idxs (clustered )
@@ -173,20 +174,20 @@ def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval thresho
173174 return pval_threshold_basis
174175 else :
175176 return LEVEL2PVALTHRESH .get (type_node .level , 0.2 )
176-
177+
177178def get_multiple_testing_corrected_condensed_similarity_matrix (condensed_distance_matrix : np .array ):
178179 """
179180 condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent.
180-
181+
181182 Args:
182183 condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons.
183-
184+
184185 Returns:
185186 np.array: Corrected condensed distance matrix.
186187 """
187188 # Apply Benjamini-Yekutieli correction
188189 _ , corrected_pvalues , _ , _ = multitest .multipletests (condensed_distance_matrix , method = 'fdr_by' )
189-
190+
190191 # Return the corrected condensed matrix
191192 return corrected_pvalues
192193
@@ -238,25 +239,25 @@ def update_childnode2clust(childnode2clust, old_clusters, new_clusters):
238239 new_clust = old2new [old_clust ]
239240 childnode2clust_new .append ((childnode , new_clust ))
240241 return childnode2clust_new
241-
242242
243243
244244
245- def evaluate_similarity (idx1 : int , idx2 : int ,
246- diffions : list [aq_diff_analysis .DifferentialIon ],
245+
246+ def evaluate_similarity (idx1 : int , idx2 : int ,
247+ diffions : list [aq_diff_analysis .DifferentialIon ],
247248 fcs : list [list [int ]],
248- normed_c1 : aq_diff_background .BackGroundDistribution ,
249+ normed_c1 : aq_diff_background .BackGroundDistribution ,
249250 normed_c2 : aq_diff_background .BackGroundDistribution ,
250251 ion2diffDist : dict [str , aq_diff_background .SubtractedBackgrounds ],
251- p2z : dict [str , str ],
252+ p2z : dict [str , str ],
252253 deedpair2doublediffdist : dict [tuple [aq_diff_background .SubtractedBackgrounds , aq_diff_background .SubtractedBackgrounds ],aq_diff_background .SubtractedBackgrounds ],
253254 fcfc_threshold : float ) -> float :
254255 """
255256 Evaluate the statistical similarity between two sets of ions based on their properties and fold changes.
256-
257+
257258 This function calculates a p-value representing the statistical similarity between two sets of ions,
258259 testing the null hypothesis that the two sets are not significantly different.
259-
260+
260261 Args:
261262 idx1 (int): Index of the first set of ions in the diffions list.
262263 idx2 (int): Index of the second set of ions in the diffions list.
@@ -268,7 +269,7 @@ def evaluate_similarity(idx1: int, idx2: int,
268269 p2z (dict[str, str]): Dictionary for converting p-values to z-scores.
269270 deedpair2doublediffdist (dict[tuple[aq_diff_background.SubtractedBackgrounds, aq_diff_background.SubtractedBackgrounds], aq_diff_background.SubtractedBackgrounds]): Mapping of ion pairs to their double difference distributions.
270271 fcfc_threshold (float): Threshold for considering fold changes as similar.
271-
272+
272273 Returns:
273274 float: A p-value where higher values suggest greater similarity between ion sets.
274275 Returns 0.99 for fold changes below fcfc_threshold.
0 commit comments