From f8168afa68ef4e87ee4c463259ce9ddd9de25c40 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:05:03 +0200 Subject: [PATCH 1/3] Delete unused modules (outlier_scoring, diffresults_handling, phospho_inference) Co-Authored-By: Claude Sonnet 4.6 --- alphaquant/cluster/outlier_scoring.py | 397 ------------------- alphaquant/multicond/diffresults_handling.py | 128 ------ alphaquant/ptm/phospho_inference.py | 27 -- 3 files changed, 552 deletions(-) delete mode 100644 alphaquant/multicond/diffresults_handling.py delete mode 100644 alphaquant/ptm/phospho_inference.py diff --git a/alphaquant/cluster/outlier_scoring.py b/alphaquant/cluster/outlier_scoring.py index 7b33caf2..cbc9925b 100644 --- a/alphaquant/cluster/outlier_scoring.py +++ b/alphaquant/cluster/outlier_scoring.py @@ -115,400 +115,3 @@ def __remove_peptide_if_necessary__(self, peptide_node): -class ProtNodeRetriever(): - @staticmethod - def get_protnodes_from_diffclust_list(condpair_tree, diffclust_list): - proteins_wanted = {x.protein_name for x in diffclust_list} - protnodes = list(filter(lambda x : x.name in proteins_wanted, condpair_tree.children)) - return protnodes - -# Cell - -class OutlierPeptideLoader(): - def __init__(self, condpair_tree): - self._condpair_tree = condpair_tree - self.outlier_peptides = [] - self._add_outlier_peptides() - - def _add_outlier_peptides(self): - for protnode in self._condpair_tree.children: - nodechecker = ProtnodeClusterCheckerPeptideInfos(protnode) - self.outlier_peptides += nodechecker.get_outlier_peptide_infos() - - -class ProtnodeClusterCheckerPeptideInfos(ProtnodeClusterChecker): - def __init__(self, protnode): - super().__init__(protnode) - self._outlier_peptide_infos = [] - - def get_outlier_peptide_infos(self): - diffclusts = self.get_diffclusts() - for clusterdiffinfo in diffclusts: - self._update_outlier_peptide_infos(clusterdiffinfo) - return self._outlier_peptide_infos - - def _update_outlier_peptide_infos(self, clusterdiffinfo): - peptide_nodes = self._get_outlier_peptide_nodes(clusterdiffinfo) - for peptide_node in peptide_nodes: - self._outlier_peptide_infos.append(OutlierPeptideInfo(peptide_node)) - - def _get_outlier_peptide_nodes(self, clusterdiffinfo): - peptide_names = set(clusterdiffinfo.outlier_peptide_names) - return anytree.findall(self._protnode, filter_= lambda x : x.name in peptide_names, maxlevel=2) - - -class ProteinInfo(): - def __init__(self, peptide_node): - self.protein_fc = self._get_protein_fc(peptide_node) - - def _get_protein_fc(self, peptide_node): - return aqclustutils.find_node_parent_at_level(peptide_node, "gene").fc - - -class OutlierPeptideInfo(ProteinInfo): - def __init__(self, peptide_node): - super().__init__(peptide_node) - self._peptide_node = peptide_node - self.peptide_sequence = peptide_node.name - self.fc = peptide_node.fc - self.quality_score = self._get_quality_score(peptide_node) - self.protnormed_fc = None - self.num_mainclust_peptides = self._get_number_mainclust_peptides() - self._calc_protnormed_fc() - - def _get_quality_score(self, peptide_node): - has_ml_score = hasattr(peptide_node, 'ml_score') - if has_ml_score: - return abs(peptide_node.ml_score) - else: - return 1/peptide_node.fraction_consistent - - def _calc_protnormed_fc(self): - self.protnormed_fc = self.fc - self.protein_fc - - def _get_number_mainclust_peptides(self): - samelevel_nodes = self._peptide_node.parent.children - mainclust_nodes = filter(lambda x : x.cluster ==0, samelevel_nodes) - return len(list(mainclust_nodes)) - - -# Cell -import anytree - - -class ModifiedPeptideLoader(): - def __init__(self, condpair_tree, specific_modification = "[Phospho (STY)]"): - self.specific_modification = specific_modification - self.condpair_tree = condpair_tree - self._pepname2modpep = {} - self._load_modified_peptides_from_tree() - - def get_modpep_from_sequence(self, peptide_sequence): - return self._pepname2modpep.get(peptide_sequence) - - def _load_modified_peptides_from_tree(self): - modified_pepnodes = self._get_modified_peptide_nodes() - for mod_pep_node in modified_pepnodes: - self._update_pepname2modpep(mod_pep_node) - - def _get_modified_peptide_nodes(self): - return anytree.search.findall(self.condpair_tree, lambda x : getattr(x,'type',"") == 'mod_seq', maxlevel=4) - - def _update_pepname2modpep(self, mod_pep_node): - modified_peptide = PeptideWithSpecificModification(mod_pep_node, self.specific_modification) - if modified_peptide.specific_modification_found: - self._pepname2modpep[modified_peptide.peptide_sequence] = modified_peptide - - -class PeptideWithSpecificModification(OutlierPeptideInfo): - def __init__(self, node_modpeptide, specific_modification= "[Phospho (STY)]"): - self.protein_name = self._get_protein_name(node_modpeptide) - self.modified_sequence = node_modpeptide.name - self.specific_modification_found = self._check_for_specific_modification(specific_modification) - if not self.specific_modification_found: - return - self.peptide_sequence = self._get_peptide_sequence(node_modpeptide) - self.fc = node_modpeptide.fc - self.quality_score = self._get_quality_score(node_modpeptide) - - def _check_for_specific_modification(self, specific_modification): - return specific_modification in self.modified_sequence - - def _get_peptide_sequence(self, node_modpeptide): - pepnode = aqclustutils.find_node_parent_at_level(node_modpeptide, level='seq') - return pepnode.name - - def _get_protein_name(self, node_modpeptide): - pepnode = aqclustutils.find_node_parent_at_level(node_modpeptide, level='gene') - return pepnode.name - -# Cell -import numpy as np -class ComplementedClusterLoader(): - def __init__(self, outlier_peptide_loader, modified_peptide_loader): - self._outlier_peptides = outlier_peptide_loader.outlier_peptides - self._modified_peptide_loader = modified_peptide_loader - self.complemented_clusters = [] - self._find_complemented_clusters() - - def _find_complemented_clusters(self): - for outlier_peptide in self._outlier_peptides: - modified_peptide = self._get_modified_peptide(outlier_peptide) - if modified_peptide is not None: - self.complemented_clusters.append(ComplementedCluster(outlier_peptide, modified_peptide)) - - def _get_modified_peptide(self, outlier_peptide): - return self._modified_peptide_loader.get_modpep_from_sequence(outlier_peptide.peptide_sequence) - - -class ComplementedCluster(): - def __init__(self, outlier_peptide, modified_peptide): - self.outlier_peptide = outlier_peptide - self.modified_peptide = modified_peptide - self._add_normfc_to_modpep() - - def has_opposite_regulation(self): - return np.sign(self.outlier_peptide.protnormed_fc) == -np.sign(self.modified_peptide.protnormed_fc) - - def get_quality_score(self): - return max(self.outlier_peptide.quality_score, self.modified_peptide.quality_score) - - def get_outlier_quality_score(self): - return self.outlier_peptide.quality_score - - def get_modpep_quality_score(self): - return self.modified_peptide.quality_score - - def get_min_abs_normfc(self): - return min(abs(self.outlier_peptide.protnormed_fc), abs(self.modified_peptide.protnormed_fc)) - - def get_max_abs_normfc(self): - return max(abs(self.outlier_peptide.protnormed_fc), abs(self.modified_peptide.protnormed_fc)) - - def get_outlier_abs_normfc(self): - return abs(self.outlier_peptide.protnormed_fc) - - def get_ptm_abs_normfc(self): - return abs(self.modified_peptide.protnormed_fc) - - def get_ptm_abs_fc(self): - return abs(self.modified_peptide.fc) - - def get_number_mainclust_peptides(self): - return self.outlier_peptide.num_mainclust_peptides - - def _add_normfc_to_modpep(self): - self.modified_peptide.protein_fc = self.outlier_peptide.protein_fc - self.modified_peptide._calc_protnormed_fc() - - - -# Cell -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -import scipy.stats - - -class ComplementedClusterEvaluator(): - def __init__(self, complmented_clusters): - self._complemented_clusters = complmented_clusters - self._fcs_outliers = None - self._fcs_modpeps = None - self._assign_fold_change_lists() - - - def compare_regulation_directions(self, ax): - opposite_regulation_overview = [int(x.has_opposite_regulation()) for x in self._complemented_clusters] - self._plot_regulation_direction_histogram(ax, opposite_regulation_overview) - - def scatter_fold_changes(self,ax): - - num_opposite = sum([np.sign(x[0])==-np.sign(x[1]) for x in zip(self._fcs_outliers, self._fcs_modpeps)]) - num_same = sum([np.sign(x[0])==np.sign(x[1]) for x in zip(self._fcs_outliers, self._fcs_modpeps)]) - LOGGER.info(f"{num_same} same, {num_opposite} opposite") - sns.scatterplot(x =self._fcs_outliers, y=self._fcs_modpeps,ax=ax) - ax.set_xlabel("outliers") - ax.set_ylabel("modified_peptides") - self._set_axis_limits(ax) - self._draw_horizontal_vertical_line(ax) - - def calculate_correlation(self): - r, p = scipy.stats.pearsonr(self._fcs_outliers, self._fcs_modpeps) - LOGGER.info(f"pval is {p}") - return r - - - def _assign_fold_change_lists(self): - self._fcs_outliers = list([x.outlier_peptide.protnormed_fc for x in self._complemented_clusters]) - self._fcs_modpeps = list([x.modified_peptide.protnormed_fc for x in self._complemented_clusters]) - - def _set_axis_limits(self,ax): - all_lims = ax.get_xlim() + ax.get_ylim() #returns tuples with the lims - most_extreme_val = max((abs(x) for x in all_lims)) - ax.set_xlim(-most_extreme_val, most_extreme_val) - ax.set_ylim(-most_extreme_val, most_extreme_val) - - def _draw_horizontal_vertical_line(self, ax): - ax.hlines(y=0, xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], colors='black') - ax.vlines(x=0, ymin = ax.get_ylim()[0], ymax = ax.get_ylim()[1], colors='black') - - - - - @staticmethod - def _plot_regulation_direction_histogram(ax, opposite_regulation_overview): - ax.hist(opposite_regulation_overview) - - - - -# Cell -class QuantileFilterer(): - def __init__(self, objects_to_filter, filterconfigs): - """Filters objects (e.g. ClusterDiffInfos) based on the - FilterConfigs defined below. The objects need to have - a function with the identical name as in FilterConfig.property_name - which returns a scalar. The quantiles are defined with respect to this scalar. - - Args: - objects_to_filter (_type_): _description_ - filterconfigs (_type_): _description_ - """ - - self._objects_to_filter = objects_to_filter - self._filterconfigs = filterconfigs - - def get_filtered_list_of_objects(self): - if self._filterconfigs is None: - return self._objects_to_filter - else: - return self._filter_objects() - - def _filter_objects(self): - individually_filtered = [] - for filterconf in self._filterconfigs: - filtered_cclusts = set(self._filter_to_property_quantile(filterconf)) - individually_filtered.append(filtered_cclusts) - return list(set.intersection(*individually_filtered)) #we filter the quantile of the COMPLETE set for - #every condition and intersect in the end, alternatively one could successively filter the quantiles - - - def _filter_to_property_quantile(self, filterconf):#get the quantiles with the best property scores i.e. quality score - property_sorted = self._sort_objects_to_filter_by_score(filterconf.property_name) - - if filterconf.quantile_starts_at_lowest: - number_to_retain = int(filterconf.quantile * len(property_sorted)) - return property_sorted[:number_to_retain] - else: - number_to_discard = int((1-filterconf.quantile) * len(property_sorted)) - return property_sorted[number_to_discard:] - - def _sort_objects_to_filter_by_score(self, property_name): - try: - return self._property_encodes_instance_variable(property_name) - except: - return self._property_encodes_function_name(property_name) - - def _property_encodes_instance_variable(self, property_name): - return sorted(self._objects_to_filter, key = lambda x : getattr(x, property_name)) - - def _property_encodes_function_name(self, property_name): - return sorted(self._objects_to_filter, key = lambda x : getattr(x, property_name)()) - - - -class FilterConfig(): - def __init__(self, property_name, quantile, quantile_starts_at_lowest): - self.property_name = property_name - self.quantile = quantile - self.quantile_starts_at_lowest = quantile_starts_at_lowest - - - -# Cell -class ComplementedClusterFilterer(QuantileFilterer): - def __init__(self, complemented_clusterloader, clusterfilterconfigs): - super().__init__(complemented_clusterloader.complemented_clusters, clusterfilterconfigs) - - def get_filtered_complemented_clusters(self): - return self.get_filtered_list_of_objects() - - - -class ComplementedClusterFilterConfigs(): - def __init__(self, min_abs_normfc_quantile = 1, ptm_abs_normfc_quantile = 1, outlier_abs_normfc_quantile = 1, ptm_absfc_quantile = 1, - modpep_quality_quantile = 1, outlier_quality_quantile = 1, number_mainclustpeps_quantile = 1): - self.filterconfigs = [] - self._min_abs_normfc_quantile =min_abs_normfc_quantile - self._ptm_abs_normfc_quantile=ptm_abs_normfc_quantile - self._number_mainclustpeps_quantile = number_mainclustpeps_quantile - self._outlier_abs_normfc_quantile= outlier_abs_normfc_quantile - self._ptm_absfc_quantile=ptm_absfc_quantile - self._outlier_quality_quantile=outlier_quality_quantile - self._modpep_quality_quantile = modpep_quality_quantile - self._number_mainclustpeps_quantile = number_mainclustpeps_quantile - self._initialize_filter_configs() - - def _initialize_filter_configs(self): - self.filterconfigs.append(FilterConfig("get_min_abs_normfc", self._min_abs_normfc_quantile, False)) - self.filterconfigs.append(FilterConfig("get_ptm_abs_normfc", self._ptm_abs_normfc_quantile, False)) - self.filterconfigs.append(FilterConfig("get_ptm_abs_fc", self._ptm_absfc_quantile, False)) - self.filterconfigs.append(FilterConfig("get_outlier_abs_normfc", self._outlier_abs_normfc_quantile, False)) - self.filterconfigs.append(FilterConfig("get_outlier_quality_score", self._outlier_quality_quantile, True)) - self.filterconfigs.append(FilterConfig("get_modpep_quality_score", self._modpep_quality_quantile, True)) - self.filterconfigs.append(FilterConfig("get_number_mainclust_peptides", self._number_mainclustpeps_quantile, False)) - - - - -# Cell - -class DiffClusterFilterer(QuantileFilterer): - def __init__(self, diffclust_list, diffclustfilterconfigs): - super().__init__(diffclust_list, diffclustfilterconfigs) - - def get_filtered_diffclust_list(self): - return self.get_filtered_list_of_objects() - - -class DiffClusterFilterConfig(FilterConfig): - def __init__(self, fcdiff_quantile = 1, quality_score_quantile = 1, num_mainclust_peptides_quantile = 1, num_outlierclust_peptides_quantile = 1): - self.filterconfigs = [] - self._fcdiff_quantile = fcdiff_quantile - self._quality_score_quantile = quality_score_quantile - self._num_mainclust_peptides_quantile = num_mainclust_peptides_quantile - self._num_outlierclust_peptides_quantile = num_outlierclust_peptides_quantile - self._initialize_filter_configs() - - def _initialize_filter_configs(self): - self.filterconfigs.append(FilterConfig("fcdiff", self._fcdiff_quantile,False)) - self.filterconfigs.append(FilterConfig("quality_score", self._quality_score_quantile, True)) - self.filterconfigs.append(FilterConfig("get_num_mainclust_peptides", self._num_mainclust_peptides_quantile, False)) - self.filterconfigs.append(FilterConfig("get_num_outlierclust_peptides", self._num_outlierclust_peptides_quantile, False)) - - - - -# Cell - -class OutlierPeptideFilterer(QuantileFilterer): - def __init__(self, outlier_peptide_list, outlierpeptide_filterconfigs): - super().__init__(outlier_peptide_list, outlierpeptide_filterconfigs) - - def get_filtered_outlier_peptide_list(self): - return self.get_filtered_list_of_objects() - - -class OutlierPeptideFilterConfigs(FilterConfig): - def __init__(self, quality_score_quantile = 1, num_mainclust_peptides_quantile = 1, protnormed_fc_quantile = 1): - self.filterconfigs = [] - self._quality_score_quantile = quality_score_quantile - self._num_mainclust_peptides_quantile = num_mainclust_peptides_quantile - self._protnormed_fc_quantile = protnormed_fc_quantile - self._initialize_filter_configs() - - def _initialize_filter_configs(self): - self.filterconfigs.append(FilterConfig("quality_score", self._quality_score_quantile, True)) - self.filterconfigs.append(FilterConfig("num_mainclust_peptides", self._num_mainclust_peptides_quantile, False)) - self.filterconfigs.append(FilterConfig("protnormed_fc", self._protnormed_fc_quantile, False)) diff --git a/alphaquant/multicond/diffresults_handling.py b/alphaquant/multicond/diffresults_handling.py deleted file mode 100644 index 43a1c2ef..00000000 --- a/alphaquant/multicond/diffresults_handling.py +++ /dev/null @@ -1,128 +0,0 @@ - - -class QuantifiedMultiConditionComparison(): - """ - class that contains all information to be used for the multi-condition - analysis. - """ - def __init__(self): - self._proteinname2multicondprots = {} - - def add_quantified_proteins_to_comparison(self, quantified_proteins): - for quantified_protein in quantified_proteins: - self._initialize_multicond_protein_if_necessary(quantified_protein) - self._add_quantified_protein_to_multicond(quantified_protein) - - def get_quantified_protein(self, protein_name, condpair): - multicondprot = self._proteinname2multicondprots.get(protein_name) - return multicondprot.condpair2quantified_protein.get(condpair) - - - def _initialize_multicond_protein_if_necessary(self, quantified_protein): - if quantified_protein.name not in self._proteinname2multicondprots: - self._proteinname2multicondprots[quantified_protein.name] = QuantifiedProteinMultiCondition(quantified_protein.name) - - def _add_quantified_protein_to_multicond(self, quantified_protein): - multicondprot = self._proteinname2multicondprots.get(quantified_protein.name) - multicondprot.condpair2quantified_protein[quantified_protein.condpair] = quantified_protein - -class QuantifiedProteinMultiCondition(): - def __init__(self, name): - self.name = name - self.condpair2quantified_protein = {} - - -class QuantifiedProteinInCondpair(): - def __init__(self, condpair, name, log2fc, p_value, fdr): - self.condpair = condpair - self.name = name - self.log2fc = log2fc - self.p_value = p_value - self.fdr = fdr - - -class QuantifiedProteinCanditateInCondpair(QuantifiedProteinInCondpair): - """stump for cluster-based protein processing""" - def __init__(self, peptide_nodes_of_cluster): - self.peptides = None - - def _get_quantprot_properties_from_peptide_nodes(self, peptide_nodes_of_cluster): - pass - - -# Cell - -class ResultsDirectoryReader(): - def __init__(self, results_dir, condpairs_selected): - self.quantified_multicondition_comparison = QuantifiedMultiConditionComparison() #initialize empty multicomparison object - - self.__condpairs_selected = condpairs_selected - self.__localizer = ResultstableLocalizer(results_dir) - self._add_all_condpairs_to_multicondition_comparison() - - def _add_all_condpairs_to_multicondition_comparison(self): - for condpair in self.__condpairs_selected: - file = self.__localizer.condpairname2file.get(condpair) - quantified_proteins = ResultsTableReader(condpair, file).quantified_proteins - self.quantified_multicondition_comparison.add_quantified_proteins_to_comparison(quantified_proteins) - -# Cell -import pandas as pd - -class ResultsTableReader(): - def __init__(self, condpair, file): - self.quantified_proteins = [] - - self.__condpair = condpair - self.__results_df = self._read_table(file) - self.__protnames = self._get_property("protein") - self.__log2fcs = self._get_property("log2fc") - self.__p_values = self._get_property("pval") - self.__fdrs = self._get_property("fdr") - - self._init_quantified_proteins() - - def _read_table(self, file): - return pd.read_csv(file, sep = "\t") - - def _get_property(self, property): - return list(self.__results_df[property]) - - def _init_quantified_proteins(self): - for idx in range(len(self.__protnames)): - self.quantified_proteins.append(QuantifiedProteinInCondpair(condpair=self.__condpair, name = self.__protnames[idx], - log2fc= self.__log2fcs[idx], p_value=self.__p_values[idx], fdr = self.__fdrs[idx])) - - - - - -# Cell -import re -class ResultstableLocalizer(): - def __init__(self, results_dir): - self._resultsfilepaths = ResultsFiles(results_dir=results_dir).filepaths - self.condpairname2file = {} - self._load_condpairname2file() - - def _load_condpairname2file(self): - for filepath in self._resultsfilepaths: - filename = self._parse_condpairname_from_filepath(filepath) - self.condpairname2file.update({filename: filepath}) - - @staticmethod - def _parse_condpairname_from_filepath(file): - pattern = "(.*\/|^)(results.*\/)(.*)(.results.tsv)" - matched = re.search(pattern, file) - return matched.group(3) - - -import glob -class ResultsFiles(): - def __init__(self, results_dir): - self._results_dir = results_dir - self.filepaths = self.__get_result_filepaths() - - def __get_result_filepaths(self): - return glob.glob(f'{self._results_dir}/*.results.tsv') - diff --git a/alphaquant/ptm/phospho_inference.py b/alphaquant/ptm/phospho_inference.py deleted file mode 100644 index 851d2c7f..00000000 --- a/alphaquant/ptm/phospho_inference.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import pathlib -import pandas as pd -import alphaquant.cluster.outlier_scoring as aqoutlier -import alphaquant.resources.database_loader as aq_resource_dbloader - -def get_inferred_phospho_peptides(results_dir, cond1, cond2): - outlier_handler = aqoutlier.OutlierHandler(results_dir = results_dir, cond1 = cond1, cond2 = cond2) - clusterdiff_list = outlier_handler.get_diffclust_overview_list() - predicted_phosphoprone_sequences = aq_resource_dbloader.load_dl_predicted_phosphoprone_sequences() - inferred_phospho_peptides = get_regulation_inferred_phosphoprone_peptides(predicted_phosphoprone_sequences, clusterdiff_list) - return inferred_phospho_peptides - - - - -def get_regulation_inferred_phosphoprone_peptides(phosphoprone_seqs, clusterdiff_list): - regulation_inferred_phosphoprone_peptides = [] - for clusterdiff in clusterdiff_list: - cluster_is_phosphoprone = False - for seq in clusterdiff.outlier_peptide_names: - if seq in phosphoprone_seqs: - cluster_is_phosphoprone = True - break - if cluster_is_phosphoprone: - regulation_inferred_phosphoprone_peptides.extend(clusterdiff.outlier_peptide_names) - return set(regulation_inferred_phosphoprone_peptides) From d482eea5f1229fb85b69fcaa9673c1598a715837 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:05:11 +0200 Subject: [PATCH 2/3] Remove dead plotting functions Co-Authored-By: Claude Sonnet 4.6 --- alphaquant/plotting/base_functions.py | 914 +----------------- alphaquant/plotting/classify.py | 16 - alphaquant/plotting/fcviz_medianref.py | 8 - alphaquant/plotting/multicond.py | 47 - alphaquant/plotting/pairwise.py | 38 - alphaquant/ui/dashboad_parts_plots_basic.py | 7 - .../ui/dashboard_parts_plots_proteoforms.py | 22 - 7 files changed, 2 insertions(+), 1050 deletions(-) diff --git a/alphaquant/plotting/base_functions.py b/alphaquant/plotting/base_functions.py index 579272ef..762a3162 100644 --- a/alphaquant/plotting/base_functions.py +++ b/alphaquant/plotting/base_functions.py @@ -86,73 +86,6 @@ def plot_pvals(result_df): plt.plot(x, x) plt.show() -# Cell -from scipy.stats import norm -import matplotlib.pyplot as plt - -def plot_bgdist(bgdist): - fc2counts_rescaled = tranform_fc2count_to_fc_space(bgdist.fc2counts, bgdist.cumulative[-1],1/100.0) - - plt.bar(list(fc2counts_rescaled.keys()), fc2counts_rescaled.values(),width=0.01,color='g') - axes2 = plt.twinx() - x = np.linspace(-4, 4, 1000) - axes2.plot(x, norm.pdf(x, 0, bgdist.SD)/1.15) - axes2.set_ylim(0.0, 0.4) - plt.show() - -def tranform_fc2count_to_fc_space(fc2counts, num_fcs, rescale_factor): - fc2counts_fcscales = {} - for fc, count in fc2counts.items(): - fc2counts_fcscales[fc*rescale_factor] = count/num_fcs - - return fc2counts_fcscales - -# Cell - -import matplotlib.pyplot as plt -from scipy import stats -import itertools - -def plot_withincond_fcs(normed_intensity_df, cut_extremes = True): - """takes a normalized intensity dataframe and plots the fold change distribution between all samples. Column = sample, row = ion""" - - samplecombs = list(itertools.combinations(normed_intensity_df.columns, 2)) - - for spair in samplecombs:#compare all pairs of samples - s1 = spair[0] - s2 = spair[1] - diff_fcs = normed_intensity_df[s1].to_numpy() - normed_intensity_df[s2].to_numpy() #calculate fold changes by subtracting log2 intensities of both samples - - if cut_extremes: - cutoff = max(abs(np.nanquantile(diff_fcs,0.025)), abs(np.nanquantile(diff_fcs, 0.975))) #determine 2.5% - 97.5% interval, i.e. remove extremes - range = (-cutoff, cutoff) - else: - range = None - plt.hist(diff_fcs,80,density=True, histtype='step',range=range) #set the cutoffs to focus the visualization - plt.xlabel("log2 peptide fcs") - - plt.show() - -# Cell -from anytree.exporter import DotExporter -import anytree - -def get_iontree_img(root, protein,saveloc = None): - protein_node = anytree.findall_by_attr(root, protein, maxlevel=2)[0] - exporter = DotExporter(protein_node, nodenamefunc=lambda n: f"{get_nodename(n)}\ncluster{n.cluster}\n{n.is_included}") - exporter.to_picture(saveloc) - -def get_nodename(node): - parentname = node.parent.name - shortened_name = node.name.replace(parentname, "") - return shortened_name - -# Cell - -import matplotlib.pyplot as plt -from scipy import stats - - # Cell import matplotlib.pyplot as plt def scatter_df_columns(merged_df, log_axes = False): @@ -171,57 +104,6 @@ def scatter_df_columns(merged_df, log_axes = False): plt.yscale('log') plt.show() -# Cell -import matplotlib.pyplot as plt -def plot_cumhist_dfcols(merged_df): - col = (0.2, 0.4, 0.6, 0.4) - ref_columns = list(filter(lambda x : "_ref" in x, merged_df.columns.to_list())) #filter the reference columns from the merged df - - for ref in ref_columns: - compare = ref.replace("_ref", "") - plt.hist(merged_df[ref], 100, density=True, histtype='step', label='reference') - plt.hist(merged_df[compare], 100, density=True, histtype='step',label='compare') - corr = merged_df[ref].corr(merged_df[compare]) - plt.title(f"{ref} vs. {compare} corr {corr}") - plt.show() - -# Cell - -# Cell -import matplotlib.pyplot as plt -import seaborn as sns - -def plot_fold_change(df, key1, key2): - to_plot = df.copy() - to_plot[f'Ratio ({key1}/{key2})'] = np.log2(to_plot[key1] / to_plot[key2]) - to_plot[f'Inten_{key1}'] = np.log10(to_plot[key1]) - - species = 'Human' - val = to_plot.loc[to_plot['species']==species, f'Ratio ({key1}/{key2})'].values - val = val[~np.isnan(val)&~np.isinf(val)&~np.isneginf(val)] - print(f'Species={species}, n={len(val)}, median={np.median(val)}, dev={np.std(val)}') - species='Ecoli' - val = to_plot.loc[to_plot['species']==species, f'Ratio ({key1}/{key2})'].values - val = val[~np.isnan(val)&~np.isinf(val)&~np.isneginf(val)] - print(f'species={species}, n={len(val)}, median={np.median(val)}, dev={np.std(val)}') - - plt.figure(figsize=(7,7)) - ax = sns.scatterplot(x=f'Ratio ({key1}/{key2})', y=f'Inten_{key1}', hue="species", data=to_plot, alpha=0.5) - plt.title('Fold Change') - plt.xlim([-4.5, 6.5]) - #plt.ylim([6,11.5]) - plt.show() - - -# Cell - -import matplotlib.pyplot as plt - -import numpy as np - - - - # Cell import anytree import time @@ -251,263 +133,6 @@ def get_melted_protein_ion_intensity_table(protein, diffresults_df, normed_df, s print(f"times melted protein intensities:\n t_melted: {t_melted - t_start} \n t_annotated: {t_annotated - t_melted}") return df_melted, diffresults_line -# Cell - -import time -def get_betweencond_fcs_table(melted_df, c1, c2, ion_header = "quant_id"): - t_start = time.time() - has_clust_info = "is_included" in melted_df.columns - melted_df = melted_df.set_index(["condition"]) - sorted_ions = melted_df[aq_variables.QUANT_ID] - c1_df = melted_df.loc[c1] - c2_df = melted_df.loc[c2] - ions = set(c1_df[ion_header]).intersection(set(c2_df[ion_header])) - sorted_ions = [x for x in sorted_ions if x in ions] - c1_df = c1_df.set_index([ion_header]).sort_index() - c2_df = c2_df.set_index([ion_header]).sort_index() - result_ions = [] - result_fcs = [] - result_included = [] - t_localized_index_set = time.time() - for ion in sorted_ions: - is_included = c1_df.loc[ion]["is_included"][0] if has_clust_info else True - - ions1 = c1_df.loc[[ion]]["intensity"] - ions2 = c2_df.loc[[ion]]["intensity"] - fcs = [x-y for x,y in itertools.product(ions1, ions2)] - result_ions.extend([ion for x in range(len(fcs))]) - result_fcs.extend(fcs) - result_included.extend([is_included for x in range(len(fcs))]) - t_list_created = time.time() - res_df = pd.DataFrame({ion_header: result_ions, "log2fc": result_fcs, "is_included" : result_included}) - print(f"times betweencond fcs table:\n t_localized_index_set: {t_localized_index_set - t_start} \n t_list_created: {t_list_created - t_localized_index_set}") - return res_df - -# Cell - -def beeswarm_ion_plot(df_melted, diffresults_protein, only_boxplot = False,saveloc = None): - """takes pre-formatted long-format dataframe which contains all ion intensities for a given protein. - Columns are QUANT_ID, "intensity", "condition". Also takes results of the protein differential analysis as a series - to annotate the plot""" - - #get annotations from diffresults - fdr = float(diffresults_protein.at["fdr"]) - log2fc = float(diffresults_protein.at["log2fc"]) - protein = diffresults_protein.name - - #define greyscale color palette for the two conditions - pal2 = [(0.94, 0.94, 0.94),(1.0, 1.0, 1.0)] - - #searborn standard functions - if only_boxplot: - ax = sns.boxplot(x=aq_variables.QUANT_ID, y="intensity", hue="condition", data=df_melted, palette="Set2", dodge=True) - else: - ax = sns.boxplot(x=aq_variables.QUANT_ID, y="intensity", hue="condition", data=df_melted, palette=pal2) - ax = sns.stripplot(x=aq_variables.QUANT_ID, y="intensity", hue="condition", data=df_melted, palette="Set2", dodge=True)#size = 10/len(protein_df.index) - - #annotate and format - handles, labels = ax.get_legend_handles_labels() - - l = plt.legend(handles[2:4], labels[2:4]) - - plt.xticks(rotation=90) - if "gene" in diffresults_protein.index: - gene = diffresults_line.at["gene"] - plt.title(f"{gene} ({protein}) FDR: {fdr:.1e}") - else: - plt.title(f"{protein} FDR: {fdr:.1e} log2FC: {log2fc:.1f}") - if saveloc is not None: - plt.savefig(saveloc) - - plt.show() - - -# Cell - -import seaborn as sns -import matplotlib.pyplot as plt - -def foldchange_ion_plot(df_melted, diffresults_protein, saveloc = None): - """takes pre-formatted long-format dataframe which contains all between condition fold changes. All ions of a given protein - are visualized, the columns are aq_variables.QUANT_ID and "log2fc". Also takes results of the protein differential analysis as a series - to annotate the plot""" - #get annotations from diffresults - fdr = float(diffresults_protein.at["fdr"]) - protein = diffresults_protein.name - - #define greyscale color palette - pal2 = [(0.94, 0.94, 0.94),(1.0, 1.0, 1.0)] - - #specify color for included proteins - my_pal = {row[aq_variables.QUANT_ID]: "lightblue" if row["is_included"] else "grey" for _,row in df_melted.iterrows()} - - #plot with seaborn standard functions - ax = sns.boxplot(x=aq_variables.QUANT_ID, y="log2fc", data=df_melted, color = "white") - ax = sns.stripplot(x=aq_variables.QUANT_ID, y="log2fc", data=df_melted, palette= my_pal) - - #annotate and format - handles, labels = ax.get_legend_handles_labels() - ax.axhline(y = 0, color='black', linewidth=2, alpha=.7, linestyle = "dashed") - l = plt.legend(handles[2:4], labels[2:4]) - plt.xticks(rotation=90) - if "gene" in diffresults_protein.index: - gene = diffresults_line.at["gene"] - plt.title(f"{gene} ({protein}) FDR: {fdr:.1e}") - else: - plt.title(f"{protein} FDR: {fdr:.1e}") - if saveloc is not None: - plt.savefig(saveloc) - - plt.show() - - - -# Cell -import numpy.ma as ma -import scipy.cluster.hierarchy as hierarchy - -def compare_direction(array1, array2): - identical_elements = array1 == array2 - num_same_direction = np.sum(identical_elements) - return num_same_direction - -def compare_correlation(array1, array2): - corr = ma.corrcoef(ma.masked_invalid(array1), ma.masked_invalid(array2))[0][1] - return corr - -def get_condensed_distance_matrix(arrays, compare_function): - - res = np.ones(int(len(arrays) * (len(arrays)-1)/2)) - count = 0 - for i in range(len(arrays)): - for j in range(i+1, len(arrays)): - array1 = arrays[i] - array2 = arrays[j] - distance = 1/compare_function(array1, array2) - res[count] = distance - count+=1 - - return res - -def clustersort_numerical_arrays(arrays, names , cluster_method ='average',compare_function = compare_direction): - condensed_distance_matrix = get_condensed_distance_matrix(arrays, compare_function) - linkage_matrix = hierarchy.linkage(condensed_distance_matrix, method = cluster_method) - sorted_array_idxs = hierarchy.leaves_list(linkage_matrix) - - sorted_array = [arrays[x] for x in sorted_array_idxs] - sorted_names = [names[x] for x in sorted_array_idxs] - return sorted_array, sorted_names, linkage_matrix - -# Cell -import numpy as np -def compare_direction(array1, array2): - identical_elements = array1 == array2 - num_same_direction = np.sum(identical_elements) - return num_same_direction - -# Cell -import numpy.ma as ma -def compare_correlation(array1, array2): - corr = ma.corrcoef(ma.masked_invalid(array1), ma.masked_invalid(array2))[0][1] - return corr - -# Cell - -import scipy.cluster.hierarchy as hierarchy -import scipy.spatial.distance as dist -def clustersort_numerical_arrays(arrays, names , cluster_method ='average',compare_function = compare_direction): - #condensed_distance_matrix = get_condensed_distance_matrix(arrays, compare_function) - condensed_distance_matrix = dist.pdist(arrays, lambda u, v: 1/(compare_function(u,v)+1)) - linkage_matrix = hierarchy.linkage(condensed_distance_matrix, method = cluster_method) - sorted_array_idxs = hierarchy.leaves_list(linkage_matrix) - - sorted_array = np.array([arrays[x] for x in sorted_array_idxs]) - sorted_names = np.array([names[x] for x in sorted_array_idxs]) - return sorted_array, sorted_names, linkage_matrix - -# Cell - -import pandas as pd -import numpy as np -import os -def get_clustered_dataframe(overview_df, cluster_method ='average',compare_function = compare_direction, clust_rows = True, clust_columns = True): - - df_numbered = overview_df.select_dtypes(include=np.number) - contains_floats = ['float' in str(x) for x in df_numbered.dtypes] - type = 'float' if True in contains_floats else 'int' - df_numbered = df_numbered.astype(type) #ensure that the df has no mixed types - - rows = df_numbered.to_numpy() - rownames = list(df_numbered.index) - colnames = list(df_numbered.columns) - - if clust_rows: - if(len(rownames)>10000): - print(f"large number of rows, skipping cluster step of rows to avoid long runtime.") - else: - print(f"clustering on {len(rownames)} rows") - rows, rownames, _ = clustersort_numerical_arrays(rows, rownames, cluster_method, compare_function) - if clust_columns: - if(len(colnames)>10000): - print(f"large number of columns, skipping cluster step of columns to avoid long runtime") - else: - print(f"clustering on {len(colnames)} columns") - columns, colnames,_ = clustersort_numerical_arrays(rows.T, colnames, cluster_method, compare_function) - rows = columns.T - print("finished clustering") - df_clustered = pd.DataFrame(rows, index= rownames) - df_clustered.columns = colnames - return df_clustered - -# Cell -import re -import os -def get_sample_overview_dataframe(results_folder = os.path.join(".", "results"), regulated_object = "protein",condpairs_to_compare = [], show_fcs = False, name_transformation_function = None): - """ - goes through the results folder and extracts up- and downregulated genes for each (specified) condition comparison - """ - - if len(condpairs_to_compare) == 0: - condpairs_to_compare = [f.replace(".results.tsv", "").split(aq_variables.CONDITION_PAIR_SEPARATOR) for f in os.listdir(results_folder) if re.match(r'.*results.tsv', f)] - - dfs = [] - count = 0 - for row in condpairs_to_compare: - c1 = row[0] - c2 = row[1] - results = get_diffresult_dataframe(c1, c2, results_folder) - if(type(results) == type(None)): - continue - site_df = results - prot2fc = dict(zip(site_df[regulated_object], site_df["log2fc"])) - positive_sites = list(set(site_df[(site_df["fdr"]<0.05) & (site_df["log2fc"]>0.5)][regulated_object])) - negative_sites = list(set(site_df[(site_df["fdr"]<0.05) & (site_df["log2fc"]<-0.5)][regulated_object])) - - if show_fcs: - df_local = pd.DataFrame([[prot2fc.get(x) for x in positive_sites]+[prot2fc.get(x) for x in negative_sites]],columns=positive_sites+negative_sites) - else: - df_local = pd.DataFrame([[1 for x in positive_sites]+[-1 for x in negative_sites]],columns=positive_sites+negative_sites) - df_local["condpair"] = utils.get_condpairname([c1, c2]) - #df_loc["num_regulated"] = len(positive_sites) + len(negative_sites) - dfs.append(df_local) - #print(count) - count+=1 - - result_df = pd.concat(dfs) - result_df = result_df.replace(np.nan, 0).set_index("condpair") - if name_transformation_function is not None: - result_df = name_transformation_function(result_df) - - return result_df - -def reformat_dataframe_header(df): - header_genes = [x.split("_")[0] for x in df.columns] - header_proteins = [x.split("_")[1] for x in df.columns] - header_sitenums = [x.split("_")[2] for x in df.columns] - - header_df = pd.DataFrame.from_dict(data= {'gene_name': header_genes, 'protein' : header_proteins, 'site_id': header_sitenums}, columns=df.columns, orient='index') - return pd.concat([header_df, df]) - # Cell import pandas as pd import os @@ -532,31 +157,6 @@ def get_diffresult_dataframe(cond1, cond2, results_folder = os.path.join(".", "r return diffprots -# Cell -import alphaquant.utils.utils as aqutils -import alphaquant.diffquant.diffutils as aqdiffutils -import numpy as np - -def get_diffresult_dict_ckg_format(cond1, cond2, results_folder = os.path.join(".", "results")): - """ - ckg wrapper, reads the results dataframe for a given condpair and reformats it to the ckg volcano plot input format - """ - result_df = get_diffresult_dataframe(cond1, cond2, results_folder) - log2fcs = result_df["log2fc"].to_numpy() - logged_fdrs = result_df["-log10fdr"].to_numpy() - min_fdr = np.min(logged_fdrs) - result_ckg_dict = {} - result_ckg_dict[("volcano", aqutils.get_condpairname([cond1, cond2]))] = {"x": log2fcs, "y" : np.array([1.2,3.3,1,3,4,5,6]), "pvalue" : min_fdr, "text" : "testext", "color" : "grey",'is_samr':False, 'annotations' : []} - return result_ckg_dict - -# Cell -import pandas as pd -def subset_normed_peptides_df_to_condition(cond, sample2cond_df, normed_df): - columns_to_keep = set(sample2cond_df[sample2cond_df["condition"]==cond]["sample"]).intersection(set(normed_df.columns)) - columns_to_drop = set(normed_df.columns) - columns_to_keep - subset_df = normed_df.drop(columns = columns_to_drop) - return subset_df - # Cell import pandas as pd import os @@ -712,421 +312,10 @@ def plot_volcano_plotly( return fig -# Cell -import pandas as pd -import numpy as np -import plotly.graph_objects as go -import matplotlib.pyplot as plt -import itertools - -def plot_withincond_fcs_plotly( - normed_intensity_df, - title, - cut_extremes = True -): - """takes a normalized intensity dataframe and plots the fold change distribution between all samples. Column = sample, row = ion""" - fig = go.Figure() - samplecombs = list(itertools.combinations(normed_intensity_df.columns, 2)) - cutoff_common = 0 - for spair in samplecombs:#compare all pairs of samples - s1 = spair[0] - s2 = spair[1] - diff_fcs = normed_intensity_df[s1].to_numpy() - normed_intensity_df[s2].to_numpy() #calculate fold changes by subtracting log2 intensities of both samples - - if cut_extremes: - cutoff_cur = max(abs(np.nanquantile(diff_fcs,0.025)), abs(np.nanquantile(diff_fcs, 0.975))) #determine 2.5% - 97.5% interval, i.e. remove extremes - if cutoff_cur and cutoff_cur > cutoff_common: - cutoff_common = cutoff_cur - - n, bins, _ = plt.hist(diff_fcs, 250, density=True, histtype='step') - fig.add_trace( - go.Scatter( - x=bins, - y=n, - mode='lines', - line=dict( - shape='hvh', - width = 0.5 - ) - ) - ) - fig.update_layout( - xaxis = dict( - title="log2 peptide fcs", - range=(-cutoff_common, cutoff_common) - ), - barmode='stack', - template='plotly_white', - showlegend=False, - title = dict( - text=title, - font=dict(size=16, color='black', family='Arial, sans-serif'), - y=0.91, - x=0.5, - xanchor='center', - yanchor='middle', - ), - height=400, - width=870, - ) - return fig - -# Cell -import pandas as pd -import numpy as np -import plotly.graph_objects as go -import matplotlib.pyplot as plt - -def plot_betweencond_fcs_plotly( - df_c1_normed, - df_c2_normed, - title, - merge_samples = True -): - """takes normalized intensity dataframes of each condition and plots the distribution of direct peptide fold changes between conditions""" - fig = go.Figure() - if merge_samples: #samples can be merged to median intensity - df_c1_normed = df_c1_normed.median(axis = 1, skipna = True).to_frame() - df_c2_normed = df_c2_normed.median(axis = 1, skipna = True).to_frame() - - both_idx = df_c1_normed.index.intersection(df_c2_normed.index) - df1 = df_c1_normed.loc[both_idx] - df2 = df_c2_normed.loc[both_idx] - cutoff_common = 0 - - fig.add_vline( - x=0, - line_width=1, - line_dash="dash", - line_color="red" - ) #the data is normalized around 0, draw in helper line - - for col1 in df1.columns: - for col2 in df2.columns: - diff_fcs = df1[col1].to_numpy() - df2[col2].to_numpy() #calculate fold changes by subtracting log2 intensities of both conditions - cutoff_cur = max(abs(np.nanquantile(diff_fcs,0.025)), abs(np.nanquantile(diff_fcs, 0.975))) #determine 2.5% - 97.5% interval, i.e. remove extremes - if cutoff_cur and cutoff_cur > cutoff_common: - cutoff_common = cutoff_cur - - n, bins, _ = plt.hist(diff_fcs, 100, density=True, histtype='step') - fig.add_trace( - go.Scatter( - x=bins, - y=n, - mode='lines', - line=dict( - shape='hvh' - ) - ) - ) - - fig.update_layout( - xaxis = dict( - title="log2(fc)", - range=(-cutoff_common, cutoff_common) - ), - barmode='stack', - template='plotly_white', - showlegend=False, - title = dict( - text=title, - y=0.90, - x=0.5, - xanchor='center', - yanchor='middle', - ), - height=400, - width=870, - ) - return fig - -# Cell -def beeswarm_ion_plot_plotly( - df_melted, - diffresults_protein, -): - """takes pre-formatted long-format dataframe which contains all ion intensities for a given protein. - Columns are QUANT_ID, "intensity", "condition". Also takes results of the protein differential analysis as a series - to annotate the plot""" - - fig = go.Figure() - - #get annotations from diffresults - fdr = float(diffresults_protein.at["fdr"]) - fc = float(diffresults_protein.at["log2fc"]) - protein = diffresults_protein.name - - for cond, color, line_color in zip(df_melted.condition.unique(), ['#FF7F0E', '#2CA02C'], ['#808080', '#a6a6a6']): - data = df_melted[df_melted.condition == cond] - - fig.add_trace( - go.Box( - x=data.ion, - y=data.intensity, - boxpoints='all', - name=cond, - line=dict( - color=line_color - ), - marker=dict( - color=color, - opacity=0.3 - ), - pointpos=0 - ) - ) - - - if "gene" in diffresults_protein.index: - gene = diffresults_line.at["gene"] - title = f"{gene} ({protein}) FDR: {fdr:.1e} log2FC: {fc:.2f}" - else: - title = f"{protein} FDR: {fdr:.1e} log2FC: {fc:.2f}" - - fig.update_layout( - xaxis_title='ion', - yaxis_title='intensity', - boxmode='group', # group together boxes of the different traces for each value of x - template='plotly_white', - title = dict( - text=title, - #y=0.85, - x=0.5, - xanchor='center', - yanchor='middle', - ), - titlefont=dict( - size=14, - color='black', - family='Arial, sans-serif' - ), - legend=dict( - title='conditions:', - orientation="h", - yanchor="bottom", - y=1.03, - xanchor="right", - x=1 - ), - height=300, - ) - - return fig - -# Cell -from matplotlib.pyplot import cm -import anytree - -def foldchange_ion_plot_plotly( - df_melted, - diffresults_protein, - protein_node = None, - level = 'seq' -): - """takes pre-formatted long-format dataframe which contains all between condition fold changes. All ions of a given protein - are visualized, the columns are QUANT_ID and "log2fc". Also takes results of the protein differential analysis as a series - to annotate the plot""" - - fig = go.Figure() - - #get annotations from diffresults - fdr = float(diffresults_protein.at["fdr"]) - fc = float(diffresults_protein.at["log2fc"]) - protein = diffresults_protein.name - - fig.add_hline( - y=0, - line_width=2, - opacity=.7, - line_dash="dash", - line_color="black" - ) - - if protein_node is not None: - clust2col, clust2ions = get_color2ions(protein_node, level) - else: - clust2col = {-1 : 'lightblue'} - clust2ions = {-1 : [x for x in df_melted[aq_variables.QUANT_ID].drop_duplicates()]} - - - - for clust in sorted(clust2ions.keys()): - ions = clust2ions.get(clust) - color = clust2col.get(clust) - - df_subset = df_melted[[x in ions for x in df_melted[aq_variables.QUANT_ID]]] - - fig.add_trace( - go.Box( - x=df_subset.ion, - y=df_subset.log2fc, - - boxpoints='all', - line=dict( - color=color - ), - marker=dict( - color=color, - opacity=0.7 - ), - pointpos=0, - name = clust - ) - ) - - if "gene" in diffresults_protein.index: - gene = diffresults_line.at["gene"] - title = f"{gene} ({protein}) FDR: {fdr:.1e} log2FC: {fc:.2f}" - else: - title = f"{protein} FDR: {fdr:.1e} log2FC: {fc:.2f}" - - fig.update_layout( - xaxis_title='ion', - yaxis_title='log2fc', - template='plotly_white', - title = dict( - text=title, - y=0.85, - x=0.5, - xanchor='center', - yanchor='middle', - ), - titlefont=dict( - size=14, - color='black', - family='Arial, sans-serif' - ), - height=300, - ) - - return fig - -def get_color2ions(protein_node, level): - clust2ions = {} - - relevant_subnodes = anytree.findall(protein_node,filter_ = lambda x : x.type == level) - excluded_leaves = [] - - for subnode in relevant_subnodes: - clust = subnode.cluster - clust2ions[clust] = clust2ions.get(clust, []) + [x.name for x in subnode.leaves if subnode.type in x.inclusion_levels] - excluded_leaves.extend([x.name for x in subnode.leaves if subnode.type not in x.inclusion_levels]) - - colors= cm.rainbow(np.linspace(0, 1, len(clust2ions.keys()))) - clust2col = {clust:convert_to_plotly_color_format(col) for clust, col in zip(sorted(clust2ions.keys()), colors)} - clust2ions[-1] = excluded_leaves - clust2col[-1] = 'lightgrey' - #color2ions = {convert_to_plotly_color_format(clust2col.get(x)) : clust2ions.get(x) for x in clust2ions.keys()} - - return clust2col, clust2ions - -def convert_to_plotly_color_format(color_array): - return f"rgb({int(color_array[0]*250)},{int(color_array[1]*250)},{int(color_array[2]*250)})" - - -# Cell - -import seaborn as sns -def make_mz_fc_boxplot(merged_df, ion_nodes): - merged_df = merged_df.copy() - ion2fc = {node.name:node.fc for node in ion_nodes} - merged_df["log2fc"] = [ion2fc.get(ion) for ion in merged_df[aq_variables.QUANT_ID]] - #merged_df = merged_df[merged_df["log2fc"] >0] - #merged_df = merged_df[merged_df["EG.ApexRT"] <80] - transf_fc = list(2**merged_df["log2fc"]) - merged_df["fc"] = transf_fc - merged_df['mz_cathegory'] = [assign_mz_cathegory(mz) for mz in merged_df["FG.PrecMz"]] - sns.boxplot(x = 'mz_cathegory', y = "log2fc",data = merged_df) - plt.ylim(-2, 2) - plt.show() - - -def assign_mz_cathegory(mz_val): - if mz_val<400: - return 400 - if mz_val < 500: - return 500 - if mz_val < 600: - return 600 - if mz_val < 700: - return 700 - if mz_val < 800: - return 800 - if mz_val < 900: - return 900 - if mz_val < 1000: - return 1000 - if mz_val < 1100: - return 1100 - else: - return 1200 - -# Cell -import matplotlib.pyplot as plt - -def plot_fc_histogram_ml_filtered(y_test, y_pred, fc_cutoff, show_filtered): - if show_filtered: - fcs = [y_test[i] for i in range(len(y_test)) if y_pred[i]>fc_cutoff] - if len(fcs)==0: - return - else: - fcs = [y_test[i] for i in range(len(y_test)) if y_pred[i] 0: - fcs = [y_test[i] for i in range(len(y_test)) if loss_score[i]>loss_score_cutoff] - print(f"{len(fcs)} retained of {len(y_test)}") - plt.hist(fcs, 60, density=True, histtype='step',cumulative=False) - plt.show() - plt.scatter(y_test, loss_score, color='blue', alpha=0.3) - plt.show() - # Cell from sklearn.metrics import mean_squared_error, r2_score import seaborn as sns -def scatter_ml_regression_perturbation_aware(y_test, y_pred, ionnames, nodes, results_dir = None): - y_test, y_pred = [np.array(y_test), np.array(y_pred)] - perturbed_ions = {x.name for x in nodes if hasattr(x, "perturbation_added")} - is_unperturbed_vec = [x for x in range(len(ionnames)) if ionnames[x] not in perturbed_ions] - y_unpert_test = y_test[is_unperturbed_vec] - y_unpert_pred = y_pred[is_unperturbed_vec] - - fig_perturb, ax_perturb = plt.subplots() - - - sns.regplot(x = y_unpert_test, y = y_unpert_pred, scatter_kws=dict(alpha=0.1), ax = ax_perturb) - - if len(perturbed_ions)>0: - is_perturbed_vec = [x for x in range(len(ionnames)) if ionnames[x] in perturbed_ions] - y_pert_test = y_test[is_perturbed_vec] - y_pert_pred = y_pred[is_perturbed_vec] - sns.scatterplot(x = y_pert_test, y = y_pert_pred, alpha=0.1, label="perturbed", color = sns.color_palette()[1], ax = ax_perturb) - plot_perturbed_unperturbed_fcs(fcs_perturbed=y_pert_test,fcs_unperturbed=y_unpert_test, results_dir = results_dir) - plot_perturbed_unperturbed_fcs(fcs_perturbed=[x.fc for x in nodes if hasattr(x, "perturbation_added")], fcs_unperturbed=[x.fc for x in nodes if not hasattr(x, "perturbation_added")]) - plot_perturbation_histogram([x for x in nodes if hasattr(x, "perturbation_added")], results_dir) - - - err = mean_squared_error(y_test, y_pred) - r2 = r2_score(y_test, y_pred) - - ax_perturb.set_title(f"MSE: {err:.2f}, R2: {r2:.2f}") - - if results_dir is not None: - fig_perturb.savefig(f"{results_dir}/ml_regression.pdf") - plt.show() - - def scatter_ml_regression(y_test, y_pred, results_dir = None): fig, ax = plt.subplots() @@ -1142,13 +331,6 @@ def scatter_ml_regression(y_test, y_pred, results_dir = None): -def plot_perturbation_histogram(perturbed_nodes, results_dir): - fig, ax = plt.subplots() - perturbations = [x.perturbation_added for x in perturbed_nodes] - ax.hist(perturbations, 60, density=True, histtype='step',cumulative=True) - if results_dir is not None: - ax.figure.savefig(f"{results_dir}/perturbation_histogram.pdf") - def plot_perturbed_unperturbed_fcs(fcs_perturbed, fcs_unperturbed, results_dir = None): fig, ax = plt.subplots() ax.hist(bins = 60, x=fcs_unperturbed, label= 'unperturbed',density=True, histtype='step') @@ -1157,17 +339,6 @@ def plot_perturbed_unperturbed_fcs(fcs_perturbed, fcs_unperturbed, results_dir = ax.figure.savefig(f"{results_dir}/compare_pertubed_unperturbed.pdf") -def plot_ml_fc_histograms(y_test, y_pred, cutoff, results_dir = None): - - plot_fc_histogram_ml_filtered(y_test, y_pred, cutoff, show_filtered=False) - if results_dir is not None: - plt.savefig(f"{results_dir}/observed_offsets_ml_filtered.pdf") - plt.show() - plot_fc_histogram_ml_filtered(y_test, y_pred, cutoff, show_filtered=True) - if results_dir is not None: - plt.savefig(f"{results_dir}/observed_offsets_nofilt.pdf") - plt.show() - # Cell import seaborn as sns import numpy as np @@ -1246,89 +417,6 @@ def filter_sort_top_n(imp, names, top_n): names = [x[1] for x in tuplelist] return imp, names -# Cell - -import numpy as np -import matplotlib.pyplot as plt - -def visualize_gaussian_mixture_fit(gmm, y_pred): - - data = np.array(y_pred) ##loading univariate data. - - plt.figure() - plt.hist(data, bins=50, histtype='stepfilled', density=True, alpha=0.5) - plt.xlim(min(y_pred), max(y_pred)) - f_axis = data.copy().ravel() - f_axis.sort() - a = [] - weight_mean_cov = list(zip(gmm.weights_, gmm.means_, gmm.covariances_)) - weight_mean_cov.sort(key = lambda x : x[0], reverse=True) - for weight, mean, covar in weight_mean_cov: - a.append(weight*norm.pdf(f_axis, mean, np.sqrt(covar)).ravel()) - plt.plot(f_axis, a[-1]) - plt.plot(f_axis, np.array(a).sum(axis =0), 'k-') - plt.xlabel('Variable') - plt.ylabel('PDF') - plt.tight_layout() - plt.show() - -# Cell - -import numpy as np -import matplotlib.pyplot as plt - -def visualize_gaussian_nomix_subfit(mean, var, y_pred, y_subset): - - data = np.array(y_pred) ##loading univariate data. - - plt.figure() - plt.hist(data, bins=50, histtype='stepfilled', density=True, alpha=0.5) - plt.xlim(min(y_pred), max(y_pred)) - f_axis = data.copy().ravel() - f_axis.sort() - a = [] - weight = len(y_subset)/len(y_pred) - - a.append(weight*norm.pdf(f_axis, mean, np.sqrt(var)).ravel()) - plt.plot(f_axis, a[-1]) - plt.plot(f_axis, np.array(a).sum(axis =0), 'k-') - plt.xlabel('Variable') - plt.ylabel('PDF') - plt.tight_layout() - plt.show() - -# Cell -import matplotlib.pyplot as plt - -def visualize_filtered_non_filtered_precursors(all_precursors, ml_score = None): - - fcs_unfilt = [x.fc for x in all_precursors] - if ml_score is not None: - fcs_filt = [x.fc for x in all_precursors if abs(x.ml_score)_changed in order to be recognized by the state manager""" - self._update_fc_visualizer() - if self.protein_input.value: - self._update_protein_plot(self.protein_input.value) - def _on_show_plots_clicked(self, event): """Handle show plots button click.""" if self.cond1 and self.cond2: diff --git a/alphaquant/ui/dashboard_parts_plots_proteoforms.py b/alphaquant/ui/dashboard_parts_plots_proteoforms.py index 2ed1a9dd..68542ba0 100644 --- a/alphaquant/ui/dashboard_parts_plots_proteoforms.py +++ b/alphaquant/ui/dashboard_parts_plots_proteoforms.py @@ -2,7 +2,6 @@ import panel as pn import pandas as pd import os -import itertools import glob import re @@ -293,27 +292,6 @@ def _on_protein_selected(self, event): self.proteoform_plot_pane.clear() self.proteoform_plot_pane.append(pn.pane.Markdown("### Click 'Plot Protein' to visualize the data")) - def _update_condition_pairs_from_df(self, df): - """Update condition pairs based on the samplemap DataFrame.""" - if 'condition' in df.columns: - unique_conditions = df['condition'].dropna().unique() - pairs = [(c1, c2) for c1, c2 in itertools.permutations(unique_conditions, 2)] - pairs_str = [f"{c1}{aq_variables.CONDITION_PAIR_SEPARATOR}{c2}" for c1, c2 in pairs] - self.condpairname_select.options = ["No conditions"] + pairs_str - - def _load_protein_identifiers(self, results_file): - """Load protein identifiers from results file and update the protein input widget.""" - try: - proteoforms_df = pd.read_csv(results_file, sep='\t') - protein_ids = sorted(proteoforms_df['gene_symbol'].unique().tolist()) - self.protein_input.options = protein_ids - self.protein_input.disabled = False - return protein_ids - except Exception as e: - self.protein_input.disabled = True - self.protein_input.options = [] - raise Exception(f"Failed to load protein identifiers: {str(e)}") - def _on_proteoform_selected(self, event): """Handle proteoform selection from table.""" print(f"_on_proteoform_selected called with event: {event}") From a3ce7436b6c22d55d4687a05b47646aab602cfa6 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:05:19 +0200 Subject: [PATCH 3/3] Remove dead utility and analysis functions Co-Authored-By: Claude Sonnet 4.6 --- alphaquant/diffquant/diffutils.py | 79 ------------------- alphaquant/norm/normalization.py | 5 -- .../quant_reader/quant_reader_manager.py | 2 - alphaquant/resources/database_loader.py | 9 --- alphaquant/utils/benchmark_utils.py | 14 ---- alphaquant/utils/diffquant_utils.py | 17 ---- alphaquant/utils/utils.py | 13 --- 7 files changed, 139 deletions(-) diff --git a/alphaquant/diffquant/diffutils.py b/alphaquant/diffquant/diffutils.py index df5d02cf..59ea900e 100644 --- a/alphaquant/diffquant/diffutils.py +++ b/alphaquant/diffquant/diffutils.py @@ -56,32 +56,6 @@ def get_samples_used_from_samplemap_df(samplemap_df, cond1, cond2): samples_c2 = samplemap_df[[cond2 == x for x in samplemap_df["condition"]]]["sample"] return list(samples_c1), list(samples_c2) -def get_all_samples_from_samplemap_df(samplemap_df): - return list(samplemap_df["sample"]) - -# Cell -import pandas as pd - -def get_samplenames_from_input_df(data): - """extracts the names of the samples of the AQ input dataframe""" - names = list(data.columns) - names.remove('protein') - names.remove(QUANT_ID) - return names - -# Cell -import numpy as np -def filter_df_to_min_valid_values(quant_df_wideformat, samples_c1, samples_c2, min_valid_values): - """filters dataframe in alphaquant format such that each column has a minimum number of replicates - """ - quant_df_wideformat = quant_df_wideformat.replace(0, np.nan) - df_c1_min_valid_values = quant_df_wideformat[samples_c1].dropna(thresh = min_valid_values, axis = 0) - df_c2_min_valid_values = quant_df_wideformat[samples_c2].dropna(thresh = min_valid_values, axis = 0) - idxs_both = df_c1_min_valid_values.index.intersection(df_c2_min_valid_values.index) - quant_df_reduced = quant_df_wideformat.iloc[idxs_both].reset_index() - return quant_df_reduced - - # Cell def get_condpairname(condpair): return f"{condpair[0]}_VS_{condpair[1]}" @@ -102,39 +76,6 @@ def make_dir_w_existcheck(dir): if not os.path.exists(dir): os.makedirs(dir) -# Cell -import os -def get_results_plot_dir_condpair(results_dir, condpair): - results_dir_plots = f"{results_dir}/{condpair}_plots" - make_dir_w_existcheck(results_dir_plots) - return results_dir_plots - -# Cell -def get_middle_elem(sorted_list): - nvals = len(sorted_list) - if nvals==1: - return sorted_list[0] - middle_idx = nvals//2 - if nvals%2==1: - return sorted_list[middle_idx] - return 0.5* (sorted_list[middle_idx] + sorted_list[middle_idx-1]) - -# Cell -import numpy as np -def get_nonna_array(array_w_nas): - res = [] - isnan_arr = np.isnan(array_w_nas) - - for idx in range(len(array_w_nas)): - sub_res = [] - sub_array = array_w_nas[idx] - na_array = isnan_arr[idx] - for idx2 in range(len(sub_array)): - if not na_array[idx2]: - sub_res.append(sub_array[idx2]) - res.append(np.array(sub_res)) - return np.array(res) - # Cell import numpy as np def get_non_nas_from_pd_df(df): @@ -152,12 +93,6 @@ def get_ionints_from_pd_df(df): } # Cell -def invert_dictionary(my_map): - inv_map = {} - for k, v in my_map.items(): - inv_map[v] = inv_map.get(v, []) + [k] - return inv_map - from collections import defaultdict def invert_tuple_list_w_nonunique_values(tuple_list): inverted_dict = defaultdict(list) @@ -373,20 +308,6 @@ def get_path_to_unformatted_file(input_file_name): -# Cell - -# Cell -import os -def check_for_processed_runs_in_results_folder(results_folder): - contained_condpairs = [] - folder_files = os.listdir(results_folder) - result_files = list(filter(lambda x: "results.tsv" in x ,folder_files)) - for result_file in result_files: - res_name = result_file.replace(".results.tsv", "") - if ((f"{res_name}.normed.tsv" in folder_files) and (f"{res_name}.results.ions.tsv" in folder_files)): - contained_condpairs.append(res_name) - return contained_condpairs - # Cell import pandas as pd import os diff --git a/alphaquant/norm/normalization.py b/alphaquant/norm/normalization.py index 1f80bb70..4a7dd726 100644 --- a/alphaquant/norm/normalization.py +++ b/alphaquant/norm/normalization.py @@ -136,11 +136,6 @@ def determine_anchor_and_shift_sample(sample2counts, i_min, j_min, min_distance) flip = 1 if anchor_idx == i_min else -1 return anchor_idx, shift_idx, flip*min_distance -# Cell -def shift_samples(samples, sampleidx2anchoridx, sample2shift): - for sample_idx in range(samples.shape[0]): - samples[sample_idx] = samples[sample_idx]+get_total_shift(sampleidx2anchoridx, sample2shift, sample_idx) - # Cell def get_total_shift(sampleidx2anchoridx, sample2shift,sample_idx): diff --git a/alphaquant/quant_reader/quant_reader_manager.py b/alphaquant/quant_reader/quant_reader_manager.py index 2f5dd4b3..c9865c36 100644 --- a/alphaquant/quant_reader/quant_reader_manager.py +++ b/alphaquant/quant_reader/quant_reader_manager.py @@ -72,5 +72,3 @@ def reformat_and_save_input_file( return outfile_name -def set_quanttable_config_location(quanttable_config_file): - config_dict_loader.INTABLE_CONFIG = quanttable_config_file diff --git a/alphaquant/resources/database_loader.py b/alphaquant/resources/database_loader.py index efdeee39..d0eb13f3 100644 --- a/alphaquant/resources/database_loader.py +++ b/alphaquant/resources/database_loader.py @@ -28,15 +28,6 @@ def get_genename2sequence_dict( organism = "human"): return gene2sequence_dict -def get_swissprot2sequence_dict( organism = "human"): - swissprot_file = get_swissprot_path(organism) - swissprot_df = pd.read_csv(swissprot_file, sep = '\t', usecols=['Entry', 'Sequence']) - swissprot_ids = swissprot_df['Entry'].astype(str).tolist() - sequences = swissprot_df['Sequence'].astype(str).tolist() - - swissprot2sequence_dict = dict(zip(swissprot_ids, sequences)) - return swissprot2sequence_dict - def get_uniprot2sequence_dict( organism = "human"): swissprot_file = get_swissprot_path(organism) swissprot_df = pd.read_csv(swissprot_file, sep = '\t', usecols=['Entry', 'Sequence']) diff --git a/alphaquant/utils/benchmark_utils.py b/alphaquant/utils/benchmark_utils.py index bb6e29c9..e69de29b 100644 --- a/alphaquant/utils/benchmark_utils.py +++ b/alphaquant/utils/benchmark_utils.py @@ -1,14 +0,0 @@ -import numpy as np -def subset_df_to_n_most_complete_proteins(proteome_df_aq_reformat, proteome_df_original, n = 100, protein_header = "PG.ProteinGroups", - protein_subset_to_use = None, use_only_complete_columns = False): - proteome_df_aq_reformat = proteome_df_aq_reformat.set_index(["protein", "quant_id"]).replace(0, np.nan) - if use_only_complete_columns: - proteome_df_aq_reformat = proteome_df_aq_reformat.dropna() - - proteome_df_aq_reformat = proteome_df_aq_reformat.reset_index() - - set_of_proteins = set(proteome_df_aq_reformat["protein"].unique()) - if protein_subset_to_use is not None: - set_of_proteins = protein_subset_to_use.intersection(set_of_proteins) - - return np.random.choice(list(set_of_proteins), n) \ No newline at end of file diff --git a/alphaquant/utils/diffquant_utils.py b/alphaquant/utils/diffquant_utils.py index 0d4f65ba..5a49abe3 100644 --- a/alphaquant/utils/diffquant_utils.py +++ b/alphaquant/utils/diffquant_utils.py @@ -1,19 +1,2 @@ import pandas as pd import numpy as np - - - -def find_non_outlier_indices_ipr(data, threshold=1.5, percentile_lower = 25, percentile_upper = 75): - - value_lower, value_upper = np.percentile(data, [percentile_lower, percentile_upper]) - iqr = value_upper - value_lower - - # Calculate the bounds for non-outliers - cut_off = iqr * threshold - lowest_tolerated_value = value_lower - cut_off - highest_tolerated_value = value_upper + cut_off - - # Identify non-outlier indices - non_outlier_indices = np.where((data >= lowest_tolerated_value) & (data <= highest_tolerated_value))[0] - - return non_outlier_indices \ No newline at end of file diff --git a/alphaquant/utils/utils.py b/alphaquant/utils/utils.py index dacc3e62..feada3e7 100644 --- a/alphaquant/utils/utils.py +++ b/alphaquant/utils/utils.py @@ -55,19 +55,6 @@ def cut_trailing_parts_seqstring(seqstring): def get_condpairname(condpair): return f"{condpair[0]}_VS_{condpair[1]}" -def get_condpair_from_condpairname(condpairname): - return condpairname.split(aq_variables.CONDITION_PAIR_SEPARATOR) - - -def convert_ion_string_to_node_type(ionstring, node_type): #for example I have a full quant_id that describes a fragment ion, I want to shorten it to the specified leve, e.g. sequence - regex = NODETYPE2REGEX[node_type] - match = re.match(regex, ionstring) - if match: - return match.group(1) - else: - raise ValueError(f"Could not match {ionstring} to {node_type}. This function only works for the following node types: seq, mod_seq, mod_seq_charge") - - def get_progress_folder_filename(input_file, file_ending, remove_extension = True): #file ending needs to include all dots, e.g. ".aq_reformat.tsv" input_file = os.path.abspath(input_file) #to make sure that the path is absolute dirname_input_file = os.path.dirname(input_file)