Remove dead utility and analysis functions

ammarcsj · claude · ammarcsj · commit a3ce7436b6c2 · 2026-05-25T10:05:19.000+02:00
Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/alphaquant/diffquant/diffutils.py b/alphaquant/diffquant/diffutils.py
@@ -56,32 +56,6 @@ def get_samples_used_from_samplemap_df(samplemap_df, cond1, cond2):
     samples_c2 = samplemap_df[[cond2 == x for x in samplemap_df["condition"]]]["sample"]
     return list(samples_c1), list(samples_c2)
 
-def get_all_samples_from_samplemap_df(samplemap_df):
-    return list(samplemap_df["sample"])
-
-# Cell
-import pandas as pd
-
-def get_samplenames_from_input_df(data):
-    """extracts the names of the samples of the AQ input dataframe"""
-    names = list(data.columns)
-    names.remove('protein')
-    names.remove(QUANT_ID)
-    return names
-
-# Cell
-import numpy as np
-def filter_df_to_min_valid_values(quant_df_wideformat, samples_c1, samples_c2, min_valid_values):
-    """filters dataframe in alphaquant format such that each column has a minimum number of replicates
-    """
-    quant_df_wideformat = quant_df_wideformat.replace(0, np.nan)
-    df_c1_min_valid_values = quant_df_wideformat[samples_c1].dropna(thresh = min_valid_values, axis = 0)
-    df_c2_min_valid_values = quant_df_wideformat[samples_c2].dropna(thresh = min_valid_values, axis = 0)
-    idxs_both = df_c1_min_valid_values.index.intersection(df_c2_min_valid_values.index)
-    quant_df_reduced = quant_df_wideformat.iloc[idxs_both].reset_index()
-    return quant_df_reduced
-
-
 # Cell
 def get_condpairname(condpair):
     return f"{condpair[0]}_VS_{condpair[1]}"
@@ -102,39 +76,6 @@ def make_dir_w_existcheck(dir):
     if not os.path.exists(dir):
         os.makedirs(dir)
 
-# Cell
-import os
-def get_results_plot_dir_condpair(results_dir, condpair):
-    results_dir_plots = f"{results_dir}/{condpair}_plots"
-    make_dir_w_existcheck(results_dir_plots)
-    return results_dir_plots
-
-# Cell
-def get_middle_elem(sorted_list):
-    nvals = len(sorted_list)
-    if nvals==1:
-        return sorted_list[0]
-    middle_idx = nvals//2
-    if nvals%2==1:
-        return sorted_list[middle_idx]
-    return 0.5* (sorted_list[middle_idx] + sorted_list[middle_idx-1])
-
-# Cell
-import numpy as np
-def get_nonna_array(array_w_nas):
-    res = []
-    isnan_arr = np.isnan(array_w_nas)
-
-    for idx in range(len(array_w_nas)):
-        sub_res = []
-        sub_array = array_w_nas[idx]
-        na_array = isnan_arr[idx]
-        for idx2 in range(len(sub_array)):
-            if not na_array[idx2]:
-               sub_res.append(sub_array[idx2])
-        res.append(np.array(sub_res))
-    return np.array(res)
-
 # Cell
 import numpy as np
 def get_non_nas_from_pd_df(df):
@@ -152,12 +93,6 @@ def get_ionints_from_pd_df(df):
     }
 
 # Cell
-def invert_dictionary(my_map):
-    inv_map = {}
-    for k, v in my_map.items():
-        inv_map[v] = inv_map.get(v, []) + [k]
-    return inv_map
-
 from collections import defaultdict
 def invert_tuple_list_w_nonunique_values(tuple_list):
     inverted_dict = defaultdict(list)
@@ -373,20 +308,6 @@ def get_path_to_unformatted_file(input_file_name):
 
 
 
-# Cell
-
-# Cell
-import os
-def check_for_processed_runs_in_results_folder(results_folder):
-    contained_condpairs = []
-    folder_files = os.listdir(results_folder)
-    result_files = list(filter(lambda x: "results.tsv" in x ,folder_files))
-    for result_file in result_files:
-        res_name = result_file.replace(".results.tsv", "")
-        if ((f"{res_name}.normed.tsv" in folder_files) and (f"{res_name}.results.ions.tsv" in folder_files)):
-            contained_condpairs.append(res_name)
-    return contained_condpairs
-
 # Cell
 import pandas as pd
 import os
diff --git a/alphaquant/norm/normalization.py b/alphaquant/norm/normalization.py
@@ -136,11 +136,6 @@ def determine_anchor_and_shift_sample(sample2counts, i_min, j_min, min_distance)
     flip = 1 if anchor_idx == i_min else -1
     return anchor_idx, shift_idx, flip*min_distance
 
-# Cell
-def shift_samples(samples, sampleidx2anchoridx, sample2shift):
-    for sample_idx in range(samples.shape[0]):
-        samples[sample_idx] = samples[sample_idx]+get_total_shift(sampleidx2anchoridx, sample2shift, sample_idx)
-
 # Cell
 def get_total_shift(sampleidx2anchoridx, sample2shift,sample_idx):
 
diff --git a/alphaquant/quant_reader/quant_reader_manager.py b/alphaquant/quant_reader/quant_reader_manager.py
@@ -72,5 +72,3 @@ def reformat_and_save_input_file(
     return outfile_name
 
 
-def set_quanttable_config_location(quanttable_config_file):
-    config_dict_loader.INTABLE_CONFIG = quanttable_config_file
diff --git a/alphaquant/resources/database_loader.py b/alphaquant/resources/database_loader.py
@@ -28,15 +28,6 @@ def get_genename2sequence_dict( organism = "human"):
 
     return gene2sequence_dict
 
-def get_swissprot2sequence_dict( organism = "human"):
-    swissprot_file = get_swissprot_path(organism)
-    swissprot_df = pd.read_csv(swissprot_file, sep = '\t', usecols=['Entry', 'Sequence'])
-    swissprot_ids = swissprot_df['Entry'].astype(str).tolist()
-    sequences = swissprot_df['Sequence'].astype(str).tolist()
-
-    swissprot2sequence_dict = dict(zip(swissprot_ids, sequences))
-    return swissprot2sequence_dict
-
 def get_uniprot2sequence_dict( organism = "human"):
     swissprot_file = get_swissprot_path(organism)
     swissprot_df = pd.read_csv(swissprot_file, sep = '\t', usecols=['Entry', 'Sequence'])
diff --git a/alphaquant/utils/benchmark_utils.py b/alphaquant/utils/benchmark_utils.py
@@ -1,14 +0,0 @@
-import numpy as np
-def subset_df_to_n_most_complete_proteins(proteome_df_aq_reformat, proteome_df_original, n = 100, protein_header = "PG.ProteinGroups", 
-                                          protein_subset_to_use = None, use_only_complete_columns = False):
-    proteome_df_aq_reformat = proteome_df_aq_reformat.set_index(["protein", "quant_id"]).replace(0, np.nan)
-    if use_only_complete_columns:
-        proteome_df_aq_reformat = proteome_df_aq_reformat.dropna()
-    
-    proteome_df_aq_reformat = proteome_df_aq_reformat.reset_index()
-
-    set_of_proteins = set(proteome_df_aq_reformat["protein"].unique())
-    if protein_subset_to_use is not None:
-        set_of_proteins = protein_subset_to_use.intersection(set_of_proteins)
-    
-    return np.random.choice(list(set_of_proteins), n)
diff --git a/alphaquant/utils/diffquant_utils.py b/alphaquant/utils/diffquant_utils.py
@@ -1,19 +1,2 @@
 import pandas as pd
 import numpy as np
-
-
-
-def find_non_outlier_indices_ipr(data, threshold=1.5, percentile_lower = 25, percentile_upper = 75):
-    
-    value_lower, value_upper = np.percentile(data, [percentile_lower, percentile_upper])
-    iqr = value_upper - value_lower
-
-    # Calculate the bounds for non-outliers
-    cut_off = iqr * threshold
-    lowest_tolerated_value = value_lower - cut_off
-    highest_tolerated_value = value_upper + cut_off
-
-    # Identify non-outlier indices
-    non_outlier_indices = np.where((data >= lowest_tolerated_value) & (data <= highest_tolerated_value))[0]
-
-    return non_outlier_indices
diff --git a/alphaquant/utils/utils.py b/alphaquant/utils/utils.py
@@ -55,19 +55,6 @@ def cut_trailing_parts_seqstring(seqstring):
 def get_condpairname(condpair):
     return f"{condpair[0]}_VS_{condpair[1]}"
 
-def get_condpair_from_condpairname(condpairname):
-    return condpairname.split(aq_variables.CONDITION_PAIR_SEPARATOR)
-
-
-def convert_ion_string_to_node_type(ionstring, node_type): #for example I have a full quant_id that describes a fragment ion, I want to shorten it to the specified leve, e.g. sequence
-    regex = NODETYPE2REGEX[node_type]
-    match = re.match(regex, ionstring)
-    if match:
-        return match.group(1)
-    else:
-        raise ValueError(f"Could not match {ionstring} to {node_type}. This function only works for the following node types: seq, mod_seq, mod_seq_charge")
-
-
 def get_progress_folder_filename(input_file, file_ending, remove_extension = True): #file ending needs to include all dots, e.g. ".aq_reformat.tsv"
     input_file = os.path.abspath(input_file) #to make sure that the path is absolute
     dirname_input_file = os.path.dirname(input_file)

Original file line number	Diff line number	Diff line change
`@@ -72,5 +72,3 @@ def reformat_and_save_input_file(`
`72`	`72`	`return outfile_name`
`73`	`73`
`74`	`74`
`75`		`-def set_quanttable_config_location(quanttable_config_file):`
`76`		`- config_dict_loader.INTABLE_CONFIG = quanttable_config_file`