Skip to content

Commit fd5460e

Browse files
authored
Merge pull request #100 from MannLabs/change_minrep_semantics
Fix filtering options in GUI
2 parents f11cb4e + 794a9dd commit fd5460e

12 files changed

Lines changed: 254 additions & 130 deletions

alphaquant/diffquant/condpair_analysis.py

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def analyze_condpair(*,runconfig, condpair):
3737
c1_samples, c2_samples = aqutils.get_samples_used_from_samplemap_df(runconfig.samplemap_df, condpair[0], condpair[1])
3838

3939
try:
40-
df_c1, df_c2 = get_per_condition_dataframes(c1_samples, c2_samples, input_df_local,runconfig.minrep_both, runconfig.minrep_either, runconfig.minrep_c1, runconfig.minrep_c2)
40+
df_c1, df_c2 = get_per_condition_dataframes(c1_samples, c2_samples, input_df_local, min_valid_values=runconfig.min_valid_values, valid_values_filter_mode=runconfig.valid_values_filter_mode, min_valid_values_c1=runconfig.min_valid_values_c1, min_valid_values_c2=runconfig.min_valid_values_c2)
4141
except Exception as e:
4242
LOGGER.info(e)
4343
return
@@ -153,53 +153,49 @@ def write_out_normed_df(normed_df_1, normed_df_2, pep2prot, results_dir, condpai
153153
merged_df.to_csv(f"{results_dir}/{aqutils.get_condpairname(condpair)}.normed.tsv", sep = "\t")
154154

155155

156-
def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, minrep_both =None, minrep_either = None, minrep_c1 = None, minrep_c2 = None):
156+
def get_per_condition_dataframes(samples_c1, samples_c2, unnormed_df, min_valid_values, valid_values_filter_mode, min_valid_values_c1, min_valid_values_c2):
157157

158158
min_samples = min(len(samples_c1), len(samples_c2))
159159

160160
if min_samples<2:
161161
raise Exception(f"condpair has not enough samples: c1:{len(samples_c1)} c2: {len(samples_c2)}, skipping")
162162

163-
if (minrep_either is not None) or ((minrep_c1 is not None) and (minrep_c2 is not None)): #minrep_both was set as default and should be overruled by minrep_either or minrep_c1 and minrep_c2
164-
minrep_both = None
165-
166-
if minrep_either is not None:
167-
minrep_either = np.min([get_minrep_for_cond(samples_c1, minrep_either), get_minrep_for_cond(samples_c2, minrep_either)])
168-
passes_minrep_c1 = unnormed_df.loc[:, samples_c1].notna().sum(axis=1) >= minrep_either
169-
passes_minrep_c2 = unnormed_df.loc[:, samples_c2].notna().sum(axis=1) >= minrep_either
170-
passes_minrep_either = passes_minrep_c1 | passes_minrep_c2
171-
unnormed_df = unnormed_df[passes_minrep_either]
163+
if valid_values_filter_mode == "either":
164+
min_valid_values = np.min([get_min_valid_values_for_cond(samples_c1, min_valid_values), get_min_valid_values_for_cond(samples_c2, min_valid_values)])
165+
passes_min_valid_values_c1 = unnormed_df.loc[:, samples_c1].notna().sum(axis=1) >= min_valid_values
166+
passes_min_valid_values_c2 = unnormed_df.loc[:, samples_c2].notna().sum(axis=1) >= min_valid_values
167+
passes_min_valid_values = passes_min_valid_values_c1 | passes_min_valid_values_c2
168+
unnormed_df = unnormed_df[passes_min_valid_values]
172169
df_c1 = unnormed_df.loc[:, samples_c1]
173170
df_c2 = unnormed_df.loc[:, samples_c2]
174171

172+
elif valid_values_filter_mode == "both":
173+
min_valid_values_c1 = get_min_valid_values_for_cond(samples_c1, min_valid_values)
174+
min_valid_values_c2 = get_min_valid_values_for_cond(samples_c2, min_valid_values)
175+
df_c1 = unnormed_df.loc[:, samples_c1].dropna(thresh=min_valid_values_c1, axis=0)
176+
df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=min_valid_values_c2, axis=0)
177+
178+
elif valid_values_filter_mode == "per_condition":
179+
min_valid_values_c1 = get_min_valid_values_for_cond(samples_c1, min_valid_values_c1)
180+
min_valid_values_c2 = get_min_valid_values_for_cond(samples_c2, min_valid_values_c2)
181+
df_c1 = unnormed_df.loc[:, samples_c1].dropna(thresh=min_valid_values_c1, axis=0)
182+
df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=min_valid_values_c2, axis=0)
183+
else:
184+
raise Exception(f"invalid value set for the variable valid_values_filter_mode: {valid_values_filter_mode}, please ensure that is set to: 'either', 'both' or 'per_condition'")
175185

176-
elif minrep_both is not None:
177-
minrep_c1 = minrep_both
178-
minrep_c2 = minrep_both
179-
180-
if (minrep_c1 is not None) and (minrep_c2 is not None):
181-
minrep_c1 = get_minrep_for_cond(samples_c1, minrep_c1)
182-
minrep_c2 = get_minrep_for_cond(samples_c2, minrep_c2)
183-
df_c1 = unnormed_df.loc[:, samples_c1].dropna(thresh=minrep_c1, axis=0)
184-
df_c2 = unnormed_df.loc[:, samples_c2].dropna(thresh=minrep_c2, axis=0)
185-
if (len(df_c1.index)<5) | (len(df_c2.index)<5):
186-
raise Exception(f"condpair has not enough data for processing c1: {len(df_c1.index)} c2: {len(df_c2.index)}, skipping")
187-
188-
if (minrep_both is None) and (minrep_either is None) and (minrep_c1 is None) and (minrep_c2 is None):
189-
raise Exception("no minrep set, please specify!")
190-
191-
186+
if (len(df_c1.index)<5) | (len(df_c2.index)<5):
187+
raise Exception(f"condpair has not enough data for processing c1: {len(df_c1.index)} c2: {len(df_c2.index)}, skipping")
192188

193189
return df_c1, df_c2
194190

195-
def get_minrep_for_cond(c_samples, minrep):
196-
if minrep is None: #in the case of None, no nans will be allowed
191+
def get_min_valid_values_for_cond(c_samples, min_valid_values):
192+
if min_valid_values is None: #in the case of None, no nans will be allowed
197193
return None
198194
num_samples = len(c_samples)
199-
if num_samples<minrep:
195+
if num_samples<min_valid_values:
200196
return num_samples
201197
else:
202-
return minrep
198+
return min_valid_values
203199

204200

205201

alphaquant/diffquant/diffutils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,13 @@ def get_samplenames_from_input_df(data):
6969

7070
# Cell
7171
import numpy as np
72-
def filter_df_to_minrep(quant_df_wideformat, samples_c1, samples_c2, minrep):
72+
def filter_df_to_min_valid_values(quant_df_wideformat, samples_c1, samples_c2, min_valid_values):
7373
"""filters dataframe in alphaquant format such that each column has a minimum number of replicates
7474
"""
7575
quant_df_wideformat = quant_df_wideformat.replace(0, np.nan)
76-
df_c1_minrep = quant_df_wideformat[samples_c1].dropna(thresh = minrep, axis = 0)
77-
df_c2_minrep = quant_df_wideformat[samples_c2].dropna(thresh = minrep, axis = 0)
78-
idxs_both = df_c1_minrep.index.intersection(df_c2_minrep.index)
76+
df_c1_min_valid_values = quant_df_wideformat[samples_c1].dropna(thresh = min_valid_values, axis = 0)
77+
df_c2_min_valid_values = quant_df_wideformat[samples_c2].dropna(thresh = min_valid_values, axis = 0)
78+
idxs_both = df_c1_min_valid_values.index.intersection(df_c2_min_valid_values.index)
7979
quant_df_reduced = quant_df_wideformat.iloc[idxs_both].reset_index()
8080
return quant_df_reduced
8181

alphaquant/ptm/ptmsite_mapping.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ def initialize_ptmsite_df(ptmsite_file, samplemap_file):
673673
ptmsite_df = pd.read_csv(ptmsite_file, sep = "\t")
674674
return ptmsite_df, samplemap_df
675675

676-
def detect_site_occupancy_change(cond1, cond2, ptmsite_df ,samplemap_df, minrep = 2, threshold_prob = 0.05):
676+
def detect_site_occupancy_change(cond1, cond2, ptmsite_df ,samplemap_df, min_valid_values = 2, threshold_prob = 0.05):
677677
"""
678678
uses a PTMsite df with headers "REFPROT", "gene","site", and headers for sample1, sample2, etc and determines
679679
whether a site appears/dissappears between conditions based on some probability threshold
@@ -708,7 +708,7 @@ def detect_site_occupancy_change(cond1, cond2, ptmsite_df ,samplemap_df, minrep
708708
numrep_c1 = len(cond1_vals)
709709
numrep_c2 = len(cond2_vals)
710710

711-
if(numrep_c1<minrep) | (numrep_c2 < minrep):
711+
if(numrep_c1<min_valid_values) | (numrep_c2 < min_valid_values):
712712
continue
713713

714714
cond1_prob = np.mean(cond1_vals)
@@ -741,7 +741,7 @@ def detect_site_occupancy_change(cond1, cond2, ptmsite_df ,samplemap_df, minrep
741741
import numpy as np
742742
import re
743743

744-
def check_site_occupancy_changes_all_diffresults(results_folder = os.path.join(".","results"), siteprobs_filename = "siteprobs.tsv",samplemap_file = "samples.map",condpairs_to_compare = [], threshold_prob = 0.05, minrep = 2):
744+
def check_site_occupancy_changes_all_diffresults(results_folder = os.path.join(".","results"), siteprobs_filename = "siteprobs.tsv",samplemap_file = "samples.map",condpairs_to_compare = [], threshold_prob = 0.05, min_valid_values = 2):
745745

746746
samplemap_df, _ = get_sample2cond_dataframe(samplemap_file)
747747
ptmsite_map = os.path.join(results_folder, siteprobs_filename)
@@ -765,7 +765,7 @@ def check_site_occupancy_changes_all_diffresults(results_folder = os.path.join("
765765
ptmsite_df_cpair = ptmsite_df_cpair.sort_index()
766766

767767
condpairname = utils.get_condpairname(condpair)
768-
df_occupancy = detect_site_occupancy_change(cond1, cond2, ptmsite_df_cpair, samplemap_df, minrep = minrep, threshold_prob = threshold_prob)
768+
df_occupancy = detect_site_occupancy_change(cond1, cond2, ptmsite_df_cpair, samplemap_df, min_valid_values = min_valid_values, threshold_prob = threshold_prob)
769769
df_occupancy.to_csv(os.path.join(results_folder, f"{condpairname}.ptm_occupancy_changes.tsv"), sep = "\t", index = None)
770770

771771

alphaquant/run_pipeline.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@ def run_pipeline(input_file: str,
4545
multicond_median_analysis: bool = False,
4646
condpairs_list: Optional[List[Tuple[str, str]]] = None,
4747
file_has_alphaquant_format: bool = False,
48-
minrep_both: int = 2,
49-
minrep_either: Optional[int] = None,
50-
minrep_c1: Optional[int] = None,
51-
minrep_c2: Optional[int] = None,
48+
min_valid_values: int = 2,
49+
valid_values_filter_mode: str = "either", #options: "either", "and", "per_condition"
50+
min_valid_values_c1: int = 0,
51+
min_valid_values_c2: int = 0,
5252
min_num_ions: int = 1,
5353
minpep: int = 1,
5454
organism: Optional[str] = None,
@@ -71,7 +71,12 @@ def run_pipeline(input_file: str,
7171
protein_subset_for_normalization_file: Optional[str] = None,
7272
protnorm_peptides: bool = True,
7373
peptides_to_exclude_file: Optional[str] = None,
74-
reset_progress_folder: bool = False) -> None:
74+
reset_progress_folder: bool = False,
75+
minrep_both: Optional[int] = None, #deprecated
76+
minrep_either: Optional[int] = None, #deprecated
77+
minrep_c1: Optional[int] = None, #deprecated
78+
minrep_c2: Optional[int] = None, #deprecated
79+
) -> None:
7580
"""Run differential analyses following the AlphaQuant pipeline. This function processes proteomics data through multiple steps including
7681
preprocessing, if applicable PTM site mapping, if applicable median condition creation, normalization, statistical testing, visualizations
7782
and writing of results tables.
@@ -86,10 +91,13 @@ def run_pipeline(input_file: str,
8691
multicond_median_analysis (bool): Whether to compare all conditions to a median condition. Defaults to False.
8792
condpairs_list (list): Specific condition pairs to compare. If None, performs all pairwise comparisons.
8893
file_has_alphaquant_format (bool): Whether the input file is already in AlphaQuant matrix format. Defaults to False.
89-
minrep_both (int): Minimum replicate count required in both conditions. Defaults to 2.
90-
minrep_either (int): Minimum replicate count required in either condition.
91-
minrep_c1 (int): Minimum replicate count required in condition 1.
92-
minrep_c2 (int): Minimum replicate count required in condition 2.
94+
min_valid_values (int): Minimum number of valid values required across conditions. Defaults to 2.
95+
valid_values_filter_mode (str): Strategy for filtering based on valid values. Options:
96+
- "either": Include features that have at least 'min_valid_values' valid values in at least one condition.
97+
- "both": Include only features that have at least 'min_valid_values' valid values in all conditions.
98+
- "per_condition": Include only features that have at least 'min_valid_values_c1' valid values in condition 1 and 'min_valid_values_c2' valid values in condition 2.
99+
min_valid_values_c1 (int): Minimum number of valid values required specifically in condition 1.
100+
min_valid_values_c2 (int): Minimum number of valid values required specifically in condition 2.
93101
min_num_ions (int): Minimum number of ions required per peptide. Defaults to 1.
94102
minpep (int): Minimum number of peptides required per protein. Defaults to 1.
95103
organism (str): Organism name for PTM mapping (e.g., 'human', 'mouse'). Required if perform_ptm_mapping is True.
@@ -115,6 +123,27 @@ def run_pipeline(input_file: str,
115123
reset_progress_folder (bool): Clear and recreate the progress folder. Defaults to False.
116124
"""
117125
LOGGER.info("Starting AlphaQuant")
126+
127+
#########################################################
128+
# TODO: this backwards compatibility can be removed beginning of 2026
129+
# to ensure backwards compatibility: in case the minrep paramters are set, we need to convert them to the min_valid_values and valid_values_filter_mode parameters
130+
if minrep_both is not None:
131+
min_valid_values = minrep_both
132+
valid_values_filter_mode = "both"
133+
LOGGER.warning("you set the parameter 'minrep_both', which is deprecated. Please use 'min_valid_values' and 'valid_values_filter_mode' instead.")
134+
if minrep_either is not None:
135+
min_valid_values = minrep_either
136+
valid_values_filter_mode = "either"
137+
LOGGER.warning("you set the parameter 'minrep_either', which is deprecated. Please use 'min_valid_values' and 'valid_values_filter_mode' instead.")
138+
if minrep_c1 is not None and minrep_c2 is not None:
139+
min_valid_values_c1 = minrep_c1
140+
min_valid_values_c2 = minrep_c2
141+
valid_values_filter_mode = "per_condition"
142+
LOGGER.warning("you set the parameter 'minrep_c1' and 'minrep_c2', which is deprecated. Please use 'min_valid_values_c1' and 'min_valid_values_c2' instead.")
143+
#########################################################
144+
145+
146+
118147
input_file_original = input_file
119148
check_input_consistency(input_file_original, samplemap_file, samplemap_df)
120149
create_progress_folder_if_applicable(input_file_original, reset_progress_folder)

alphaquant/ui/dashboad_parts_plots_basic.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,12 @@ def _extract_condpairs(self):
176176
self.condpairname_select.options = ["No conditions"]
177177
return
178178

179-
pattern = os.path.join(self.results_dir, "*_VS_*.results.tsv")
179+
# Ensure directory path ends with separator for Windows compatibility
180+
dir_path = self.results_dir
181+
if not dir_path.endswith(os.sep):
182+
dir_path += os.sep
183+
184+
pattern = os.path.join(dir_path, "*_VS_*.results.tsv")
180185
files = glob.glob(pattern)
181186

182187
for f in files:

0 commit comments

Comments
 (0)