Skip to content

Commit cb9306d

Browse files
committed
pmgen similarity threshold benchmark review #1
1 parent 85ca2bf commit cb9306d

4 files changed

Lines changed: 107 additions & 19 deletions

File tree

PANDORA/PANDORA/Pandora/Modelling_functions.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,8 @@ def score_peptide_alignment(target, template, substitution_matrix='PAM30'):
490490
return aligned.score
491491

492492

493-
def find_template(target, database, best_n_templates=1, benchmark=False,
493+
def find_template(target, database, best_n_templates=1,
494+
benchmark=False, benchmark_similarity_threshold=None, # added for pmgen benchmarking # added after review --> similarity threshold
494495
blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
495496
''' Selects the template structure that is best suited as template for homology modelling of the target
496497
@@ -620,6 +621,31 @@ def find_template(target, database, best_n_templates=1, benchmark=False,
620621
putative_templates = {k: v for k, v in putative_templates.items() if
621622
len(database.MHCII_data[k].anchors) == 4}
622623

624+
# Added for benchmark of similarity, # added after review --> similarity threshold
625+
similarity_info = None
626+
if benchmark and benchmark_similarity_threshold is not None:
627+
score_key = class_variables[2] # 'M_score' or 'Avg_score'
628+
# Build (id, similarity_fraction) list for what's left after target removal
629+
sim_list = [(k, v[score_key] / 100.0) for k, v in putative_templates.items()
630+
if score_key in v]
631+
if len(sim_list) == 0:
632+
raise Exception('No putative templates with similarity scores after target removal.')
633+
min_similarity = min(s for _, s in sim_list)
634+
below = [(k, s) for k, s in sim_list if s <= benchmark_similarity_threshold]
635+
at_least_one_below = len(below) > 0
636+
if at_least_one_below:
637+
keep_ids = set(k for k, _ in below)
638+
else:
639+
# all above threshold -> take the lowest-similarity ones (up to best_n_templates)
640+
sim_list_sorted = sorted(sim_list, key=lambda x: x[1])
641+
keep_ids = set(k for k, _ in sim_list_sorted[:best_n_templates])
642+
putative_templates = {k: v for k, v in putative_templates.items() if k in keep_ids}
643+
similarity_info = {
644+
'min_similarity': min_similarity,
645+
'at_least_one_below_threshold': at_least_one_below,
646+
'all_similarities': dict(sim_list), # for later lookup of selected templates
647+
}
648+
623649
# For both chains
624650
# Sort for average score
625651
putative_templates = sorted(putative_templates.items(),
@@ -650,7 +676,7 @@ def find_template(target, database, best_n_templates=1, benchmark=False,
650676

651677
templates = [getattr(database, class_variables[1])[tmpl] for tmpl in template_id]
652678
keep_IL = any(check_target_template(target, tmpl) for tmpl in templates)
653-
return templates, scores, keep_IL
679+
return templates, scores, keep_IL, similarity_info # added after review --> similarity threshold
654680

655681

656682
def write_ini_script(target, template, alignment_file, output_dir, clip_C_domain=False):

PANDORA/PANDORA/Pandora/Pandora.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,14 @@ def __init__(self, target, database=None, template=None, no_modelling=False): #
7171
self.no_modelling_output_dict = {}
7272
self.keep_IL = False
7373
self.logfile = f'{self.target.output_dir}/{target.id}.log'
74+
self.similarity_info = None # added after review --> similarity threshold
7475

7576
if database is None and template is None:
7677
raise Exception('Provide a Database object so Pandora can find the best suitable template structure for '
7778
'modelling. Alternatively, you can specify a user defined Template object.')
7879

7980

80-
def find_template(self, best_n_templates=1, benchmark=False, verbose=True):
81+
def find_template(self, best_n_templates=1, benchmark=False, benchmark_similarity_threshold=None, verbose=True,): # added after review --> similarity threshold
8182
''' Find the best template structure given a Target object
8283
8384
Args:
@@ -106,10 +107,11 @@ def find_template(self, best_n_templates=1, benchmark=False, verbose=True):
106107
print('\tLooking for a template...')
107108
# Find the best template. If the target already exists in the database,
108109
# also consider the initial loop model as a model
109-
self.template, self.pept_ali_scores, self.keep_IL = Modelling_functions.find_template(self.target,
110-
self.database,
111-
best_n_templates=best_n_templates,
112-
benchmark=benchmark)
110+
self.template, self.pept_ali_scores, self.keep_IL, self.similarity_info = Modelling_functions.find_template(self.target,
111+
self.database,
112+
best_n_templates=best_n_templates,
113+
benchmark=benchmark,
114+
benchmark_similarity_threshold=benchmark_similarity_threshold) # added after review --> similarity threshold
113115
self.target.templates = [i.id for i in self.template]
114116
if verbose:
115117
print('\tSelected template structure (%s): %s' %(len(self.template), [i.id for i in self.template]))
@@ -378,7 +380,8 @@ def __log(self, target_id, template_id, error, verbose=True):
378380

379381
def model(self, n_loop_models=20, n_homology_models=1,
380382
best_n_templates=1, n_jobs=None, loop_refinement='slow', pickle_out=False,
381-
benchmark=False, verbose=True, helix=False, sheet=False,
383+
benchmark=False, benchmark_similarity_threshold=None, # added after review --> similarity threshold
384+
verbose=True, helix=False, sheet=False,
382385
RMSD_atoms=['C', 'CA', 'N', 'O'], clip_C_domain=False, restraints_stdev=False):
383386
'''Wrapper function that combines all modelling steps.
384387
@@ -438,7 +441,7 @@ def model(self, n_loop_models=20, n_homology_models=1,
438441
# Find the best template structure given the Target
439442
if self.template==None:
440443
try:
441-
self.find_template(best_n_templates=best_n_templates, benchmark=benchmark, verbose=verbose)
444+
self.find_template(best_n_templates=best_n_templates, benchmark=benchmark, benchmark_similarity_threshold=benchmark_similarity_threshold, verbose=verbose) # added after review --> similarity threshold
442445
except:
443446
self.__log(self.target.id, 'None', 'Could not find a template')
444447
raise Exception('Could not find a template')

run_PMGen.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ def main():
5555
default='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
5656
help='Path to fine-tuned model')
5757
parser.add_argument('--benchmark', action='store_true', help='Enable benchmarking')
58+
parser.add_argument('--benchmark_similarity_threshold', type=float, default=0.95,help='When --benchmark is set, exclude templates with MHC sequence similarity '
59+
'above this fraction (0-1). Default: 0.95.') # added after review --> similarity threshold
5860
parser.add_argument('--best_n_templates', type=int, default=4, help='Best N templates')
5961
parser.add_argument('--n_homology_models', type=int, default=1, help='Number of homology models')
6062
parser.add_argument('--max_ram', type=int, default=3, help='Maximum RAM GB per job (only for parallel mode)')
@@ -209,8 +211,9 @@ def main():
209211
fine_tuned_model_path=args.fine_tuned_model_path,
210212
benchmark=args.benchmark, best_n_templates=args.best_n_templates,
211213
n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora,
212-
no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs)
213-
if args.run == 'parallel' and not args.only_protein_mpnn and not args.only_mutation_screen:
214+
no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs,
215+
benchmark_similarity_threshold=args.benchmark_similarity_threshold,) # added after review --> similarity threshold
216+
if args.run == 'parallel' and not args.only_protein_mpnn and not args.only_mutation_screen:
214217
runner.run_wrapper_parallel(max_ram=args.max_ram, max_cores=args.max_cores, run_alphafold=args.no_alphafold)
215218
elif args.run == 'single' and not args.only_protein_mpnn and not args.only_mutation_screen:
216219
runner.run_wrapper(run_alphafold=args.no_alphafold)
@@ -244,7 +247,8 @@ def main():
244247
benchmark=args.benchmark, best_n_templates=args.best_n_templates,
245248
n_homology_models=args.n_homology_models,
246249
pandora_force_run=args.no_pandora,
247-
return_all_outputs=args.return_all_outputs)
250+
return_all_outputs=args.return_all_outputs,
251+
benchmark_similarity_threshold=args.benchmark_similarity_threshold,) # added after review --> similarity threshold
248252
if not args.only_protein_mpnn and not args.only_mutation_screen:
249253
runner.run_PMGen(run_alphafold=args.no_alphafold)
250254
else:

run_utils.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
3232
fine_tuned_model_path='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
3333
benchmark=False, n_homology_models=1, best_n_templates=4,
3434
pandora_force_run=True, no_modelling=False,
35-
return_all_outputs=False):
35+
return_all_outputs=False, benchmark_similarity_threshold=0.95): # added after review --> similarity threshold
3636
"""
3737
Initializes the PMGen modeling pipeline.
3838
@@ -56,6 +56,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
5656
pandora_force_run (bool): Weather to force run pandora or not, default=True.
5757
no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
5858
return_all_outputs (bool): If active, all alphafold outputs are saved.
59+
benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
5960
"""
6061
super().__init__()
6162
self.peptide = peptide
@@ -77,6 +78,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
7778
self.pandora_force_run = pandora_force_run
7879
self.no_modelling = no_modelling
7980
self.return_all_outputs = return_all_outputs
81+
self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
8082
self.input_assertion()
8183
if len(self.models) > 1:
8284
print(f'\n #### Warning! You are running for multiple models {self.models}'
@@ -167,9 +169,25 @@ def run_pandora(self, force_run=True):
167169
use_netmhcpan=self.predict_anchor, anchors=anchor)
168170
case = Pandora.Pandora(target, self.db, no_modelling=self.no_modelling)
169171
case.model(n_loop_models=self.num_templates, benchmark=self.benchmark,
170-
n_homology_models=self.n_homology_models,
171-
best_n_templates=self.best_n_templates)
172+
benchmark_similarity_threshold=(self.benchmark_similarity_threshold if self.benchmark else None), # added after review --> similarity threshold
173+
n_homology_models=self.n_homology_models,
174+
best_n_templates=self.best_n_templates)
172175
print("Pandora modeling completed successfully.")
176+
# Persist similarity info for later CSV aggregation # added after review --> similarity threshold
177+
if self.benchmark and getattr(case, 'similarity_info', None) is not None:
178+
selected_ids = [t.id[:4] for t in case.template]
179+
sims = case.similarity_info.get('all_similarities', {})
180+
template_sims = [(tid, sims.get(tid[:4], None)) for tid in selected_ids]
181+
info = {
182+
'id': self.id,
183+
'min_similarity': case.similarity_info['min_similarity'],
184+
'at_least_one_below_threshold': case.similarity_info['at_least_one_below_threshold'],
185+
'n_templates_used': len(selected_ids),
186+
'template_similarities': template_sims, # list of (template_id, similarity_fraction)
187+
}
188+
json_path = os.path.join(self.pandora_output, self.id, 'benchmark_similarity.json')
189+
with open(json_path, 'w') as fjs:
190+
json.dump(info, fjs)
173191
except Exception as e:
174192
print(f"❌ An error occurred during template engineering {self.id}: {str(e)}", file=sys.stderr)
175193
raise
@@ -380,7 +398,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
380398
alphafold_param_folder='AFfine/af_params/params_original/',
381399
fine_tuned_model_path='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
382400
max_ram_per_job=3, num_cpu=1, benchmark=False, best_n_templates=1, n_homology_models=1,
383-
pandora_force_run=True, no_modelling=False, return_all_outputs=False):
401+
pandora_force_run=True, no_modelling=False, return_all_outputs=False, benchmark_similarity_threshold=0.95): # added after review --> similarity threshold
384402
"""
385403
Initializes the run_PMGen_wrapper class.
386404
:param df: pandas DataFrame containing input data. Required columns:
@@ -405,6 +423,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
405423
:param no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
406424
:param pandora_force_run (bool): If active, PANDORA will be forced to run even if files already exist.
407425
:param return_all_outputs (bool): If active, all alphafold outputs are saved.
426+
:param benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
408427
The function `input_assertion()` checks if all inputs are correctly formatted and whether required files and directories exist.
409428
410429
Raises:
@@ -425,6 +444,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
425444
self.pandora_force_run = pandora_force_run
426445
self.no_modelling = no_modelling
427446
self.return_all_outputs = return_all_outputs
447+
self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
428448
self.input_assertion()
429449

430450
def run_wrapper(self, run_alphafold=True):
@@ -444,7 +464,7 @@ def run_wrapper(self, run_alphafold=True):
444464
fine_tuned_model_path=self.fine_tuned_model_path, benchmark=self.benchmark,
445465
n_homology_models=self.n_homology_models, best_n_templates=self.best_n_templates,
446466
pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
447-
return_all_outputs=self.return_all_outputs)
467+
return_all_outputs=self.return_all_outputs, benchmark_similarity_threshold=self.benchmark_similarity_threshold,) # added after review --> similarity threshold
448468
runner.run_PMGen(run_alphafold=False)
449469
input_df = pd.read_csv(runner.alphafold_input_file, sep='\t', header=0)
450470
input_df['targetid'] = [str(row['id']) + '/' + str(row['id'])] # id/id
@@ -453,6 +473,7 @@ def run_wrapper(self, run_alphafold=True):
453473
alphafold_out = self.output_dir + '/alphafold'
454474
pd.concat(INPUT_DF).to_csv(f'{alphafold_out}/alphafold_input_file.tsv', sep='\t', index=False)
455475
runner.run_alphafold(input_file=f'{alphafold_out}/alphafold_input_file.tsv', output_prefix=alphafold_out + '/')
476+
self._aggregate_benchmark_similarity_csv() # added after review --> similarity threshold
456477

457478

458479
def get_available_memory(self):
@@ -478,7 +499,7 @@ def process_row(self, row):
478499
fine_tuned_model_path=self.fine_tuned_model_path, benchmark=self.benchmark,
479500
n_homology_models=self.n_homology_models, best_n_templates=self.best_n_templates,
480501
pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
481-
return_all_outputs=self.return_all_outputs)
502+
return_all_outputs=self.return_all_outputs, benchmark_similarity_threshold=self.benchmark_similarity_threshold,) # added after review --> similarity threshold
482503
runner.run_PMGen(run_alphafold=False)
483504
input_df = pd.read_csv(runner.alphafold_input_file, sep='\t', header=0)
484505
input_df['targetid'] = [str(row['id']) + '/' + str(row['id'])] # id/id
@@ -537,7 +558,41 @@ def run_wrapper_parallel(self, max_ram=3, max_cores=4, run_alphafold=True):
537558
return_all_outputs=self.return_all_outputs)
538559
if run_alphafold:
539560
runner.run_alphafold(input_file=f'{alphafold_out}/alphafold_input_file.tsv', output_prefix=alphafold_out + '/')
540-
561+
self._aggregate_benchmark_similarity_csv() # added after review --> similarity threshold
562+
563+
def _aggregate_benchmark_similarity_csv(self):
564+
if not self.benchmark:
565+
return
566+
rows = []
567+
max_t = 0
568+
for _, row in self.df.iterrows():
569+
json_path = os.path.join(self.output_dir, 'pandora', str(row['id']), 'benchmark_similarity.json')
570+
if not os.path.exists(json_path):
571+
continue
572+
with open(json_path) as fjs:
573+
info = json.load(fjs)
574+
max_t = max(max_t, len(info['template_similarities']))
575+
rows.append(info)
576+
if not rows:
577+
return
578+
out_rows = []
579+
for info in rows:
580+
d = {
581+
'id': info['id'],
582+
'min_similarity': info['min_similarity'],
583+
'at_least_one_below_threshold': info['at_least_one_below_threshold'],
584+
'n_templates_used': info['n_templates_used'],
585+
}
586+
for i in range(max_t):
587+
if i < len(info['template_similarities']):
588+
tid, sim = info['template_similarities'][i]
589+
d[f'template_{i+1}_id'] = tid
590+
d[f'template_{i+1}_similarity'] = sim
591+
else:
592+
d[f'template_{i+1}_id'] = None
593+
d[f'template_{i+1}_similarity'] = None
594+
out_rows.append(d)
595+
pd.DataFrame(out_rows).to_csv(os.path.join(self.output_dir, 'benchmark_similarity.csv'), index=False)
541596

542597
def input_assertion(self):
543598
"""

0 commit comments

Comments
 (0)