pmgen similarity threshold benchmark review #1

AmirAsgary · AmirAsgary · commit cb9306d68b45 · 2026-04-15T16:28:43.000+02:00
diff --git a/PANDORA/PANDORA/Pandora/Modelling_functions.py b/PANDORA/PANDORA/Pandora/Modelling_functions.py
@@ -490,7 +490,8 @@ def score_peptide_alignment(target, template, substitution_matrix='PAM30'):
         return aligned.score
 
 
-def find_template(target, database, best_n_templates=1, benchmark=False,
+def find_template(target, database, best_n_templates=1, 
+                  benchmark=False, benchmark_similarity_threshold=None, # added for pmgen benchmarking # added after review --> similarity threshold
                   blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
     ''' Selects the template structure that is best suited as template for homology modelling of the target
 
@@ -620,6 +621,31 @@ def find_template(target, database, best_n_templates=1, benchmark=False,
             putative_templates = {k: v for k, v in putative_templates.items() if
                                   len(database.MHCII_data[k].anchors) == 4}
 
+    # Added for benchmark of similarity, # added after review --> similarity threshold
+    similarity_info = None
+    if benchmark and benchmark_similarity_threshold is not None:
+        score_key = class_variables[2]  # 'M_score' or 'Avg_score'
+        # Build (id, similarity_fraction) list for what's left after target removal
+        sim_list = [(k, v[score_key] / 100.0) for k, v in putative_templates.items()
+                    if score_key in v]
+        if len(sim_list) == 0:
+            raise Exception('No putative templates with similarity scores after target removal.')
+        min_similarity = min(s for _, s in sim_list)
+        below = [(k, s) for k, s in sim_list if s <= benchmark_similarity_threshold]
+        at_least_one_below = len(below) > 0
+        if at_least_one_below:
+            keep_ids = set(k for k, _ in below)
+        else:
+            # all above threshold -> take the lowest-similarity ones (up to best_n_templates)
+            sim_list_sorted = sorted(sim_list, key=lambda x: x[1])
+            keep_ids = set(k for k, _ in sim_list_sorted[:best_n_templates])
+        putative_templates = {k: v for k, v in putative_templates.items() if k in keep_ids}
+        similarity_info = {
+            'min_similarity': min_similarity,
+            'at_least_one_below_threshold': at_least_one_below,
+            'all_similarities': dict(sim_list),  # for later lookup of selected templates
+        }
+
     # For both chains
     # Sort for average score
     putative_templates = sorted(putative_templates.items(),
@@ -650,7 +676,7 @@ def find_template(target, database, best_n_templates=1, benchmark=False,
 
     templates = [getattr(database, class_variables[1])[tmpl] for tmpl in template_id]
     keep_IL = any(check_target_template(target, tmpl) for tmpl in templates)
-    return templates, scores, keep_IL
+    return templates, scores, keep_IL, similarity_info # added after review --> similarity threshold
 
 
 def write_ini_script(target, template, alignment_file, output_dir, clip_C_domain=False):
diff --git a/PANDORA/PANDORA/Pandora/Pandora.py b/PANDORA/PANDORA/Pandora/Pandora.py
@@ -71,13 +71,14 @@ def __init__(self, target, database=None, template=None, no_modelling=False): #
         self.no_modelling_output_dict = {}
         self.keep_IL = False
         self.logfile = f'{self.target.output_dir}/{target.id}.log'
+        self.similarity_info = None # added after review --> similarity threshold
 
         if database is None and template is None:
             raise Exception('Provide a Database object so Pandora can find the best suitable template structure for '
                             'modelling. Alternatively, you can specify a user defined Template object.')
         
 
-    def find_template(self, best_n_templates=1, benchmark=False, verbose=True):
+    def find_template(self, best_n_templates=1, benchmark=False, benchmark_similarity_threshold=None, verbose=True,): # added after review --> similarity threshold
         ''' Find the best template structure given a Target object
 
         Args:
@@ -106,10 +107,11 @@ def find_template(self, best_n_templates=1, benchmark=False, verbose=True):
                 print('\tLooking for a template...')
             # Find the best template. If the target already exists in the database,
             # also consider the initial loop model as a model
-            self.template, self.pept_ali_scores, self.keep_IL = Modelling_functions.find_template(self.target,
-                                                                    self.database,
-                                                                    best_n_templates=best_n_templates,
-                                                                    benchmark=benchmark)
+            self.template, self.pept_ali_scores, self.keep_IL, self.similarity_info = Modelling_functions.find_template(self.target,
+                                                                                        self.database,
+                                                                                        best_n_templates=best_n_templates,
+                                                                                        benchmark=benchmark,
+                                                                                        benchmark_similarity_threshold=benchmark_similarity_threshold) # added after review --> similarity threshold
             self.target.templates = [i.id for i in self.template]
             if verbose:
                 print('\tSelected template structure (%s): %s' %(len(self.template), [i.id for i in self.template]))
@@ -378,7 +380,8 @@ def __log(self, target_id, template_id, error, verbose=True):
 
     def model(self, n_loop_models=20, n_homology_models=1,
               best_n_templates=1, n_jobs=None, loop_refinement='slow', pickle_out=False,
-              benchmark=False, verbose=True, helix=False, sheet=False, 
+              benchmark=False, benchmark_similarity_threshold=None, # added after review --> similarity threshold
+              verbose=True, helix=False, sheet=False, 
               RMSD_atoms=['C', 'CA', 'N', 'O'], clip_C_domain=False, restraints_stdev=False):
         '''Wrapper function that combines all modelling steps.
 
@@ -438,7 +441,7 @@ def model(self, n_loop_models=20, n_homology_models=1,
         # Find the best template structure given the Target
         if self.template==None:
             try:
-                self.find_template(best_n_templates=best_n_templates, benchmark=benchmark, verbose=verbose)
+                self.find_template(best_n_templates=best_n_templates, benchmark=benchmark, benchmark_similarity_threshold=benchmark_similarity_threshold, verbose=verbose) # added after review --> similarity threshold
             except:
                 self.__log(self.target.id, 'None', 'Could not find a template')
                 raise Exception('Could not find a template')
diff --git a/run_PMGen.py b/run_PMGen.py
@@ -55,6 +55,8 @@ def main():
                         default='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
                         help='Path to fine-tuned model')
     parser.add_argument('--benchmark', action='store_true', help='Enable benchmarking')
+    parser.add_argument('--benchmark_similarity_threshold', type=float, default=0.95,help='When --benchmark is set, exclude templates with MHC sequence similarity '
+                         'above this fraction (0-1). Default: 0.95.') # added after review --> similarity threshold
     parser.add_argument('--best_n_templates', type=int, default=4, help='Best N templates')
     parser.add_argument('--n_homology_models', type=int, default=1, help='Number of homology models')
     parser.add_argument('--max_ram', type=int, default=3, help='Maximum RAM GB per job (only for parallel mode)')
@@ -209,8 +211,9 @@ def main():
                                            fine_tuned_model_path=args.fine_tuned_model_path,
                                            benchmark=args.benchmark, best_n_templates=args.best_n_templates,
                                            n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora,
-                                            no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs)
-            if args.run == 'parallel' and not args.only_protein_mpnn and not args.only_mutation_screen:
+                                            no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs,
+                                            benchmark_similarity_threshold=args.benchmark_similarity_threshold,)  # added after review --> similarity threshold
+            if args.run == 'parallel' and not args.only_protein_mpnn and not args.only_mutation_screen: 
                 runner.run_wrapper_parallel(max_ram=args.max_ram, max_cores=args.max_cores, run_alphafold=args.no_alphafold)
             elif args.run == 'single' and not args.only_protein_mpnn and not args.only_mutation_screen:
                 runner.run_wrapper(run_alphafold=args.no_alphafold)
@@ -244,7 +247,8 @@ def main():
                                             benchmark=args.benchmark, best_n_templates=args.best_n_templates,
                                             n_homology_models=args.n_homology_models,
                                             pandora_force_run=args.no_pandora,
-                                            return_all_outputs=args.return_all_outputs)
+                                            return_all_outputs=args.return_all_outputs,
+                                            benchmark_similarity_threshold=args.benchmark_similarity_threshold,)  # added after review --> similarity threshold
             if not args.only_protein_mpnn and not args.only_mutation_screen:
                 runner.run_PMGen(run_alphafold=args.no_alphafold)
             else:
diff --git a/run_utils.py b/run_utils.py
@@ -32,7 +32,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
                  fine_tuned_model_path='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
                  benchmark=False, n_homology_models=1, best_n_templates=4,
                  pandora_force_run=True, no_modelling=False,
-                 return_all_outputs=False):
+                 return_all_outputs=False, benchmark_similarity_threshold=0.95): # added after review --> similarity threshold
         """
         Initializes the PMGen modeling pipeline.
 
@@ -56,6 +56,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
             pandora_force_run (bool): Weather to force run pandora or not, default=True.
             no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
             return_all_outputs (bool): If active, all alphafold outputs are saved.
+            benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
         """
         super().__init__()
         self.peptide = peptide
@@ -77,6 +78,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
         self.pandora_force_run = pandora_force_run
         self.no_modelling = no_modelling
         self.return_all_outputs = return_all_outputs
+        self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
         self.input_assertion()
         if len(self.models) > 1:
             print(f'\n #### Warning! You are running for multiple models {self.models}'
@@ -167,9 +169,25 @@ def run_pandora(self, force_run=True):
                                     use_netmhcpan=self.predict_anchor, anchors=anchor)
                     case = Pandora.Pandora(target, self.db, no_modelling=self.no_modelling)
                     case.model(n_loop_models=self.num_templates, benchmark=self.benchmark,
-                               n_homology_models=self.n_homology_models,
-                               best_n_templates=self.best_n_templates)
+                            benchmark_similarity_threshold=(self.benchmark_similarity_threshold if self.benchmark else None), # added after review --> similarity threshold
+                            n_homology_models=self.n_homology_models,
+                            best_n_templates=self.best_n_templates) 
                     print("Pandora modeling completed successfully.")
+                    # Persist similarity info for later CSV aggregation # added after review --> similarity threshold
+                    if self.benchmark and getattr(case, 'similarity_info', None) is not None:
+                        selected_ids = [t.id[:4] for t in case.template]
+                        sims = case.similarity_info.get('all_similarities', {})
+                        template_sims = [(tid, sims.get(tid[:4], None)) for tid in selected_ids]
+                        info = {
+                            'id': self.id,
+                            'min_similarity': case.similarity_info['min_similarity'],
+                            'at_least_one_below_threshold': case.similarity_info['at_least_one_below_threshold'],
+                            'n_templates_used': len(selected_ids),
+                            'template_similarities': template_sims,  # list of (template_id, similarity_fraction)
+                        }
+                        json_path = os.path.join(self.pandora_output, self.id, 'benchmark_similarity.json')
+                        with open(json_path, 'w') as fjs:
+                            json.dump(info, fjs)
                 except Exception as e:
                     print(f"❌ An error occurred during template engineering {self.id}: {str(e)}", file=sys.stderr)
                     raise
@@ -380,7 +398,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
                  alphafold_param_folder='AFfine/af_params/params_original/',
                  fine_tuned_model_path='AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl',
                  max_ram_per_job=3, num_cpu=1, benchmark=False, best_n_templates=1, n_homology_models=1,
-                 pandora_force_run=True, no_modelling=False, return_all_outputs=False):
+                 pandora_force_run=True, no_modelling=False, return_all_outputs=False, benchmark_similarity_threshold=0.95): # added after review --> similarity threshold
         """
         Initializes the run_PMGen_wrapper class.
         :param df: pandas DataFrame containing input data. Required columns:
@@ -405,6 +423,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
         :param no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
         :param pandora_force_run (bool): If active, PANDORA will be forced to run even if files already exist.
         :param return_all_outputs (bool): If active, all alphafold outputs are saved.
+        :param benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
         The function `input_assertion()` checks if all inputs are correctly formatted and whether required files and directories exist.
 
         Raises:
@@ -425,6 +444,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
         self.pandora_force_run = pandora_force_run
         self.no_modelling = no_modelling
         self.return_all_outputs = return_all_outputs
+        self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
         self.input_assertion()
 
     def run_wrapper(self, run_alphafold=True):
@@ -444,7 +464,7 @@ def run_wrapper(self, run_alphafold=True):
                                             fine_tuned_model_path=self.fine_tuned_model_path, benchmark=self.benchmark,
                                             n_homology_models=self.n_homology_models, best_n_templates=self.best_n_templates,
                                             pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
-                                            return_all_outputs=self.return_all_outputs)
+                                            return_all_outputs=self.return_all_outputs, benchmark_similarity_threshold=self.benchmark_similarity_threshold,) # added after review --> similarity threshold
             runner.run_PMGen(run_alphafold=False)
             input_df = pd.read_csv(runner.alphafold_input_file, sep='\t', header=0)
             input_df['targetid'] = [str(row['id']) + '/' + str(row['id'])] # id/id
@@ -453,6 +473,7 @@ def run_wrapper(self, run_alphafold=True):
             alphafold_out = self.output_dir + '/alphafold'
             pd.concat(INPUT_DF).to_csv(f'{alphafold_out}/alphafold_input_file.tsv', sep='\t', index=False)
             runner.run_alphafold(input_file=f'{alphafold_out}/alphafold_input_file.tsv', output_prefix=alphafold_out + '/')
+        self._aggregate_benchmark_similarity_csv() # added after review --> similarity threshold
 
 
     def get_available_memory(self):
@@ -478,7 +499,7 @@ def process_row(self, row):
                                         fine_tuned_model_path=self.fine_tuned_model_path, benchmark=self.benchmark,
                                         n_homology_models=self.n_homology_models, best_n_templates=self.best_n_templates,
                                         pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
-                                        return_all_outputs=self.return_all_outputs)
+                                        return_all_outputs=self.return_all_outputs, benchmark_similarity_threshold=self.benchmark_similarity_threshold,) # added after review --> similarity threshold
         runner.run_PMGen(run_alphafold=False)
         input_df = pd.read_csv(runner.alphafold_input_file, sep='\t', header=0)
         input_df['targetid'] = [str(row['id']) + '/' + str(row['id'])]  # id/id
@@ -537,7 +558,41 @@ def run_wrapper_parallel(self, max_ram=3, max_cores=4, run_alphafold=True):
                                             return_all_outputs=self.return_all_outputs)
         if run_alphafold:
             runner.run_alphafold(input_file=f'{alphafold_out}/alphafold_input_file.tsv', output_prefix=alphafold_out + '/')
-
+        self._aggregate_benchmark_similarity_csv() # added after review --> similarity threshold
+
+    def _aggregate_benchmark_similarity_csv(self):
+        if not self.benchmark:
+            return
+        rows = []
+        max_t = 0
+        for _, row in self.df.iterrows():
+            json_path = os.path.join(self.output_dir, 'pandora', str(row['id']), 'benchmark_similarity.json')
+            if not os.path.exists(json_path):
+                continue
+            with open(json_path) as fjs:
+                info = json.load(fjs)
+            max_t = max(max_t, len(info['template_similarities']))
+            rows.append(info)
+        if not rows:
+            return
+        out_rows = []
+        for info in rows:
+            d = {
+                'id': info['id'],
+                'min_similarity': info['min_similarity'],
+                'at_least_one_below_threshold': info['at_least_one_below_threshold'],
+                'n_templates_used': info['n_templates_used'],
+            }
+            for i in range(max_t):
+                if i < len(info['template_similarities']):
+                    tid, sim = info['template_similarities'][i]
+                    d[f'template_{i+1}_id'] = tid
+                    d[f'template_{i+1}_similarity'] = sim
+                else:
+                    d[f'template_{i+1}_id'] = None
+                    d[f'template_{i+1}_similarity'] = None
+            out_rows.append(d)
+        pd.DataFrame(out_rows).to_csv(os.path.join(self.output_dir, 'benchmark_similarity.csv'), index=False)
 
     def input_assertion(self):
         """