@@ -32,7 +32,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
3232 fine_tuned_model_path = 'AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl' ,
3333 benchmark = False , n_homology_models = 1 , best_n_templates = 4 ,
3434 pandora_force_run = True , no_modelling = False ,
35- return_all_outputs = False ):
35+ return_all_outputs = False , benchmark_similarity_threshold = 0.95 ): # added after review --> similarity threshold
3636 """
3737 Initializes the PMGen modeling pipeline.
3838
@@ -56,6 +56,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
5656 pandora_force_run (bool): Weather to force run pandora or not, default=True.
5757 no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
5858 return_all_outputs (bool): If active, all alphafold outputs are saved.
59+ benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
5960 """
6061 super ().__init__ ()
6162 self .peptide = peptide
@@ -77,6 +78,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
7778 self .pandora_force_run = pandora_force_run
7879 self .no_modelling = no_modelling
7980 self .return_all_outputs = return_all_outputs
81+ self .benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
8082 self .input_assertion ()
8183 if len (self .models ) > 1 :
8284 print (f'\n #### Warning! You are running for multiple models { self .models } '
@@ -167,9 +169,25 @@ def run_pandora(self, force_run=True):
167169 use_netmhcpan = self .predict_anchor , anchors = anchor )
168170 case = Pandora .Pandora (target , self .db , no_modelling = self .no_modelling )
169171 case .model (n_loop_models = self .num_templates , benchmark = self .benchmark ,
170- n_homology_models = self .n_homology_models ,
171- best_n_templates = self .best_n_templates )
172+ benchmark_similarity_threshold = (self .benchmark_similarity_threshold if self .benchmark else None ), # added after review --> similarity threshold
173+ n_homology_models = self .n_homology_models ,
174+ best_n_templates = self .best_n_templates )
172175 print ("Pandora modeling completed successfully." )
176+ # Persist similarity info for later CSV aggregation # added after review --> similarity threshold
177+ if self .benchmark and getattr (case , 'similarity_info' , None ) is not None :
178+ selected_ids = [t .id [:4 ] for t in case .template ]
179+ sims = case .similarity_info .get ('all_similarities' , {})
180+ template_sims = [(tid , sims .get (tid [:4 ], None )) for tid in selected_ids ]
181+ info = {
182+ 'id' : self .id ,
183+ 'min_similarity' : case .similarity_info ['min_similarity' ],
184+ 'at_least_one_below_threshold' : case .similarity_info ['at_least_one_below_threshold' ],
185+ 'n_templates_used' : len (selected_ids ),
186+ 'template_similarities' : template_sims , # list of (template_id, similarity_fraction)
187+ }
188+ json_path = os .path .join (self .pandora_output , self .id , 'benchmark_similarity.json' )
189+ with open (json_path , 'w' ) as fjs :
190+ json .dump (info , fjs )
173191 except Exception as e :
174192 print (f"❌ An error occurred during template engineering { self .id } : { str (e )} " , file = sys .stderr )
175193 raise
@@ -380,7 +398,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
380398 alphafold_param_folder = 'AFfine/af_params/params_original/' ,
381399 fine_tuned_model_path = 'AFfine/af_params/params_finetune/params/model_ft_mhc_20640.pkl' ,
382400 max_ram_per_job = 3 , num_cpu = 1 , benchmark = False , best_n_templates = 1 , n_homology_models = 1 ,
383- pandora_force_run = True , no_modelling = False , return_all_outputs = False ):
401+ pandora_force_run = True , no_modelling = False , return_all_outputs = False , benchmark_similarity_threshold = 0.95 ): # added after review --> similarity threshold
384402 """
385403 Initializes the run_PMGen_wrapper class.
386404 :param df: pandas DataFrame containing input data. Required columns:
@@ -405,6 +423,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
405423 :param no_modelling (bool): If active, no modeller homology modeling happens and only PANDORA is used for template search and alignment.
406424 :param pandora_force_run (bool): If active, PANDORA will be forced to run even if files already exist.
407425 :param return_all_outputs (bool): If active, all alphafold outputs are saved.
426+ :param benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
408427 The function `input_assertion()` checks if all inputs are correctly formatted and whether required files and directories exist.
409428
410429 Raises:
@@ -425,6 +444,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
425444 self .pandora_force_run = pandora_force_run
426445 self .no_modelling = no_modelling
427446 self .return_all_outputs = return_all_outputs
447+ self .benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
428448 self .input_assertion ()
429449
430450 def run_wrapper (self , run_alphafold = True ):
@@ -444,7 +464,7 @@ def run_wrapper(self, run_alphafold=True):
444464 fine_tuned_model_path = self .fine_tuned_model_path , benchmark = self .benchmark ,
445465 n_homology_models = self .n_homology_models , best_n_templates = self .best_n_templates ,
446466 pandora_force_run = self .pandora_force_run , no_modelling = self .no_modelling ,
447- return_all_outputs = self .return_all_outputs )
467+ return_all_outputs = self .return_all_outputs , benchmark_similarity_threshold = self . benchmark_similarity_threshold ,) # added after review --> similarity threshold
448468 runner .run_PMGen (run_alphafold = False )
449469 input_df = pd .read_csv (runner .alphafold_input_file , sep = '\t ' , header = 0 )
450470 input_df ['targetid' ] = [str (row ['id' ]) + '/' + str (row ['id' ])] # id/id
@@ -453,6 +473,7 @@ def run_wrapper(self, run_alphafold=True):
453473 alphafold_out = self .output_dir + '/alphafold'
454474 pd .concat (INPUT_DF ).to_csv (f'{ alphafold_out } /alphafold_input_file.tsv' , sep = '\t ' , index = False )
455475 runner .run_alphafold (input_file = f'{ alphafold_out } /alphafold_input_file.tsv' , output_prefix = alphafold_out + '/' )
476+ self ._aggregate_benchmark_similarity_csv () # added after review --> similarity threshold
456477
457478
458479 def get_available_memory (self ):
@@ -478,7 +499,7 @@ def process_row(self, row):
478499 fine_tuned_model_path = self .fine_tuned_model_path , benchmark = self .benchmark ,
479500 n_homology_models = self .n_homology_models , best_n_templates = self .best_n_templates ,
480501 pandora_force_run = self .pandora_force_run , no_modelling = self .no_modelling ,
481- return_all_outputs = self .return_all_outputs )
502+ return_all_outputs = self .return_all_outputs , benchmark_similarity_threshold = self . benchmark_similarity_threshold ,) # added after review --> similarity threshold
482503 runner .run_PMGen (run_alphafold = False )
483504 input_df = pd .read_csv (runner .alphafold_input_file , sep = '\t ' , header = 0 )
484505 input_df ['targetid' ] = [str (row ['id' ]) + '/' + str (row ['id' ])] # id/id
@@ -537,7 +558,41 @@ def run_wrapper_parallel(self, max_ram=3, max_cores=4, run_alphafold=True):
537558 return_all_outputs = self .return_all_outputs )
538559 if run_alphafold :
539560 runner .run_alphafold (input_file = f'{ alphafold_out } /alphafold_input_file.tsv' , output_prefix = alphafold_out + '/' )
540-
561+ self ._aggregate_benchmark_similarity_csv () # added after review --> similarity threshold
562+
563+ def _aggregate_benchmark_similarity_csv (self ):
564+ if not self .benchmark :
565+ return
566+ rows = []
567+ max_t = 0
568+ for _ , row in self .df .iterrows ():
569+ json_path = os .path .join (self .output_dir , 'pandora' , str (row ['id' ]), 'benchmark_similarity.json' )
570+ if not os .path .exists (json_path ):
571+ continue
572+ with open (json_path ) as fjs :
573+ info = json .load (fjs )
574+ max_t = max (max_t , len (info ['template_similarities' ]))
575+ rows .append (info )
576+ if not rows :
577+ return
578+ out_rows = []
579+ for info in rows :
580+ d = {
581+ 'id' : info ['id' ],
582+ 'min_similarity' : info ['min_similarity' ],
583+ 'at_least_one_below_threshold' : info ['at_least_one_below_threshold' ],
584+ 'n_templates_used' : info ['n_templates_used' ],
585+ }
586+ for i in range (max_t ):
587+ if i < len (info ['template_similarities' ]):
588+ tid , sim = info ['template_similarities' ][i ]
589+ d [f'template_{ i + 1 } _id' ] = tid
590+ d [f'template_{ i + 1 } _similarity' ] = sim
591+ else :
592+ d [f'template_{ i + 1 } _id' ] = None
593+ d [f'template_{ i + 1 } _similarity' ] = None
594+ out_rows .append (d )
595+ pd .DataFrame (out_rows ).to_csv (os .path .join (self .output_dir , 'benchmark_similarity.csv' ), index = False )
541596
542597 def input_assertion (self ):
543598 """
0 commit comments