before release date added

AmirAsgary · AmirAsgary · commit 041c6d2e8d76 · 2026-04-24T14:17:07.000+02:00
diff --git a/PANDORA/PANDORA/Pandora/Modelling_functions.py b/PANDORA/PANDORA/Pandora/Modelling_functions.py
@@ -23,6 +23,30 @@
 import re
 import json
 
+def _parse_release_date(s):
+    """Parse a release-date string in any of the common formats. Returns a
+    datetime or None if the input is missing/unparseable."""
+    from datetime import datetime
+    if s is None:
+        return None
+    if isinstance(s, datetime):
+        return s
+    s = str(s).strip()
+    if not s or s.lower() in ('nan', 'none', 'nat'):
+        return None
+    for fmt in ('%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d', '%m-%d-%Y'):
+        try:
+            return datetime.strptime(s, fmt)
+        except (ValueError, TypeError):
+            continue
+    try:
+        parsed = pd.to_datetime(s, errors='coerce')
+        if pd.isna(parsed):
+            return None
+        return parsed.to_pydatetime()
+    except Exception:
+        return None
+
 def check_target_template(target, template):
     """ Checks if the target and the template are the same. If the user gave sequence info in the target, use that, else
         use the allele type.
@@ -511,8 +535,12 @@ def _peptide_identity(target_pept, template_pept, target_anchors, template_ancho
     aln_len = max(len(aligned_t.replace('-', '')), len(aligned_p.replace('-', '')))
     return matches, max(aln_len, 1)
 
+        
+
 def find_template(target, database, best_n_templates=1, 
                   benchmark=False, benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added for pmgen benchmarking # added after review --> similarity threshold
+                  benchmark_release_dates_map=None,   
+                  target_release_date=None,  
                   blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
     ''' Selects the template structure that is best suited as template for homology modelling of the target
 
@@ -634,6 +662,34 @@ def find_template(target, database, best_n_templates=1,
         for excl in benchmark_exclude_ids:
             putative_templates.pop(excl[:4].upper(), None)
             putative_templates.pop(excl[:4].lower(), None)
+    
+    if (benchmark
+            and benchmark_release_dates_map is not None
+            and target_release_date is not None):
+        tgt_date = _parse_release_date(target_release_date)
+        if tgt_date is None:
+            print(f'[benchmark_before_date] WARNING: could not parse target '
+                  f'release date "{target_release_date}". Skipping date filter.')
+        else:
+            excluded_by_date = []
+            for ID in list(putative_templates.keys()):
+                key = ID[:4].upper()
+                if key not in benchmark_release_dates_map:
+                    # Unknown date -> keep (assumed pre-2018)
+                    continue
+                t_date = _parse_release_date(benchmark_release_dates_map[key])
+                if t_date is None:
+                    continue
+                if t_date >= tgt_date:
+                    excluded_by_date.append(ID)
+                    putative_templates.pop(ID, None)
+            print(f'[benchmark_before_date] Target {target.id} '
+                  f'(date={tgt_date.date()}) — excluded '
+                  f'{len(excluded_by_date)} templates with date >= target.')
+            if not putative_templates:
+                raise Exception(
+                    f'No candidate templates remain after benchmark_before_date '
+                    f'filter for target {target.id} (date={tgt_date.date()}).')
 
     if target.MHC_class == 'II':
         for ID in putative_templates:
diff --git a/PANDORA/PANDORA/Pandora/Pandora.py b/PANDORA/PANDORA/Pandora/Pandora.py
@@ -83,7 +83,7 @@ def __init__(self, target, database=None, template=None, no_modelling=False): #
 
     def find_template(self, best_n_templates=1, benchmark=False, 
                       benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added after review --> similarity threshold
-                      verbose=True,): 
+                      verbose=True, benchmark_release_dates_map=None, target_release_date=None,): 
         ''' Find the best template structure given a Target object
 
         Args:
@@ -116,8 +116,15 @@ def find_template(self, best_n_templates=1, benchmark=False,
                                                                                         self.database,
                                                                                         best_n_templates=best_n_templates,
                                                                                         benchmark=benchmark,
-                                                                                        benchmark_similarity_threshold=benchmark_similarity_threshold,
-                                                                                        benchmark_exclude_ids=benchmark_exclude_ids) # added after review --> similarity threshold
+                                                                                        benchmark_similarity_threshold=benchmark_similarity_threshold,  # added after review --> similarity threshold
+                                                                                        benchmark_exclude_ids=benchmark_exclude_ids,
+                                                                                        benchmark_release_dates_map=benchmark_release_dates_map,  
+                                                                                        target_release_date=target_release_date,
+                                                                                        benchmark_release_dates_map=benchmark_release_dates_map,   
+                                                                                        target_release_date=target_release_date,
+                                                                                        verbose=verbose                 
+                                                                                        )
+                                                                                       
             self.target.templates = [i.id for i in self.template]
             if verbose:
                 print('\tSelected template structure (%s): %s' %(len(self.template), [i.id for i in self.template]))
@@ -387,6 +394,8 @@ def __log(self, target_id, template_id, error, verbose=True):
     def model(self, n_loop_models=20, n_homology_models=1,
               best_n_templates=1, n_jobs=None, loop_refinement='slow', pickle_out=False,
               benchmark=False, benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added after review --> similarity threshold
+              benchmark_release_dates_map=None,
+              target_release_date=None,
               verbose=True, helix=False, sheet=False, 
               RMSD_atoms=['C', 'CA', 'N', 'O'], clip_C_domain=False, restraints_stdev=False):
         '''Wrapper function that combines all modelling steps.
diff --git a/run_PMGen.py b/run_PMGen.py
@@ -145,8 +145,13 @@ def main():
             'similarity above this fraction (0-1). pMHC similarity is the length-weighted '
             'average of MHC G-domain identity and peptide identity. Default: 1.0.') # added after review --> similarity benchmark
     parser.add_argument('--benchmark_exclude_ids', action="store_true", help="If activated, none of the ids in df are used as templates for benchmarking.")
+    parser.add_argument('--benchmark_before_date', action='store_true', default=False, help='Only valid with --benchmark. When set, templates whose PDB release '
+         'date is >= the target structure\'s release date are excluded. '
+         'Requires a "release_date" (or "PDB release date") column in --df. '
+         'Templates whose PDB IDs are not in --df are kept (assumed pre-2018).')
 
     args = parser.parse_args()
+    if args.benchmark_before_date: assert args.benchmark, "--benchmark_before_date requires --benchmark to be set."
     assert(args.proteinmpnn_model_name) in allowed_mpnn_models, f"Allowed models: {allowed_mpnn_models}"
     bioemu_assertions(args)
     for iteration in range(args.iterative_peptide_gen + 1):
@@ -206,6 +211,25 @@ def main():
 
 
             df['mhc_seq'] = [''.join([aa.upper() for aa in seq if aa.upper() in AMINO_ACIDS]) for seq in df['mhc_seq'].tolist()]  # remove gaps from df:
+            # Build PDB -> release_date map for --benchmark_before_date
+            benchmark_release_dates_map = None
+            if args.benchmark and args.benchmark_before_date:
+                _date_col_candidates = ['release_date', 'PDB release date',
+                                        'PDB_release_date', 'PDB release\xa0date']
+                _date_col = next((c for c in _date_col_candidates if c in df.columns), None)
+                if _date_col is None:
+                    raise ValueError(
+                        f"--benchmark_before_date requires a release-date column in --df. "
+                        f"Expected one of {_date_col_candidates}. Found: {list(df.columns)}"
+                    )
+                benchmark_release_dates_map = {}
+                for _, _r in df.iterrows():
+                    _pdb_key = str(_r['id']).split('_')[0][:4].upper()
+                    _d = _r[_date_col]
+                    if pd.notna(_d):
+                        benchmark_release_dates_map[_pdb_key] = str(_d)
+                print(f'[benchmark_before_date] Built date map for '
+                    f'{len(benchmark_release_dates_map)} PDB IDs (column "{_date_col}").')
             # Build benchmark exclusion list (4-letter PDB prefixes from all test ids) # added after review --> similarity threshold
             benchmark_exclude_ids = None
             if args.benchmark_exclude_ids: # not activated for main benchmarking, bcz other methods used all templates and the comparision was not fair if we do it. Instead done in supplementary to asses model performance.
@@ -233,6 +257,7 @@ def main():
                                            n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora,
                                             no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs,
                                             benchmark_similarity_threshold=args.benchmark_similarity_threshold, benchmark_exclude_ids=benchmark_exclude_ids,  # added after review --> similarity threshold
+                                            benchmark_release_dates_map=benchmark_release_dates_map,
                                             sampling_mode=args.sampling_mode,
                                             n_times_sampling=args.n_times_sampling,
                                             sampling_fraction_IG=args.sampling_fraction_IG,
@@ -279,6 +304,7 @@ def main():
                                             return_all_outputs=args.return_all_outputs,
                                             benchmark_similarity_threshold=args.benchmark_similarity_threshold, # added after review --> similarity threshold
                                             benchmark_exclude_ids=[args.id.split('_')[0][:4]] if args.benchmark and args.id else None,
+                                            benchmark_release_dates_map=None,
                                             sampling_mode=args.sampling_mode,
                                             n_times_sampling=args.n_times_sampling,
                                             sampling_fraction_IG=args.sampling_fraction_IG,
diff --git a/run_utils.py b/run_utils.py
@@ -36,7 +36,8 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
                  benchmark_similarity_threshold=0.95, benchmark_exclude_ids=None, # added after review --> similarity threshold
                 sampling_mode=False, n_times_sampling=200, sampling_fraction_IG=0.5, # Sampling mode added
                 sampling_fraction_evo=0.3, sampling_dropout_rate=0.5, sampling_seed=42,
-                radius=8.0, pep_sampling=None,
+                radius=8.0, pep_sampling=None, benchmark_release_dates_map=None, 
+                target_release_date=None,
     ):
         """
         Initializes the PMGen modeling pipeline.
@@ -86,6 +87,8 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
         self.return_all_outputs = return_all_outputs
         self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
         self.benchmark_exclude_ids = benchmark_exclude_ids # added after review --> similarity threshold
+        self.benchmark_release_dates_map = benchmark_release_dates_map   # NEW
+        self.target_release_date = target_release_date                   # NEW
         self.sampling_mode = sampling_mode
         self.n_times_sampling = n_times_sampling
         self.sampling_fraction_IG = sampling_fraction_IG
@@ -94,6 +97,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
         self.sampling_seed = sampling_seed
         self.radius = radius
         self.pep_sampling = pep_sampling
+        
         self.input_assertion()
         if len(self.models) > 1:
             print(f'\n #### Warning! You are running for multiple models {self.models}'
@@ -186,6 +190,8 @@ def run_pandora(self, force_run=True):
                     case.model(n_loop_models=self.num_templates, benchmark=self.benchmark,
                             benchmark_similarity_threshold=(self.benchmark_similarity_threshold if self.benchmark else None), # added after review --> similarity threshold
                             benchmark_exclude_ids=(self.benchmark_exclude_ids if self.benchmark else None),
+                            benchmark_release_dates_map=(self.benchmark_release_dates_map if self.benchmark else None),
+                            target_release_date=(self.target_release_date if self.benchmark else None),    
                             n_homology_models=self.n_homology_models,
                             best_n_templates=self.best_n_templates) 
                     print("Pandora modeling completed successfully.")
@@ -439,7 +445,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
                  benchmark_similarity_threshold=0.95, benchmark_exclude_ids=None,  # added after review --> similarity threshold
                  sampling_mode=False, n_times_sampling=200, sampling_fraction_IG=0.5, # Sampling mode added
                 sampling_fraction_evo=0.3, sampling_dropout_rate=0.5, sampling_seed=42,
-                radius=8.0, pep_sampling=None,
+                radius=8.0, pep_sampling=None, benchmark_release_dates_map=None,
                  ):
         """
         Initializes the run_PMGen_wrapper class.
@@ -466,6 +472,9 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
         :param pandora_force_run (bool): If active, PANDORA will be forced to run even if files already exist.
         :param return_all_outputs (bool): If active, all alphafold outputs are saved.
         :param benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
+        :param benchmark_release_dates_map: Only valid with --benchmark. When set, templates whose PDB release 
+                date is >= the target structure\'s release date are excluded. Requires a "release_date" (or "PDB release date") column in --df. 
+                Templates whose PDB IDs are not in --df are kept (assumed pre-2018).
         The function `input_assertion()` checks if all inputs are correctly formatted and whether required files and directories exist.
 
         Raises:
@@ -488,6 +497,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
         self.return_all_outputs = return_all_outputs
         self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
         self.benchmark_exclude_ids = benchmark_exclude_ids
+        self.benchmark_release_dates_map = benchmark_release_dates_map 
         self.sampling_mode = sampling_mode
         self.n_times_sampling = n_times_sampling
         self.sampling_fraction_IG = sampling_fraction_IG
@@ -517,6 +527,11 @@ def run_wrapper(self, run_alphafold=True):
                                             pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
                                             return_all_outputs=self.return_all_outputs, 
                                             benchmark_similarity_threshold=self.benchmark_similarity_threshold, benchmark_exclude_ids=self.benchmark_exclude_ids, # added after review --> similarity threshold
+                                            benchmark_release_dates_map=self.benchmark_release_dates_map,
+                                            target_release_date=(
+                                                self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
+                                                if self.benchmark_release_dates_map is not None else None
+                                            ),
                                             sampling_mode=self.sampling_mode, #v2
                                             n_times_sampling=self.n_times_sampling,
                                             sampling_fraction_IG=self.sampling_fraction_IG,
@@ -563,6 +578,11 @@ def process_row(self, row):
                                         pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
                                         return_all_outputs=self.return_all_outputs, 
                                         benchmark_similarity_threshold=self.benchmark_similarity_threshold, benchmark_exclude_ids=self.benchmark_exclude_ids, # added after review --> similarity threshold
+                                        benchmark_release_dates_map=self.benchmark_release_dates_map,
+                                        target_release_date=(
+                                            self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
+                                            if self.benchmark_release_dates_map is not None else None
+                                        ),
                                         sampling_mode=self.sampling_mode, #v2
                                         n_times_sampling=self.n_times_sampling,
                                         sampling_fraction_IG=self.sampling_fraction_IG,
@@ -629,6 +649,11 @@ def run_wrapper_parallel(self, max_ram=3, max_cores=4, run_alphafold=True):
                                             models=self.models, alphafold_param_folder=self.alphafold_param_folder,
                                             fine_tuned_model_path=self.fine_tuned_model_path, no_modelling=self.no_modelling,
                                             return_all_outputs=self.return_all_outputs,
+                                            benchmark_release_dates_map=self.benchmark_release_dates_map,
+                                            target_release_date=(
+                                                self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
+                                                if self.benchmark_release_dates_map is not None else None
+                                            ),
                                             sampling_mode=self.sampling_mode, #v2
                                             n_times_sampling=self.n_times_sampling,
                                             sampling_fraction_IG=self.sampling_fraction_IG,