Skip to content

Commit 041c6d2

Browse files
committed
before release date added
1 parent 5f42027 commit 041c6d2

4 files changed

Lines changed: 121 additions & 5 deletions

File tree

PANDORA/PANDORA/Pandora/Modelling_functions.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,30 @@
2323
import re
2424
import json
2525

26+
def _parse_release_date(s):
27+
"""Parse a release-date string in any of the common formats. Returns a
28+
datetime or None if the input is missing/unparseable."""
29+
from datetime import datetime
30+
if s is None:
31+
return None
32+
if isinstance(s, datetime):
33+
return s
34+
s = str(s).strip()
35+
if not s or s.lower() in ('nan', 'none', 'nat'):
36+
return None
37+
for fmt in ('%m/%d/%Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d', '%m-%d-%Y'):
38+
try:
39+
return datetime.strptime(s, fmt)
40+
except (ValueError, TypeError):
41+
continue
42+
try:
43+
parsed = pd.to_datetime(s, errors='coerce')
44+
if pd.isna(parsed):
45+
return None
46+
return parsed.to_pydatetime()
47+
except Exception:
48+
return None
49+
2650
def check_target_template(target, template):
2751
""" Checks if the target and the template are the same. If the user gave sequence info in the target, use that, else
2852
use the allele type.
@@ -511,8 +535,12 @@ def _peptide_identity(target_pept, template_pept, target_anchors, template_ancho
511535
aln_len = max(len(aligned_t.replace('-', '')), len(aligned_p.replace('-', '')))
512536
return matches, max(aln_len, 1)
513537

538+
539+
514540
def find_template(target, database, best_n_templates=1,
515541
benchmark=False, benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added for pmgen benchmarking # added after review --> similarity threshold
542+
benchmark_release_dates_map=None,
543+
target_release_date=None,
516544
blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
517545
''' Selects the template structure that is best suited as template for homology modelling of the target
518546
@@ -634,6 +662,34 @@ def find_template(target, database, best_n_templates=1,
634662
for excl in benchmark_exclude_ids:
635663
putative_templates.pop(excl[:4].upper(), None)
636664
putative_templates.pop(excl[:4].lower(), None)
665+
666+
if (benchmark
667+
and benchmark_release_dates_map is not None
668+
and target_release_date is not None):
669+
tgt_date = _parse_release_date(target_release_date)
670+
if tgt_date is None:
671+
print(f'[benchmark_before_date] WARNING: could not parse target '
672+
f'release date "{target_release_date}". Skipping date filter.')
673+
else:
674+
excluded_by_date = []
675+
for ID in list(putative_templates.keys()):
676+
key = ID[:4].upper()
677+
if key not in benchmark_release_dates_map:
678+
# Unknown date -> keep (assumed pre-2018)
679+
continue
680+
t_date = _parse_release_date(benchmark_release_dates_map[key])
681+
if t_date is None:
682+
continue
683+
if t_date >= tgt_date:
684+
excluded_by_date.append(ID)
685+
putative_templates.pop(ID, None)
686+
print(f'[benchmark_before_date] Target {target.id} '
687+
f'(date={tgt_date.date()}) — excluded '
688+
f'{len(excluded_by_date)} templates with date >= target.')
689+
if not putative_templates:
690+
raise Exception(
691+
f'No candidate templates remain after benchmark_before_date '
692+
f'filter for target {target.id} (date={tgt_date.date()}).')
637693

638694
if target.MHC_class == 'II':
639695
for ID in putative_templates:

PANDORA/PANDORA/Pandora/Pandora.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def __init__(self, target, database=None, template=None, no_modelling=False): #
8383

8484
def find_template(self, best_n_templates=1, benchmark=False,
8585
benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added after review --> similarity threshold
86-
verbose=True,):
86+
verbose=True, benchmark_release_dates_map=None, target_release_date=None,):
8787
''' Find the best template structure given a Target object
8888
8989
Args:
@@ -116,8 +116,15 @@ def find_template(self, best_n_templates=1, benchmark=False,
116116
self.database,
117117
best_n_templates=best_n_templates,
118118
benchmark=benchmark,
119-
benchmark_similarity_threshold=benchmark_similarity_threshold,
120-
benchmark_exclude_ids=benchmark_exclude_ids) # added after review --> similarity threshold
119+
benchmark_similarity_threshold=benchmark_similarity_threshold, # added after review --> similarity threshold
120+
benchmark_exclude_ids=benchmark_exclude_ids,
121+
benchmark_release_dates_map=benchmark_release_dates_map,
122+
target_release_date=target_release_date,
123+
benchmark_release_dates_map=benchmark_release_dates_map,
124+
target_release_date=target_release_date,
125+
verbose=verbose
126+
)
127+
121128
self.target.templates = [i.id for i in self.template]
122129
if verbose:
123130
print('\tSelected template structure (%s): %s' %(len(self.template), [i.id for i in self.template]))
@@ -387,6 +394,8 @@ def __log(self, target_id, template_id, error, verbose=True):
387394
def model(self, n_loop_models=20, n_homology_models=1,
388395
best_n_templates=1, n_jobs=None, loop_refinement='slow', pickle_out=False,
389396
benchmark=False, benchmark_similarity_threshold=None, benchmark_exclude_ids=None, # added after review --> similarity threshold
397+
benchmark_release_dates_map=None,
398+
target_release_date=None,
390399
verbose=True, helix=False, sheet=False,
391400
RMSD_atoms=['C', 'CA', 'N', 'O'], clip_C_domain=False, restraints_stdev=False):
392401
'''Wrapper function that combines all modelling steps.

run_PMGen.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,13 @@ def main():
145145
'similarity above this fraction (0-1). pMHC similarity is the length-weighted '
146146
'average of MHC G-domain identity and peptide identity. Default: 1.0.') # added after review --> similarity benchmark
147147
parser.add_argument('--benchmark_exclude_ids', action="store_true", help="If activated, none of the ids in df are used as templates for benchmarking.")
148+
parser.add_argument('--benchmark_before_date', action='store_true', default=False, help='Only valid with --benchmark. When set, templates whose PDB release '
149+
'date is >= the target structure\'s release date are excluded. '
150+
'Requires a "release_date" (or "PDB release date") column in --df. '
151+
'Templates whose PDB IDs are not in --df are kept (assumed pre-2018).')
148152

149153
args = parser.parse_args()
154+
if args.benchmark_before_date: assert args.benchmark, "--benchmark_before_date requires --benchmark to be set."
150155
assert(args.proteinmpnn_model_name) in allowed_mpnn_models, f"Allowed models: {allowed_mpnn_models}"
151156
bioemu_assertions(args)
152157
for iteration in range(args.iterative_peptide_gen + 1):
@@ -206,6 +211,25 @@ def main():
206211

207212

208213
df['mhc_seq'] = [''.join([aa.upper() for aa in seq if aa.upper() in AMINO_ACIDS]) for seq in df['mhc_seq'].tolist()] # remove gaps from df:
214+
# Build PDB -> release_date map for --benchmark_before_date
215+
benchmark_release_dates_map = None
216+
if args.benchmark and args.benchmark_before_date:
217+
_date_col_candidates = ['release_date', 'PDB release date',
218+
'PDB_release_date', 'PDB release\xa0date']
219+
_date_col = next((c for c in _date_col_candidates if c in df.columns), None)
220+
if _date_col is None:
221+
raise ValueError(
222+
f"--benchmark_before_date requires a release-date column in --df. "
223+
f"Expected one of {_date_col_candidates}. Found: {list(df.columns)}"
224+
)
225+
benchmark_release_dates_map = {}
226+
for _, _r in df.iterrows():
227+
_pdb_key = str(_r['id']).split('_')[0][:4].upper()
228+
_d = _r[_date_col]
229+
if pd.notna(_d):
230+
benchmark_release_dates_map[_pdb_key] = str(_d)
231+
print(f'[benchmark_before_date] Built date map for '
232+
f'{len(benchmark_release_dates_map)} PDB IDs (column "{_date_col}").')
209233
# Build benchmark exclusion list (4-letter PDB prefixes from all test ids) # added after review --> similarity threshold
210234
benchmark_exclude_ids = None
211235
if args.benchmark_exclude_ids: # not activated for main benchmarking, bcz other methods used all templates and the comparision was not fair if we do it. Instead done in supplementary to asses model performance.
@@ -233,6 +257,7 @@ def main():
233257
n_homology_models=args.n_homology_models, pandora_force_run=args.no_pandora,
234258
no_modelling=args.initial_guess, return_all_outputs=args.return_all_outputs,
235259
benchmark_similarity_threshold=args.benchmark_similarity_threshold, benchmark_exclude_ids=benchmark_exclude_ids, # added after review --> similarity threshold
260+
benchmark_release_dates_map=benchmark_release_dates_map,
236261
sampling_mode=args.sampling_mode,
237262
n_times_sampling=args.n_times_sampling,
238263
sampling_fraction_IG=args.sampling_fraction_IG,
@@ -279,6 +304,7 @@ def main():
279304
return_all_outputs=args.return_all_outputs,
280305
benchmark_similarity_threshold=args.benchmark_similarity_threshold, # added after review --> similarity threshold
281306
benchmark_exclude_ids=[args.id.split('_')[0][:4]] if args.benchmark and args.id else None,
307+
benchmark_release_dates_map=None,
282308
sampling_mode=args.sampling_mode,
283309
n_times_sampling=args.n_times_sampling,
284310
sampling_fraction_IG=args.sampling_fraction_IG,

run_utils.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
3636
benchmark_similarity_threshold=0.95, benchmark_exclude_ids=None, # added after review --> similarity threshold
3737
sampling_mode=False, n_times_sampling=200, sampling_fraction_IG=0.5, # Sampling mode added
3838
sampling_fraction_evo=0.3, sampling_dropout_rate=0.5, sampling_seed=42,
39-
radius=8.0, pep_sampling=None,
39+
radius=8.0, pep_sampling=None, benchmark_release_dates_map=None,
40+
target_release_date=None,
4041
):
4142
"""
4243
Initializes the PMGen modeling pipeline.
@@ -86,6 +87,8 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
8687
self.return_all_outputs = return_all_outputs
8788
self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
8889
self.benchmark_exclude_ids = benchmark_exclude_ids # added after review --> similarity threshold
90+
self.benchmark_release_dates_map = benchmark_release_dates_map # NEW
91+
self.target_release_date = target_release_date # NEW
8992
self.sampling_mode = sampling_mode
9093
self.n_times_sampling = n_times_sampling
9194
self.sampling_fraction_IG = sampling_fraction_IG
@@ -94,6 +97,7 @@ def __init__(self, peptide, mhc_seq, mhc_type, id, output_dir='output',
9497
self.sampling_seed = sampling_seed
9598
self.radius = radius
9699
self.pep_sampling = pep_sampling
100+
97101
self.input_assertion()
98102
if len(self.models) > 1:
99103
print(f'\n #### Warning! You are running for multiple models {self.models}'
@@ -186,6 +190,8 @@ def run_pandora(self, force_run=True):
186190
case.model(n_loop_models=self.num_templates, benchmark=self.benchmark,
187191
benchmark_similarity_threshold=(self.benchmark_similarity_threshold if self.benchmark else None), # added after review --> similarity threshold
188192
benchmark_exclude_ids=(self.benchmark_exclude_ids if self.benchmark else None),
193+
benchmark_release_dates_map=(self.benchmark_release_dates_map if self.benchmark else None),
194+
target_release_date=(self.target_release_date if self.benchmark else None),
189195
n_homology_models=self.n_homology_models,
190196
best_n_templates=self.best_n_templates)
191197
print("Pandora modeling completed successfully.")
@@ -439,7 +445,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
439445
benchmark_similarity_threshold=0.95, benchmark_exclude_ids=None, # added after review --> similarity threshold
440446
sampling_mode=False, n_times_sampling=200, sampling_fraction_IG=0.5, # Sampling mode added
441447
sampling_fraction_evo=0.3, sampling_dropout_rate=0.5, sampling_seed=42,
442-
radius=8.0, pep_sampling=None,
448+
radius=8.0, pep_sampling=None, benchmark_release_dates_map=None,
443449
):
444450
"""
445451
Initializes the run_PMGen_wrapper class.
@@ -466,6 +472,9 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
466472
:param pandora_force_run (bool): If active, PANDORA will be forced to run even if files already exist.
467473
:param return_all_outputs (bool): If active, all alphafold outputs are saved.
468474
:param benchmark_similarity_threshold (float): Only used during benchmarking, exludes sequences above this similarity threshold.
475+
:param benchmark_release_dates_map: Only valid with --benchmark. When set, templates whose PDB release
476+
date is >= the target structure\'s release date are excluded. Requires a "release_date" (or "PDB release date") column in --df.
477+
Templates whose PDB IDs are not in --df are kept (assumed pre-2018).
469478
The function `input_assertion()` checks if all inputs are correctly formatted and whether required files and directories exist.
470479
471480
Raises:
@@ -488,6 +497,7 @@ def __init__(self, df, output_dir, num_templates=4, num_recycles=3, models=['mod
488497
self.return_all_outputs = return_all_outputs
489498
self.benchmark_similarity_threshold = benchmark_similarity_threshold # added after review --> similarity threshold
490499
self.benchmark_exclude_ids = benchmark_exclude_ids
500+
self.benchmark_release_dates_map = benchmark_release_dates_map
491501
self.sampling_mode = sampling_mode
492502
self.n_times_sampling = n_times_sampling
493503
self.sampling_fraction_IG = sampling_fraction_IG
@@ -517,6 +527,11 @@ def run_wrapper(self, run_alphafold=True):
517527
pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
518528
return_all_outputs=self.return_all_outputs,
519529
benchmark_similarity_threshold=self.benchmark_similarity_threshold, benchmark_exclude_ids=self.benchmark_exclude_ids, # added after review --> similarity threshold
530+
benchmark_release_dates_map=self.benchmark_release_dates_map,
531+
target_release_date=(
532+
self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
533+
if self.benchmark_release_dates_map is not None else None
534+
),
520535
sampling_mode=self.sampling_mode, #v2
521536
n_times_sampling=self.n_times_sampling,
522537
sampling_fraction_IG=self.sampling_fraction_IG,
@@ -563,6 +578,11 @@ def process_row(self, row):
563578
pandora_force_run=self.pandora_force_run, no_modelling=self.no_modelling,
564579
return_all_outputs=self.return_all_outputs,
565580
benchmark_similarity_threshold=self.benchmark_similarity_threshold, benchmark_exclude_ids=self.benchmark_exclude_ids, # added after review --> similarity threshold
581+
benchmark_release_dates_map=self.benchmark_release_dates_map,
582+
target_release_date=(
583+
self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
584+
if self.benchmark_release_dates_map is not None else None
585+
),
566586
sampling_mode=self.sampling_mode, #v2
567587
n_times_sampling=self.n_times_sampling,
568588
sampling_fraction_IG=self.sampling_fraction_IG,
@@ -629,6 +649,11 @@ def run_wrapper_parallel(self, max_ram=3, max_cores=4, run_alphafold=True):
629649
models=self.models, alphafold_param_folder=self.alphafold_param_folder,
630650
fine_tuned_model_path=self.fine_tuned_model_path, no_modelling=self.no_modelling,
631651
return_all_outputs=self.return_all_outputs,
652+
benchmark_release_dates_map=self.benchmark_release_dates_map,
653+
target_release_date=(
654+
self.benchmark_release_dates_map.get(str(row['id']).split('_')[0][:4].upper())
655+
if self.benchmark_release_dates_map is not None else None
656+
),
632657
sampling_mode=self.sampling_mode, #v2
633658
n_times_sampling=self.n_times_sampling,
634659
sampling_fraction_IG=self.sampling_fraction_IG,

0 commit comments

Comments
 (0)