Skip to content

Commit 8134cb1

Browse files
authored
Merge pull request #59 from DeepRank/issue55
Issue55
2 parents afeabef + bab86ff commit 8134cb1

4 files changed

Lines changed: 3070 additions & 68 deletions

File tree

pdb2sql/StructureSimilarity.py

Lines changed: 96 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,23 @@ def __init__(self, decoy, ref, verbose=False):
6262
def __repr__(self):
6363
return f'{self.__module__}.{self.__class__.__name__}({self.decoy}, {self.ref}, {self.verbose})'
6464

65+
def check_residues(self):
66+
"""Check if the residue numbering matches."""
67+
68+
res_ref = pdb2sql(self.ref).get_residues()
69+
res_dec = pdb2sql(self.decoy).get_residues()
70+
71+
if res_ref != res_dec:
72+
print('Residues are different in the reference and decoy')
73+
print('Residues found in %s and not in %s' %
74+
(self.ref, self.decoy))
75+
print(set(res_ref).difference(set(res_dec)))
76+
print('Residues found in %s and not in %s' %
77+
(self.decoy, self.ref))
78+
print(set(res_dec).difference(set(res_ref)))
79+
raise ValueError(
80+
'Residue numbering not identical in ref and decoy')
81+
6582
##########################################################################
6683
#
6784
# FAST ROUTINE TO COMPUTE THE L-RMSD
@@ -71,16 +88,14 @@ def __repr__(self):
7188
#
7289
##########################################################################
7390

74-
# TODO add output zone files
75-
7691
# compute the L-RMSD
77-
def compute_lrmsd_fast(self, lzone=None, method='svd', check=True):
92+
def compute_lrmsd_fast(self, lzone=None, method='svd', check=True, name=['C', 'CA', 'N', 'O']):
7893
"""Fast routine to compute the L-RMSD.
7994
8095
L-RMSD is computed by aligning the longest chain of the decoy to
8196
the one of the reference and computing the RMSD of the shortest
82-
chain between decoy and reference. Both fitting and rms calculation
83-
use only backbone atoms. See reference:
97+
chain between decoy and reference. By default, both fitting and
98+
rms calculation use only backbone atoms. See reference:
8499
85100
DockQ: A Quality Measure for Protein-Protein Docking Models
86101
https://doi.org/10.1371/journal.pone.0161879
@@ -92,6 +107,8 @@ def compute_lrmsd_fast(self, lzone=None, method='svd', check=True):
92107
'svd' or 'quaternion'.
93108
check (bool, optional): Check if the sequences are aligned
94109
and fix it if not. Defaults to True.
110+
name (list, optional): atom name to include in the zone.
111+
Defaults to ['C', 'CA', 'N', 'O']
95112
96113
Returns:
97114
float: L-RMSD value of the conformation
@@ -115,30 +132,39 @@ def compute_lrmsd_fast(self, lzone=None, method='svd', check=True):
115132
# 1. get_data_zone_backbone returns in_zone and not_in_zone
116133
# here the in_zone defines the zone for fitting,
117134
# and not_in_zone defines the zone for rms calculation.
118-
# 2. the decoy and ref pdb must have consistent residue
119-
# numbering, otherwise e.g. shifted numbering can also give
120-
# results which is totally wrong, because the code here does
121-
# not do sequence alignment.
135+
136+
self.check_residues()
122137

123138
data_decoy_long, data_decoy_short = self.get_data_zone_backbone(
124-
self.decoy, resData, return_not_in_zone=True)
139+
self.decoy, resData, return_not_in_zone=True, name=name)
140+
125141
data_ref_long, data_ref_short = self.get_data_zone_backbone(
126-
self.ref, resData, return_not_in_zone=True)
142+
self.ref, resData, return_not_in_zone=True, name=name)
143+
144+
if data_decoy_long.symmetric_difference(data_ref_long) != set():
145+
raise ValueError(
146+
'Issue in the calculation of the l-rmsd')
127147

128148
atom_long = data_ref_long.intersection(data_decoy_long)
129149
xyz_decoy_long = self._get_xyz(self.decoy, atom_long)
130150
xyz_ref_long = self._get_xyz(self.ref, atom_long)
131151

152+
if data_decoy_short.symmetric_difference(data_ref_short) != set():
153+
raise ValueError(
154+
'Issue in the calculation of the l-rmsd')
155+
132156
atom_short = data_ref_short.intersection(data_decoy_short)
133157
xyz_decoy_short = self._get_xyz(self.decoy, atom_short)
134158
xyz_ref_short = self._get_xyz(self.ref, atom_short)
135159

136160
# extract the xyz
137161
else:
162+
138163
xyz_decoy_long, xyz_decoy_short = self.get_xyz_zone_backbone(
139-
self.decoy, resData, return_not_in_zone=True)
164+
self.decoy, resData, return_not_in_zone=True, name=name)
165+
140166
xyz_ref_long, xyz_ref_short = self.get_xyz_zone_backbone(
141-
self.ref, resData, return_not_in_zone=True)
167+
self.ref, resData, return_not_in_zone=True, name=name)
142168

143169
xyz_decoy_short = superpose_selection(
144170
xyz_decoy_short, xyz_decoy_long, xyz_ref_long, method)
@@ -218,12 +244,8 @@ def compute_lzone(self, save_file=True, filename=None):
218244
#
219245
##########################################################################
220246

221-
def compute_irmsd_fast(
222-
self,
223-
izone=None,
224-
method='svd',
225-
cutoff=10,
226-
check=True):
247+
def compute_irmsd_fast(self, izone=None, method='svd',
248+
cutoff=10, check=True):
227249
"""Fast method to compute the i-rmsd.
228250
229251
i-RMSD is computed by selecting the backbone atoms of reference
@@ -254,16 +276,25 @@ def compute_irmsd_fast(
254276
# read the izone file
255277
if izone is None:
256278
resData = self.compute_izone(cutoff, save_file=False)
279+
elif not os.path.isfile(izone):
280+
resData = self.compute_izone(
281+
cutoff, save_file=True, filename=izone)
257282
else:
258283
resData = self.read_zone(izone)
259284

260285
if check:
261286

287+
self.check_residues()
288+
262289
data_decoy = self.get_data_zone_backbone(
263290
self.decoy, resData, return_not_in_zone=False)
264291
data_ref = self.get_data_zone_backbone(
265292
self.ref, resData, return_not_in_zone=False)
266293

294+
if data_ref.symmetric_difference(data_decoy) != set():
295+
raise ValueError(
296+
'Issue in the calculation of the i-rmsd')
297+
267298
atom_common = data_ref.intersection(data_decoy)
268299
xyz_contact_decoy = self._get_xyz(self.decoy, atom_common)
269300
xyz_contact_ref = self._get_xyz(self.ref, atom_common)
@@ -277,7 +308,8 @@ def compute_irmsd_fast(
277308

278309
# superpose the fragments
279310
xyz_contact_decoy = superpose_selection(xyz_contact_decoy,
280-
xyz_contact_decoy, xyz_contact_ref, method)
311+
xyz_contact_decoy,
312+
xyz_contact_ref, method)
281313

282314
# return the RMSD
283315
return self.get_rmsd(xyz_contact_decoy, xyz_contact_ref)
@@ -473,7 +505,7 @@ def compute_residue_pairs_ref(
473505
##########################################################################
474506

475507
# compute the L-RMSD
476-
def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd'):
508+
def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd', **kwargs):
477509
"""Slow routine to compute the L-RMSD.
478510
479511
L-RMSD is computed by aligning the longest chain of the decoy to
@@ -490,13 +522,25 @@ def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd'):
490522
method (str, optional): Method to align the fragments,
491523
'svd' or 'quaternion'.
492524
525+
Kwargs: selection keywords used in the pdb2sql.get() method :
526+
'rowID', 'serial', 'name', 'altLoc',
527+
'resName', 'resSeq', 'iCode',
528+
'x', 'y', 'z', 'occ', 'temp', 'element', 'model'
529+
530+
493531
Returns:
494532
float: L-RMSD value of the conformation
495533
496534
See also:
497535
:meth:`compute_lrmsd_fast`
498536
"""
499537
backbone = ['CA', 'C', 'N', 'O']
538+
if 'name' not in kwargs:
539+
kwargs['name'] = backbone
540+
541+
if 'chainID' in kwargs:
542+
raise ValueError(
543+
'do not specify chainID in compute_lrmsd_pdb2sql')
500544

501545
# create the sql
502546
sql_decoy = pdb2sql(self.decoy, sqlfile='decoy.db')
@@ -515,24 +559,24 @@ def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd'):
515559

516560
# extract the pos of chains A
517561
xyz_decoy_A = np.array(
518-
sql_decoy.get('x,y,z', chainID=chain1, name=backbone))
562+
sql_decoy.get('x,y,z', chainID=chain1, **kwargs))
519563
xyz_ref_A = np.array(sql_ref.get(
520-
'x,y,z', chainID=chain1, name=backbone))
564+
'x,y,z', chainID=chain1, **kwargs))
521565

522566
# extract the pos of chains B
523567
xyz_decoy_B = np.array(
524-
sql_decoy.get('x,y,z', chainID=chain2, name=backbone))
568+
sql_decoy.get('x,y,z', chainID=chain2, **kwargs))
525569
xyz_ref_B = np.array(sql_ref.get(
526-
'x,y,z', chainID=chain2, name=backbone))
570+
'x,y,z', chainID=chain2, **kwargs))
527571

528572
# check the lengthes
529573
if len(xyz_decoy_A) != len(xyz_ref_A):
530574
xyz_decoy_A, xyz_ref_A = self.get_identical_atoms(
531-
sql_decoy, sql_ref, chain1)
575+
sql_decoy, sql_ref, chain1, **kwargs)
532576

533577
if len(xyz_decoy_B) != len(xyz_ref_B):
534578
xyz_decoy_B, xyz_ref_B = self.get_identical_atoms(
535-
sql_decoy, sql_ref, chain2)
579+
sql_decoy, sql_ref, **kwargs)
536580

537581
# detect which chain is the longest
538582
nA, nB = len(xyz_decoy_A), len(xyz_decoy_B)
@@ -611,23 +655,28 @@ def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd'):
611655
# RETURN THE ATOMS THAT ARE SHARED BY THE TWO DB
612656
# FOR A GIVEN CHAINID
613657
@staticmethod
614-
def get_identical_atoms(db1, db2, chain):
658+
def get_identical_atoms(db1, db2, chain, **kwargs):
615659
"""Return that atoms shared by both databse for a specific chain.
616660
617661
Args:
618662
db1 (TYPE): pdb2sql database of the first conformation
619663
db2 (TYPE): pdb2sql database of the 2nd conformation
620664
chain (str): chain name
621665
666+
Kwargs: selection keywords used in the pdb2sql.get() method :
667+
'rowID', 'serial', 'name', 'altLoc',
668+
'resName', 'chainID', 'resSeq', 'iCode',
669+
'x', 'y', 'z', 'occ', 'temp', 'element', 'model'
670+
622671
Returns:
623672
list, list: list of xyz for both database
624673
"""
625-
backbone = ['CA', 'C', 'N', 'O']
674+
626675
# get data
627676
data1 = db1.get('chainID,resSeq,name',
628-
chainID=chain, name=backbone)
677+
chainID=chain, **kwargs)
629678
data2 = db2.get('chainID,resSeq,name',
630-
chainID=chain, name=backbone)
679+
chainID=chain, **kwargs)
631680

632681
# tuplify
633682
data1 = [tuple(d1) for d1 in data1]
@@ -922,7 +971,7 @@ def compute_fnat_pdb2sql(self, cutoff=5.0):
922971
#
923972
##########################################################################
924973
@staticmethod
925-
def get_xyz_zone_backbone(pdb_file, resData, return_not_in_zone=False):
974+
def get_xyz_zone_backbone(pdb_file, resData, return_not_in_zone=False, name=['C', 'CA', 'N', 'O']):
926975
"""Get the xyz of zone backbone atoms.
927976
928977
Args:
@@ -950,18 +999,17 @@ def get_xyz_zone_backbone(pdb_file, resData, return_not_in_zone=False):
950999
chainID = line[72]
9511000

9521001
resSeq = int(line[22:26])
953-
name = line[12:16].strip()
1002+
atname = line[12:16].strip()
9541003

9551004
x = float(line[30:38])
9561005
y = float(line[38:46])
9571006
z = float(line[46:54])
9581007

959-
backbone = ['C', 'CA', 'N', 'O']
960-
if chainID in resData.keys():
961-
if resSeq in resData[chainID] and name in backbone:
962-
xyz_in_zone.append([x, y, z])
963-
else:
964-
if name in backbone:
1008+
if atname in name:
1009+
if chainID in resData.keys():
1010+
if resSeq in resData[chainID]:
1011+
xyz_in_zone.append([x, y, z])
1012+
else:
9651013
xyz_not_in_zone.append([x, y, z])
9661014

9671015
if return_not_in_zone:
@@ -971,7 +1019,7 @@ def get_xyz_zone_backbone(pdb_file, resData, return_not_in_zone=False):
9711019
return xyz_in_zone
9721020

9731021
@staticmethod
974-
def get_data_zone_backbone(pdb_file, resData, return_not_in_zone=False):
1022+
def get_data_zone_backbone(pdb_file, resData, return_not_in_zone=False, name=['C', 'CA', 'N', 'O']):
9751023
"""Get the data (chainID, resSeq, name) of backbone atoms in the zone.
9761024
9771025
Args:
@@ -999,19 +1047,14 @@ def get_data_zone_backbone(pdb_file, resData, return_not_in_zone=False):
9991047
chainID = line[72]
10001048

10011049
resSeq = int(line[22:26])
1002-
name = line[12:16].strip()
1003-
1004-
backbone = ['C', 'CA', 'N', 'O']
1005-
1006-
if chainID in resData.keys():
1007-
1008-
if resSeq in resData[chainID] and name in backbone:
1009-
data_in_zone.append((chainID, resSeq, name))
1010-
1011-
else:
1012-
if name in backbone:
1013-
data_not_in_zone.append(
1014-
(chainID, resSeq, name))
1050+
atname = line[12:16].strip()
1051+
1052+
if atname in name:
1053+
if chainID in resData.keys():
1054+
if resSeq in resData[chainID]:
1055+
data_in_zone.append((chainID, resSeq, atname))
1056+
else:
1057+
data_not_in_zone.append((chainID, resSeq, atname))
10151058

10161059
if return_not_in_zone:
10171060
return set(data_in_zone), set(data_not_in_zone)

test/pdb/1AK4/1AK4_5w.pdb

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2953,12 +2953,4 @@ ATOM 2912 OH TYR B 145 -31.997 15.593 0.288 1.00 10.00 B
29532953
ATOM 2913 HH TYR B 145 -31.415 16.340 0.449 1.00 10.00 B
29542954
ATOM 2914 C TYR B 145 -35.691 15.026 7.243 1.00 10.00 B
29552955
ATOM 2915 O TYR B 145 -36.878 14.913 6.937 1.00 10.00 B
2956-
ATOM 2916 N SER B 146 -35.253 14.903 8.489 1.00 10.00 B
2957-
ATOM 2917 HN SER B 146 -34.306 15.070 8.683 1.00 10.00 B
2958-
ATOM 2918 CA SER B 146 -36.149 14.514 9.572 1.00 10.00 B
2959-
ATOM 2919 CB SER B 146 -35.350 14.103 10.810 1.00 10.00 B
2960-
ATOM 2920 OG SER B 146 -34.583 15.188 11.306 1.00 10.00 B
2961-
ATOM 2921 HG SER B 146 -33.992 15.513 10.613 1.00 10.00 B
2962-
ATOM 2922 C SER B 146 -37.134 15.618 9.937 1.00 10.00 B
2963-
ATOM 2923 O SER B 146 -38.105 15.361 10.650 1.00 10.00 B
2964-
END
2956+
END

0 commit comments

Comments
 (0)