Skip to content

Commit f758567

Browse files
authored
Merge pull request #346 from bigict/data
refactor: use standard 3 letter res_name
2 parents 2f68349 + 090c48a commit f758567

3 files changed

Lines changed: 57 additions & 50 deletions

File tree

profold2/common/residue_constants.py

Lines changed: 43 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -80,52 +80,52 @@
8080
['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
8181
'VAL': [['CA', 'C', 'N', 'CA'], ['C', 'N', 'CA', 'C'], ['N', 'CA', 'C', 'O'],
8282
['N', 'CA', 'CB', 'CG1']],
83-
' DA': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
83+
'DA': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
8484
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
8585
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
8686
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
8787
['O4\'', 'C1\'', 'N9', 'C2']],
88-
' DC': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
88+
'DC': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
8989
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
9090
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
9191
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
9292
['O4\'', 'C1\'', 'N1', 'C2']],
93-
' DG': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
93+
'DG': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
9494
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
9595
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
9696
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
9797
['O4\'', 'C1\'', 'N9', 'C2']],
98-
' DT': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
98+
'DT': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
9999
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
100100
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
101101
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
102102
['O4\'', 'C1\'', 'N1', 'C2']],
103-
' DX': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
103+
'DX': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
104104
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
105105
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
106106
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
107107
[]],
108-
' A': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
108+
'A': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
109109
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
110110
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
111111
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
112112
['O4\'', 'C1\'', 'N9', 'C2'], ['O4\'', 'C1\'', 'C2\'', 'C3\'']],
113-
' C': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
113+
'C': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
114114
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
115115
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
116116
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
117117
['O4\'', 'C1\'', 'N1', 'C2'], ['O4\'', 'C1\'', 'C2\'', 'C3\'']],
118-
' G': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
118+
'G': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
119119
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
120120
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
121121
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
122122
['O4\'', 'C1\'', 'N9', 'C2'], ['O4\'', 'C1\'', 'C2\'', 'C3\'']],
123-
' U': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
123+
'U': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
124124
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
125125
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
126126
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
127127
['O4\'', 'C1\'', 'N1', 'C2'], ['O4\'', 'C1\'', 'C2\'', 'C3\'']],
128-
' X': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
128+
'X': [['P', 'OP2', 'OP1', 'P'], ['OP2', 'OP1', 'P', 'OP2'], [],
129129
['OP1', 'P', 'O5\'', 'C5\''], ['P', 'O5\'', 'C5\'', 'C4\''],
130130
['O5\'', 'C5\'', 'C4\'', 'C3\''], ['C5\'', 'C4\'', 'C3\'', 'O3\''],
131131
['C5\'', 'C4\'', 'O4\'', 'C1\''], ['C4\'', 'O4\'', 'C1\'', 'C2\''],
@@ -479,7 +479,7 @@
479479
['CG1', 4, (0.540, 1.429, -0.000)],
480480
['CG2', 4, (0.533, -0.776, 1.203)],
481481
],
482-
' DA': [
482+
'DA': [
483483
['OP1', 0, (-0.7319, 1.2920, 0.000)],
484484
['P', 0, (0.000, 0.000, 0.000)],
485485
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -502,7 +502,7 @@
502502
['C8', 10, (0.8438, -1.0825, 0.0000)],
503503
['N6', 10, (4.4402, 1.2598, 0.0000)],
504504
],
505-
' DC': [
505+
'DC': [
506506
['OP1', 0, (-0.7319, 1.2920, 0.000)],
507507
['P', 0, (0.000, 0.000, 0.000)],
508508
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -523,7 +523,7 @@
523523
['C5', 10, (2.0394, -1.1794, 0.0000)],
524524
['C6', 10, (0.7007, -1.1745, 0.0000)],
525525
],
526-
' DG': [
526+
'DG': [
527527
['OP1', 0, (-0.7319, 1.2920, 0.000)],
528528
['P', 0, (0.000, 0.000, 0.000)],
529529
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -547,7 +547,7 @@
547547
['N2', 10, (1.2085, 4.5537, 0.0000)],
548548
['O6', 10, (4.4017, 1.2743, 0.0000)],
549549
],
550-
' DT': [
550+
'DT': [
551551
['OP1', 0, (-0.7319, 1.2920, 0.000)],
552552
['P', 0, (0.000, 0.000, 0.000)],
553553
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -569,7 +569,7 @@
569569
['C7', 10, (2.7845, -2.5550, 0.0000)],
570570
['C6', 10, (0.7021, -1.1863, 0.0000)],
571571
],
572-
' DX': [
572+
'DX': [
573573
['OP1', 0, (-0.7319, 1.2920, 0.000)],
574574
['P', 0, (0.000, 0.000, 0.000)],
575575
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -582,7 +582,7 @@
582582
['C1\'', 8, (0.4913, 1.3316, 0.0000)],
583583
['C2\'', 9, (0.4167, 1.4603, 0.0000)],
584584
],
585-
' A': [
585+
'A': [
586586
['OP1', 0, (-0.7319, 1.2920, 0.000)],
587587
['P', 0, (0.000, 0.000, 0.000)],
588588
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -606,7 +606,7 @@
606606
['N7', 10, (2.1146, -0.7630, 0.0000)],
607607
['C8', 10, (0.8442, -1.0830, 0.0000)],
608608
],
609-
' C': [
609+
'C': [
610610
['OP1', 0, (-0.7319, 1.2920, 0.000)],
611611
['P', 0, (0.000, 0.000, 0.000)],
612612
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -628,7 +628,7 @@
628628
['C5', 10, (2.0635, -1.1476, 0.0000)],
629629
['C6', 10, (0.7250, -1.1627, 0.0000)],
630630
],
631-
' G': [
631+
'G': [
632632
['OP1', 0, (-0.7319, 1.2920, 0.000)],
633633
['P', 0, (0.000, 0.000, 0.000)],
634634
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -653,7 +653,7 @@
653653
['N7', 10, (2.0980, -0.7759, 0.0000)],
654654
['C8', 10, (0.8317, -1.0936, 0.0000)],
655655
],
656-
' U': [
656+
'U': [
657657
['OP1', 0, (-0.7319, 1.2920, 0.000)],
658658
['P', 0, (0.000, 0.000, 0.000)],
659659
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -675,7 +675,7 @@
675675
['C5', 10, (2.0746, -1.1833, 0.0000)],
676676
['C6', 10, (0.7378, -1.1648, 0.0000)],
677677
],
678-
' X': [
678+
'X': [
679679
['OP1', 0, (-0.7319, 1.2920, 0.000)],
680680
['P', 0, (0.000, 0.000, 0.000)],
681681
['OP2', 0, (1.4855, 0.000, 0.000)],
@@ -882,16 +882,16 @@ def make_bond_key(atom1_name, atom2_name):
882882
'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
883883
'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH', '', ''],
884884
'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''],
885-
' DA': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N6', 'N7', 'C8', 'N9', '', ''],
886-
' DC': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'C5', 'C6', 'N4', '', '', '', ''],
887-
' DG': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'N2', 'N3', 'C4', 'C5', 'C6', 'O6', 'N7', 'C8', 'N9', ''],
888-
' DT': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'O4', 'C5', 'C6', 'C7', '', '', ''],
889-
' DX': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', '', '', '', '', '', '', '', '', '', ''],
890-
' A': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N6', 'N7', 'C8', 'N9', ''],
891-
' C': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'C5', 'C6', 'N4', '', '', ''],
892-
' G': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'N2', 'N3', 'C4', 'C5', 'C6', 'O6', 'N7', 'C8', 'N9'],
893-
' U': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'O4', 'C5', 'C6', '', '', ''],
894-
' X': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', '', '', '', '', '', '', '', '', ''],
885+
'DA': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N6', 'N7', 'C8', 'N9', '', ''],
886+
'DC': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'C5', 'C6', 'N4', '', '', '', ''],
887+
'DG': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'N2', 'N3', 'C4', 'C5', 'C6', 'O6', 'N7', 'C8', 'N9', ''],
888+
'DT': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'O4', 'C5', 'C6', 'C7', '', '', ''],
889+
'DX': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','C1\'','N1', 'C2', '', '', '', '', '', '', '', '', '', ''],
890+
'A': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N6', 'N7', 'C8', 'N9', ''],
891+
'C': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'C5', 'C6', 'N4', '', '', ''],
892+
'G': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'N2', 'N3', 'C4', 'C5', 'C6', 'O6', 'N7', 'C8', 'N9'],
893+
'U': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', 'O2', 'N3', 'C4', 'O4', 'C5', 'C6', '', '', ''],
894+
'X': ['OP1', 'P', 'OP2', 'O5\'', 'C5\'','C4\'','O4\'','C3\'','O3\'','C2\'','O2\'','C1\'','N1', 'C2', '', '', '', '', '', '', '', '', ''],
895895
'UNK': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
896896
}
897897
# pylint: enable=line-too-long
@@ -1052,16 +1052,17 @@ def sequence_to_onehot(
10521052
('W', PROT): 'TRP',
10531053
('Y', PROT): 'TYR',
10541054
('V', PROT): 'VAL',
1055-
('A', DNA ): ' DA',
1056-
('C', DNA ): ' DC',
1057-
('G', DNA ): ' DG',
1058-
('T', DNA ): ' DT',
1059-
('X', DNA ): ' DX',
1060-
('A', RNA ): ' A',
1061-
('C', RNA ): ' C',
1062-
('G', RNA ): ' G',
1063-
('U', RNA ): ' U',
1064-
('X', RNA ): ' X',
1055+
('X', PROT): 'UNK',
1056+
('A', DNA ): 'DA',
1057+
('C', DNA ): 'DC',
1058+
('G', DNA ): 'DG',
1059+
('T', DNA ): 'DT',
1060+
('X', DNA ): 'DX',
1061+
('A', RNA ): 'A',
1062+
('C', RNA ): 'C',
1063+
('G', RNA ): 'G',
1064+
('U', RNA ): 'U',
1065+
('X', RNA ): 'X',
10651066
}
10661067

10671068
# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
@@ -1072,8 +1073,8 @@ def sequence_to_onehot(
10721073

10731074
# Define a restype name for all unknown residues.
10741075
unk_restype = 'UNK'
1075-
unk_dnatype = ' DX'
1076-
unk_rnatype = ' X'
1076+
unk_dnatype = 'DX'
1077+
unk_rnatype = 'X'
10771078

10781079
resnames = [restype_1to3[(r, moltype(i))]
10791080
for i, r in enumerate(restypes)] + [(unk_restype, PROT)]

tools/dataset_from_pdb.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,8 @@ def mmcif_yield_chain(mmcif_dict, args): # pylint: disable=redefined-outer-name
104104

105105
def _get_residue_id(residue_id, chain_type):
106106
del chain_type
107-
while len(residue_id) < 3:
108-
residue_id = f' {residue_id}'
107+
# while len(residue_id) < 3:
108+
# residue_id = f' {residue_id}'
109109
return fix_residue_id(residue_id)
110110

111111
def _get_unktype(chain_type):
@@ -144,7 +144,7 @@ def _make_npz(coord_list, coord_mask_list, bfactor_list):
144144
def _make_domain(start, end, delta=0):
145145
return (start + delta, end + delta)
146146

147-
chain_id, seq, domains = None, [], []
147+
chain_id, seq, domains, chain_skip = None, [], [], False
148148
int_resseq_start, int_resseq_end = None, None
149149
int_resseq_delta, int_resseq_offset = 0, 0
150150
coord_list, coord_mask_list, bfactor_list = [], [], []
@@ -160,7 +160,7 @@ def _make_domain(start, end, delta=0):
160160
continue
161161

162162
if chain_id_list[i] != chain_id:
163-
if exists(chain_id) and seq: # FIX: 5wj3
163+
if exists(chain_id) and seq and not chain_skip: # FIX: 5wj3
164164
domains += [
165165
_make_domain(
166166
int_resseq_start, int_resseq_end + int_resseq_offset, int_resseq_delta
@@ -172,7 +172,7 @@ def _make_domain(start, end, delta=0):
172172
bfactor_list.append(bfactors)
173173
npz = _make_npz(coord_list, coord_mask_list, bfactor_list)
174174
yield chain_id, chain_type_dict[chain_id], seq, domains, npz
175-
chain_id, seq, domains = chain_id_list[i], [], []
175+
chain_id, seq, domains, chain_skip = chain_id_list[i], [], [], False
176176
# assert chain_id in chain_type_dict
177177
int_resseq_start, int_resseq_end = None, None
178178
int_resseq_delta, int_resseq_offset = 0, 0
@@ -256,8 +256,14 @@ def _make_domain(start, end, delta=0):
256256
logger.debug(
257257
'residue_id: %s, chain_type: %s, exception: %s', residue_id, chain_type, e
258258
)
259+
except IndexError as e: # FIX: 5HT2
260+
logger.error(
261+
'entry.id: %s, residue_id: %s, chain_type: %s, exception: %s',
262+
mmcif_dict['_entry.id'], residue_id, chain_type, e
263+
)
264+
chain_skip = True
259265

260-
if exists(chain_id) and seq:
266+
if exists(chain_id) and seq and not chain_skip:
261267
domains += [
262268
_make_domain(
263269
int_resseq_start, int_resseq_end + int_resseq_offset, int_resseq_delta

tools/pdb_extract_comp_id.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ def mmcif_yield_chain(mmcif_dict, _): # pylint: disable=redefined-outer-name
9191

9292
def _get_residue_id(residue_id, chain_type):
9393
del chain_type
94-
while len(residue_id) < 3:
95-
residue_id = f' {residue_id}'
94+
# while len(residue_id) < 3:
95+
# residue_id = f' {residue_id}'
9696
if residue_id == 'MSE':
9797
return 'MET'
9898
return residue_id

0 commit comments

Comments
 (0)