Skip to content

Commit d446fd4

Browse files
committed
added support for templating a residue through an SDF file
1 parent 027d9a8 commit d446fd4

3 files changed

Lines changed: 126 additions & 12 deletions

File tree

moleculekit/molecule.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3113,17 +3113,24 @@ def templateResidueFromMolecule(
31133113
_logger=True,
31143114
):
31153115
"""Assign bonds, bond orders, formal charges and (optionally) hydrogens
3116-
to a residue from a reference Molecule template, matched by atom name.
3116+
to a residue from a reference Molecule template.
31173117
31183118
Like :meth:`templateResidueFromSmiles`, but the template is a reference
31193119
Molecule (e.g. loaded from a CIF) that already carries bonds, bond
3120-
orders and formal charges. The selected residue's heavy atoms are
3121-
mapped onto the reference by NAME and the reference's bond orders and
3122-
formal charges are transferred verbatim, so they must already be
3123-
correct in the reference (this function does not re-derive them). The
3124-
reference is used only as a template and is never appended. The
3125-
molecule is mutated in place. The residue and reference must share the
3126-
same set of heavy-atom names.
3120+
orders and formal charges. When the reference's heavy-atom names are
3121+
unique and equal to the residue's, the atoms are mapped by NAME and the
3122+
reference's bond orders and formal charges are transferred verbatim
3123+
(ideal for CIF references: unambiguous under molecular symmetry). When
3124+
the names do not match (for example a reference read from an SDF file,
3125+
whose atom names are only element symbols), the reference is converted
3126+
to a SMILES and matched by element and connectivity instead; for a
3127+
symmetric residue this can place a charge or double bond on an
3128+
equivalent atom differently while giving the same molecule. Either way
3129+
the reference is used only as a template and is never appended, and the
3130+
molecule is mutated in place. The reference must describe the same
3131+
residue: it may carry extra terminal atoms (such as a free amino acid's
3132+
OXT or a covalent leaving group), which are stripped, but a reference
3133+
missing heavy atoms of the residue raises.
31273134
31283135
Parameters
31293136
----------
@@ -3132,9 +3139,9 @@ def templateResidueFromMolecule(
31323139
of the residue(s) to template. May span multiple copies.
31333140
refmol : Molecule
31343141
The reference template. Its bonds, bond orders and formal charges
3135-
are copied onto the residue verbatim, so they must already be
3136-
correct; heavy-atom names must be unique and must match the
3137-
residue's.
3142+
must already be correct. When its heavy-atom names are unique and
3143+
match the residue's they are used directly; otherwise it is matched
3144+
by SMILES, so unnamed references (e.g. from SDF) are supported.
31383145
addHs : bool
31393146
If True, add hydrogens after bond orders are transferred.
31403147
onlyOnAtoms : str or np.ndarray
@@ -3145,7 +3152,7 @@ def templateResidueFromMolecule(
31453152
Examples
31463153
--------
31473154
>>> mol = Molecule("complex.pdb") # doctest: +SKIP
3148-
>>> mol.templateResidueFromMolecule("resname LIG", Molecule("LIG.cif"), addHs=True, guessBonds=True)
3155+
>>> mol.templateResidueFromMolecule("resname LIG", Molecule("LIG.cif"), addHs=True, guessBonds=True) # doctest: +SKIP
31493156
"""
31503157
from moleculekit.rdkittools import template_residue_from_molecule
31513158

moleculekit/rdkittools.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,23 @@ def template_residue_from_smiles(
746746
"mol.guessBonds() before templating, or rely on LINK/CONECT records)."
747747
)
748748

749+
# Every residue heavy atom must be covered by the match. The check above
750+
# handles a template that is a superset of the residue (extra template
751+
# atoms are stripped); this handles the opposite case, an incomplete
752+
# template that would leave a residue heavy atom with its guessed bonds.
753+
# Erroring here keeps a reference missing heavy atoms (routed in from
754+
# template_residue_from_molecule) from producing structurally wrong output.
755+
res_heavy_idx = [a.GetIdx() for a in rmol.GetAtoms() if a.GetSymbol() != "H"]
756+
unmatched_res = sorted(set(res_heavy_idx) - set(at1))
757+
if unmatched_res:
758+
unmatched_names = [str(residue.name[i]) for i in unmatched_res]
759+
raise RuntimeError(
760+
f"The SMILES template '{smiles}' does not cover all heavy atoms of "
761+
f"the residue; unmatched residue atoms: {unmatched_names}. The "
762+
"template must describe the same residue (it may carry extra "
763+
"terminal atoms, but must not be missing any)."
764+
)
765+
749766
_apply_template_mapping(
750767
mol,
751768
selidx,
@@ -859,6 +876,42 @@ def template_residue_from_molecule(
859876
"The selection contains gaps in the atom indexes. Please select a single molecule residue only."
860877
)
861878

879+
# Decide the matching strategy for this single residue. Verbatim name
880+
# matching is only viable when both sides have unique heavy-atom names and
881+
# the name sets are equal (e.g. a CIF reference); it is unambiguous under
882+
# molecular symmetry. Otherwise (e.g. an SDF reference, whose atom names are
883+
# only element symbols) convert the reference to a SMILES and match by
884+
# element and connectivity through the SMILES path.
885+
res_heavy_names = mol.name[selidx][mol.element[selidx] != "H"]
886+
ref_heavy_names = refmol.name[refmol.element != "H"]
887+
name_match_viable = (
888+
len(set(res_heavy_names)) == len(res_heavy_names)
889+
and len(set(ref_heavy_names)) == len(ref_heavy_names)
890+
and set(res_heavy_names) == set(ref_heavy_names)
891+
)
892+
if not name_match_viable:
893+
ref_rdkit = refmol.toRDKitMol(
894+
sanitize=True, kekulize=False, assignStereo=False, _logger=False
895+
)
896+
smiles = Chem.MolToSmiles(ref_rdkit)
897+
if _logger:
898+
logger.info(
899+
"Reference atom names do not match the residue (e.g. an SDF "
900+
"reference); matching by SMILES instead. Reference SMILES: "
901+
f"{smiles!r}"
902+
)
903+
template_residue_from_smiles(
904+
mol,
905+
selidx,
906+
smiles,
907+
sanitizeSmiles=True,
908+
addHs=addHs,
909+
onlyOnAtoms=onlyOnAtoms,
910+
guessBonds=guessBonds,
911+
_logger=_logger,
912+
)
913+
return
914+
862915
cross_bonds, sel_start, sel_end = _detect_cross_residue_bonds(mol, selidx)
863916

864917
residue = mol.copy(sel=selidx)

tests/test_rdkittools.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,50 @@ def _cmp(mol, ref_mol):
106106
)
107107

108108

109+
def test_templateResidueFromMolecule_sdf_reference(tmp_path):
110+
"""A reference Molecule without matching atom names (e.g. read from an SDF
111+
file, whose atom names are only element symbols) is matched by converting it
112+
to SMILES and using MCS, giving the same result as the name-matched path."""
113+
import numpy as np
114+
115+
testdir = os.path.join(curr_dir, "test_molecule", "test_templating")
116+
ref_named = Molecule(os.path.join(testdir, "BEN_pH7.4.cif"))
117+
118+
# Round-trip the reference through SDF. SDF keeps bonds, bond orders and
119+
# formal charges but assigns element-only atom names, so name matching is no
120+
# longer viable and the SMILES route is taken.
121+
sdf_path = os.path.join(tmp_path, "ref.sdf")
122+
ref_named.write(sdf_path)
123+
ref_sdf = Molecule(sdf_path)
124+
# The SDF reference's names are not unique (element symbols repeat), so the
125+
# name path cannot apply and the SMILES route must be exercised.
126+
assert len(np.unique(ref_sdf.name)) < ref_sdf.numAtoms
127+
128+
mol = Molecule(os.path.join(testdir, "BEN.pdb"))
129+
mol.templateResidueFromMolecule(
130+
"resname BEN", ref_sdf, addHs=True, guessBonds=True
131+
)
132+
133+
# The SMILES / MCS path is symmetry-tolerant: benzamidine's two amidine
134+
# nitrogens are interchangeable, so which one carries the +1 and the double
135+
# bond can differ from the name-matched reference by a resonance swap. It is
136+
# the same molecule, so compare by canonical SMILES (invariant under that
137+
# automorphism) plus atom count, composition and net charge.
138+
from rdkit import Chem
139+
140+
def _canonical(m):
141+
rd = m.toRDKitMol(
142+
sanitize=True, kekulize=False, assignStereo=False, _logger=False
143+
)
144+
return Chem.MolToSmiles(Chem.RemoveHs(rd))
145+
146+
ben = mol.copy(sel="resname BEN")
147+
assert ben.numAtoms == ref_named.numAtoms
148+
assert sorted(ben.element) == sorted(ref_named.element)
149+
assert int(ben.formalcharge.sum()) == int(ref_named.formalcharge.sum())
150+
assert _canonical(ben) == _canonical(ref_named)
151+
152+
109153
def test_templateResidueFromSmiles_multiresidue():
110154
"""A selection that spans multiple residues with the same resname
111155
(e.g. ``resname BEN`` when there are several BEN copies in the
@@ -199,6 +243,16 @@ def test_templateResidueFromSmiles_incorrect_smiles():
199243
assert "2" in ben.bondtype
200244

201245

246+
def test_templateResidueFromSmiles_incomplete_template_errors():
247+
"""A SMILES that does not cover every residue heavy atom must raise rather
248+
than silently leaving atoms untemplated. Benzene matches only BEN's ring,
249+
leaving the amidine C and two N atoms unmatched."""
250+
testdir = os.path.join(curr_dir, "test_molecule", "test_templating")
251+
mol = Molecule(os.path.join(testdir, "BEN.pdb"))
252+
with pytest.raises(RuntimeError):
253+
mol.templateResidueFromSmiles("resname BEN", "c1ccccc1", guessBonds=True)
254+
255+
202256
@pytest.mark.parametrize(
203257
"smiles",
204258
(

0 commit comments

Comments
 (0)