Skip to content

Commit 433aebb

Browse files
committed
better detecting of cross-residue bonds both by distance and by deposited bonds
1 parent 8493c09 commit 433aebb

6 files changed

Lines changed: 515 additions & 81 deletions

File tree

moleculekit/rdkittools.py

Lines changed: 35 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def _try_strip_unmatched_terminals(
316316
_BONDTYPE_ORDER = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "ar": 1, "un": 1, "mc": 1}
317317

318318

319-
def _detect_cross_residue_bonds(mol, selidx):
319+
def _detect_interresidue_bonds(mol, selidx):
320320
"""Return ``(cross_bonds, sel_start, sel_end)`` for covalent bonds linking
321321
the selected residue to the rest of ``mol``.
322322
@@ -342,55 +342,39 @@ def _detect_cross_residue_bonds(mol, selidx):
342342
else:
343343
cross_bonds.append((b - sel_start, a, bt))
344344

345-
# PDB inputs often arrive without explicit peptide / phosphodiester bonds
346-
# (CONECT records cover only HET groups). Without them, the boundary
347-
# atoms look free-standing and AddHs over-protonates them. Mirror the
348-
# proximity check used by ``_has_peptide_neighbour`` in
349-
# nonstandard_residues so a caller doesn't have to run ``mol.guessBonds``
350-
# first.
351-
sel_names = mol.name[selidx]
352-
sel_elems = mol.element[selidx]
353-
354-
def _add_cross_by_proximity(local_idx, other_mask_full, threshold):
355-
if local_idx in {li for li, _, _ in cross_bonds}:
356-
return
357-
own_pos = mol.coords[selidx[local_idx], :, mol.frame]
358-
other_mask = other_mask_full.copy()
359-
other_mask[selidx] = False
360-
candidates = np.where(other_mask)[0]
361-
if not len(candidates):
362-
return
363-
d = np.linalg.norm(
364-
mol.coords[candidates, :, mol.frame] - own_pos, axis=1
365-
)
366-
within = d < threshold
367-
if not within.any():
368-
return
369-
partner = int(candidates[np.argmin(np.where(within, d, np.inf))])
370-
cross_bonds.append((local_idx, partner, "1"))
371-
372-
# Peptide N-C bonds (~1.33 A, threshold 1.6 A)
373-
if {"N", "CA", "C"}.issubset(sel_names):
374-
for own_side, other_name in (("N", "C"), ("C", "N")):
375-
hits = np.where(sel_names == own_side)[0]
376-
if len(hits):
377-
_add_cross_by_proximity(
378-
int(hits[0]), mol.name == other_name, threshold=1.6
379-
)
380-
381-
# Nucleic acid phosphodiester P-O3' bonds (~1.6 A, threshold 1.8 A).
382-
# Two directions: (1) own P to external O3' of previous residue,
383-
# (2) own O3' to external P of next residue. The O3' check also runs for
384-
# 5'-terminal residues that lack their own P but still bond to next.
385-
if "P" in sel_elems:
386-
for own_p_idx in np.where(sel_elems == "P")[0]:
387-
_add_cross_by_proximity(
388-
int(own_p_idx), mol.element == "O", threshold=1.8
389-
)
390-
for own_o3_idx in np.where(sel_names == "O3'")[0]:
391-
_add_cross_by_proximity(
392-
int(own_o3_idx), mol.element == "P", threshold=1.8
345+
# PDB inputs often arrive without explicit peptide / phosphodiester /
346+
# isopeptide bonds (CONECT records cover only HET groups). Without them the
347+
# boundary atoms look free-standing and AddHs over-protonates them (or leaves
348+
# an isopeptide carbon over-valent). Recover them from geometry against the
349+
# file-adjacent residues using the shared ``geometric_interresidue_links`` primitive, so
350+
# the definition and thresholds match autoSegment / detect / systemPrepare.
351+
# This covers the standard peptide / phosphodiester backbone AND the
352+
# non-standard side-chain isopeptide (e.g. microcystin's ACB.CG -> ARG.N) in
353+
# one pass.
354+
from moleculekit.tools.nonstandard_residues import geometric_interresidue_links
355+
356+
def _residue_atoms_of(rep):
357+
mask = (
358+
(mol.resid == mol.resid[rep])
359+
& (mol.chain == mol.chain[rep])
360+
& (mol.insertion == mol.insertion[rep])
361+
& (mol.segid == mol.segid[rep])
393362
)
363+
return np.where(mask)[0]
364+
365+
neighbors = []
366+
if sel_start > 0:
367+
neighbors.append(_residue_atoms_of(sel_start - 1))
368+
if sel_end + 1 < mol.numAtoms:
369+
neighbors.append(_residue_atoms_of(sel_end + 1))
370+
371+
have_cross = {li for li, _, _ in cross_bonds}
372+
for neighbor in neighbors:
373+
for ia, ib, _kind in geometric_interresidue_links(mol, selidx, neighbor):
374+
local_idx = int(ia) - sel_start
375+
if local_idx not in have_cross:
376+
cross_bonds.append((local_idx, int(ib), "1"))
377+
have_cross.add(local_idx)
394378

395379
return cross_bonds, sel_start, sel_end
396380

@@ -645,7 +629,7 @@ def template_residue_from_smiles(
645629
"The selection contains gaps in the atom indexes. Please select a single molecule residue only."
646630
)
647631

648-
cross_bonds, sel_start, sel_end = _detect_cross_residue_bonds(mol, selidx)
632+
cross_bonds, sel_start, sel_end = _detect_interresidue_bonds(mol, selidx)
649633

650634
residue = mol.copy(sel=selidx)
651635
if guessBonds:
@@ -912,7 +896,7 @@ def template_residue_from_molecule(
912896
)
913897
return
914898

915-
cross_bonds, sel_start, sel_end = _detect_cross_residue_bonds(mol, selidx)
899+
cross_bonds, sel_start, sel_end = _detect_interresidue_bonds(mol, selidx)
916900

917901
residue = mol.copy(sel=selidx)
918902
if guessBonds:

moleculekit/tools/autosegment.py

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -102,24 +102,42 @@ def _polymer_linked(
102102
``nucleic_fallback_cutoff``. If no usable atom pair exists, returns False
103103
(treated as a break).
104104
"""
105+
from moleculekit.tools.nonstandard_residues import (
106+
geometric_interresidue_links,
107+
_CANONICAL_RESNAMES,
108+
)
109+
105110
if category == "protein":
106-
c = _residue_atom_coord(mol, prev_idx, "C")
107-
n = _residue_atom_coord(mol, curr_idx, "N")
108-
if c is not None and n is not None:
109-
if float(np.linalg.norm(c - n)) <= protein_cutoff:
111+
# Geometric backbone continuation via the shared primitive: a peptide
112+
# bond always links; a side-chain isopeptide links only at a NON-canonical
113+
# junction (geometric inference is less certain than a deposited bond, so
114+
# two canonical residues are not merged on a coincidental side-chain
115+
# contact - e.g. microcystin's ACB.CG->ARG.N links, a Gln-Lys crosslink
116+
# between two standard residues does not).
117+
noncanon = (
118+
str(mol.resname[prev_idx[0]]) not in _CANONICAL_RESNAMES
119+
or str(mol.resname[curr_idx[0]]) not in _CANONICAL_RESNAMES
120+
)
121+
for _, _, k in geometric_interresidue_links(
122+
mol, prev_idx, curr_idx, amide_dist=protein_cutoff
123+
):
124+
if k == "peptide" or (k == "isopeptide" and noncanon):
110125
return True
111-
else:
126+
# CA-CA fallback ONLY when the backbone C/N needed for the precise check
127+
# is missing (an incomplete residue) - never for present-but-far C/N,
128+
# which is a real chain gap.
129+
if (
130+
_residue_atom_coord(mol, prev_idx, "C") is None
131+
or _residue_atom_coord(mol, curr_idx, "N") is None
132+
):
112133
ca0 = _residue_atom_coord(mol, prev_idx, "CA")
113134
ca1 = _residue_atom_coord(mol, curr_idx, "CA")
114135
if ca0 is not None and ca1 is not None:
115136
if float(np.linalg.norm(ca0 - ca1)) <= ca_fallback_cutoff:
116137
return True
117-
# Distance alone misses NON-standard backbone links - the backbone N of
118-
# ``curr`` acylated by a side-chain carbonyl of ``prev`` (e.g. the
119-
# gamma-glutamyl isopeptide FGA.CD->DAM.N in microcystin), or prev's
120-
# backbone C bonded into curr. Honor such an explicit bond so the
121-
# connected polymer stays one segment.
122-
return _has_interresidue_backbone_bond(
138+
# Last resort: honor an explicit deposited backbone bond the geometry
139+
# above did not catch (a stretched or non-amide CONECT link).
140+
return _has_deposited_backbone_bond(
123141
mol, prev_idx, curr_idx, prev_names=("C",), curr_names=("N",)
124142
)
125143

@@ -134,23 +152,23 @@ def _polymer_linked(
134152
if c3 is not None and p is not None:
135153
if float(np.linalg.norm(c3 - p)) <= nucleic_fallback_cutoff:
136154
return True
137-
return _has_interresidue_backbone_bond(
155+
return _has_deposited_backbone_bond(
138156
mol, prev_idx, curr_idx,
139157
prev_names=("O3'", "O3*", "C3'", "C3*"), curr_names=("P",),
140158
)
141159

142160
return False
143161

144162

145-
def _has_interresidue_backbone_bond(mol, prev_idx, curr_idx, prev_names, curr_names):
146-
"""Return True if an explicit bond joins the two residues at the backbone:
147-
a bond from ``curr``'s backbone atom (any of ``curr_names``) to any atom of
148-
``prev``, or from ``prev``'s backbone atom (any of ``prev_names``) to any
149-
atom of ``curr``. This catches non-standard backbone linkages (e.g. an
150-
isopeptide acylating curr's N) that a pure distance check misses, while
151-
NOT merging side-chain-only crosslinks (e.g. a disulfide), which touch
152-
neither backbone atom. Only consulted as a fallback when the distance
153-
checks fail, so the per-pair bond scan runs at most at chain breaks.
163+
def _has_deposited_backbone_bond(mol, prev_idx, curr_idx, prev_names, curr_names):
164+
"""Return True if an EXPLICIT (deposited) bond joins the two residues at the
165+
backbone: a bond from ``curr``'s backbone atom (any of ``curr_names``) to any
166+
atom of ``prev``, or from ``prev``'s backbone atom (any of ``prev_names``) to
167+
any atom of ``curr``. This honors a deposited CONECT link the geometric
168+
:func:`geometric_interresidue_links` check could miss (a stretched or non-amide bond),
169+
while NOT merging side-chain-only crosslinks (e.g. a disulfide), which touch
170+
neither backbone atom. Only consulted as a fallback when the geometric checks
171+
in :func:`_polymer_linked` fail, so the bond scan runs at most at chain breaks.
154172
"""
155173
if mol.bonds is None or len(mol.bonds) == 0:
156174
return False

0 commit comments

Comments
 (0)