Skip to content

Commit 0ef861c

Browse files
committed
fetching residue SMILES
1 parent 243f0b0 commit 0ef861c

2 files changed

Lines changed: 194 additions & 0 deletions

File tree

moleculekit/rcsb.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# No redistribution in whole or part
55
#
66
import urllib.request
7+
import urllib.error
8+
import json
79
import logging
810

911
logger = logging.getLogger(__name__)
@@ -123,3 +125,136 @@ def rcsbFindLigands(pdbid: str) -> list:
123125
name = td[0].find_all("a")[0].text.strip()
124126
ligands.append(name)
125127
return ligands
128+
129+
130+
def _getRCSBjson(url, attempts=3):
131+
import time
132+
133+
last_err = None
134+
for _ in range(attempts):
135+
try:
136+
response = urllib.request.urlopen(url)
137+
return json.loads(response.read())
138+
except urllib.error.HTTPError as err:
139+
# 404 etc. — the code is wrong, do not retry
140+
raise RuntimeError(f"RCSB request failed for {url}: {err}") from err
141+
except Exception as err:
142+
last_err = err
143+
logger.warning(
144+
f"Failed to connect to URL {url} with error {err}. Sleeping 5s and retrying."
145+
)
146+
time.sleep(5)
147+
raise RuntimeError(f"Failed to connect to URL {url}: {last_err}")
148+
149+
150+
def rcsbFetchLigandInfo(comp_id: str) -> dict:
151+
"""Fetch the full RCSB Chemical Component Dictionary record for a ligand.
152+
153+
Queries the RCSB data API for a 3-letter chemical component (CCD) code and
154+
returns the complete record, including identifiers, formula, weight and all
155+
descriptor variants (InChI plus SMILES from RCSB, CACTVS, OpenEye and ACDLabs).
156+
157+
Parameters
158+
----------
159+
comp_id : str
160+
The chemical component (CCD) 3-letter code, e.g. ``"BEN"``. Case-insensitive.
161+
162+
Returns
163+
-------
164+
info : dict
165+
The parsed JSON record. The curated descriptors live under
166+
``info["rcsb_chem_comp_descriptor"]`` (``SMILES``, ``SMILES_stereo``,
167+
``InChI``, ``InChIKey``); per-program variants live under
168+
``info["pdbx_chem_comp_descriptor"]``.
169+
170+
Examples
171+
--------
172+
>>> info = rcsbFetchLigandInfo('BEN')
173+
>>> info['rcsb_chem_comp_descriptor']['comp_id']
174+
'BEN'
175+
"""
176+
comp_id = comp_id.strip().upper()
177+
url = f"https://data.rcsb.org/rest/v1/core/chemcomp/{comp_id}"
178+
return _getRCSBjson(url)
179+
180+
181+
def rcsbFetchLigandSmiles(
182+
comp_id: str, stereo: bool = True, program: str = "OpenEye"
183+
) -> str:
184+
"""Fetch a SMILES string for a ligand by its RCSB CCD code.
185+
186+
Thin wrapper over :func:`rcsbFetchLigandInfo`. RCSB stores SMILES computed by
187+
several toolkits (OpenEye, CACTVS, ACDLabs). By default this returns the
188+
OpenEye descriptor, which RCSB also curates into its top-level
189+
``rcsb_chem_comp_descriptor`` block. Pass ``program`` to pick a different
190+
toolkit; for full control read ``pdbx_chem_comp_descriptor`` off
191+
:func:`rcsbFetchLigandInfo` directly.
192+
193+
Parameters
194+
----------
195+
comp_id : str
196+
The chemical component (CCD) 3-letter code, e.g. ``"BEN"``. Case-insensitive.
197+
stereo : bool
198+
If True (default) return the isomeric SMILES (stereochemistry included);
199+
if False return the plain SMILES. RCSB labels the isomeric variant
200+
``SMILES_CANONICAL``. Falls back to the other variant when the preferred
201+
one is absent for the chosen program.
202+
program : str
203+
Which toolkit's descriptor to return. ``"OpenEye"`` (default) uses RCSB's
204+
curated descriptor. Other typical values are ``"CACTVS"`` and
205+
``"ACDLabs"``. Matched case-insensitively as a substring of the program
206+
name reported by RCSB; raises if the component has no SMILES from a
207+
matching program.
208+
209+
Returns
210+
-------
211+
smiles : str
212+
The SMILES string.
213+
214+
Examples
215+
--------
216+
>>> rcsbFetchLigandSmiles('BEN', stereo=False)
217+
'[H]N=C(c1ccccc1)N'
218+
>>> rcsbFetchLigandSmiles('BEN', program='CACTVS')
219+
'NC(=N)c1ccccc1'
220+
"""
221+
info = rcsbFetchLigandInfo(comp_id)
222+
code = comp_id.strip().upper()
223+
want = program.strip().lower()
224+
225+
# OpenEye is the default, and is exactly what RCSB curates into the top-level
226+
# ``rcsb_chem_comp_descriptor`` block (always present) — use it directly.
227+
if want in ("openeye", "openeye oetoolkits", "oe"):
228+
desc = info.get("rcsb_chem_comp_descriptor", {})
229+
primary, secondary = (
230+
("SMILES_stereo", "SMILES") if stereo else ("SMILES", "SMILES_stereo")
231+
)
232+
smiles = desc.get(primary) or desc.get(secondary)
233+
if smiles:
234+
return smiles
235+
# else fall through to the per-program rows below
236+
237+
# Per-program descriptors (CACTVS, ACDLabs, or an OpenEye fallback).
238+
rows = [
239+
r
240+
for r in info.get("pdbx_chem_comp_descriptor", [])
241+
if "SMILES" in (r.get("type") or "")
242+
]
243+
available = sorted({r.get("program") for r in rows if r.get("program")})
244+
matches = [r for r in rows if want and want in (r.get("program") or "").lower()]
245+
if not matches:
246+
raise RuntimeError(
247+
f"RCSB has no SMILES for component '{code}' from program '{program}'. "
248+
f"Available programs: {available}"
249+
)
250+
# ``SMILES_CANONICAL`` is the isomeric (stereo-bearing) variant.
251+
primary, secondary = (
252+
("SMILES_CANONICAL", "SMILES") if stereo else ("SMILES", "SMILES_CANONICAL")
253+
)
254+
by_type = {r.get("type"): r.get("descriptor") for r in matches}
255+
smiles = by_type.get(primary) or by_type.get(secondary)
256+
if not smiles:
257+
raise RuntimeError(
258+
f"RCSB returned no SMILES descriptor for component '{code}' from program '{program}'"
259+
)
260+
return smiles

tests/test_rcsb.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import pytest
2+
from moleculekit.rcsb import rcsbFetchLigandInfo, rcsbFetchLigandSmiles
3+
4+
5+
def test_fetch_ligand_info_returns_record():
6+
info = rcsbFetchLigandInfo("BEN")
7+
assert isinstance(info, dict)
8+
assert "rcsb_chem_comp_descriptor" in info
9+
assert info["rcsb_chem_comp_descriptor"]["comp_id"] == "BEN"
10+
# the full record exposes per-program SMILES variants too
11+
assert "pdbx_chem_comp_descriptor" in info
12+
13+
14+
def test_fetch_ligand_smiles_stereo_default():
15+
# benzamidine, stereo SMILES carries the /N=C(\...)/ double-bond geometry
16+
smi = rcsbFetchLigandSmiles("BEN")
17+
assert isinstance(smi, str) and len(smi) > 0
18+
assert "c1ccccc1" in smi
19+
20+
21+
def test_fetch_ligand_smiles_non_stereo():
22+
smi = rcsbFetchLigandSmiles("BEN", stereo=False)
23+
assert smi == "[H]N=C(c1ccccc1)N"
24+
25+
26+
def test_fetch_ligand_info_unknown_code_raises():
27+
with pytest.raises(RuntimeError):
28+
rcsbFetchLigandInfo("ZZZZ")
29+
30+
31+
def test_lowercase_code_is_accepted():
32+
smi = rcsbFetchLigandSmiles("ben")
33+
assert "c1ccccc1" in smi
34+
35+
36+
def test_default_program_is_openeye():
37+
# explicit OpenEye matches the default (curated rcsb_chem_comp_descriptor)
38+
assert rcsbFetchLigandSmiles("BEN", program="OpenEye") == rcsbFetchLigandSmiles("BEN")
39+
40+
41+
def test_program_cactvs():
42+
assert rcsbFetchLigandSmiles("BEN", program="CACTVS") == "NC(=N)c1ccccc1"
43+
# CACTVS differs from the OpenEye default for this ligand
44+
assert rcsbFetchLigandSmiles("BEN", program="CACTVS") != rcsbFetchLigandSmiles("BEN")
45+
46+
47+
def test_program_match_is_case_insensitive():
48+
assert rcsbFetchLigandSmiles("BEN", program="cactvs") == "NC(=N)c1ccccc1"
49+
50+
51+
def test_program_falls_back_to_other_type_when_canonical_absent():
52+
# ACDLabs only provides a plain SMILES row for BEN (no SMILES_CANONICAL),
53+
# so a stereo request falls back to it rather than raising.
54+
assert rcsbFetchLigandSmiles("BEN", program="ACDLabs") == "[N@H]=C(N)c1ccccc1"
55+
56+
57+
def test_unknown_program_raises():
58+
with pytest.raises(RuntimeError):
59+
rcsbFetchLigandSmiles("BEN", program="Nonesuch")

0 commit comments

Comments
 (0)