|
4 | 4 | # No redistribution in whole or part |
5 | 5 | # |
6 | 6 | import urllib.request |
| 7 | +import urllib.error |
| 8 | +import json |
7 | 9 | import logging |
8 | 10 |
|
9 | 11 | logger = logging.getLogger(__name__) |
@@ -123,3 +125,136 @@ def rcsbFindLigands(pdbid: str) -> list: |
123 | 125 | name = td[0].find_all("a")[0].text.strip() |
124 | 126 | ligands.append(name) |
125 | 127 | return ligands |
| 128 | + |
| 129 | + |
| 130 | +def _getRCSBjson(url, attempts=3): |
| 131 | + import time |
| 132 | + |
| 133 | + last_err = None |
| 134 | + for _ in range(attempts): |
| 135 | + try: |
| 136 | + response = urllib.request.urlopen(url) |
| 137 | + return json.loads(response.read()) |
| 138 | + except urllib.error.HTTPError as err: |
| 139 | + # 404 etc. — the code is wrong, do not retry |
| 140 | + raise RuntimeError(f"RCSB request failed for {url}: {err}") from err |
| 141 | + except Exception as err: |
| 142 | + last_err = err |
| 143 | + logger.warning( |
| 144 | + f"Failed to connect to URL {url} with error {err}. Sleeping 5s and retrying." |
| 145 | + ) |
| 146 | + time.sleep(5) |
| 147 | + raise RuntimeError(f"Failed to connect to URL {url}: {last_err}") |
| 148 | + |
| 149 | + |
| 150 | +def rcsbFetchLigandInfo(comp_id: str) -> dict: |
| 151 | + """Fetch the full RCSB Chemical Component Dictionary record for a ligand. |
| 152 | +
|
| 153 | + Queries the RCSB data API for a 3-letter chemical component (CCD) code and |
| 154 | + returns the complete record, including identifiers, formula, weight and all |
| 155 | + descriptor variants (InChI plus SMILES from RCSB, CACTVS, OpenEye and ACDLabs). |
| 156 | +
|
| 157 | + Parameters |
| 158 | + ---------- |
| 159 | + comp_id : str |
| 160 | + The chemical component (CCD) 3-letter code, e.g. ``"BEN"``. Case-insensitive. |
| 161 | +
|
| 162 | + Returns |
| 163 | + ------- |
| 164 | + info : dict |
| 165 | + The parsed JSON record. The curated descriptors live under |
| 166 | + ``info["rcsb_chem_comp_descriptor"]`` (``SMILES``, ``SMILES_stereo``, |
| 167 | + ``InChI``, ``InChIKey``); per-program variants live under |
| 168 | + ``info["pdbx_chem_comp_descriptor"]``. |
| 169 | +
|
| 170 | + Examples |
| 171 | + -------- |
| 172 | + >>> info = rcsbFetchLigandInfo('BEN') |
| 173 | + >>> info['rcsb_chem_comp_descriptor']['comp_id'] |
| 174 | + 'BEN' |
| 175 | + """ |
| 176 | + comp_id = comp_id.strip().upper() |
| 177 | + url = f"https://data.rcsb.org/rest/v1/core/chemcomp/{comp_id}" |
| 178 | + return _getRCSBjson(url) |
| 179 | + |
| 180 | + |
| 181 | +def rcsbFetchLigandSmiles( |
| 182 | + comp_id: str, stereo: bool = True, program: str = "OpenEye" |
| 183 | +) -> str: |
| 184 | + """Fetch a SMILES string for a ligand by its RCSB CCD code. |
| 185 | +
|
| 186 | + Thin wrapper over :func:`rcsbFetchLigandInfo`. RCSB stores SMILES computed by |
| 187 | + several toolkits (OpenEye, CACTVS, ACDLabs). By default this returns the |
| 188 | + OpenEye descriptor, which RCSB also curates into its top-level |
| 189 | + ``rcsb_chem_comp_descriptor`` block. Pass ``program`` to pick a different |
| 190 | + toolkit; for full control read ``pdbx_chem_comp_descriptor`` off |
| 191 | + :func:`rcsbFetchLigandInfo` directly. |
| 192 | +
|
| 193 | + Parameters |
| 194 | + ---------- |
| 195 | + comp_id : str |
| 196 | + The chemical component (CCD) 3-letter code, e.g. ``"BEN"``. Case-insensitive. |
| 197 | + stereo : bool |
| 198 | + If True (default) return the isomeric SMILES (stereochemistry included); |
| 199 | + if False return the plain SMILES. RCSB labels the isomeric variant |
| 200 | + ``SMILES_CANONICAL``. Falls back to the other variant when the preferred |
| 201 | + one is absent for the chosen program. |
| 202 | + program : str |
| 203 | + Which toolkit's descriptor to return. ``"OpenEye"`` (default) uses RCSB's |
| 204 | + curated descriptor. Other typical values are ``"CACTVS"`` and |
| 205 | + ``"ACDLabs"``. Matched case-insensitively as a substring of the program |
| 206 | + name reported by RCSB; raises if the component has no SMILES from a |
| 207 | + matching program. |
| 208 | +
|
| 209 | + Returns |
| 210 | + ------- |
| 211 | + smiles : str |
| 212 | + The SMILES string. |
| 213 | +
|
| 214 | + Examples |
| 215 | + -------- |
| 216 | + >>> rcsbFetchLigandSmiles('BEN', stereo=False) |
| 217 | + '[H]N=C(c1ccccc1)N' |
| 218 | + >>> rcsbFetchLigandSmiles('BEN', program='CACTVS') |
| 219 | + 'NC(=N)c1ccccc1' |
| 220 | + """ |
| 221 | + info = rcsbFetchLigandInfo(comp_id) |
| 222 | + code = comp_id.strip().upper() |
| 223 | + want = program.strip().lower() |
| 224 | + |
| 225 | + # OpenEye is the default, and is exactly what RCSB curates into the top-level |
| 226 | + # ``rcsb_chem_comp_descriptor`` block (always present) — use it directly. |
| 227 | + if want in ("openeye", "openeye oetoolkits", "oe"): |
| 228 | + desc = info.get("rcsb_chem_comp_descriptor", {}) |
| 229 | + primary, secondary = ( |
| 230 | + ("SMILES_stereo", "SMILES") if stereo else ("SMILES", "SMILES_stereo") |
| 231 | + ) |
| 232 | + smiles = desc.get(primary) or desc.get(secondary) |
| 233 | + if smiles: |
| 234 | + return smiles |
| 235 | + # else fall through to the per-program rows below |
| 236 | + |
| 237 | + # Per-program descriptors (CACTVS, ACDLabs, or an OpenEye fallback). |
| 238 | + rows = [ |
| 239 | + r |
| 240 | + for r in info.get("pdbx_chem_comp_descriptor", []) |
| 241 | + if "SMILES" in (r.get("type") or "") |
| 242 | + ] |
| 243 | + available = sorted({r.get("program") for r in rows if r.get("program")}) |
| 244 | + matches = [r for r in rows if want and want in (r.get("program") or "").lower()] |
| 245 | + if not matches: |
| 246 | + raise RuntimeError( |
| 247 | + f"RCSB has no SMILES for component '{code}' from program '{program}'. " |
| 248 | + f"Available programs: {available}" |
| 249 | + ) |
| 250 | + # ``SMILES_CANONICAL`` is the isomeric (stereo-bearing) variant. |
| 251 | + primary, secondary = ( |
| 252 | + ("SMILES_CANONICAL", "SMILES") if stereo else ("SMILES", "SMILES_CANONICAL") |
| 253 | + ) |
| 254 | + by_type = {r.get("type"): r.get("descriptor") for r in matches} |
| 255 | + smiles = by_type.get(primary) or by_type.get(secondary) |
| 256 | + if not smiles: |
| 257 | + raise RuntimeError( |
| 258 | + f"RCSB returned no SMILES descriptor for component '{code}' from program '{program}'" |
| 259 | + ) |
| 260 | + return smiles |
0 commit comments