Skip to content

Commit 2645583

Browse files
authored
Merge pull request #22 from abdullahwaqar/feature/get-molecular-weight
added molecular weight calculation and fallback api Thanks so much @abdullahwaqar!
2 parents 9df6974 + 5001c9f commit 2645583

5 files changed

Lines changed: 343 additions & 17 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "drug-named-entity-recognition"
3-
version = "2.0.8"
3+
version = "2.0.9"
44
description = "Drug Named Entity Recognition library to find and resolve drug names in a string (drug named entity linking)"
55
readme = "README.md"
66
keywords = ['drug', 'bio', 'biomedical', 'medical', 'pharma', 'pharmaceutical', 'ner', 'nlp', 'named entity recognition', 'natural language processing', 'named entity linking']

src/drug_named_entity_recognition/drugs_finder.py

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
'''
1+
"""
22
33
MIT License
44
@@ -26,14 +26,17 @@
2626
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2727
SOFTWARE.
2828
29-
'''
29+
"""
3030

3131
import bz2
3232
import os
3333
import pathlib
3434
import pickle as pkl
3535
from collections import Counter
3636

37+
from drug_named_entity_recognition.molecular_properties import (
38+
get_molecular_weight,
39+
)
3740
from drug_named_entity_recognition.omop_api import get_omop_id_from_drug
3841
from drug_named_entity_recognition.structure_file_downloader import download_structures
3942
from drug_named_entity_recognition.util import stopwords
@@ -79,7 +82,7 @@ def get_ngrams(text):
7982
n = 3
8083
ngrams = set()
8184
for i in range(0, len(text) - n + 1, 1):
82-
ngrams.add(text[i:i + n])
85+
ngrams.add(text[i : i + n])
8386
return ngrams
8487

8588

@@ -110,7 +113,9 @@ def reset_drugs_data():
110113
ngram_to_variant[ngram].append(drug_variant)
111114

112115

113-
def add_custom_drug_synonym(drug_variant: str, canonical_name: str, optional_variant_data: dict = None):
116+
def add_custom_drug_synonym(
117+
drug_variant: str, canonical_name: str, optional_variant_data: dict = None
118+
):
114119
drug_variant = drug_variant.lower()
115120
canonical_name = canonical_name.lower()
116121
drug_variant_to_canonical[drug_variant] = [canonical_name]
@@ -168,19 +173,37 @@ def get_fuzzy_match(surface_form: str):
168173
if len(candidate_to_num_matching_ngrams) > 0:
169174
top_candidate = max(candidate_to_jaccard, key=candidate_to_jaccard.get)
170175
jaccard = candidate_to_jaccard[top_candidate]
171-
query_ngrams_missing_in_candidate = query_ngrams.difference(variant_to_ngrams[top_candidate])
172-
candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(query_ngrams)
176+
query_ngrams_missing_in_candidate = query_ngrams.difference(
177+
variant_to_ngrams[top_candidate]
178+
)
179+
candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(
180+
query_ngrams
181+
)
173182

174183
candidate_length = len(top_candidate)
175184
length_diff = abs(query_length - candidate_length)
176-
if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3 \
177-
and length_diff <= 2:
185+
if (
186+
max(
187+
[
188+
len(query_ngrams_missing_in_candidate),
189+
len(candidate_ngrams_missing_in_query),
190+
]
191+
)
192+
<= 3
193+
and length_diff <= 2
194+
):
178195
return top_candidate, jaccard
179196
return None, None
180197

181198

182-
def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_include_structure=False,
183-
is_use_omop_api=False):
199+
def find_drugs(
200+
tokens: list,
201+
is_fuzzy_match=False,
202+
is_ignore_case=None,
203+
is_include_structure=False,
204+
is_use_omop_api=False,
205+
use_pub_chem_api=False,
206+
):
184207
if is_include_structure and len(dbid_to_mol_lookup) == 0:
185208
dbid_to_mol_lookup["downloading"] = True
186209
if not os.path.exists(structures_file):
@@ -211,10 +234,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
211234
match = drug_variant_to_canonical.get(cand_norm, None)
212235
if match:
213236
for m in match:
214-
match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(cand_norm, {})
237+
match_data = dict(
238+
drug_canonical_to_data.get(m, {})
239+
) | drug_variant_to_variant_data.get(cand_norm, {})
215240
match_data["match_type"] = "exact"
216241
match_data["matching_string"] = cand
217242
lookup_name = match_data.get("name") or m
243+
244+
match_data = get_molecular_weight(
245+
match_data, lookup_name, use_pub_chem_api
246+
)
247+
218248
if is_use_omop_api:
219249
match_data["omop_id"] = cached_get_omop_id(lookup_name)
220250
drug_matches.append((match_data, token_idx, token_idx + 2))
@@ -226,12 +256,18 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
226256
if fuzzy_matched_variant is not None:
227257
match = drug_variant_to_canonical[fuzzy_matched_variant]
228258
for m in match:
229-
match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(
230-
fuzzy_matched_variant, {})
259+
match_data = dict(
260+
drug_canonical_to_data.get(m, {})
261+
) | drug_variant_to_variant_data.get(fuzzy_matched_variant, {})
231262
match_data["match_type"] = "fuzzy"
232263
match_data["match_similarity"] = similarity
233264
match_data["match_variant"] = fuzzy_matched_variant
234265
match_data["matching_string"] = cand
266+
267+
match_data = get_molecular_weight(
268+
match_data, lookup_name, use_pub_chem_api
269+
)
270+
235271
if is_use_omop_api:
236272
lookup_name = match_data.get("name") or m
237273
match_data["omop_id"] = cached_get_omop_id(lookup_name)
@@ -245,10 +281,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
245281
match = drug_variant_to_canonical.get(cand_norm, None)
246282
if match:
247283
for m in match:
248-
match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(cand_norm, {})
284+
match_data = dict(
285+
drug_canonical_to_data.get(m, {})
286+
) | drug_variant_to_variant_data.get(cand_norm, {})
249287
match_data["match_type"] = "exact"
250288
match_data["matching_string"] = token
251289
lookup_name = match_data.get("name") or m
290+
291+
match_data = get_molecular_weight(
292+
match_data, lookup_name, use_pub_chem_api
293+
)
294+
252295
if is_use_omop_api:
253296
match_data["omop_id"] = cached_get_omop_id(lookup_name)
254297
drug_matches.append((match_data, token_idx, token_idx + 1))
@@ -259,13 +302,19 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
259302
if fuzzy_matched_variant is not None:
260303
match = drug_variant_to_canonical[fuzzy_matched_variant]
261304
for m in match:
262-
match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(
263-
fuzzy_matched_variant, {})
305+
match_data = dict(
306+
drug_canonical_to_data.get(m, {})
307+
) | drug_variant_to_variant_data.get(fuzzy_matched_variant, {})
264308
match_data["match_type"] = "fuzzy"
265309
match_data["match_similarity"] = similarity
266310
match_data["match_variant"] = fuzzy_matched_variant
267311
match_data["matching_string"] = token
268312
lookup_name = match_data.get("name") or m
313+
314+
match_data = get_molecular_weight(
315+
match_data, lookup_name, use_pub_chem_api
316+
)
317+
269318
if is_use_omop_api:
270319
match_data["omop_id"] = cached_get_omop_id(lookup_name)
271320
drug_matches.append((match_data, token_idx, token_idx + 1))
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""
2+
3+
MIT License
4+
5+
Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com)
6+
7+
Maintainer: Thomas Wood
8+
9+
Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
10+
11+
Permission is hereby granted, free of charge, to any person obtaining a copy
12+
of this software and associated documentation files (the "Software"), to deal
13+
in the Software without restriction, including without limitation the rights
14+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15+
copies of the Software, and to permit persons to whom the Software is
16+
furnished to do so, subject to the following conditions:
17+
18+
The above copyright notice and this permission notice shall be included in all
19+
copies or substantial portions of the Software.
20+
21+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27+
SOFTWARE.
28+
29+
"""
30+
31+
import re
32+
from typing import Dict, Optional, Tuple, Union
33+
34+
import requests
35+
36+
# * IUPAC 2023 atomic weights for all elements
37+
ATOMIC_WEIGHTS = {
38+
"H": 1.00794,
39+
"He": 4.002602,
40+
"Li": 6.941,
41+
"Be": 9.012182,
42+
"B": 10.811,
43+
"C": 12.0107,
44+
"N": 14.0067,
45+
"O": 15.9994,
46+
"F": 18.9984032,
47+
"Ne": 20.1797,
48+
"Na": 22.98976928,
49+
"Mg": 24.3050,
50+
"Al": 26.9815386,
51+
"Si": 28.0855,
52+
"P": 30.973762,
53+
"S": 32.065,
54+
"Cl": 35.453,
55+
"Ar": 39.948,
56+
"K": 39.0983,
57+
"Ca": 40.078,
58+
"Sc": 44.955912,
59+
"Ti": 47.867,
60+
"V": 50.9415,
61+
"Cr": 51.9961,
62+
"Mn": 54.938045,
63+
"Fe": 55.845,
64+
"Co": 58.933195,
65+
"Ni": 58.6934,
66+
"Cu": 63.546,
67+
"Zn": 65.38,
68+
"Ga": 69.723,
69+
"Ge": 72.64,
70+
"As": 74.92160,
71+
"Se": 78.96,
72+
"Br": 79.904,
73+
"Kr": 83.798,
74+
"Rb": 85.4678,
75+
"Sr": 87.62,
76+
"Y": 88.90585,
77+
"Zr": 91.224,
78+
"Nb": 92.90638,
79+
"Mo": 95.96,
80+
"Tc": 98.0,
81+
"Ru": 101.07,
82+
"Rh": 102.90550,
83+
"Pd": 106.42,
84+
"Ag": 107.8682,
85+
"Cd": 112.411,
86+
"In": 114.818,
87+
"Sn": 118.710,
88+
"Sb": 121.760,
89+
"Te": 127.60,
90+
"I": 126.90447,
91+
"Xe": 131.293,
92+
"Cs": 132.9054519,
93+
"Ba": 137.327,
94+
"La": 138.90547,
95+
"Ce": 140.116,
96+
"Pr": 140.90765,
97+
"Nd": 144.24,
98+
"Pm": 145.0,
99+
"Sm": 150.36,
100+
"Eu": 151.964,
101+
"Gd": 157.25,
102+
"Tb": 158.92534,
103+
"Dy": 162.500,
104+
"Ho": 164.93032,
105+
"Er": 167.259,
106+
"Tm": 168.93421,
107+
"Yb": 173.04,
108+
"Lu": 174.967,
109+
"Hf": 178.49,
110+
"Ta": 180.9479,
111+
"W": 183.84,
112+
"Re": 186.207,
113+
"Os": 190.23,
114+
"Ir": 192.217,
115+
"Pt": 195.084,
116+
"Au": 196.966569,
117+
"Hg": 200.59,
118+
"Tl": 204.3833,
119+
"Pb": 207.2,
120+
"Bi": 208.98040,
121+
"Po": 209.0,
122+
"At": 210.0,
123+
"Rn": 222.0,
124+
"Fr": 223.0,
125+
"Ra": 226.0,
126+
"Ac": 227.0,
127+
"Th": 232.03806,
128+
"Pa": 231.03588,
129+
"U": 238.02891,
130+
"Np": 237.0,
131+
"Pu": 244.0,
132+
"Am": 243.0,
133+
"Cm": 247.0,
134+
"Bk": 247.0,
135+
"Cf": 251.0,
136+
"Es": 252.0,
137+
"Fm": 257.0,
138+
"Md": 258.0,
139+
"No": 259.0,
140+
"Lr": 262.0,
141+
"Rf": 267.0,
142+
"Db": 270.0,
143+
"Sg": 271.0,
144+
"Bh": 270.0,
145+
"Hs": 277.0,
146+
"Mt": 278.0,
147+
"Ds": 281.0,
148+
"Rg": 282.0,
149+
"Cn": 285.0,
150+
"Fl": 289.0,
151+
"Lv": 293.0,
152+
"Ts": 294.0,
153+
"Og": 294.0,
154+
}
155+
156+
157+
def fetch_pub_chem_properties(
158+
drug_name: str,
159+
) -> Union[Tuple[Optional[float], Optional[str]], Tuple[None, None]]:
160+
"""
161+
Fetches MolecularWeight and CanonicalSMILES from PubChem API for a given drug name.
162+
163+
Returns:
164+
MolecularWeight as float and CanonicalSMILES as strings if found, otherwise (None, None).
165+
"""
166+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/MolecularWeight,CanonicalSMILES/JSON"
167+
try:
168+
response = requests.get(url, timeout=10)
169+
if response.ok:
170+
props = response.json()["PropertyTable"]["Properties"][0]
171+
# * Return as strings to preserve exact formatting from API
172+
return props.get("MolecularWeight"), props.get("CanonicalSMILES")
173+
except Exception:
174+
pass
175+
return None, None
176+
177+
178+
def calculate_molecular_weight(formula: str) -> float:
179+
"""
180+
Calculates the average molecular weight from a chemical formula string.
181+
Returns the molecular weight rounded to two decimals.
182+
"""
183+
matches = re.findall(r"([A-Z][a-z]?)(\d*)", formula)
184+
weight = 0.0
185+
for element, count in matches:
186+
if element not in ATOMIC_WEIGHTS:
187+
raise ValueError(f"Unknown element: {element}")
188+
count = int(count) if count else 1
189+
weight += ATOMIC_WEIGHTS[element] * count
190+
return round(weight, 2)
191+
192+
193+
def get_molecular_weight(
194+
match_data: dict, lookup_name: str, use_pub_chem_api=False
195+
) -> Dict:
196+
"""
197+
Ensures 'molecular_weight' and 'smiles' are present in match_data.
198+
Tries to calculate molecular_weight from formula first; falls back to PubChem API if needed.
199+
Modifies match_data in place.
200+
"""
201+
# * Try formula-based calculation first
202+
if "molecular_weight" not in match_data and "formula" in match_data:
203+
try:
204+
match_data["molecular_weight"] = calculate_molecular_weight(
205+
match_data["formula"]
206+
)
207+
except Exception:
208+
# * If formula is invalid or missing elements, fallback to API
209+
pass
210+
211+
# * Fetch from PubChem if still missing molecular_weight
212+
if "molecular_weight" not in match_data and use_pub_chem_api:
213+
mw, _ = fetch_pub_chem_properties(lookup_name)
214+
if mw:
215+
match_data["molecular_weight"] = round(mw, 2)
216+
217+
return match_data

0 commit comments

Comments
 (0)