fastdatascience
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/drug_named_entity_recognition/drugs_finder.py‎
Lines changed: 65 additions & 16 deletions b/‎src/drug_named_entity_recognition/drugs_finder.py‎
Lines changed: 65 additions & 16 deletions
diff --git a/‎src/drug_named_entity_recognition/molecular_properties.py‎
Lines changed: 217 additions & 0 deletions b/‎src/drug_named_entity_recognition/molecular_properties.py‎
Lines changed: 217 additions & 0 deletions
@@ -1,6 +1,6 @@
 [project]
 name = "drug-named-entity-recognition"
-version = "2.0.8"
+version = "2.0.9"
 description = "Drug Named Entity Recognition library to find and resolve drug names in a string (drug named entity linking)"
 readme = "README.md"
 keywords = ['drug', 'bio', 'biomedical', 'medical', 'pharma', 'pharmaceutical', 'ner', 'nlp', 'named entity recognition', 'natural language processing', 'named entity linking']
 
@@ -1,4 +1,4 @@
-'''
+"""
 
 MIT License
 
@@ -26,14 +26,17 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
-'''
+"""
 
 import bz2
 import os
 import pathlib
 import pickle as pkl
 from collections import Counter
 
+from drug_named_entity_recognition.molecular_properties import (
+    get_molecular_weight,
+)
 from drug_named_entity_recognition.omop_api import get_omop_id_from_drug
 from drug_named_entity_recognition.structure_file_downloader import download_structures
 from drug_named_entity_recognition.util import stopwords
@@ -79,7 +82,7 @@ def get_ngrams(text):
     n = 3
     ngrams = set()
     for i in range(0, len(text) - n + 1, 1):
-        ngrams.add(text[i:i + n])
+        ngrams.add(text[i : i + n])
     return ngrams
 
 
@@ -110,7 +113,9 @@ def reset_drugs_data():
             ngram_to_variant[ngram].append(drug_variant)
 
 
-def add_custom_drug_synonym(drug_variant: str, canonical_name: str, optional_variant_data: dict = None):
+def add_custom_drug_synonym(
+    drug_variant: str, canonical_name: str, optional_variant_data: dict = None
+):
     drug_variant = drug_variant.lower()
     canonical_name = canonical_name.lower()
     drug_variant_to_canonical[drug_variant] = [canonical_name]
@@ -168,19 +173,37 @@ def get_fuzzy_match(surface_form: str):
     if len(candidate_to_num_matching_ngrams) > 0:
         top_candidate = max(candidate_to_jaccard, key=candidate_to_jaccard.get)
         jaccard = candidate_to_jaccard[top_candidate]
-        query_ngrams_missing_in_candidate = query_ngrams.difference(variant_to_ngrams[top_candidate])
-        candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(query_ngrams)
+        query_ngrams_missing_in_candidate = query_ngrams.difference(
+            variant_to_ngrams[top_candidate]
+        )
+        candidate_ngrams_missing_in_query = variant_to_ngrams[top_candidate].difference(
+            query_ngrams
+        )
 
         candidate_length = len(top_candidate)
         length_diff = abs(query_length - candidate_length)
-        if max([len(query_ngrams_missing_in_candidate), len(candidate_ngrams_missing_in_query)]) <= 3 \
-                and length_diff <= 2:
+        if (
+            max(
+                [
+                    len(query_ngrams_missing_in_candidate),
+                    len(candidate_ngrams_missing_in_query),
+                ]
+            )
+            <= 3
+            and length_diff <= 2
+        ):
             return top_candidate, jaccard
     return None, None
 
 
-def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_include_structure=False,
-               is_use_omop_api=False):
+def find_drugs(
+    tokens: list,
+    is_fuzzy_match=False,
+    is_ignore_case=None,
+    is_include_structure=False,
+    is_use_omop_api=False,
+    use_pub_chem_api=False,
+):
     if is_include_structure and len(dbid_to_mol_lookup) == 0:
         dbid_to_mol_lookup["downloading"] = True
         if not os.path.exists(structures_file):
@@ -211,10 +234,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
         match = drug_variant_to_canonical.get(cand_norm, None)
         if match:
             for m in match:
-                match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(cand_norm, {})
+                match_data = dict(
+                    drug_canonical_to_data.get(m, {})
+                ) | drug_variant_to_variant_data.get(cand_norm, {})
                 match_data["match_type"] = "exact"
                 match_data["matching_string"] = cand
                 lookup_name = match_data.get("name") or m
+
+                match_data = get_molecular_weight(
+                    match_data, lookup_name, use_pub_chem_api
+                )
+
                 if is_use_omop_api:
                     match_data["omop_id"] = cached_get_omop_id(lookup_name)
                 drug_matches.append((match_data, token_idx, token_idx + 2))
@@ -226,12 +256,18 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
                 if fuzzy_matched_variant is not None:
                     match = drug_variant_to_canonical[fuzzy_matched_variant]
                     for m in match:
-                        match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(
-                            fuzzy_matched_variant, {})
+                        match_data = dict(
+                            drug_canonical_to_data.get(m, {})
+                        ) | drug_variant_to_variant_data.get(fuzzy_matched_variant, {})
                         match_data["match_type"] = "fuzzy"
                         match_data["match_similarity"] = similarity
                         match_data["match_variant"] = fuzzy_matched_variant
                         match_data["matching_string"] = cand
+
+                        match_data = get_molecular_weight(
+                            match_data, lookup_name, use_pub_chem_api
+                        )
+
                         if is_use_omop_api:
                             lookup_name = match_data.get("name") or m
                             match_data["omop_id"] = cached_get_omop_id(lookup_name)
@@ -245,10 +281,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
         match = drug_variant_to_canonical.get(cand_norm, None)
         if match:
             for m in match:
-                match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(cand_norm, {})
+                match_data = dict(
+                    drug_canonical_to_data.get(m, {})
+                ) | drug_variant_to_variant_data.get(cand_norm, {})
                 match_data["match_type"] = "exact"
                 match_data["matching_string"] = token
                 lookup_name = match_data.get("name") or m
+
+                match_data = get_molecular_weight(
+                    match_data, lookup_name, use_pub_chem_api
+                )
+
                 if is_use_omop_api:
                     match_data["omop_id"] = cached_get_omop_id(lookup_name)
                 drug_matches.append((match_data, token_idx, token_idx + 1))
@@ -259,13 +302,19 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
                 if fuzzy_matched_variant is not None:
                     match = drug_variant_to_canonical[fuzzy_matched_variant]
                     for m in match:
-                        match_data = dict(drug_canonical_to_data.get(m, {})) | drug_variant_to_variant_data.get(
-                            fuzzy_matched_variant, {})
+                        match_data = dict(
+                            drug_canonical_to_data.get(m, {})
+                        ) | drug_variant_to_variant_data.get(fuzzy_matched_variant, {})
                         match_data["match_type"] = "fuzzy"
                         match_data["match_similarity"] = similarity
                         match_data["match_variant"] = fuzzy_matched_variant
                         match_data["matching_string"] = token
                         lookup_name = match_data.get("name") or m
+
+                        match_data = get_molecular_weight(
+                            match_data, lookup_name, use_pub_chem_api
+                        )
+
                         if is_use_omop_api:
                             match_data["omop_id"] = cached_get_omop_id(lookup_name)
                         drug_matches.append((match_data, token_idx, token_idx + 1))
 
@@ -0,0 +1,217 @@
+"""
+
+MIT License
+
+Copyright (c) 2023 Fast Data Science Ltd (https://fastdatascience.com)
+
+Maintainer: Thomas Wood
+
+Tutorial at https://fastdatascience.com/drug-named-entity-recognition-python-library/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+"""
+
+import re
+from typing import Dict, Optional, Tuple, Union
+
+import requests
+
+# * IUPAC 2023 atomic weights for all elements
+ATOMIC_WEIGHTS = {
+    "H": 1.00794,
+    "He": 4.002602,
+    "Li": 6.941,
+    "Be": 9.012182,
+    "B": 10.811,
+    "C": 12.0107,
+    "N": 14.0067,
+    "O": 15.9994,
+    "F": 18.9984032,
+    "Ne": 20.1797,
+    "Na": 22.98976928,
+    "Mg": 24.3050,
+    "Al": 26.9815386,
+    "Si": 28.0855,
+    "P": 30.973762,
+    "S": 32.065,
+    "Cl": 35.453,
+    "Ar": 39.948,
+    "K": 39.0983,
+    "Ca": 40.078,
+    "Sc": 44.955912,
+    "Ti": 47.867,
+    "V": 50.9415,
+    "Cr": 51.9961,
+    "Mn": 54.938045,
+    "Fe": 55.845,
+    "Co": 58.933195,
+    "Ni": 58.6934,
+    "Cu": 63.546,
+    "Zn": 65.38,
+    "Ga": 69.723,
+    "Ge": 72.64,
+    "As": 74.92160,
+    "Se": 78.96,
+    "Br": 79.904,
+    "Kr": 83.798,
+    "Rb": 85.4678,
+    "Sr": 87.62,
+    "Y": 88.90585,
+    "Zr": 91.224,
+    "Nb": 92.90638,
+    "Mo": 95.96,
+    "Tc": 98.0,
+    "Ru": 101.07,
+    "Rh": 102.90550,
+    "Pd": 106.42,
+    "Ag": 107.8682,
+    "Cd": 112.411,
+    "In": 114.818,
+    "Sn": 118.710,
+    "Sb": 121.760,
+    "Te": 127.60,
+    "I": 126.90447,
+    "Xe": 131.293,
+    "Cs": 132.9054519,
+    "Ba": 137.327,
+    "La": 138.90547,
+    "Ce": 140.116,
+    "Pr": 140.90765,
+    "Nd": 144.24,
+    "Pm": 145.0,
+    "Sm": 150.36,
+    "Eu": 151.964,
+    "Gd": 157.25,
+    "Tb": 158.92534,
+    "Dy": 162.500,
+    "Ho": 164.93032,
+    "Er": 167.259,
+    "Tm": 168.93421,
+    "Yb": 173.04,
+    "Lu": 174.967,
+    "Hf": 178.49,
+    "Ta": 180.9479,
+    "W": 183.84,
+    "Re": 186.207,
+    "Os": 190.23,
+    "Ir": 192.217,
+    "Pt": 195.084,
+    "Au": 196.966569,
+    "Hg": 200.59,
+    "Tl": 204.3833,
+    "Pb": 207.2,
+    "Bi": 208.98040,
+    "Po": 209.0,
+    "At": 210.0,
+    "Rn": 222.0,
+    "Fr": 223.0,
+    "Ra": 226.0,
+    "Ac": 227.0,
+    "Th": 232.03806,
+    "Pa": 231.03588,
+    "U": 238.02891,
+    "Np": 237.0,
+    "Pu": 244.0,
+    "Am": 243.0,
+    "Cm": 247.0,
+    "Bk": 247.0,
+    "Cf": 251.0,
+    "Es": 252.0,
+    "Fm": 257.0,
+    "Md": 258.0,
+    "No": 259.0,
+    "Lr": 262.0,
+    "Rf": 267.0,
+    "Db": 270.0,
+    "Sg": 271.0,
+    "Bh": 270.0,
+    "Hs": 277.0,
+    "Mt": 278.0,
+    "Ds": 281.0,
+    "Rg": 282.0,
+    "Cn": 285.0,
+    "Fl": 289.0,
+    "Lv": 293.0,
+    "Ts": 294.0,
+    "Og": 294.0,
+}
+
+
+def fetch_pub_chem_properties(
+    drug_name: str,
+) -> Union[Tuple[Optional[float], Optional[str]], Tuple[None, None]]:
+    """
+    Fetches MolecularWeight and CanonicalSMILES from PubChem API for a given drug name.
+
+    Returns:
+        MolecularWeight as float and CanonicalSMILES as strings if found, otherwise (None, None).
+    """
+    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/MolecularWeight,CanonicalSMILES/JSON"
+    try:
+        response = requests.get(url, timeout=10)
+        if response.ok:
+            props = response.json()["PropertyTable"]["Properties"][0]
+            # * Return as strings to preserve exact formatting from API
+            return props.get("MolecularWeight"), props.get("CanonicalSMILES")
+    except Exception:
+        pass
+    return None, None
+
+
+def calculate_molecular_weight(formula: str) -> float:
+    """
+    Calculates the average molecular weight from a chemical formula string.
+    Returns the molecular weight rounded to two decimals.
+    """
+    matches = re.findall(r"([A-Z][a-z]?)(\d*)", formula)
+    weight = 0.0
+    for element, count in matches:
+        if element not in ATOMIC_WEIGHTS:
+            raise ValueError(f"Unknown element: {element}")
+        count = int(count) if count else 1
+        weight += ATOMIC_WEIGHTS[element] * count
+    return round(weight, 2)
+
+
+def get_molecular_weight(
+    match_data: dict, lookup_name: str, use_pub_chem_api=False
+) -> Dict:
+    """
+    Ensures 'molecular_weight' and 'smiles' are present in match_data.
+    Tries to calculate molecular_weight from formula first; falls back to PubChem API if needed.
+    Modifies match_data in place.
+    """
+    # * Try formula-based calculation first
+    if "molecular_weight" not in match_data and "formula" in match_data:
+        try:
+            match_data["molecular_weight"] = calculate_molecular_weight(
+                match_data["formula"]
+            )
+        except Exception:
+            # * If formula is invalid or missing elements, fallback to API
+            pass
+
+    # * Fetch from PubChem if still missing molecular_weight
+    if "molecular_weight" not in match_data and use_pub_chem_api:
+        mw, _ = fetch_pub_chem_properties(lookup_name)
+        if mw:
+            match_data["molecular_weight"] = round(mw, 2)
+
+    return match_data