1- '''
1+ """
22
33MIT License
44
2626OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2727SOFTWARE.
2828
29- '''
29+ """
3030
3131import bz2
3232import os
3333import pathlib
3434import pickle as pkl
3535from collections import Counter
3636
37+ from drug_named_entity_recognition .molecular_properties import (
38+ get_molecular_weight ,
39+ )
3740from drug_named_entity_recognition .omop_api import get_omop_id_from_drug
3841from drug_named_entity_recognition .structure_file_downloader import download_structures
3942from drug_named_entity_recognition .util import stopwords
@@ -79,7 +82,7 @@ def get_ngrams(text):
7982 n = 3
8083 ngrams = set ()
8184 for i in range (0 , len (text ) - n + 1 , 1 ):
82- ngrams .add (text [i : i + n ])
85+ ngrams .add (text [i : i + n ])
8386 return ngrams
8487
8588
@@ -110,7 +113,9 @@ def reset_drugs_data():
110113 ngram_to_variant [ngram ].append (drug_variant )
111114
112115
113- def add_custom_drug_synonym (drug_variant : str , canonical_name : str , optional_variant_data : dict = None ):
116+ def add_custom_drug_synonym (
117+ drug_variant : str , canonical_name : str , optional_variant_data : dict = None
118+ ):
114119 drug_variant = drug_variant .lower ()
115120 canonical_name = canonical_name .lower ()
116121 drug_variant_to_canonical [drug_variant ] = [canonical_name ]
@@ -168,19 +173,37 @@ def get_fuzzy_match(surface_form: str):
168173 if len (candidate_to_num_matching_ngrams ) > 0 :
169174 top_candidate = max (candidate_to_jaccard , key = candidate_to_jaccard .get )
170175 jaccard = candidate_to_jaccard [top_candidate ]
171- query_ngrams_missing_in_candidate = query_ngrams .difference (variant_to_ngrams [top_candidate ])
172- candidate_ngrams_missing_in_query = variant_to_ngrams [top_candidate ].difference (query_ngrams )
176+ query_ngrams_missing_in_candidate = query_ngrams .difference (
177+ variant_to_ngrams [top_candidate ]
178+ )
179+ candidate_ngrams_missing_in_query = variant_to_ngrams [top_candidate ].difference (
180+ query_ngrams
181+ )
173182
174183 candidate_length = len (top_candidate )
175184 length_diff = abs (query_length - candidate_length )
176- if max ([len (query_ngrams_missing_in_candidate ), len (candidate_ngrams_missing_in_query )]) <= 3 \
177- and length_diff <= 2 :
185+ if (
186+ max (
187+ [
188+ len (query_ngrams_missing_in_candidate ),
189+ len (candidate_ngrams_missing_in_query ),
190+ ]
191+ )
192+ <= 3
193+ and length_diff <= 2
194+ ):
178195 return top_candidate , jaccard
179196 return None , None
180197
181198
182- def find_drugs (tokens : list , is_fuzzy_match = False , is_ignore_case = None , is_include_structure = False ,
183- is_use_omop_api = False ):
199+ def find_drugs (
200+ tokens : list ,
201+ is_fuzzy_match = False ,
202+ is_ignore_case = None ,
203+ is_include_structure = False ,
204+ is_use_omop_api = False ,
205+ use_pub_chem_api = False ,
206+ ):
184207 if is_include_structure and len (dbid_to_mol_lookup ) == 0 :
185208 dbid_to_mol_lookup ["downloading" ] = True
186209 if not os .path .exists (structures_file ):
@@ -211,10 +234,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
211234 match = drug_variant_to_canonical .get (cand_norm , None )
212235 if match :
213236 for m in match :
214- match_data = dict (drug_canonical_to_data .get (m , {})) | drug_variant_to_variant_data .get (cand_norm , {})
237+ match_data = dict (
238+ drug_canonical_to_data .get (m , {})
239+ ) | drug_variant_to_variant_data .get (cand_norm , {})
215240 match_data ["match_type" ] = "exact"
216241 match_data ["matching_string" ] = cand
217242 lookup_name = match_data .get ("name" ) or m
243+
244+ match_data = get_molecular_weight (
245+ match_data , lookup_name , use_pub_chem_api
246+ )
247+
218248 if is_use_omop_api :
219249 match_data ["omop_id" ] = cached_get_omop_id (lookup_name )
220250 drug_matches .append ((match_data , token_idx , token_idx + 2 ))
@@ -226,12 +256,18 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
226256 if fuzzy_matched_variant is not None :
227257 match = drug_variant_to_canonical [fuzzy_matched_variant ]
228258 for m in match :
229- match_data = dict (drug_canonical_to_data .get (m , {})) | drug_variant_to_variant_data .get (
230- fuzzy_matched_variant , {})
259+ match_data = dict (
260+ drug_canonical_to_data .get (m , {})
261+ ) | drug_variant_to_variant_data .get (fuzzy_matched_variant , {})
231262 match_data ["match_type" ] = "fuzzy"
232263 match_data ["match_similarity" ] = similarity
233264 match_data ["match_variant" ] = fuzzy_matched_variant
234265 match_data ["matching_string" ] = cand
266+
267+ match_data = get_molecular_weight (
268+ match_data , lookup_name , use_pub_chem_api
269+ )
270+
235271 if is_use_omop_api :
236272 lookup_name = match_data .get ("name" ) or m
237273 match_data ["omop_id" ] = cached_get_omop_id (lookup_name )
@@ -245,10 +281,17 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
245281 match = drug_variant_to_canonical .get (cand_norm , None )
246282 if match :
247283 for m in match :
248- match_data = dict (drug_canonical_to_data .get (m , {})) | drug_variant_to_variant_data .get (cand_norm , {})
284+ match_data = dict (
285+ drug_canonical_to_data .get (m , {})
286+ ) | drug_variant_to_variant_data .get (cand_norm , {})
249287 match_data ["match_type" ] = "exact"
250288 match_data ["matching_string" ] = token
251289 lookup_name = match_data .get ("name" ) or m
290+
291+ match_data = get_molecular_weight (
292+ match_data , lookup_name , use_pub_chem_api
293+ )
294+
252295 if is_use_omop_api :
253296 match_data ["omop_id" ] = cached_get_omop_id (lookup_name )
254297 drug_matches .append ((match_data , token_idx , token_idx + 1 ))
@@ -259,13 +302,19 @@ def find_drugs(tokens: list, is_fuzzy_match=False, is_ignore_case=None, is_inclu
259302 if fuzzy_matched_variant is not None :
260303 match = drug_variant_to_canonical [fuzzy_matched_variant ]
261304 for m in match :
262- match_data = dict (drug_canonical_to_data .get (m , {})) | drug_variant_to_variant_data .get (
263- fuzzy_matched_variant , {})
305+ match_data = dict (
306+ drug_canonical_to_data .get (m , {})
307+ ) | drug_variant_to_variant_data .get (fuzzy_matched_variant , {})
264308 match_data ["match_type" ] = "fuzzy"
265309 match_data ["match_similarity" ] = similarity
266310 match_data ["match_variant" ] = fuzzy_matched_variant
267311 match_data ["matching_string" ] = token
268312 lookup_name = match_data .get ("name" ) or m
313+
314+ match_data = get_molecular_weight (
315+ match_data , lookup_name , use_pub_chem_api
316+ )
317+
269318 if is_use_omop_api :
270319 match_data ["omop_id" ] = cached_get_omop_id (lookup_name )
271320 drug_matches .append ((match_data , token_idx , token_idx + 1 ))
0 commit comments