@@ -394,6 +394,46 @@ def get_html(self, url, journal, mode='browser', headless=True):
394394 r .encoding = 'utf-8'
395395 return r .text
396396
397+ def _get_pmcid_from_pmid (self , pmid ):
398+ """Resolve PMCID from PMID using NCBI E-utilities."""
399+ try :
400+ response = self ._client .elink (
401+ pmid ,
402+ retmode = 'json' ,
403+ access_db = 'pmc' ,
404+ return_content = False ,
405+ )
406+ response .raise_for_status ()
407+ json_content = response .json ()
408+
409+ linksets = json_content .get ('linksets' , [])
410+ if not linksets :
411+ return None
412+
413+ linksetdbs = linksets [0 ].get ('linksetdbs' , [])
414+ if not linksetdbs :
415+ return None
416+
417+ pmc_links = linksetdbs [0 ].get ('links' , [])
418+ if not pmc_links :
419+ return None
420+
421+ pmc_id = str (pmc_links [0 ]).strip ()
422+ if not pmc_id :
423+ return None
424+
425+ if not pmc_id .upper ().startswith ('PMC' ):
426+ pmc_id = f"PMC{ pmc_id } "
427+
428+ return pmc_id
429+ except requests .RequestException as e :
430+ logger .error (f"Failed to resolve PMCID for PMID { pmid } : { e } " )
431+ except (ValueError , KeyError , IndexError , TypeError ) as e :
432+ logger .error (f"Unexpected PMCID response for PMID { pmid } : { e } " )
433+ except Exception as e :
434+ logger .error (f"PMCID lookup failed for PMID { pmid } : { e } " )
435+ return None
436+
397437
398438 def get_html_by_pmid (self , pmid , journal , mode = 'browser' , retmode = 'ref' , prefer_pmc_source = True , headless = True ):
399439 base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
@@ -405,18 +445,37 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
405445 response .raise_for_status () # Raise an HTTPError for bad responses
406446 json_content = response .json ()
407447
408- providers = {obj ['provider' ]['nameabbr' ]: obj ["url" ]["value" ] for obj in json_content ['linksets' ][0 ]['idurllist' ][0 ]['objurls' ]}
448+ objurls = []
449+ linksets = json_content .get ('linksets' , [])
450+ if linksets :
451+ idurllist = linksets [0 ].get ('idurllist' , [])
452+ if idurllist :
453+ objurls = idurllist [0 ].get ('objurls' , [])
454+
455+ providers = {
456+ obj .get ('provider' , {}).get ('nameabbr' ): obj .get ('url' , {}).get ('value' )
457+ for obj in objurls
458+ if obj .get ('provider' , {}).get ('nameabbr' )
459+ }
409460 pmc_url = providers .get ('PMC' )
410461
411462 if pmc_url :
412- return self .get_html (pmc_url , journal , mode = 'requests' )
463+ pmc_id = self ._get_pmcid_from_pmid (pmid )
464+ if pmc_id :
465+ pmc_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{ pmc_id } "
466+ return self .get_html (pmc_url , journal , mode = 'requests' )
467+ if prefer_pmc_source == "only" :
468+ logger .info ("\t PMC source detected, but PMCID lookup failed. Skipping..." )
469+ return
413470 elif prefer_pmc_source == "only" :
414471 logger .info ("\t No PMC source found! Skipping..." )
415472 return
416473 except requests .RequestException as e :
417474 logger .error (f"Request failed: { e } " )
418- except KeyError as e :
419- logger .error (f"Key error: { e } - JSON content: { json_content } " )
475+ except (ValueError , KeyError , IndexError , TypeError ) as e :
476+ logger .error (f"Unexpected E-utilities response for PMID { pmid } : { e } " )
477+ except Exception as e :
478+ logger .error (f"E-utilities lookup failed for PMID { pmid } : { e } " )
420479 else :
421480 query = f"{ base_url } ?dbfrom=pubmed&id={ pmid } &cmd=prlinks&retmode={ retmode } "
422481 logger .info (query )
@@ -487,6 +546,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
487546 with filename .open ('w' ) as f :
488547 f .write (doc )
489548 if not valid :
549+ from pdb import set_trace ; set_trace ()
490550 logger .info ("\t Scrape failed! Skipping..." )
491551
492552 # Insert random delay until next request.
0 commit comments