Skip to content

Commit 092e6d9

Browse files
committed
Fix PMC downloading
1 parent afca8d8 commit 092e6d9

1 file changed

Lines changed: 64 additions & 4 deletions

File tree

ace/scrape.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,46 @@ def get_html(self, url, journal, mode='browser', headless=True):
394394
r.encoding = 'utf-8'
395395
return r.text
396396

397+
def _get_pmcid_from_pmid(self, pmid):
398+
"""Resolve PMCID from PMID using NCBI E-utilities."""
399+
try:
400+
response = self._client.elink(
401+
pmid,
402+
retmode='json',
403+
access_db='pmc',
404+
return_content=False,
405+
)
406+
response.raise_for_status()
407+
json_content = response.json()
408+
409+
linksets = json_content.get('linksets', [])
410+
if not linksets:
411+
return None
412+
413+
linksetdbs = linksets[0].get('linksetdbs', [])
414+
if not linksetdbs:
415+
return None
416+
417+
pmc_links = linksetdbs[0].get('links', [])
418+
if not pmc_links:
419+
return None
420+
421+
pmc_id = str(pmc_links[0]).strip()
422+
if not pmc_id:
423+
return None
424+
425+
if not pmc_id.upper().startswith('PMC'):
426+
pmc_id = f"PMC{pmc_id}"
427+
428+
return pmc_id
429+
except requests.RequestException as e:
430+
logger.error(f"Failed to resolve PMCID for PMID {pmid}: {e}")
431+
except (ValueError, KeyError, IndexError, TypeError) as e:
432+
logger.error(f"Unexpected PMCID response for PMID {pmid}: {e}")
433+
except Exception as e:
434+
logger.error(f"PMCID lookup failed for PMID {pmid}: {e}")
435+
return None
436+
397437

398438
def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True, headless=True):
399439
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
@@ -405,18 +445,37 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
405445
response.raise_for_status() # Raise an HTTPError for bad responses
406446
json_content = response.json()
407447

408-
providers = {obj['provider']['nameabbr']: obj["url"]["value"] for obj in json_content['linksets'][0]['idurllist'][0]['objurls']}
448+
objurls = []
449+
linksets = json_content.get('linksets', [])
450+
if linksets:
451+
idurllist = linksets[0].get('idurllist', [])
452+
if idurllist:
453+
objurls = idurllist[0].get('objurls', [])
454+
455+
providers = {
456+
obj.get('provider', {}).get('nameabbr'): obj.get('url', {}).get('value')
457+
for obj in objurls
458+
if obj.get('provider', {}).get('nameabbr')
459+
}
409460
pmc_url = providers.get('PMC')
410461

411462
if pmc_url:
412-
return self.get_html(pmc_url, journal, mode='requests')
463+
pmc_id = self._get_pmcid_from_pmid(pmid)
464+
if pmc_id:
465+
pmc_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}"
466+
return self.get_html(pmc_url, journal, mode='requests')
467+
if prefer_pmc_source == "only":
468+
logger.info("\tPMC source detected, but PMCID lookup failed. Skipping...")
469+
return
413470
elif prefer_pmc_source == "only":
414471
logger.info("\tNo PMC source found! Skipping...")
415472
return
416473
except requests.RequestException as e:
417474
logger.error(f"Request failed: {e}")
418-
except KeyError as e:
419-
logger.error(f"Key error: {e} - JSON content: {json_content}")
475+
except (ValueError, KeyError, IndexError, TypeError) as e:
476+
logger.error(f"Unexpected E-utilities response for PMID {pmid}: {e}")
477+
except Exception as e:
478+
logger.error(f"E-utilities lookup failed for PMID {pmid}: {e}")
420479
else:
421480
query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
422481
logger.info(query)
@@ -487,6 +546,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
487546
with filename.open('w') as f:
488547
f.write(doc)
489548
if not valid:
549+
from pdb import set_trace; set_trace()
490550
logger.info("\tScrape failed! Skipping...")
491551

492552
# Insert random delay until next request.

0 commit comments

Comments
 (0)