Skip to content

Commit fa0568c

Browse files
committed
Add options to run faster (skip challenge, content timeouts)
1 parent 9a68e81 commit fa0568c

1 file changed

Lines changed: 150 additions & 11 deletions

File tree

ace/scrape.py

Lines changed: 150 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,38 @@ def __init__(self, store, api_key=None):
241241
self.store = Path(store)
242242
self._client = PubMedAPI(api_key=api_key)
243243

244+
@staticmethod
245+
def _normalize_skip_url_substrings(skip_url_substrings):
246+
if skip_url_substrings is None:
247+
return tuple()
248+
if isinstance(skip_url_substrings, str):
249+
values = [skip_url_substrings]
250+
else:
251+
values = skip_url_substrings
252+
normalized = []
253+
for value in values:
254+
if value is None:
255+
continue
256+
text = str(value).strip().lower()
257+
if text:
258+
normalized.append(text)
259+
return tuple(normalized)
260+
261+
def _should_skip_url(self, url, skip_url_substrings):
262+
if not url:
263+
return False
264+
normalized = self._normalize_skip_url_substrings(skip_url_substrings)
265+
if not normalized:
266+
return False
267+
url_lower = str(url).lower()
268+
for substring in normalized:
269+
if substring in url_lower:
270+
self._skip_article_requested = True
271+
self._skip_article_due_to_url = True
272+
logger.info("Skipping URL because it contains %r: %s", substring, url)
273+
return True
274+
return False
275+
244276

245277
def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
246278
journal = journal.replace(' ', '+')
@@ -257,11 +289,14 @@ def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
257289
return doc
258290

259291

260-
def get_html(self, url, journal, mode='browser', headless=True):
292+
def get_html(self, url, journal, mode='browser', headless=True, skip_url_substrings=None):
261293

262294
''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
263295
or just gets the URL directly. '''
264296

297+
if self._should_skip_url(url, skip_url_substrings):
298+
return None
299+
265300
if mode == 'browser':
266301
driver = Driver(
267302
uc=True,
@@ -273,6 +308,9 @@ def get_html(self, url, journal, mode='browser', headless=True):
273308
driver.set_page_load_timeout(10)
274309
driver.get(url)
275310
url = driver.current_url
311+
if self._should_skip_url(url, skip_url_substrings):
312+
driver.quit()
313+
return None
276314
except:
277315
driver.quit()
278316
logger.info(f"Timeout exception #{attempt}. Retrying...")
@@ -300,6 +338,9 @@ def get_html(self, url, journal, mode='browser', headless=True):
300338
break
301339

302340
new_url = self.check_for_substitute_url(url, html, journal)
341+
if self._should_skip_url(new_url, skip_url_substrings):
342+
driver.quit()
343+
return None
303344

304345
if url != new_url:
305346
driver = Driver(
@@ -383,11 +424,17 @@ def get_html(self, url, journal, mode='browser', headless=True):
383424
elif mode == 'requests':
384425
headers = {'User-Agent': random.choice(USER_AGENTS)}
385426
r = requests.get(url, headers=headers)
427+
if self._should_skip_url(r.url, skip_url_substrings):
428+
return None
386429
# For some journals, we can do better than the returned HTML, so get the final URL and
387430
# substitute a better one.
388431
url = self.check_for_substitute_url(r.url, r.text, journal)
389432
if url != r.url:
433+
if self._should_skip_url(url, skip_url_substrings):
434+
return None
390435
r = requests.get(url, headers=headers)
436+
if self._should_skip_url(r.url, skip_url_substrings):
437+
return None
391438
# XML content is usually misidentified as ISO-8859-1, so we need to manually set utf-8.
392439
# Unfortunately this can break other documents. Need to eventually change this to inspect the
393440
# encoding attribute of the document header.
@@ -434,8 +481,25 @@ def _get_pmcid_from_pmid(self, pmid):
434481
logger.error(f"PMCID lookup failed for PMID {pmid}: {e}")
435482
return None
436483

437-
438-
def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True, headless=True):
484+
@staticmethod
485+
def _provider_urls_from_elink_json(json_content):
486+
provider_urls = []
487+
linksets = json_content.get('linksets', [])
488+
if not linksets:
489+
return provider_urls
490+
idurllist = linksets[0].get('idurllist', [])
491+
if not idurllist:
492+
return provider_urls
493+
objurls = idurllist[0].get('objurls', [])
494+
for obj in objurls:
495+
provider_url = obj.get('url', {}).get('value')
496+
if provider_url:
497+
provider_urls.append(provider_url)
498+
return provider_urls
499+
500+
501+
def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True, headless=True,
502+
skip_url_substrings=None):
439503
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
440504
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
441505

@@ -451,6 +515,7 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
451515
idurllist = linksets[0].get('idurllist', [])
452516
if idurllist:
453517
objurls = idurllist[0].get('objurls', [])
518+
provider_urls = self._provider_urls_from_elink_json(json_content)
454519

455520
providers = {
456521
obj.get('provider', {}).get('nameabbr'): obj.get('url', {}).get('value')
@@ -463,31 +528,76 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
463528
pmc_id = self._get_pmcid_from_pmid(pmid)
464529
if pmc_id:
465530
pmc_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmc_id}"
466-
return self.get_html(pmc_url, journal, mode=mode, headless=headless)
531+
return self.get_html(
532+
pmc_url,
533+
journal,
534+
mode=mode,
535+
headless=headless,
536+
skip_url_substrings=skip_url_substrings,
537+
)
467538
if prefer_pmc_source == "only":
468539
logger.info("\tPMC source detected, but PMCID lookup failed. Skipping...")
469540
return
470541
elif prefer_pmc_source == "only":
471542
logger.info("\tNo PMC source found! Skipping...")
472543
return
544+
else:
545+
for provider_url in provider_urls:
546+
if self._should_skip_url(provider_url, skip_url_substrings):
547+
logger.info(
548+
"\tSkipping PMID %s due to blocked provider URL: %s",
549+
pmid,
550+
provider_url,
551+
)
552+
return
473553
except requests.RequestException as e:
474554
logger.error(f"Request failed: {e}")
475555
except (ValueError, KeyError, IndexError, TypeError) as e:
476556
logger.error(f"Unexpected E-utilities response for PMID {pmid}: {e}")
477557
except Exception as e:
478558
logger.error(f"E-utilities lookup failed for PMID {pmid}: {e}")
479559
else:
560+
if skip_url_substrings:
561+
try:
562+
response = self._client.elink(pmid, retmode='json', return_content=False)
563+
response.raise_for_status()
564+
for provider_url in self._provider_urls_from_elink_json(response.json()):
565+
if self._should_skip_url(provider_url, skip_url_substrings):
566+
logger.info(
567+
"\tSkipping PMID %s due to blocked provider URL: %s",
568+
pmid,
569+
provider_url,
570+
)
571+
return
572+
except requests.RequestException as e:
573+
logger.error(f"Provider URL pre-check failed for PMID {pmid}: {e}")
574+
except (ValueError, KeyError, IndexError, TypeError) as e:
575+
logger.error(f"Unexpected provider pre-check response for PMID {pmid}: {e}")
576+
except Exception as e:
577+
logger.error(f"Provider URL pre-check lookup failed for PMID {pmid}: {e}")
480578
query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
481579
logger.info(query)
482-
return self.get_html(query, journal, mode=mode, headless=headless)
580+
return self.get_html(
581+
query,
582+
journal,
583+
mode=mode,
584+
headless=headless,
585+
skip_url_substrings=skip_url_substrings,
586+
)
483587

484588
if prefer_pmc_source == "only":
485589
logger.info("\tNo PMC source found!! Skipping...")
486590
return
487591

488592
# Fallback if no PMC link found
489593
query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
490-
return self.get_html(query, journal, mode=mode, headless=headless)
594+
return self.get_html(
595+
query,
596+
journal,
597+
mode=mode,
598+
headless=headless,
599+
skip_url_substrings=skip_url_substrings,
600+
)
491601

492602

493603
def check_for_substitute_url(self, url, html, journal):
@@ -525,9 +635,12 @@ def is_pmc_open_acess(self, pmcid):
525635

526636
return 'idIsNotOpenAccess' not in response
527637

528-
def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True, headless=True):
638+
def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True,
639+
headless=True, skip_url_substrings=None):
529640

530641
logger.info("Processing %s..." % id)
642+
self._skip_article_requested = False
643+
self._skip_article_due_to_url = False
531644
journal_path = (self.store / 'html' / journal)
532645
filename = journal_path / f"{id}.html"
533646

@@ -537,7 +650,21 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
537650
return None, None
538651

539652
# Save the HTML
540-
doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source, headless=headless)
653+
doc = self.get_html_by_pmid(
654+
id,
655+
journal,
656+
mode=mode,
657+
prefer_pmc_source=prefer_pmc_source,
658+
headless=headless,
659+
skip_url_substrings=skip_url_substrings,
660+
)
661+
if not doc and (
662+
getattr(self, "_skip_article_requested", False)
663+
or getattr(self, "_skip_article_due_to_url", False)
664+
or getattr(self, "_skip_article_due_to_challenge", False)
665+
):
666+
logger.info("\tSkipped by configured skip rules.")
667+
return None, None
541668
valid = None
542669
if doc:
543670
valid = _validate_scrape(doc)
@@ -546,7 +673,6 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
546673
with filename.open('w') as f:
547674
f.write(doc)
548675
if not valid:
549-
from pdb import set_trace; set_trace()
550676
logger.info("\tScrape failed! Skipping...")
551677

552678
# Insert random delay until next request.
@@ -559,7 +685,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
559685
def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mode='browser', search=None,
560686
limit=None, overwrite=False, min_pmid=None, max_pmid=None, shuffle=False,
561687
index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None,
562-
prefer_pmc_source=True, headless=True):
688+
prefer_pmc_source=True, headless=True, skip_url_substrings=None):
563689

564690
''' Try to retrieve all PubMed articles for a single journal that don't
565691
already exist in the storage directory.
@@ -598,6 +724,8 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
598724
but are not open-access. If set to "only", will only retrieve articles from PMC, and
599725
skip articles it cannot retrieve from PMC.
600726
headless: When True, runs the browser in headless mode (only relevant if mode=='browser', and not PMC)
727+
skip_url_substrings: Optional iterable of URL substrings. When any substring is present
728+
in a candidate or redirected article URL, that article is skipped.
601729
'''
602730
articles_found = 0
603731
if journal is None and dois is None and pmids is None:
@@ -675,7 +803,18 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
675803
f.write(f"{pmcid}\n")
676804
continue
677805

678-
filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source, headless)
806+
filename, valid = self.process_article(
807+
pmid,
808+
journal,
809+
delay,
810+
mode,
811+
overwrite,
812+
prefer_pmc_source,
813+
headless,
814+
skip_url_substrings=skip_url_substrings,
815+
)
816+
if filename is None and valid is None:
817+
continue
679818

680819
if not valid:
681820
invalid_articles.append(filename)

0 commit comments

Comments
 (0)