@@ -241,6 +241,38 @@ def __init__(self, store, api_key=None):
241241 self .store = Path (store )
242242 self ._client = PubMedAPI (api_key = api_key )
243243
244+ @staticmethod
245+ def _normalize_skip_url_substrings (skip_url_substrings ):
246+ if skip_url_substrings is None :
247+ return tuple ()
248+ if isinstance (skip_url_substrings , str ):
249+ values = [skip_url_substrings ]
250+ else :
251+ values = skip_url_substrings
252+ normalized = []
253+ for value in values :
254+ if value is None :
255+ continue
256+ text = str (value ).strip ().lower ()
257+ if text :
258+ normalized .append (text )
259+ return tuple (normalized )
260+
261+ def _should_skip_url (self , url , skip_url_substrings ):
262+ if not url :
263+ return False
264+ normalized = self ._normalize_skip_url_substrings (skip_url_substrings )
265+ if not normalized :
266+ return False
267+ url_lower = str (url ).lower ()
268+ for substring in normalized :
269+ if substring in url_lower :
270+ self ._skip_article_requested = True
271+ self ._skip_article_due_to_url = True
272+ logger .info ("Skipping URL because it contains %r: %s" , substring , url )
273+ return True
274+ return False
275+
244276
245277 def search_pubmed (self , journal , search , retmax = 10000 , savelist = None ,):
246278 journal = journal .replace (' ' , '+' )
@@ -257,11 +289,14 @@ def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
257289 return doc
258290
259291
260- def get_html (self , url , journal , mode = 'browser' , headless = True ):
292+ def get_html (self , url , journal , mode = 'browser' , headless = True , skip_url_substrings = None ):
261293
262294 ''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
263295 or just gets the URL directly. '''
264296
297+ if self ._should_skip_url (url , skip_url_substrings ):
298+ return None
299+
265300 if mode == 'browser' :
266301 driver = Driver (
267302 uc = True ,
@@ -273,6 +308,9 @@ def get_html(self, url, journal, mode='browser', headless=True):
273308 driver .set_page_load_timeout (10 )
274309 driver .get (url )
275310 url = driver .current_url
311+ if self ._should_skip_url (url , skip_url_substrings ):
312+ driver .quit ()
313+ return None
276314 except :
277315 driver .quit ()
278316 logger .info (f"Timeout exception #{ attempt } . Retrying..." )
@@ -300,6 +338,9 @@ def get_html(self, url, journal, mode='browser', headless=True):
300338 break
301339
302340 new_url = self .check_for_substitute_url (url , html , journal )
341+ if self ._should_skip_url (new_url , skip_url_substrings ):
342+ driver .quit ()
343+ return None
303344
304345 if url != new_url :
305346 driver = Driver (
@@ -383,11 +424,17 @@ def get_html(self, url, journal, mode='browser', headless=True):
383424 elif mode == 'requests' :
384425 headers = {'User-Agent' : random .choice (USER_AGENTS )}
385426 r = requests .get (url , headers = headers )
427+ if self ._should_skip_url (r .url , skip_url_substrings ):
428+ return None
386429 # For some journals, we can do better than the returned HTML, so get the final URL and
387430 # substitute a better one.
388431 url = self .check_for_substitute_url (r .url , r .text , journal )
389432 if url != r .url :
433+ if self ._should_skip_url (url , skip_url_substrings ):
434+ return None
390435 r = requests .get (url , headers = headers )
436+ if self ._should_skip_url (r .url , skip_url_substrings ):
437+ return None
391438 # XML content is usually misidentified as ISO-8859-1, so we need to manually set utf-8.
392439 # Unfortunately this can break other documents. Need to eventually change this to inspect the
393440 # encoding attribute of the document header.
@@ -434,8 +481,25 @@ def _get_pmcid_from_pmid(self, pmid):
434481 logger .error (f"PMCID lookup failed for PMID { pmid } : { e } " )
435482 return None
436483
437-
438- def get_html_by_pmid (self , pmid , journal , mode = 'browser' , retmode = 'ref' , prefer_pmc_source = True , headless = True ):
484+ @staticmethod
485+ def _provider_urls_from_elink_json (json_content ):
486+ provider_urls = []
487+ linksets = json_content .get ('linksets' , [])
488+ if not linksets :
489+ return provider_urls
490+ idurllist = linksets [0 ].get ('idurllist' , [])
491+ if not idurllist :
492+ return provider_urls
493+ objurls = idurllist [0 ].get ('objurls' , [])
494+ for obj in objurls :
495+ provider_url = obj .get ('url' , {}).get ('value' )
496+ if provider_url :
497+ provider_urls .append (provider_url )
498+ return provider_urls
499+
500+
501+ def get_html_by_pmid (self , pmid , journal , mode = 'browser' , retmode = 'ref' , prefer_pmc_source = True , headless = True ,
502+ skip_url_substrings = None ):
439503 base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
440504 "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
441505
@@ -451,6 +515,7 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
451515 idurllist = linksets [0 ].get ('idurllist' , [])
452516 if idurllist :
453517 objurls = idurllist [0 ].get ('objurls' , [])
518+ provider_urls = self ._provider_urls_from_elink_json (json_content )
454519
455520 providers = {
456521 obj .get ('provider' , {}).get ('nameabbr' ): obj .get ('url' , {}).get ('value' )
@@ -463,31 +528,76 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
463528 pmc_id = self ._get_pmcid_from_pmid (pmid )
464529 if pmc_id :
465530 pmc_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{ pmc_id } "
466- return self .get_html (pmc_url , journal , mode = mode , headless = headless )
531+ return self .get_html (
532+ pmc_url ,
533+ journal ,
534+ mode = mode ,
535+ headless = headless ,
536+ skip_url_substrings = skip_url_substrings ,
537+ )
467538 if prefer_pmc_source == "only" :
468539 logger .info ("\t PMC source detected, but PMCID lookup failed. Skipping..." )
469540 return
470541 elif prefer_pmc_source == "only" :
471542 logger .info ("\t No PMC source found! Skipping..." )
472543 return
544+ else :
545+ for provider_url in provider_urls :
546+ if self ._should_skip_url (provider_url , skip_url_substrings ):
547+ logger .info (
548+ "\t Skipping PMID %s due to blocked provider URL: %s" ,
549+ pmid ,
550+ provider_url ,
551+ )
552+ return
473553 except requests .RequestException as e :
474554 logger .error (f"Request failed: { e } " )
475555 except (ValueError , KeyError , IndexError , TypeError ) as e :
476556 logger .error (f"Unexpected E-utilities response for PMID { pmid } : { e } " )
477557 except Exception as e :
478558 logger .error (f"E-utilities lookup failed for PMID { pmid } : { e } " )
479559 else :
560+ if skip_url_substrings :
561+ try :
562+ response = self ._client .elink (pmid , retmode = 'json' , return_content = False )
563+ response .raise_for_status ()
564+ for provider_url in self ._provider_urls_from_elink_json (response .json ()):
565+ if self ._should_skip_url (provider_url , skip_url_substrings ):
566+ logger .info (
567+ "\t Skipping PMID %s due to blocked provider URL: %s" ,
568+ pmid ,
569+ provider_url ,
570+ )
571+ return
572+ except requests .RequestException as e :
573+ logger .error (f"Provider URL pre-check failed for PMID { pmid } : { e } " )
574+ except (ValueError , KeyError , IndexError , TypeError ) as e :
575+ logger .error (f"Unexpected provider pre-check response for PMID { pmid } : { e } " )
576+ except Exception as e :
577+ logger .error (f"Provider URL pre-check lookup failed for PMID { pmid } : { e } " )
480578 query = f"{ base_url } ?dbfrom=pubmed&id={ pmid } &cmd=prlinks&retmode={ retmode } "
481579 logger .info (query )
482- return self .get_html (query , journal , mode = mode , headless = headless )
580+ return self .get_html (
581+ query ,
582+ journal ,
583+ mode = mode ,
584+ headless = headless ,
585+ skip_url_substrings = skip_url_substrings ,
586+ )
483587
484588 if prefer_pmc_source == "only" :
485589 logger .info ("\t No PMC source found!! Skipping..." )
486590 return
487591
488592 # Fallback if no PMC link found
489593 query = f"{ base_url } ?dbfrom=pubmed&id={ pmid } &cmd=prlinks&retmode={ retmode } "
490- return self .get_html (query , journal , mode = mode , headless = headless )
594+ return self .get_html (
595+ query ,
596+ journal ,
597+ mode = mode ,
598+ headless = headless ,
599+ skip_url_substrings = skip_url_substrings ,
600+ )
491601
492602
493603 def check_for_substitute_url (self , url , html , journal ):
@@ -525,9 +635,12 @@ def is_pmc_open_acess(self, pmcid):
525635
526636 return 'idIsNotOpenAccess' not in response
527637
528- def process_article (self , id , journal , delay = None , mode = 'browser' , overwrite = False , prefer_pmc_source = True , headless = True ):
638+ def process_article (self , id , journal , delay = None , mode = 'browser' , overwrite = False , prefer_pmc_source = True ,
639+ headless = True , skip_url_substrings = None ):
529640
530641 logger .info ("Processing %s..." % id )
642+ self ._skip_article_requested = False
643+ self ._skip_article_due_to_url = False
531644 journal_path = (self .store / 'html' / journal )
532645 filename = journal_path / f"{ id } .html"
533646
@@ -537,7 +650,21 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
537650 return None , None
538651
539652 # Save the HTML
540- doc = self .get_html_by_pmid (id , journal , mode = mode , prefer_pmc_source = prefer_pmc_source , headless = headless )
653+ doc = self .get_html_by_pmid (
654+ id ,
655+ journal ,
656+ mode = mode ,
657+ prefer_pmc_source = prefer_pmc_source ,
658+ headless = headless ,
659+ skip_url_substrings = skip_url_substrings ,
660+ )
661+ if not doc and (
662+ getattr (self , "_skip_article_requested" , False )
663+ or getattr (self , "_skip_article_due_to_url" , False )
664+ or getattr (self , "_skip_article_due_to_challenge" , False )
665+ ):
666+ logger .info ("\t Skipped by configured skip rules." )
667+ return None , None
541668 valid = None
542669 if doc :
543670 valid = _validate_scrape (doc )
@@ -546,7 +673,6 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
546673 with filename .open ('w' ) as f :
547674 f .write (doc )
548675 if not valid :
549- from pdb import set_trace ; set_trace ()
550676 logger .info ("\t Scrape failed! Skipping..." )
551677
552678 # Insert random delay until next request.
@@ -559,7 +685,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
559685 def retrieve_articles (self , journal = None , pmids = None , dois = None , delay = None , mode = 'browser' , search = None ,
560686 limit = None , overwrite = False , min_pmid = None , max_pmid = None , shuffle = False ,
561687 index_pmids = False , skip_pubmed_central = True , metadata_store = None , invalid_article_log_file = None ,
562- prefer_pmc_source = True , headless = True ):
688+ prefer_pmc_source = True , headless = True , skip_url_substrings = None ):
563689
564690 ''' Try to retrieve all PubMed articles for a single journal that don't
565691 already exist in the storage directory.
@@ -598,6 +724,8 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
598724 but are not open-access. If set to "only", will only retrieve articles from PMC, and
599725 skip articles it cannot retrieve from PMC.
600726 headless: When True, runs the browser in headless mode (only relevant if mode=='browser', and not PMC)
727+ skip_url_substrings: Optional iterable of URL substrings. When any substring is present
728+ in a candidate or redirected article URL, that article is skipped.
601729 '''
602730 articles_found = 0
603731 if journal is None and dois is None and pmids is None :
@@ -675,7 +803,18 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
675803 f .write (f"{ pmcid } \n " )
676804 continue
677805
678- filename , valid = self .process_article (pmid , journal , delay , mode , overwrite , prefer_pmc_source , headless )
806+ filename , valid = self .process_article (
807+ pmid ,
808+ journal ,
809+ delay ,
810+ mode ,
811+ overwrite ,
812+ prefer_pmc_source ,
813+ headless ,
814+ skip_url_substrings = skip_url_substrings ,
815+ )
816+ if filename is None and valid is None :
817+ continue
679818
680819 if not valid :
681820 invalid_articles .append (filename )
0 commit comments