@@ -476,3 +476,72 @@ def test_unknown_source_coordinate_table_with_force_ingest(test_weird_data_path,
476476 assert len (db_force .articles ) >= 1
477477 assert len (db_force .articles [0 ].tables ) >= 1
478478 assert _count_valid_activations (db_force .articles [0 ].tables ) >= 1
479+
480+
481+ def test_ingest_prefers_best_duplicate_pmid_file (test_weird_data_path , tmp_path ):
482+ pmid = "17913474"
483+ bad_src = join (test_weird_data_path , "17913474_recaptcha.html" )
484+ good_src = join (test_weird_data_path , "17913474_pond.html" )
485+
486+ bad_dir = tmp_path / "a_bad"
487+ good_dir = tmp_path / "b_good"
488+ bad_dir .mkdir ()
489+ good_dir .mkdir ()
490+
491+ # Same PMID filename in two different source folders
492+ bad_target = bad_dir / f"{ pmid } .html"
493+ good_target = good_dir / f"{ pmid } .html"
494+ shutil .copy (bad_src , bad_target )
495+ shutil .copy (good_src , good_target )
496+
497+ db_path = f"sqlite:///{ (tmp_path / 'ace_dupe_pick_best.db' ).as_posix ()} "
498+ db = database .Database (adapter = 'sqlite' , db_name = db_path )
499+
500+ # Intentionally place blocked/challenge page first.
501+ ingest .add_articles (
502+ db ,
503+ [str (bad_target ), str (good_target )],
504+ pmid_filenames = True ,
505+ force_ingest = True ,
506+ num_workers = 1 ,
507+ skip_metadata = True ,
508+ )
509+
510+ assert len (db .articles ) == 1
511+ assert len (db .articles [0 ].tables ) >= 1
512+ assert _count_valid_activations (db .articles [0 ].tables ) >= 1
513+
514+
515+ def test_validate_scrape_flags_recaptcha_challenge_page (test_weird_data_path ):
516+ html = open (join (test_weird_data_path , "17913474_recaptcha.html" )).read ()
517+ assert scrape ._validate_scrape (html ) is False
518+
519+
520+ @pytest .mark .parametrize (
521+ "pmid,expected_source" ,
522+ [
523+ ("17088334" , "PMCSource" ),
524+ ("26342221" , "OUPSource" ),
525+ ("27623361" , "ScienceDirectSource" ),
526+ ("27319001" , "SpringerSource" ),
527+ ("20350171" , "JournalOfCognitiveNeuroscienceSource" ),
528+ ("12860777" , None ), # Unknown source -> DefaultSource fallback
529+ ],
530+ )
531+ def test_additional_missed_in_main_text_regressions (test_weird_data_path , source_manager , pmid , expected_source ):
532+ html = open (join (test_weird_data_path , pmid + ".html" )).read ()
533+ source = source_manager .identify_source (html )
534+
535+ if expected_source is None :
536+ assert source is None
537+ parser = source_manager .default_source
538+ assert parser is not None
539+ else :
540+ assert source is not None
541+ assert source .__class__ .__name__ == expected_source
542+ parser = source
543+
544+ article = parser .parse_article (html , pmid = pmid , skip_metadata = True )
545+ assert article is not None
546+ assert len (article .tables ) >= 1
547+ assert _count_valid_activations (article .tables ) >= 1
0 commit comments