Skip to content

Commit aa0ebae

Browse files
committed
Add regression tests for recatpcha
1 parent 5d31a10 commit aa0ebae

12 files changed

Lines changed: 15074 additions & 8 deletions

ace/ingest.py

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,29 @@
1010

1111
logger = logging.getLogger(__name__)
1212

13+
14+
def _article_quality_score(article):
15+
"""Score article quality for same-PMID duplicate resolution.
16+
17+
Higher is better. Prioritizes records that actually contain extracted
18+
activations and usable tables over placeholder/blocked pages.
19+
"""
20+
if article is None:
21+
return (-1, -1, -1, -1)
22+
23+
tables = getattr(article, "tables", []) or []
24+
n_tables = len(tables)
25+
n_activations = sum(len(getattr(t, "activations", []) or []) for t in tables)
26+
missing_source = 1 if getattr(article, "missing_source", False) else 0
27+
28+
# Sort by:
29+
# 1) Most activations
30+
# 2) Most tables
31+
# 3) Prefer non-missing-source parses
32+
# 4) Presence of any table at all
33+
return (n_activations, n_tables, -missing_source, int(n_tables > 0))
34+
35+
1336
def _process_file_with_source(args):
1437
"""Helper function to read, validate, and identify source for a file."""
1538
f, source_configs = args
@@ -167,23 +190,44 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
167190
for args in tqdm(parse_args, desc="Parsing articles"):
168191
parsed_articles.append(_parse_article(args))
169192

170-
# Add successfully parsed articles to database
193+
# Add successfully parsed articles to database.
194+
# When pmid_filenames=True we can see duplicate PMID files from different
195+
# source folders (e.g., one blocked/challenge page + one valid page). Keep
196+
# the best parsed candidate per PMID within this run.
171197
missing_sources = []
172-
for i, (f, article) in enumerate(parsed_articles):
173-
if article is None:
174-
missing_sources.append(f)
175-
continue
176-
198+
if pmid_filenames:
199+
best_by_pmid = {}
200+
for f, article in parsed_articles:
201+
pmid = path.splitext(path.basename(f))[0]
202+
if article is None:
203+
missing_sources.append(f)
204+
continue
205+
206+
score = _article_quality_score(article)
207+
existing = best_by_pmid.get(pmid)
208+
if existing is None or score > existing[2]:
209+
best_by_pmid[pmid] = (f, article, score)
210+
211+
selected_articles = [(f, article) for (f, article, _) in best_by_pmid.values()]
212+
else:
213+
selected_articles = []
214+
for f, article in parsed_articles:
215+
if article is None:
216+
missing_sources.append(f)
217+
continue
218+
selected_articles.append((f, article))
219+
220+
for i, (f, article) in enumerate(selected_articles):
177221
if get_config('SAVE_ARTICLES_WITHOUT_ACTIVATIONS') or article.tables:
178222
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
179223
if pmid and db.article_exists(pmid):
180224
if get_config('OVERWRITE_EXISTING_ROWS'):
181225
db.delete_article(pmid)
182226
else:
183227
continue
184-
228+
185229
db.add(article)
186-
if commit and (i % 100 == 0 or i == len(parsed_articles) - 1):
230+
if commit and (i % 100 == 0 or i == len(selected_articles) - 1):
187231
db.save()
188232

189233
db.save()

ace/scrape.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,10 @@ def _validate_scrape(html):
225225
'This site can’t be reached',
226226
'used Cloudflare to restrict access',
227227
'502 Bad Gateway',
228+
'Checking your browser before accessing',
229+
'Checking your browser - reCAPTCHA',
230+
'/recaptcha/challengepage/',
231+
'g-recaptcha',
228232
]
229233

230234
for pattern in patterns:

ace/tests/test_ace.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,72 @@ def test_unknown_source_coordinate_table_with_force_ingest(test_weird_data_path,
476476
assert len(db_force.articles) >= 1
477477
assert len(db_force.articles[0].tables) >= 1
478478
assert _count_valid_activations(db_force.articles[0].tables) >= 1
479+
480+
481+
def test_ingest_prefers_best_duplicate_pmid_file(test_weird_data_path, tmp_path):
482+
pmid = "17913474"
483+
bad_src = join(test_weird_data_path, "17913474_recaptcha.html")
484+
good_src = join(test_weird_data_path, "17913474_pond.html")
485+
486+
bad_dir = tmp_path / "a_bad"
487+
good_dir = tmp_path / "b_good"
488+
bad_dir.mkdir()
489+
good_dir.mkdir()
490+
491+
# Same PMID filename in two different source folders
492+
bad_target = bad_dir / f"{pmid}.html"
493+
good_target = good_dir / f"{pmid}.html"
494+
shutil.copy(bad_src, bad_target)
495+
shutil.copy(good_src, good_target)
496+
497+
db_path = f"sqlite:///{(tmp_path / 'ace_dupe_pick_best.db').as_posix()}"
498+
db = database.Database(adapter='sqlite', db_name=db_path)
499+
500+
# Intentionally place blocked/challenge page first.
501+
ingest.add_articles(
502+
db,
503+
[str(bad_target), str(good_target)],
504+
pmid_filenames=True,
505+
force_ingest=True,
506+
num_workers=1,
507+
skip_metadata=True,
508+
)
509+
510+
assert len(db.articles) == 1
511+
assert len(db.articles[0].tables) >= 1
512+
assert _count_valid_activations(db.articles[0].tables) >= 1
513+
514+
515+
def test_validate_scrape_flags_recaptcha_challenge_page(test_weird_data_path):
516+
html = open(join(test_weird_data_path, "17913474_recaptcha.html")).read()
517+
assert scrape._validate_scrape(html) is False
518+
519+
520+
@pytest.mark.parametrize(
521+
"pmid,expected_source",
522+
[
523+
("17088334", "PMCSource"),
524+
("26342221", "OUPSource"),
525+
("27623361", "ScienceDirectSource"),
526+
("27319001", "SpringerSource"),
527+
("20350171", "JournalOfCognitiveNeuroscienceSource"),
528+
("12860777", None), # Unknown source -> DefaultSource fallback
529+
],
530+
)
531+
def test_additional_missed_in_main_text_regressions(test_weird_data_path, source_manager, pmid, expected_source):
532+
html = open(join(test_weird_data_path, pmid + ".html")).read()
533+
source = source_manager.identify_source(html)
534+
535+
if expected_source is None:
536+
assert source is None
537+
parser = source_manager.default_source
538+
assert parser is not None
539+
else:
540+
assert source is not None
541+
assert source.__class__.__name__ == expected_source
542+
parser = source
543+
544+
article = parser.parse_article(html, pmid=pmid, skip_metadata=True)
545+
assert article is not None
546+
assert len(article.tables) >= 1
547+
assert _count_valid_activations(article.tables) >= 1

0 commit comments

Comments
 (0)