Skip to content

Commit 068c99f

Browse files
authored
Merge pull request #64 from neurosynth/no_miss
ENH: Expand heuristics for coordinate detecting
2 parents fa0568c + aa0ebae commit 068c99f

21 files changed

Lines changed: 30437 additions & 40 deletions

ace/ingest.py

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,29 @@
1010

1111
logger = logging.getLogger(__name__)
1212

13+
14+
def _article_quality_score(article):
15+
"""Score article quality for same-PMID duplicate resolution.
16+
17+
Higher is better. Prioritizes records that actually contain extracted
18+
activations and usable tables over placeholder/blocked pages.
19+
"""
20+
if article is None:
21+
return (-1, -1, -1, -1)
22+
23+
tables = getattr(article, "tables", []) or []
24+
n_tables = len(tables)
25+
n_activations = sum(len(getattr(t, "activations", []) or []) for t in tables)
26+
missing_source = 1 if getattr(article, "missing_source", False) else 0
27+
28+
# Sort by:
29+
# 1) Most activations
30+
# 2) Most tables
31+
# 3) Prefer non-missing-source parses
32+
# 4) Presence of any table at all
33+
return (n_activations, n_tables, -missing_source, int(n_tables > 0))
34+
35+
1336
def _process_file_with_source(args):
1437
"""Helper function to read, validate, and identify source for a file."""
1538
f, source_configs = args
@@ -50,8 +73,12 @@ def _parse_article(args):
5073
# Fallback to original source identification
5174
source = manager.identify_source(html)
5275
if source is None:
53-
logger.info("Could not identify source for %s", f)
54-
return f, None
76+
if force_ingest and getattr(manager, "default_source", None) is not None:
77+
logger.info("Could not identify source for %s; using DefaultSource fallback", f)
78+
source = manager.default_source
79+
else:
80+
logger.info("Could not identify source for %s", f)
81+
return f, None
5582

5683
article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
5784
if not article:
@@ -163,23 +190,44 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
163190
for args in tqdm(parse_args, desc="Parsing articles"):
164191
parsed_articles.append(_parse_article(args))
165192

166-
# Add successfully parsed articles to database
193+
# Add successfully parsed articles to database.
194+
# When pmid_filenames=True we can see duplicate PMID files from different
195+
# source folders (e.g., one blocked/challenge page + one valid page). Keep
196+
# the best parsed candidate per PMID within this run.
167197
missing_sources = []
168-
for i, (f, article) in enumerate(parsed_articles):
169-
if article is None:
170-
missing_sources.append(f)
171-
continue
172-
198+
if pmid_filenames:
199+
best_by_pmid = {}
200+
for f, article in parsed_articles:
201+
pmid = path.splitext(path.basename(f))[0]
202+
if article is None:
203+
missing_sources.append(f)
204+
continue
205+
206+
score = _article_quality_score(article)
207+
existing = best_by_pmid.get(pmid)
208+
if existing is None or score > existing[2]:
209+
best_by_pmid[pmid] = (f, article, score)
210+
211+
selected_articles = [(f, article) for (f, article, _) in best_by_pmid.values()]
212+
else:
213+
selected_articles = []
214+
for f, article in parsed_articles:
215+
if article is None:
216+
missing_sources.append(f)
217+
continue
218+
selected_articles.append((f, article))
219+
220+
for i, (f, article) in enumerate(selected_articles):
173221
if get_config('SAVE_ARTICLES_WITHOUT_ACTIVATIONS') or article.tables:
174222
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
175223
if pmid and db.article_exists(pmid):
176224
if get_config('OVERWRITE_EXISTING_ROWS'):
177225
db.delete_article(pmid)
178226
else:
179227
continue
180-
228+
181229
db.add(article)
182-
if commit and (i % 100 == 0 or i == len(parsed_articles) - 1):
230+
if commit and (i % 100 == 0 or i == len(selected_articles) - 1):
183231
db.save()
184232

185233
db.save()

ace/scrape.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,10 @@ def _validate_scrape(html):
225225
'This site can’t be reached',
226226
'used Cloudflare to restrict access',
227227
'502 Bad Gateway',
228+
'Checking your browser before accessing',
229+
'Checking your browser - reCAPTCHA',
230+
'/recaptcha/challengepage/',
231+
'g-recaptcha',
228232
]
229233

230234
for pattern in patterns:

0 commit comments

Comments
 (0)