Skip to content

Commit 5d31a10

Browse files
committed
Expand heuristics for coordinate detection, to minimize missed coordinates
1 parent fa0568c commit 5d31a10

11 files changed

Lines changed: 15363 additions & 32 deletions

File tree

ace/ingest.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,12 @@ def _parse_article(args):
5050
# Fallback to original source identification
5151
source = manager.identify_source(html)
5252
if source is None:
53-
logger.info("Could not identify source for %s", f)
54-
return f, None
53+
if force_ingest and getattr(manager, "default_source", None) is not None:
54+
logger.info("Could not identify source for %s; using DefaultSource fallback", f)
55+
source = manager.default_source
56+
else:
57+
logger.info("Could not identify source for %s", f)
58+
return f, None
5559

5660
article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
5761
if not article:

ace/sources.py

Lines changed: 304 additions & 21 deletions
Large diffs are not rendered by default.

ace/tableparser.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ def identify_standard_columns(labels):
2929
s = 'hemisphere'
3030
elif regex.search('(^k$)|(mm.*?3)|volume|voxels|size|extent', lab):
3131
s = 'size'
32+
elif (
33+
regex.search(r'\bx\b.*\by\b.*\bz\b', lab)
34+
or regex.search(r'(peak\s*voxel\s*coordinate|talairach\s*coordinates?|mni\s*coordinates?)', lab)
35+
or (regex.search(r'coordinates?', lab) and not regex.search(r'cluster|score|value', lab))
36+
):
37+
# Some tables store x/y/z in one combined coordinate column.
38+
s = 'coord_triplet'
39+
found_coords = True
3240

3341
# --- START OF FIX ---
3442
# OLD: elif regex.match('\s*[xy]\s*$', lab):
@@ -59,7 +67,8 @@ def identify_standard_columns(labels):
5967
# --- END OF FIX ---
6068

6169
elif regex.search('rdinate', lab):
62-
continue
70+
s = 'coord_triplet'
71+
found_coords = True
6372
elif lab == 't' or regex.search('^(max.*(z|t).*|.*(z|t).*(score|value|max))$', lab):
6473
s = 'statistic'
6574
elif regex.search('p[\-\s]+.*val', lab):
@@ -158,6 +167,15 @@ def identify_repeating_groups(labels):
158167
def create_activation(data, labels, standard_cols, group_labels=[]):
159168

160169
activation = Activation()
170+
coords_from_triplet = False
171+
172+
def _extract_triplet(value):
173+
clean_val = regex.sub(r'(?<!\d)\.(?!\d)', '', str(value))
174+
cs = '([-]?\d{1,3}\.?\d{0,2})'
175+
match = regex.search('\n*%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), clean_val)
176+
if not match:
177+
return None
178+
return [regex.sub('-\s+', '-', c.strip()) for c in [match.group(1), match.group(2), match.group(3)]]
161179

162180
for i, col in enumerate(data):
163181

@@ -181,10 +199,26 @@ def create_activation(data, labels, standard_cols, group_labels=[]):
181199

182200
sc = standard_cols[i]
183201

202+
if sc in ['coord_triplet', 'x', 'y', 'z']:
203+
triplet = _extract_triplet(col)
204+
if triplet is not None:
205+
x, y, z = triplet
206+
logger.info("Found coordinate triplet in %s column: %s -> %s, %s, %s" % (sc, col, x, y, z))
207+
activation.set_coords(x, y, z)
208+
coords_from_triplet = True
209+
activation.add_col(labels[i], col)
210+
continue
211+
if sc == 'coord_triplet':
212+
activation.add_col(labels[i], col)
213+
continue
214+
184215
# Validate XYZ columns: Should only be integers (and possible trailing decimals).
185216
# If they're not, keep only leading numbers. The exception is that ScienceDirect
186217
# journals often follow the minus sign with a space (e.g., - 35), which we strip.
187218
if regex.match('[xyz]$', sc):
219+
if coords_from_triplet and str(col).strip() == '':
220+
activation.add_col(labels[i], col)
221+
continue
188222
m = regex.match('([-])\s?(\d+\.*\d*)$', col)
189223
if m:
190224
col = "%s%s" % (m.group(1), m.group(2))
@@ -210,11 +244,9 @@ def create_activation(data, labels, standard_cols, group_labels=[]):
210244
# Also need to remove space between minus sign and numbers; some ScienceDirect
211245
# journals leave a gap.
212246
if not i in standard_cols:
213-
cs = '([-]?\d{1,3}\.?\d{0,2})'
214-
clean_col = regex.sub(r'(?<!\d)\.(?!\d)', '', str(col)) # Remove dots not part of numbers
215-
m = regex.search('\n*%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), clean_col)
216-
if m:
217-
x, y, z = [regex.sub('-\s+', '-', c.strip()) for c in [m.group(1), m.group(2), m.group(3)]]
247+
triplet = _extract_triplet(col)
248+
if triplet is not None:
249+
x, y, z = triplet
218250
logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z))
219251
activation.set_coords(x, y, z)
220252

@@ -390,6 +422,3 @@ def parse_table(data, html=None):
390422

391423
table.finalize()
392424
return table if len(table.activations) else None
393-
394-
395-

ace/tests/test_ace.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,3 +374,105 @@ def test_springer_nature_source(test_data_path, source_manager):
374374
assert t.number == '2'
375375
assert "fMRI results across all participants" in t.caption
376376
assert t.n_activations == 9
377+
378+
379+
def _count_valid_activations(tables):
380+
return sum(
381+
1 for t in tables for a in t.activations
382+
if a.x is not None and a.y is not None and a.z is not None
383+
)
384+
385+
386+
def test_pmc_modern_table_wrapper_source(test_weird_data_path, source_manager):
387+
pmid = '16085533'
388+
html = open(join(test_weird_data_path, pmid + '.html')).read()
389+
source = source_manager.identify_source(html)
390+
assert source is not None
391+
assert source.__class__.__name__ == 'PMCSource'
392+
article = source.parse_article(html, pmid=pmid, skip_metadata=True)
393+
assert article is not None
394+
assert len(article.tables) >= 1
395+
assert _count_valid_activations(article.tables) >= 1
396+
397+
398+
def test_oup_table_wrap_fallback_source(test_weird_data_path, source_manager):
399+
pmid = '24700584'
400+
html = open(join(test_weird_data_path, pmid + '.html')).read()
401+
source = source_manager.identify_source(html)
402+
assert source is not None
403+
assert source.__class__.__name__ == 'OUPSource'
404+
article = source.parse_article(html, pmid=pmid, skip_metadata=True)
405+
assert article is not None
406+
assert len(article.tables) >= 1
407+
assert _count_valid_activations(article.tables) >= 1
408+
409+
410+
def test_jcn_embedded_table_fallback_source(test_weird_data_path, source_manager):
411+
pmid = '24666131'
412+
html = open(join(test_weird_data_path, pmid + '.html')).read()
413+
source = source_manager.identify_source(html)
414+
assert source is not None
415+
assert source.__class__.__name__ == 'JournalOfCognitiveNeuroscienceSource'
416+
article = source.parse_article(html, pmid=pmid, skip_metadata=True)
417+
assert article is not None
418+
assert len(article.tables) >= 1
419+
assert _count_valid_activations(article.tables) >= 1
420+
421+
422+
def test_sciencedirect_combined_coordinate_column_source(test_weird_data_path, source_manager):
423+
pmid = '15327927'
424+
html = open(join(test_weird_data_path, pmid + '.html')).read()
425+
source = source_manager.identify_source(html)
426+
assert source is not None
427+
assert source.__class__.__name__ == 'ScienceDirectSource'
428+
article = source.parse_article(html, pmid=pmid, skip_metadata=True)
429+
assert article is not None
430+
assert len(article.tables) >= 1
431+
assert _count_valid_activations(article.tables) >= 1
432+
433+
434+
def test_springer_inline_table_fallback_source(test_weird_data_path, source_manager):
435+
pmid = '27007121'
436+
html = open(join(test_weird_data_path, pmid + '.html')).read()
437+
source = source_manager.identify_source(html)
438+
assert source is not None
439+
assert source.__class__.__name__ == 'SpringerSource'
440+
article = source.parse_article(html, pmid=pmid, skip_metadata=True)
441+
assert article is not None
442+
assert len(article.tables) >= 1
443+
assert _count_valid_activations(article.tables) >= 1
444+
445+
446+
def test_unknown_source_coordinate_table_with_force_ingest(test_weird_data_path, tmp_path):
447+
pmid = '11296095'
448+
src_file = join(test_weird_data_path, pmid + '.html')
449+
target_file = tmp_path / f"{pmid}.html"
450+
shutil.copy(src_file, target_file)
451+
452+
db_path_no_force = f"sqlite:///{(tmp_path / 'ace_no_force.db').as_posix()}"
453+
db_no_force = database.Database(adapter='sqlite', db_name=db_path_no_force)
454+
missing_sources = ingest.add_articles(
455+
db_no_force,
456+
[str(target_file)],
457+
pmid_filenames=True,
458+
force_ingest=False,
459+
num_workers=1,
460+
skip_metadata=True,
461+
)
462+
assert str(target_file) in missing_sources
463+
assert len(db_no_force.articles) == 0
464+
465+
db_path_force = f"sqlite:///{(tmp_path / 'ace_force.db').as_posix()}"
466+
db_force = database.Database(adapter='sqlite', db_name=db_path_force)
467+
missing_sources_force = ingest.add_articles(
468+
db_force,
469+
[str(target_file)],
470+
pmid_filenames=True,
471+
force_ingest=True,
472+
num_workers=1,
473+
skip_metadata=True,
474+
)
475+
assert str(target_file) not in missing_sources_force
476+
assert len(db_force.articles) >= 1
477+
assert len(db_force.articles[0].tables) >= 1
478+
assert _count_valid_activations(db_force.articles[0].tables) >= 1

0 commit comments

Comments
 (0)