Skip to content

Commit 02aa72d

Browse files
authored
Merge pull request #60 from neurosynth/Nature
Add new sources for: Nature, BMC, AmerPsych, MDPI and Sage2
2 parents fa026e7 + 13af717 commit 02aa72d

19 files changed

Lines changed: 16601 additions & 335 deletions

ace/.vscode/settings.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

ace/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@
5454
# anyway, so this should be left off unless problems arise.
5555
EXCLUDE_TABLES_WITH_MISSING_LABELS = False
5656

57+
# Whether to use readability.py for HTML cleaning when available.
58+
# When False, will use fallback HTML processing by default.
59+
USE_READABILITY = True
60+
61+
# Whether to save the original HTML of the table in the Table object
62+
SAVE_ORIGINAL_HTML = False
63+
5764

5865

5966

ace/database.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ class Table(Base):
169169
notes = Column(Text)
170170
n_activations = Column(Integer)
171171
n_columns = Column(Integer)
172+
input_html = Column(LongText)
172173

173174
def finalize(self):
174175
''' Any cleanup and updating operations we need to do before saving. '''

ace/datatable.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,17 @@ def n_rows(self):
3232
def add_val(self, val, rows=1, cols=1):
3333
''' Find next open position and add values to grid '''
3434

35-
# Flatten list and find next open position
36-
flat = [item for l in self.data for item in l]
37-
flat_set = set(flat)
35+
flat = []
36+
for row in self.data:
37+
# If row is not a list for some reason, treat as single-item row
38+
if isinstance(row, list):
39+
for item in row:
40+
flat.append(item)
41+
else:
42+
flat.append(row)
43+
44+
# Only include hashable items in the set (skip unhashable like lists)
45+
flat_set = set(x for x in flat if not isinstance(x, list))
3846

3947
if not None in flat_set:
4048
open_pos = self.n_rows * self.n_cols

ace/export.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,18 @@ def export_database(db, foldername, skip_empty=True):
9898
}
9999

100100
with (foldername / 'export.json').open('w') as f:
101-
json.dump(export_md, f)
101+
json.dump(export_md, f)
102+
103+
# Save table HTML files if available
104+
tables_dir = foldername / 'tables'
105+
tables_dir.mkdir(parents=True, exist_ok=True)
106+
107+
for art in articles:
108+
art_dir = tables_dir / str(art.id)
109+
art_dir.mkdir(parents=True, exist_ok=True)
110+
111+
for t in art.tables:
112+
if t.input_html:
113+
table_file = art_dir / f"{t.id}.html"
114+
with table_file.open('w', encoding='utf-8') as f:
115+
f.write(t.input_html)

ace/ingest.py

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,13 @@
55
from .scrape import _validate_scrape
66
import multiprocessing as mp
77
from functools import partial
8+
from tqdm import tqdm
89

910
logger = logging.getLogger(__name__)
1011

11-
def _process_file(f):
12-
"""Helper function to read and validate a single file."""
13-
logger.info("Processing article %s..." % f)
14-
try:
15-
html = open(f).read()
16-
except Exception as e:
17-
logger.warning("Failed to read file %s: %s" % (f, str(e)))
18-
return f, None
19-
20-
if not _validate_scrape(html):
21-
logger.warning("Invalid HTML for %s" % f)
22-
return f, None
23-
24-
return f, html
25-
26-
2712
def _process_file_with_source(args):
2813
"""Helper function to read, validate, and identify source for a single file."""
2914
f, source_configs = args
30-
logger.info("Processing article %s..." % f)
3115
try:
3216
html = open(f).read()
3317
except Exception as e:
@@ -65,18 +49,18 @@ def _parse_article(args):
6549
# Fallback to original source identification
6650
source = manager.identify_source(html)
6751
if source is None:
68-
logger.warning("Could not identify source for %s" % f)
52+
logger.info("Could not identify source for %s" % f)
6953
return f, None
7054

7155
article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
7256
return f, article
7357
except Exception as e:
74-
logger.warning("Error parsing article %s: %s" % (f, str(e)))
58+
logger.info("Error parsing article %s: %s" % (f, str(e)))
7559
return f, None
7660

7761

7862
def add_articles(db, files, commit=True, table_dir=None, limit=None,
79-
pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, **kwargs):
63+
pmid_filenames=False, metadata_dir=None, force_ingest=True, num_workers=None, use_readability=None, **kwargs):
8064
''' Process articles and add their data to the DB.
8165
Args:
8266
files: The path to the article(s) to process. Can be a single
@@ -100,10 +84,13 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
10084
force_ingest: Ingest even if no source is identified.
10185
num_workers: Number of worker processes to use when processing in parallel.
10286
If None (default), uses the number of CPUs available on the system.
87+
use_readability: When True, use readability.py for HTML cleaning if available.
88+
When False, use fallback HTML processing by default. If None (default),
89+
uses the value from config.USE_READABILITY.
10390
kwargs: Additional keyword arguments to pass to parse_article.
10491
'''
10592

106-
manager = sources.SourceManager(table_dir)
93+
manager = sources.SourceManager(table_dir, use_readability=use_readability if use_readability is not None else config.USE_READABILITY)
10794

10895
# Prepare source configurations for parallel processing
10996
source_configs = {name: source.identifiers for name, source in manager.sources.items()}
@@ -123,11 +110,11 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
123110
# Process files in parallel to extract HTML content and identify sources
124111
process_args = [(f, source_configs) for f in files]
125112
with mp.Pool(processes=num_workers) as pool:
126-
file_html_source_tuples = pool.map(_process_file_with_source, process_args)
113+
file_html_source_tuples = list(tqdm(pool.imap_unordered(_process_file_with_source, process_args), total=len(process_args), desc="Processing files"))
127114
else:
128115
# Process files sequentially
129116
file_html_source_tuples = []
130-
for f in files:
117+
for f in tqdm(files, desc="Processing files"):
131118
result = _process_file_with_source((f, source_configs))
132119
file_html_source_tuples.append(result)
133120

@@ -142,7 +129,7 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
142129
# Filter out articles that already exist in the database
143130
files_to_process = []
144131
missing_sources = []
145-
132+
146133
for f, html, source_name in valid_files:
147134
pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
148135

@@ -160,16 +147,16 @@ def add_articles(db, files, commit=True, table_dir=None, limit=None,
160147
if num_workers is not None and num_workers != 1 and parse_args:
161148
# Parse articles in parallel
162149
with mp.Pool(processes=num_workers) as pool:
163-
parsed_articles = pool.map(_parse_article, parse_args)
150+
parsed_articles = list(tqdm(pool.imap_unordered(_parse_article, parse_args), total=len(parse_args), desc="Parsing articles"))
164151
else:
165152
# Parse articles sequentially
166153
parsed_articles = []
167-
for args in parse_args:
154+
for args in tqdm(parse_args, desc="Parsing articles"):
168155
parsed_articles.append(_parse_article(args))
169156

170157
# Add successfully parsed articles to database
171158
for i, (f, article) in enumerate(parsed_articles):
172-
if article is None:
159+
if article in [None, False]:
173160
missing_sources.append(f)
174161
continue
175162

0 commit comments

Comments
 (0)