Skip to content

Commit c06de67

Browse files
committed
Save HTML in database
1 parent e422305 commit c06de67

4 files changed

Lines changed: 9 additions & 2 deletions

File tree

ace/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@
5858
# When False, will use fallback HTML processing by default.
5959
USE_READABILITY = True
6060

61+
# Whether to save the original HTML of the table in the Table object
62+
SAVE_ORIGINAL_HTML = False
63+
6164

6265

6366

ace/database.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ class Table(Base):
169169
notes = Column(Text)
170170
n_activations = Column(Integer)
171171
n_columns = Column(Integer)
172+
input_html = Column(LongText)
172173

173174
def finalize(self):
174175
''' Any cleanup and updating operations we need to do before saving. '''

ace/sources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ def n_cols_in_row(row):
337337
if data.data[data.n_rows- 1].count(None) == data.n_cols:
338338
data.data.pop()
339339
logger.debug("\t\tTrying to parse table...")
340-
return tableparser.parse_table(data)
340+
return tableparser.parse_table(data, html=str(table))
341341

342342
def extract_doi(self, soup):
343343
''' Every Source subclass must be able to extract its doi. '''

ace/tableparser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,13 @@ def create_activation(data, labels, standard_cols, group_labels=[]):
222222
return activation
223223

224224

225-
def parse_table(data):
225+
def parse_table(data, html=None):
226226
''' Takes a DataTable as input and returns a Table instance. '''
227227

228228
table = Table()
229+
# Only store the original HTML if the global config allows it
230+
if html is not None and config.SAVE_ORIGINAL_HTML:
231+
table.input_html = html
229232
n_cols = data.n_cols
230233

231234
# Identify column names: first occurrence of unique (i.e. colspan=1) label.

0 commit comments

Comments
 (0)