Skip to content

Commit ed2fbbf

Browse files
committed
Enable on the fly updating of config
1 parent 13af717 commit ed2fbbf

3 files changed

Lines changed: 138 additions & 40 deletions

File tree

ace/config.py

Lines changed: 123 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -62,33 +62,129 @@
6262
SAVE_ORIGINAL_HTML = False
6363

6464

65-
66-
6765
''' SCRAPING/PARSING SETTINGS '''
6866
USER_AGENTS = [
69-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
70-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
71-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
72-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
73-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
74-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
75-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
76-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
77-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
78-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
79-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
80-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
81-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
82-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
83-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
84-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
85-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
86-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
87-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
88-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
89-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
90-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
91-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36',
92-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
93-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36'
67+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
68+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
69+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
70+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
71+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
72+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
73+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
74+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
75+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
76+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
77+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
78+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
79+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
80+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
81+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
82+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
83+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
84+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
85+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
86+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
87+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
88+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
89+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', # noqa: E501
90+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
91+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' # noqa: E501
9492
]
93+
94+
95+
class ConfigManager:
96+
"""Manages runtime configuration settings for ACE"""
97+
98+
_defaults = {}
99+
_overrides = {}
100+
101+
def __init__(self):
102+
# Capture initial defaults
103+
self._defaults = {
104+
'SILENT_ERRORS': SILENT_ERRORS,
105+
'SQL_ADAPTER': SQL_ADAPTER,
106+
'SQLITE_URI': SQLITE_URI,
107+
'MYSQL_USER': MYSQL_USER,
108+
'MYSQL_PASSWORD': MYSQL_PASSWORD,
109+
'MYSQL_DB': MYSQL_DB,
110+
'SAVE_ARTICLES_WITHOUT_ACTIVATIONS':
111+
SAVE_ARTICLES_WITHOUT_ACTIVATIONS,
112+
'OVERWRITE_EXISTING_ROWS': OVERWRITE_EXISTING_ROWS,
113+
'CAREFUL_PARSING': CAREFUL_PARSING,
114+
'IGNORE_BAD_ROWS': IGNORE_BAD_ROWS,
115+
'EXCLUDE_TABLES_WITH_MISSING_LABELS':
116+
EXCLUDE_TABLES_WITH_MISSING_LABELS,
117+
'USE_READABILITY': USE_READABILITY,
118+
'SAVE_ORIGINAL_HTML': SAVE_ORIGINAL_HTML
119+
}
120+
121+
def update(self, **kwargs):
122+
"""Update configuration settings at runtime
123+
124+
Args:
125+
**kwargs: Key-value pairs of configuration settings to update
126+
"""
127+
valid_keys = self._defaults.keys()
128+
for key, value in kwargs.items():
129+
if key in valid_keys:
130+
self._overrides[key] = value
131+
else:
132+
raise ValueError(f"Invalid config key: {key}")
133+
134+
def reset(self, key=None):
135+
"""Reset configuration to default values
136+
137+
Args:
138+
key (str): Specific key to reset (reset all if None)
139+
"""
140+
if key:
141+
if key in self._overrides:
142+
del self._overrides[key]
143+
else:
144+
self._overrides = {}
145+
146+
def __getattr__(self, name):
147+
if name in self._overrides:
148+
return self._overrides[name]
149+
elif name in self._defaults:
150+
return self._defaults[name]
151+
else:
152+
raise AttributeError(f"Config setting {name} does not exist")
153+
154+
155+
# Create global config manager instance
156+
config_manager = ConfigManager()
157+
158+
159+
# Proxy functions for easier access
160+
def update_config(**kwargs):
161+
"""Update configuration settings at runtime"""
162+
config_manager.update(**kwargs)
163+
164+
165+
def reset_config(key=None):
166+
"""Reset configuration to default values"""
167+
config_manager.reset(key)
168+
169+
170+
def get_config(key):
171+
"""Get current configuration value"""
172+
return getattr(config_manager, key)
173+
174+
175+
# Expose config settings through the manager
176+
SILENT_ERRORS = config_manager.SILENT_ERRORS
177+
SQL_ADAPTER = config_manager.SQL_ADAPTER
178+
SQLITE_URI = config_manager.SQLITE_URI
179+
MYSQL_USER = config_manager.MYSQL_USER
180+
MYSQL_PASSWORD = config_manager.MYSQL_PASSWORD
181+
MYSQL_DB = config_manager.MYSQL_DB
182+
SAVE_ARTICLES_WITHOUT_ACTIVATIONS = \
183+
config_manager.SAVE_ARTICLES_WITHOUT_ACTIVATIONS
184+
OVERWRITE_EXISTING_ROWS = config_manager.OVERWRITE_EXISTING_ROWS
185+
CAREFUL_PARSING = config_manager.CAREFUL_PARSING
186+
IGNORE_BAD_ROWS = config_manager.IGNORE_BAD_ROWS
187+
EXCLUDE_TABLES_WITH_MISSING_LABELS = \
188+
config_manager.EXCLUDE_TABLES_WITH_MISSING_LABELS
189+
USE_READABILITY = config_manager.USE_READABILITY
190+
SAVE_ORIGINAL_HTML = config_manager.SAVE_ORIGINAL_HTML

ace/export.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
logger = logging.getLogger(__name__)
1111

12-
def export_database(db, foldername, skip_empty=True):
12+
def export_database(db, foldername, skip_empty=True, table_html=False):
1313
# Create folder if it doesn't exist
1414
foldername = Path(foldername)
1515
foldername.mkdir(parents=True, exist_ok=True)
@@ -100,16 +100,17 @@ def export_database(db, foldername, skip_empty=True):
100100
with (foldername / 'export.json').open('w') as f:
101101
json.dump(export_md, f)
102102

103-
# Save table HTML files if available
104-
tables_dir = foldername / 'tables'
105-
tables_dir.mkdir(parents=True, exist_ok=True)
106-
107-
for art in articles:
108-
art_dir = tables_dir / str(art.id)
109-
art_dir.mkdir(parents=True, exist_ok=True)
103+
if table_html:
104+
# Save table HTML files if available
105+
tables_dir = foldername / 'tables'
106+
tables_dir.mkdir(parents=True, exist_ok=True)
110107

111-
for t in art.tables:
112-
if t.input_html:
113-
table_file = art_dir / f"{t.id}.html"
114-
with table_file.open('w', encoding='utf-8') as f:
115-
f.write(t.input_html)
108+
for art in articles:
109+
art_dir = tables_dir / str(art.id)
110+
art_dir.mkdir(parents=True, exist_ok=True)
111+
112+
for t in art.tables:
113+
if t.input_html:
114+
table_file = art_dir / f"{t.id}.html"
115+
with table_file.open('w', encoding='utf-8') as f:
116+
f.write(t.input_html)

ace/tableparser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ def parse_table(data, html=None):
227227

228228
table = Table()
229229
# Only store the original HTML if the global config allows it
230+
from pdb import set_trace; set_trace()
230231
if html is not None and config.SAVE_ORIGINAL_HTML:
231232
table.input_html = html
232233
n_cols = data.n_cols

0 commit comments

Comments
 (0)