Skip to content

Commit d46fc20

Browse files
committed
Merge resolve
2 parents 5706df4 + 58ae05b commit d46fc20

42 files changed

Lines changed: 96277 additions & 351 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

ace/.vscode/settings.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

ace/__init__.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,43 @@
1-
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
2-
# ex: set sts=4 ts=4 sw=4 et:
1+
# emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*-
2+
# ex: set sts=4 sw=4 et:
33
"""ACE -- Automated Coordinate Extraction.
44
"""
5-
__all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]
5+
__all__ = [
6+
"config", "ingest", "database", "datatable", "set_logging_level",
7+
"scrape", "sources", "tableparser", "tests", "__version__"
8+
]
69

710
import logging
811
import sys
912
import os
1013

1114
from .version import __version__
1215

16+
1317
def set_logging_level(level=None):
1418
"""Set package-wide logging level
1519
16-
Args
17-
level : Logging level constant from logging module (warning, error, info, etc.)
20+
Args:
21+
level: Logging level constant from logging module
22+
(warning, error, info, etc.)
1823
"""
1924
if level is None:
2025
level = os.environ.get('ACE_LOGLEVEL', 'warn')
2126
logger.setLevel(getattr(logging, level.upper()))
2227
return logger.getEffectiveLevel()
2328

29+
2430
def _setup_logger(logger):
2531
# Basic logging setup
2632
console = logging.StreamHandler(sys.stdout)
27-
console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s"))
33+
formatter = logging.Formatter(
34+
"%(levelname)-6s %(module)-7s %(message)s"
35+
)
36+
console.setFormatter(formatter)
2837
logger.addHandler(console)
2938
set_logging_level()
3039

40+
3141
# Set up logger
3242
logger = logging.getLogger("ace")
3343
_setup_logger(logger)

ace/config.py

Lines changed: 128 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -54,34 +54,137 @@
5454
# anyway, so this should be left off unless problems arise.
5555
EXCLUDE_TABLES_WITH_MISSING_LABELS = False
5656

57+
# Whether to use readability.py for HTML cleaning when available.
58+
# When False, will use fallback HTML processing by default.
59+
USE_READABILITY = True
5760

61+
# Whether to save the original HTML of the table in the Table object
62+
SAVE_ORIGINAL_HTML = False
5863

5964

6065
''' SCRAPING/PARSING SETTINGS '''
6166
USER_AGENTS = [
62-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
63-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
64-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
65-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
66-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
67-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
68-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
69-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
70-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
71-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
72-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
73-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
74-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
75-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
76-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
77-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
78-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
79-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
80-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
81-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
82-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
83-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
84-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36',
85-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
86-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36'
67+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
68+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
69+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501
70+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
71+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
72+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
73+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
74+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
75+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501
76+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501
77+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
78+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501
79+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501
80+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
81+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
82+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
83+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501
84+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501
85+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
86+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501
87+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
88+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501
89+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', # noqa: E501
90+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501
91+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' # noqa: E501
8792
]
93+
94+
95+
class ConfigManager:
96+
"""Manages runtime configuration settings for ACE"""
97+
98+
_defaults = {}
99+
_overrides = {}
100+
101+
def __init__(self):
102+
# Capture initial defaults
103+
self._defaults = {
104+
'SILENT_ERRORS': SILENT_ERRORS,
105+
'SQL_ADAPTER': SQL_ADAPTER,
106+
'SQLITE_URI': SQLITE_URI,
107+
'MYSQL_USER': MYSQL_USER,
108+
'MYSQL_PASSWORD': MYSQL_PASSWORD,
109+
'MYSQL_DB': MYSQL_DB,
110+
'SAVE_ARTICLES_WITHOUT_ACTIVATIONS':
111+
SAVE_ARTICLES_WITHOUT_ACTIVATIONS,
112+
'OVERWRITE_EXISTING_ROWS': OVERWRITE_EXISTING_ROWS,
113+
'CAREFUL_PARSING': CAREFUL_PARSING,
114+
'IGNORE_BAD_ROWS': IGNORE_BAD_ROWS,
115+
'EXCLUDE_TABLES_WITH_MISSING_LABELS':
116+
EXCLUDE_TABLES_WITH_MISSING_LABELS,
117+
'USE_READABILITY': USE_READABILITY,
118+
'SAVE_ORIGINAL_HTML': SAVE_ORIGINAL_HTML
119+
}
120+
121+
def update(self, **kwargs):
122+
"""Update configuration settings at runtime
123+
124+
Args:
125+
**kwargs: Key-value pairs of configuration settings to update
126+
"""
127+
valid_keys = self._defaults.keys()
128+
for key, value in kwargs.items():
129+
if key in valid_keys:
130+
self._overrides[key] = value
131+
else:
132+
raise ValueError(f"Invalid config key: {key}")
133+
134+
def reset(self, key=None):
135+
"""Reset configuration to default values
136+
137+
Args:
138+
key (str): Specific key to reset (reset all if None)
139+
"""
140+
if key:
141+
if key in self._overrides:
142+
del self._overrides[key]
143+
else:
144+
self._overrides = {}
145+
146+
def __getattr__(self, name):
147+
if name in self._overrides:
148+
return self._overrides[name]
149+
elif name in self._defaults:
150+
return self._defaults[name]
151+
else:
152+
raise AttributeError(f"Config setting {name} does not exist")
153+
154+
155+
# Create global config manager instance
156+
config_manager = ConfigManager()
157+
158+
159+
# Proxy functions for easier access
160+
def update_config(**kwargs):
161+
"""Update configuration settings at runtime"""
162+
config_manager.update(**kwargs)
163+
164+
165+
def reset_config(key=None):
166+
"""Reset configuration to default values"""
167+
config_manager.reset(key)
168+
169+
170+
def get_config(key):
171+
"""Get current configuration value"""
172+
return getattr(config_manager, key)
173+
174+
175+
# Expose config settings through the manager
176+
SILENT_ERRORS = config_manager.SILENT_ERRORS
177+
SQL_ADAPTER = config_manager.SQL_ADAPTER
178+
SQLITE_URI = config_manager.SQLITE_URI
179+
MYSQL_USER = config_manager.MYSQL_USER
180+
MYSQL_PASSWORD = config_manager.MYSQL_PASSWORD
181+
MYSQL_DB = config_manager.MYSQL_DB
182+
SAVE_ARTICLES_WITHOUT_ACTIVATIONS = \
183+
config_manager.SAVE_ARTICLES_WITHOUT_ACTIVATIONS
184+
OVERWRITE_EXISTING_ROWS = config_manager.OVERWRITE_EXISTING_ROWS
185+
CAREFUL_PARSING = config_manager.CAREFUL_PARSING
186+
IGNORE_BAD_ROWS = config_manager.IGNORE_BAD_ROWS
187+
EXCLUDE_TABLES_WITH_MISSING_LABELS = \
188+
config_manager.EXCLUDE_TABLES_WITH_MISSING_LABELS
189+
USE_READABILITY = config_manager.USE_READABILITY
190+
SAVE_ORIGINAL_HTML = config_manager.SAVE_ORIGINAL_HTML

0 commit comments

Comments
 (0)