|
54 | 54 | # anyway, so this should be left off unless problems arise. |
55 | 55 | EXCLUDE_TABLES_WITH_MISSING_LABELS = False |
56 | 56 |
|
| 57 | +# Whether to use readability.py for HTML cleaning when available. |
| 58 | +# When False, will use fallback HTML processing by default. |
| 59 | +USE_READABILITY = True |
57 | 60 |
|
| 61 | +# Whether to save the original HTML of the table in the Table object |
| 62 | +SAVE_ORIGINAL_HTML = False |
58 | 63 |
|
59 | 64 |
|
60 | 65 | ''' SCRAPING/PARSING SETTINGS ''' |
61 | 66 | USER_AGENTS = [ |
62 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', |
63 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', |
64 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', |
65 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', |
66 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', |
67 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', |
68 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', |
69 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', |
70 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', |
71 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', |
72 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', |
73 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', |
74 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', |
75 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', |
76 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', |
77 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', |
78 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', |
79 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', |
80 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', |
81 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', |
82 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', |
83 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', |
84 | | - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', |
85 | | - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', |
86 | | - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' |
| 67 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 |
| 68 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 |
| 69 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', # noqa: E501 |
| 70 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 |
| 71 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501 |
| 72 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 |
| 73 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 |
| 74 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501 |
| 75 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', # noqa: E501 |
| 76 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', # noqa: E501 |
| 77 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501 |
| 78 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', # noqa: E501 |
| 79 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', # noqa: E501 |
| 80 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501 |
| 81 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501 |
| 82 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 |
| 83 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', # noqa: E501 |
| 84 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', # noqa: E501 |
| 85 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501 |
| 86 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', # noqa: E501 |
| 87 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501 |
| 88 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', # noqa: E501 |
| 89 | + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', # noqa: E501 |
| 90 | + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', # noqa: E501 |
| 91 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' # noqa: E501 |
87 | 92 | ] |
| 93 | + |
| 94 | + |
| 95 | +class ConfigManager: |
| 96 | + """Manages runtime configuration settings for ACE""" |
| 97 | + |
| 98 | + _defaults = {} |
| 99 | + _overrides = {} |
| 100 | + |
| 101 | + def __init__(self): |
| 102 | + # Capture initial defaults |
| 103 | + self._defaults = { |
| 104 | + 'SILENT_ERRORS': SILENT_ERRORS, |
| 105 | + 'SQL_ADAPTER': SQL_ADAPTER, |
| 106 | + 'SQLITE_URI': SQLITE_URI, |
| 107 | + 'MYSQL_USER': MYSQL_USER, |
| 108 | + 'MYSQL_PASSWORD': MYSQL_PASSWORD, |
| 109 | + 'MYSQL_DB': MYSQL_DB, |
| 110 | + 'SAVE_ARTICLES_WITHOUT_ACTIVATIONS': |
| 111 | + SAVE_ARTICLES_WITHOUT_ACTIVATIONS, |
| 112 | + 'OVERWRITE_EXISTING_ROWS': OVERWRITE_EXISTING_ROWS, |
| 113 | + 'CAREFUL_PARSING': CAREFUL_PARSING, |
| 114 | + 'IGNORE_BAD_ROWS': IGNORE_BAD_ROWS, |
| 115 | + 'EXCLUDE_TABLES_WITH_MISSING_LABELS': |
| 116 | + EXCLUDE_TABLES_WITH_MISSING_LABELS, |
| 117 | + 'USE_READABILITY': USE_READABILITY, |
| 118 | + 'SAVE_ORIGINAL_HTML': SAVE_ORIGINAL_HTML |
| 119 | + } |
| 120 | + |
| 121 | + def update(self, **kwargs): |
| 122 | + """Update configuration settings at runtime |
| 123 | + |
| 124 | + Args: |
| 125 | + **kwargs: Key-value pairs of configuration settings to update |
| 126 | + """ |
| 127 | + valid_keys = self._defaults.keys() |
| 128 | + for key, value in kwargs.items(): |
| 129 | + if key in valid_keys: |
| 130 | + self._overrides[key] = value |
| 131 | + else: |
| 132 | + raise ValueError(f"Invalid config key: {key}") |
| 133 | + |
| 134 | + def reset(self, key=None): |
| 135 | + """Reset configuration to default values |
| 136 | + |
| 137 | + Args: |
| 138 | + key (str): Specific key to reset (reset all if None) |
| 139 | + """ |
| 140 | + if key: |
| 141 | + if key in self._overrides: |
| 142 | + del self._overrides[key] |
| 143 | + else: |
| 144 | + self._overrides = {} |
| 145 | + |
| 146 | + def __getattr__(self, name): |
| 147 | + if name in self._overrides: |
| 148 | + return self._overrides[name] |
| 149 | + elif name in self._defaults: |
| 150 | + return self._defaults[name] |
| 151 | + else: |
| 152 | + raise AttributeError(f"Config setting {name} does not exist") |
| 153 | + |
| 154 | + |
| 155 | +# Create global config manager instance |
| 156 | +config_manager = ConfigManager() |
| 157 | + |
| 158 | + |
| 159 | +# Proxy functions for easier access |
| 160 | +def update_config(**kwargs): |
| 161 | + """Update configuration settings at runtime""" |
| 162 | + config_manager.update(**kwargs) |
| 163 | + |
| 164 | + |
| 165 | +def reset_config(key=None): |
| 166 | + """Reset configuration to default values""" |
| 167 | + config_manager.reset(key) |
| 168 | + |
| 169 | + |
| 170 | +def get_config(key): |
| 171 | + """Get current configuration value""" |
| 172 | + return getattr(config_manager, key) |
| 173 | + |
| 174 | + |
| 175 | +# Expose config settings through the manager |
| 176 | +SILENT_ERRORS = config_manager.SILENT_ERRORS |
| 177 | +SQL_ADAPTER = config_manager.SQL_ADAPTER |
| 178 | +SQLITE_URI = config_manager.SQLITE_URI |
| 179 | +MYSQL_USER = config_manager.MYSQL_USER |
| 180 | +MYSQL_PASSWORD = config_manager.MYSQL_PASSWORD |
| 181 | +MYSQL_DB = config_manager.MYSQL_DB |
| 182 | +SAVE_ARTICLES_WITHOUT_ACTIVATIONS = \ |
| 183 | + config_manager.SAVE_ARTICLES_WITHOUT_ACTIVATIONS |
| 184 | +OVERWRITE_EXISTING_ROWS = config_manager.OVERWRITE_EXISTING_ROWS |
| 185 | +CAREFUL_PARSING = config_manager.CAREFUL_PARSING |
| 186 | +IGNORE_BAD_ROWS = config_manager.IGNORE_BAD_ROWS |
| 187 | +EXCLUDE_TABLES_WITH_MISSING_LABELS = \ |
| 188 | + config_manager.EXCLUDE_TABLES_WITH_MISSING_LABELS |
| 189 | +USE_READABILITY = config_manager.USE_READABILITY |
| 190 | +SAVE_ORIGINAL_HTML = config_manager.SAVE_ORIGINAL_HTML |
0 commit comments