|
3 | 3 |
|
4 | 4 | import datetime |
5 | 5 | import platform |
| 6 | +import traceback |
6 | 7 | from dataclasses import dataclass |
| 8 | +from pathlib import Path |
7 | 9 | from typing import TYPE_CHECKING, Any, TypeVar |
8 | 10 |
|
9 | 11 | import click |
|
25 | 27 | import sqlite3 |
26 | 28 | from collections.abc import Callable, Iterable |
27 | 29 | from click._termui_impl import ProgressBar |
| 30 | + from typing import TextIO |
28 | 31 |
|
29 | 32 | V = TypeVar("V") |
30 | 33 |
|
@@ -220,6 +223,67 @@ def _run_images(conn: sqlite3.Connection, _data_store: dict[str, Any], enabled_c |
220 | 223 | PostTask("images", _run_images), |
221 | 224 | ) |
222 | 225 |
|
| 226 | +PARSING_ERROR_SEPARATOR = "-" * 80 |
| 227 | + |
| 228 | + |
| 229 | +def write_parsing_error( |
| 230 | + file: TextIO, |
| 231 | + *, |
| 232 | + category: str, |
| 233 | + article: Article, |
| 234 | + error: ArticleParsingError, |
| 235 | +) -> None: |
| 236 | + """Write a parsing error entry to a log file.""" |
| 237 | + file.write(f"{PARSING_ERROR_SEPARATOR}\n") |
| 238 | + file.write(f"Category: {category}\n") |
| 239 | + file.write(f"Article: {article.title}\n") |
| 240 | + file.write(f"URL: {article.url}\n") |
| 241 | + file.write("Traceback:\n") |
| 242 | + file.writelines(traceback.format_exception(type(error), error, error.__traceback__)) |
| 243 | + file.write("\n") |
| 244 | + |
| 245 | + |
| 246 | +def parse_articles( |
| 247 | + conn: sqlite3.Connection, |
| 248 | + data_store: dict[str, Any], |
| 249 | + enabled_categories: set[str], |
| 250 | + parsing_errors_log: TextIO | None = None, |
| 251 | +) -> int: |
| 252 | + """Parse category articles into the database.""" |
| 253 | + click.echo("Parsing articles...") |
| 254 | + parsing_errors_count = 0 |
| 255 | + for key, category in CATEGORIES.items(): |
| 256 | + if key not in enabled_categories: |
| 257 | + continue |
| 258 | + |
| 259 | + titles = [entry.title for entry in data_store[key]] |
| 260 | + parser = category.parser |
| 261 | + if category.generate_map: |
| 262 | + data_store[f"{key}_map"] = {} |
| 263 | + unparsed = [] |
| 264 | + generator = wiki_client.get_articles(titles) |
| 265 | + with ( |
| 266 | + timed() as t, |
| 267 | + conn, |
| 268 | + progress_bar(generator, len(titles), f"Parsing {key}", item_show_func=article_label) as bar, |
| 269 | + ): |
| 270 | + for article in bar: |
| 271 | + try: |
| 272 | + entry = parser.from_article(article) |
| 273 | + entry.insert(conn) |
| 274 | + if category.generate_map: |
| 275 | + data_store[f"{key}_map"][entry.title.lower()] = entry.article_id |
| 276 | + except ArticleParsingError as e: |
| 277 | + unparsed.append(article.title) |
| 278 | + parsing_errors_count += 1 |
| 279 | + if parsing_errors_log: |
| 280 | + write_parsing_error(parsing_errors_log, category=key, article=article, error=e) |
| 281 | + if unparsed: |
| 282 | + click.echo(f"{Fore.RED}Could not parse {len(unparsed):,} articles.{Style.RESET_ALL}") |
| 283 | + click.echo(f"\t-> {Fore.RED}{f'{Style.RESET_ALL},{Fore.RED}'.join(unparsed)}{Style.RESET_ALL}") |
| 284 | + click.echo(f"\t{Fore.GREEN}Parsed articles in {t.elapsed:.2f} seconds.{Style.RESET_ALL}") |
| 285 | + return parsing_errors_count |
| 286 | + |
223 | 287 |
|
224 | 288 | def resolve_enabled_categories(skip_categories: set[str]) -> tuple[set[str], dict[str, set[str]]]: |
225 | 289 | """Resolve enabled categories including dependency-based auto-skips.""" |
@@ -278,6 +342,7 @@ def generate( |
278 | 342 | skip_images: bool = False, |
279 | 343 | skip_deprecated: bool = False, |
280 | 344 | skip_categories: tuple[str, ...] = (), |
| 345 | + parsing_errors_file: str | None = None, |
281 | 346 | ) -> None: |
282 | 347 | """Generate a complete TibiaWiki SQLite database.""" |
283 | 348 | normalized_skip_categories = {category.casefold() for category in skip_categories} |
@@ -306,34 +371,18 @@ def generate( |
306 | 371 | excluded_titles = deprecated if not category.include_deprecated else None |
307 | 372 | data_store[key] = fetch_category_entries(category.name, excluded_titles) |
308 | 373 |
|
309 | | - click.echo("Parsing articles...") |
310 | | - for key, category in CATEGORIES.items(): |
311 | | - if key not in enabled_categories: |
312 | | - continue |
313 | | - |
314 | | - titles = [entry.title for entry in data_store[key]] |
315 | | - parser = category.parser |
316 | | - if category.generate_map: |
317 | | - data_store[f"{key}_map"] = {} |
318 | | - unparsed = [] |
319 | | - generator = wiki_client.get_articles(titles) |
320 | | - with ( |
321 | | - timed() as t, |
322 | | - conn, |
323 | | - progress_bar(generator, len(titles), f"Parsing {key}", item_show_func=article_label) as bar, |
324 | | - ): |
325 | | - for article in bar: |
326 | | - try: |
327 | | - entry = parser.from_article(article) |
328 | | - entry.insert(conn) |
329 | | - if category.generate_map: |
330 | | - data_store[f"{key}_map"][entry.title.lower()] = entry.article_id |
331 | | - except ArticleParsingError: |
332 | | - unparsed.append(article.title) |
333 | | - if unparsed: |
334 | | - click.echo(f"{Fore.RED}Could not parse {len(unparsed):,} articles.{Style.RESET_ALL}") |
335 | | - click.echo(f"\t-> {Fore.RED}{f'{Style.RESET_ALL},{Fore.RED}'.join(unparsed)}{Style.RESET_ALL}") |
336 | | - click.echo(f"\t{Fore.GREEN}Parsed articles in {t.elapsed:.2f} seconds.{Style.RESET_ALL}") |
| 374 | + parsing_errors_path = Path(parsing_errors_file) if parsing_errors_file else None |
| 375 | + if parsing_errors_path: |
| 376 | + with parsing_errors_path.open("w", encoding="utf-8") as parsing_errors_log: |
| 377 | + gen_time = datetime.datetime.now(datetime.timezone.utc) |
| 378 | + parsing_errors_log.write(f"TibiaWikiSQL parsing errors - {gen_time.isoformat()}\n\n") |
| 379 | + parsing_errors_count = parse_articles(conn, data_store, enabled_categories, parsing_errors_log) |
| 380 | + click.echo( |
| 381 | + f"{Fore.YELLOW}Wrote {parsing_errors_count:,} parsing errors to " |
| 382 | + f"{parsing_errors_path}.{Style.RESET_ALL}", |
| 383 | + ) |
| 384 | + else: |
| 385 | + parse_articles(conn, data_store, enabled_categories) |
337 | 386 |
|
338 | 387 | for position in rashid_positions: |
339 | 388 | RashidPositionTable.insert(conn, **position.model_dump()) |
|
0 commit comments