Skip to content

Commit 324482e

Browse files
committed
Add parameter to print parsing errors to file
1 parent b0f379c commit 324482e

3 files changed

Lines changed: 113 additions & 46 deletions

File tree

tibiawikisql/__main__.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Command line interface for tibiawiki-sql."""
2+
13
import sqlite3
24

35
import click
@@ -7,21 +9,26 @@
79
from tibiawikisql.utils import timed
810

911
DATABASE_FILE = "tibiawiki.db"
12+
PARSING_ERRORS_FILE = "parsing-errors.log"
1013

1114
colorama.init()
1215

1316

14-
@click.group(context_settings={'help_option_names': ['-h', '--help']})
15-
@click.version_option(__version__, '-V', '--version')
16-
def cli():
17-
# Empty command group to disable default command.
18-
pass
17+
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
18+
@click.version_option(__version__, "-V", "--version")
19+
def cli() -> None:
20+
"""Run tibiawiki-sql commands."""
1921

2022

2123
@cli.command(name="generate")
22-
@click.option('-i', '--skip-images', help="Skip fetching and loading images to the database.", is_flag=True)
23-
@click.option('-o', '--db-name', help="Name for the database file.", default=DATABASE_FILE)
24-
@click.option('-d', '--skip-deprecated', help="Skips fetching deprecated articles and their images.", is_flag=True)
24+
@click.option("-i", "--skip-images", help="Skip fetching and loading images to the database.", is_flag=True)
25+
@click.option("-o", "--db-name", help="Name for the database file.", default=DATABASE_FILE)
26+
@click.option("-d", "--skip-deprecated", help="Skips fetching deprecated articles and their images.", is_flag=True)
27+
@click.option(
28+
"--log-parsing-errors",
29+
help=f"Write every parsing error to {PARSING_ERRORS_FILE}.",
30+
is_flag=True,
31+
)
2532
@click.option(
2633
"-c",
2734
"--skip-category",
@@ -32,16 +39,22 @@ def cli():
3239
"Skip specific categories. Can be repeated."
3340
),
3441
)
35-
def generate(skip_images, db_name, skip_deprecated, skip_categories):
42+
def generate(
43+
skip_images: bool,
44+
db_name: str,
45+
skip_deprecated: bool,
46+
log_parsing_errors: bool,
47+
skip_categories: tuple[str, ...],
48+
) -> None:
3649
"""Generates a database file."""
37-
with timed() as t:
38-
with sqlite3.connect(db_name) as conn:
39-
generation.generate(
40-
conn,
41-
skip_images=skip_images,
42-
skip_deprecated=skip_deprecated,
43-
skip_categories=skip_categories,
44-
)
50+
with timed() as t, sqlite3.connect(db_name) as conn:
51+
generation.generate(
52+
conn,
53+
skip_images=skip_images,
54+
skip_deprecated=skip_deprecated,
55+
skip_categories=skip_categories,
56+
parsing_errors_file=PARSING_ERRORS_FILE if log_parsing_errors else None,
57+
)
4558
click.echo(f"Command finished in {t.elapsed:.2f} seconds.")
4659

4760

tibiawikisql/generation.py

Lines changed: 77 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
import datetime
55
import platform
6+
import traceback
67
from dataclasses import dataclass
8+
from pathlib import Path
79
from typing import TYPE_CHECKING, Any, TypeVar
810

911
import click
@@ -25,6 +27,7 @@
2527
import sqlite3
2628
from collections.abc import Callable, Iterable
2729
from click._termui_impl import ProgressBar
30+
from typing import TextIO
2831

2932
V = TypeVar("V")
3033

@@ -220,6 +223,67 @@ def _run_images(conn: sqlite3.Connection, _data_store: dict[str, Any], enabled_c
220223
PostTask("images", _run_images),
221224
)
222225

226+
PARSING_ERROR_SEPARATOR = "-" * 80
227+
228+
229+
def write_parsing_error(
230+
file: TextIO,
231+
*,
232+
category: str,
233+
article: Article,
234+
error: ArticleParsingError,
235+
) -> None:
236+
"""Write a parsing error entry to a log file."""
237+
file.write(f"{PARSING_ERROR_SEPARATOR}\n")
238+
file.write(f"Category: {category}\n")
239+
file.write(f"Article: {article.title}\n")
240+
file.write(f"URL: {article.url}\n")
241+
file.write("Traceback:\n")
242+
file.writelines(traceback.format_exception(type(error), error, error.__traceback__))
243+
file.write("\n")
244+
245+
246+
def parse_articles(
247+
conn: sqlite3.Connection,
248+
data_store: dict[str, Any],
249+
enabled_categories: set[str],
250+
parsing_errors_log: TextIO | None = None,
251+
) -> int:
252+
"""Parse category articles into the database."""
253+
click.echo("Parsing articles...")
254+
parsing_errors_count = 0
255+
for key, category in CATEGORIES.items():
256+
if key not in enabled_categories:
257+
continue
258+
259+
titles = [entry.title for entry in data_store[key]]
260+
parser = category.parser
261+
if category.generate_map:
262+
data_store[f"{key}_map"] = {}
263+
unparsed = []
264+
generator = wiki_client.get_articles(titles)
265+
with (
266+
timed() as t,
267+
conn,
268+
progress_bar(generator, len(titles), f"Parsing {key}", item_show_func=article_label) as bar,
269+
):
270+
for article in bar:
271+
try:
272+
entry = parser.from_article(article)
273+
entry.insert(conn)
274+
if category.generate_map:
275+
data_store[f"{key}_map"][entry.title.lower()] = entry.article_id
276+
except ArticleParsingError as e:
277+
unparsed.append(article.title)
278+
parsing_errors_count += 1
279+
if parsing_errors_log:
280+
write_parsing_error(parsing_errors_log, category=key, article=article, error=e)
281+
if unparsed:
282+
click.echo(f"{Fore.RED}Could not parse {len(unparsed):,} articles.{Style.RESET_ALL}")
283+
click.echo(f"\t-> {Fore.RED}{f'{Style.RESET_ALL},{Fore.RED}'.join(unparsed)}{Style.RESET_ALL}")
284+
click.echo(f"\t{Fore.GREEN}Parsed articles in {t.elapsed:.2f} seconds.{Style.RESET_ALL}")
285+
return parsing_errors_count
286+
223287

224288
def resolve_enabled_categories(skip_categories: set[str]) -> tuple[set[str], dict[str, set[str]]]:
225289
"""Resolve enabled categories including dependency-based auto-skips."""
@@ -278,6 +342,7 @@ def generate(
278342
skip_images: bool = False,
279343
skip_deprecated: bool = False,
280344
skip_categories: tuple[str, ...] = (),
345+
parsing_errors_file: str | None = None,
281346
) -> None:
282347
"""Generate a complete TibiaWiki SQLite database."""
283348
normalized_skip_categories = {category.casefold() for category in skip_categories}
@@ -306,34 +371,18 @@ def generate(
306371
excluded_titles = deprecated if not category.include_deprecated else None
307372
data_store[key] = fetch_category_entries(category.name, excluded_titles)
308373

309-
click.echo("Parsing articles...")
310-
for key, category in CATEGORIES.items():
311-
if key not in enabled_categories:
312-
continue
313-
314-
titles = [entry.title for entry in data_store[key]]
315-
parser = category.parser
316-
if category.generate_map:
317-
data_store[f"{key}_map"] = {}
318-
unparsed = []
319-
generator = wiki_client.get_articles(titles)
320-
with (
321-
timed() as t,
322-
conn,
323-
progress_bar(generator, len(titles), f"Parsing {key}", item_show_func=article_label) as bar,
324-
):
325-
for article in bar:
326-
try:
327-
entry = parser.from_article(article)
328-
entry.insert(conn)
329-
if category.generate_map:
330-
data_store[f"{key}_map"][entry.title.lower()] = entry.article_id
331-
except ArticleParsingError:
332-
unparsed.append(article.title)
333-
if unparsed:
334-
click.echo(f"{Fore.RED}Could not parse {len(unparsed):,} articles.{Style.RESET_ALL}")
335-
click.echo(f"\t-> {Fore.RED}{f'{Style.RESET_ALL},{Fore.RED}'.join(unparsed)}{Style.RESET_ALL}")
336-
click.echo(f"\t{Fore.GREEN}Parsed articles in {t.elapsed:.2f} seconds.{Style.RESET_ALL}")
374+
parsing_errors_path = Path(parsing_errors_file) if parsing_errors_file else None
375+
if parsing_errors_path:
376+
with parsing_errors_path.open("w", encoding="utf-8") as parsing_errors_log:
377+
gen_time = datetime.datetime.now(datetime.timezone.utc)
378+
parsing_errors_log.write(f"TibiaWikiSQL parsing errors - {gen_time.isoformat()}\n\n")
379+
parsing_errors_count = parse_articles(conn, data_store, enabled_categories, parsing_errors_log)
380+
click.echo(
381+
f"{Fore.YELLOW}Wrote {parsing_errors_count:,} parsing errors to "
382+
f"{parsing_errors_path}.{Style.RESET_ALL}",
383+
)
384+
else:
385+
parse_articles(conn, data_store, enabled_categories)
337386

338387
for position in rashid_positions:
339388
RashidPositionTable.insert(conn, **position.model_dump())

tibiawikisql/parsers/base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,12 @@ def from_article(cls, article: Article) -> M:
209209
An inherited model object for the current article.
210210
211211
"""
212-
row = cls.parse_attributes(article)
212+
try:
213+
row = cls.parse_attributes(article)
214+
except ArticleParsingError:
215+
raise
216+
except Exception as e:
217+
raise ArticleParsingError(article, cause=e) from e
213218
try:
214219
return cls.model.model_validate(row)
215220
except ValidationError as e:

0 commit comments

Comments
 (0)