From a88d12f35f4624d7f6d1f3f79d8b29e51bc71b93 Mon Sep 17 00:00:00 2001 From: Harlan Lieberman-Berg Date: Sun, 8 Mar 2026 21:09:27 -0400 Subject: [PATCH] OD-2139: Fix skipping tags on damaged imports Previously, the system would skip importing the entire class of tags when a malformed tag was found in any story. This patch changes it to error on malformed tags, but allow the non-standard (mostly) all string tag format. In addition, we now warn when importing stories that have tags not in the tag table. --- efiction/metadata.py | 16 ++++++++--- efiction/tag_converter.py | 47 ++++++++++++++++++++++++--------- efiction/tests/test_metadata.py | 2 +- opendoors/mysql.py | 1 + 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/efiction/metadata.py b/efiction/metadata.py index 98435a5..c648dbc 100644 --- a/efiction/metadata.py +++ b/efiction/metadata.py @@ -153,10 +153,20 @@ def _convert_story_tag_table(self, table_name, old_tags): if self.tag_table_is_nonstandard[table_name]: # Tag table identified by name rather than id. original_tagid = "original_tag" + + valid_tag_ids = { + str(c[original_tagid]): c["id"] for c in self.tag_tables[table_name] + } + + dropped_tags = [ + tag for tag in old_tags[table_name] if tag and tag not in valid_tag_ids + ] + if dropped_tags: + self.logger.warning( + f"Found tags in stories but not in tag table: {dropped_tags}" + ) return [ - c["id"] - for c in self.tag_tables[table_name] - if str(c[original_tagid]) in old_tags[table_name] + valid_tag_ids[tag] for tag in old_tags[table_name] if tag in valid_tag_ids ] def _convert_story_tags(self, old_story): diff --git a/efiction/tag_converter.py b/efiction/tag_converter.py index fc078a9..7b7b3e9 100644 --- a/efiction/tag_converter.py +++ b/efiction/tag_converter.py @@ -3,7 +3,7 @@ from typing import Dict, List -from opendoors.mysql import SqlDb +from opendoors.mysql import OperationalError, ProgrammingError, SqlDb from opendoors.utils import print_progress @@ -56,20 +56,43 @@ def check_for_nonstandard_tag_tables(self) -> bool: try: query = f"SELECT {id_name} FROM stories;" - tags = self.sql.execute_and_fetchall(self.working_original, query) - try: - tags = list( - map( - lambda story_tags: story_tags[id_name].replace(",", ""), - tags, - ) + tag_rows = self.sql.execute_and_fetchall( + self.working_original, query + ) + story_tags = [tag_row[id_name] for tag_row in tag_rows] + # Get the number of story tags which contain something other + # than a comma-separated list of digits + is_numeric = [ + not tag or tag.isdigit() + for tags in story_tags + for tag in tags.split(",") + ] + + # There are three possibilities for the way tags have been + # put into stories. The most common, by far, is that they're + # all comma-separated lists of integers. If this is not the + # case, then they should be almost all -- though not + # necessarily all -- comma-separated strings. (This is + # because a tag could potentially be all-numeric, and have + # at least one fic only tagged with that all-numeric tag). + if sum(is_numeric) == len(is_numeric): + self.logger.debug( + f"Standard story tag syntax in {tag_table_name}" ) - int("".join(tags)) tag_tables[tag_table_name] = False - except Exception: - # Non-integer in identifier + + elif (sum(is_numeric) / len(is_numeric)) < 0.1: + self.logger.info( + f"Non-standard story tag syntax in {tag_table_name}" + ) tag_tables[tag_table_name] = True - except Exception as e: + + # Finally, the fields could be completely corrupt -- in which case we want to break. + else: + raise Exception( + f"Broken story tag syntax in {tag_table_name}; mix of standard (numeric) and non-standard (string) tags." + ) + except (OperationalError, ProgrammingError) as e: self.logger.info(e) self.logger.info("No such table?") tag_tables[tag_table_name] = None diff --git a/efiction/tests/test_metadata.py b/efiction/tests/test_metadata.py index 13a2dab..11fc63e 100644 --- a/efiction/tests/test_metadata.py +++ b/efiction/tests/test_metadata.py @@ -215,7 +215,7 @@ def test_convert_story_tags_normal_ratings(self): } ] result = self.efiction_converter._convert_story_tags(old_stories[0]) - self.assertEqual( + self.assertCountEqual( { "categories": [6], "characters": [106, 107], diff --git a/opendoors/mysql.py b/opendoors/mysql.py index c5ce604..43eefe5 100644 --- a/opendoors/mysql.py +++ b/opendoors/mysql.py @@ -6,6 +6,7 @@ import pymysql import sqlparse from pymysql.cursors import DictCursor +from pymysql.err import OperationalError, ProgrammingError # noqa: F401 from opendoors.utils import get_full_path