Rework import scripts

XanderVertegaal · XanderVertegaal · commit d32798dd8dc8 · 2025-05-22T15:20:26.000+02:00
diff --git a/backend/problem/management/commands/import_fracas.py b/backend/problem/management/commands/import_fracas.py
@@ -1,10 +1,13 @@
+import json
 import xml.etree.ElementTree as ET
 
 from django.core.management.base import BaseCommand
 from django.db import transaction
+from tqdm import tqdm
 
-from problem.models import FracasPremise, FracasProblem
-from problem.utils import progress
+from langpro_annotator.logger import logger
+from problem.services import get_fracas_problems
+from problem.models import Problem
 
 
 class Command(BaseCommand):
@@ -22,7 +25,18 @@ def handle(self, *args, **options):
         fracas_path = options["fracas_path"]
         self.import_fracas_problems(fracas_path)
 
-    def annotate_section_subsections(self, tree: ET.ElementTree) -> None:
+    @staticmethod
+    def _text_from_element(element: ET.Element) -> str:
+        """
+        Extracts stripped text from an XML element, returning an empty string if the element is None or has no text.
+        """
+        return element.text.strip() if element is not None and element.text else ""
+
+    @staticmethod
+    def _annotate_section_subsections(tree: ET.ElementTree) -> None:
+        """
+        Annotates each problem in the XML tree with its corresponding section, subsection, and subsubsection.
+        """
         current_section = None
         current_subsection = None
         current_subsubsection = None
@@ -50,76 +64,63 @@ def annotate_section_subsections(self, tree: ET.ElementTree) -> None:
                     element.set("subsubsection", current_subsubsection)
 
     def import_fracas_problems(self, fracas_path: str) -> None:
-        # Parse the XML file
         tree = ET.parse(fracas_path)
-        self.annotate_section_subsections(tree)
+        self._annotate_section_subsections(tree)
         root = tree.getroot()
-
         all_problems = root.findall("problem")
-        total = len(all_problems)
-        n = 1
 
+        created = 0
         skipped = 0
 
-        def text_from_element(element: ET.Element) -> str:
-            """
-            Extracts stripped text from an XML element, returning an empty string if the element is None or has no text.
-            """
-            return element.text.strip() if element is not None and element.text else ""
+        existing_fracas_problems = get_fracas_problems()
+        existing_fracas_ids = {p.fracas_id for p in existing_fracas_problems}
 
-        for problem in root.findall("problem"):
+        for problem in tqdm(all_problems, desc="Importing FraCaS problems"):
             problem_id = problem.get("id")
-
             if problem_id is None:
                 raise ValueError(
                     "Problem ID is missing in the XML file for problem: {}".format(
                         problem
                     )
                 )
 
-            progress(n, total)
-            n += 1
-
-            if FracasProblem.objects.filter(fracas_id=problem_id).exists():
+            if int(problem_id) in existing_fracas_ids:
                 skipped += 1
                 continue
 
-            question = text_from_element(problem.find("q"))
-            hypothesis = text_from_element(problem.find("h"))
-            answer = text_from_element(problem.find("a"))
-            note = text_from_element(problem.find("note"))
+            question = self._text_from_element(problem.find("q"))
+            hypothesis = self._text_from_element(problem.find("h"))
+            answer = self._text_from_element(problem.find("a"))
+            note = self._text_from_element(problem.find("note"))
 
             section = problem.get("section")
             subsection = problem.get("subsection")
             fracas_answer = problem.get("fracas_answer")
             fracas_nonstandard = problem.get("fracas_nonstandard", False) == "true"
 
+            premise_nodes = problem.findall("p")
+            premises = [node.text.strip() for node in premise_nodes if node.text]
+
             with transaction.atomic():
-                fracas_problem = FracasProblem.objects.create(
-                    fracas_id=int(problem_id),
-                    question=question,
-                    hypothesis=hypothesis,
-                    answer=answer,
-                    fracas_answer=fracas_answer,
-                    fracas_non_standard=fracas_nonstandard,
-                    note=note,
-                    section_name=section,
-                    subsection_name=subsection,
+                Problem.objects.create(
+                    type=Problem.ProblemType.FRACAS,
+                    content=json.dumps(
+                        {
+                            "fracas_id": int(problem_id),
+                            "question": question,
+                            "hypothesis": hypothesis,
+                            "answer": answer,
+                            "fracas_answer": fracas_answer,
+                            "fracas_non_standard": fracas_nonstandard,
+                            "note": note,
+                            "section_name": section,
+                            "subsection_name": subsection,
+                            "premises": premises,
+                        }
+                    ),
                 )
+                created += 1
 
-                premises = problem.findall("p")
-                for premise in premises:
-                    premise_index = premise.get("idx", None)
-                    if premise_index is None:
-                        raise ValueError(
-                            "Premise index is missing in the XML file for problem: {}".format(
-                                problem
-                            )
-                        )
-                    FracasPremise.objects.create(
-                        fracas_problem=fracas_problem,
-                        premise_index=int(premise_index),
-                        premise=premise.text.strip() if premise.text else "",
-                    )
-
-        print(f"FraCaS problems import complete! Total: {total} | Skipped: {skipped}")
+        logger.info(
+            f"FraCaS problems import complete! Total: {created} | Skipped: {skipped}"
+        )
diff --git a/backend/problem/management/commands/import_sick.py b/backend/problem/management/commands/import_sick.py
@@ -1,8 +1,12 @@
 import csv
+import json
 
 from django.core.management.base import BaseCommand
-from problem.utils import progress
-from problem.models import SickProblem
+from tqdm import tqdm
+
+from langpro_annotator.logger import logger
+from problem.models import Problem
+from problem.services import get_sick_problems
 
 
 class Command(BaseCommand):
@@ -25,30 +29,27 @@ def import_sick_problems(self, sick_path: str) -> None:
         Import SICK problems from SICK.txt (a TSV file) and enter them into the database.
         """
 
-        print("Importing SICK problems...")
-
         skipped = 0
+        created = 0
+
+        existing_sick_problems = get_sick_problems()
+        existing_pair_ids = {p.pair_id for p in existing_sick_problems}
 
         with open(sick_path, "r", encoding="utf-8") as file:
             reader = csv.DictReader(file, delimiter="\t")
             problem_list = list(reader)
 
-            total = len(problem_list)
-            n = 1
-
-            for row in problem_list:
-                progress(n, total)
-                n += 1
-                if SickProblem.objects.filter(pair_id=row["pair_ID"]).exists():
+            for problem in tqdm(problem_list, desc="Importing SICK problems"):
+                if problem["pair_ID"] in existing_pair_ids:
                     skipped += 1
                     continue
 
-                SickProblem.objects.create(
-                    pair_id=row["pair_ID"],
-                    sentence_one=row["sentence_A"],
-                    sentence_two=row["sentence_B"],
-                    entailment_label=row["entailment_label"],
-                    relatedness_score=row["relatedness_score"],
+                created += 1
+                Problem.objects.create(
+                    type=Problem.ProblemType.SICK,
+                    content=json.dumps(problem),
                 )
 
-            print(f"SICK problems import complete! Total: {total} | Skipped: {skipped}")
+            logger.info(
+                f"SICK problems import complete! Created: {created} | Skipped: {skipped}"
+            )