Skip to content

Commit 78ebdc5

Browse files
Import FraCaS data
1 parent ce0410a commit 78ebdc5

3 files changed

Lines changed: 233 additions & 0 deletions

File tree

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import xml.etree.ElementTree as ET
2+
3+
from django.core.management.base import BaseCommand
4+
from django.db import transaction
5+
6+
from problem.models import FracasPremise, FracasProblem
7+
from problem.utils import progress
8+
9+
10+
class Command(BaseCommand):
11+
help = "Import FraCaS problems from fracas.xml."
12+
13+
def add_arguments(self, parser):
14+
parser.add_argument(
15+
"--fracas_path",
16+
type=str,
17+
default="problem/data/fracas.xml",
18+
help="Path to the fracas.xml file.",
19+
)
20+
21+
def handle(self, *args, **options):
22+
fracas_path = options["fracas_path"]
23+
self.import_fracas_problems(fracas_path)
24+
25+
def annotate_section_subsections(self, tree: ET.ElementTree) -> None:
26+
current_section = None
27+
current_subsection = None
28+
current_subsubsection = None
29+
30+
root = tree.getroot()
31+
32+
for element in root:
33+
if element.tag == "comment" and element.attrib.get("class") == "section":
34+
current_section = element.text.strip()
35+
elif (
36+
element.tag == "comment" and element.attrib.get("class") == "subsection"
37+
):
38+
current_subsection = element.text.strip()
39+
elif element.tag == "comment" and element.attrib.get("class") == "subsubsection":
40+
current_subsubsection = element.text.strip()
41+
elif element.tag == "problem":
42+
if current_section:
43+
element.set("section", current_section)
44+
if current_subsection:
45+
element.set("subsection", current_subsection)
46+
if current_subsubsection:
47+
element.set("subsubsection", current_subsubsection)
48+
49+
def import_fracas_problems(self, fracas_path: str) -> None:
50+
# Parse the XML file
51+
tree = ET.parse(fracas_path)
52+
self.annotate_section_subsections(tree)
53+
root = tree.getroot()
54+
55+
all_problems = root.findall("problem")
56+
total = len(all_problems)
57+
n = 1
58+
59+
skipped = 0
60+
61+
def text_from_element(element: ET.Element) -> str:
62+
"""
63+
Extracts stripped text from an XML element, returning an empty string if the element is None or has no text.
64+
"""
65+
return element.text.strip() if element is not None and element.text else ""
66+
67+
for problem in root.findall("problem"):
68+
problem_id = problem.get("id")
69+
70+
if problem_id is None:
71+
raise ValueError(
72+
"Problem ID is missing in the XML file for problem: {}".format(
73+
problem
74+
)
75+
)
76+
77+
progress(n, total)
78+
n += 1
79+
80+
if FracasProblem.objects.filter(fracas_id=problem_id).exists():
81+
skipped += 1
82+
continue
83+
84+
question = text_from_element(problem.find("q"))
85+
hypothesis = text_from_element(problem.find("h"))
86+
answer = text_from_element(problem.find("a"))
87+
note = text_from_element(problem.find("note"))
88+
89+
section = problem.get("section")
90+
subsection = problem.get("subsection")
91+
fracas_answer = problem.get("fracas_answer")
92+
fracas_nonstandard = problem.get("fracas_nonstandard", False) == "true"
93+
94+
with transaction.atomic():
95+
fracas_problem = FracasProblem.objects.create(
96+
fracas_id=int(problem_id),
97+
question=question,
98+
hypothesis=hypothesis,
99+
answer=answer,
100+
fracas_answer=fracas_answer,
101+
fracas_non_standard=fracas_nonstandard,
102+
note=note,
103+
section_name=section,
104+
subsection_name=subsection,
105+
)
106+
107+
premises = problem.findall("p")
108+
for premise in premises:
109+
premise_index = premise.get("idx", None)
110+
if premise_index is None:
111+
raise ValueError(
112+
"Premise index is missing in the XML file for problem: {}".format(
113+
problem
114+
)
115+
)
116+
FracasPremise.objects.create(
117+
fracas_problem=fracas_problem,
118+
premise_index=int(premise_index),
119+
premise=premise.text.strip() if premise.text else "",
120+
)
121+
122+
print(f"FraCaS problems import complete! Total: {total} | Skipped: {skipped}")
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Generated by Django 4.2.20 on 2025-05-02 12:22
2+
3+
from django.db import migrations, models
4+
import django.db.models.deletion
5+
6+
7+
class Migration(migrations.Migration):
8+
9+
dependencies = [
10+
('problem', '0001_initial'),
11+
]
12+
13+
operations = [
14+
migrations.CreateModel(
15+
name='FracasProblem',
16+
fields=[
17+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
18+
('fracas_id', models.IntegerField(unique=True)),
19+
('question', models.CharField(help_text='The question from the original FraCaS problem. 4 problems do not have a question.', max_length=255)),
20+
('hypothesis', models.CharField(help_text='The answer formulated as a hypothesis by McCartney.', max_length=255)),
21+
('answer', models.CharField(help_text='The answer from the original FraCaS problem. Most are "Yes", "No", or "Don\'t know", but not always.', max_length=255)),
22+
('fracas_answer', models.CharField(choices=[('yes', 'Yes'), ('no', 'No'), ('unknown', 'Unknown'), ('undef', 'Undef')], help_text='The answer constrained to one of a fixed set of values by McCartney.', max_length=20)),
23+
('fracas_non_standard', models.BooleanField(help_text='Indicates whether the answer in the origianl FraCaS problem is non-standard (i.e. not "Yes", "No", or "Don\'t know")')),
24+
('note', models.TextField(help_text='Note given by McCartney to explain issues arising during translation to XML.')),
25+
('section_name', models.CharField(help_text='The section name from the original FraCaS problem.', max_length=255)),
26+
('subsection_name', models.CharField(help_text='The subsection name from the original FraCaS problem.', max_length=255)),
27+
],
28+
),
29+
migrations.CreateModel(
30+
name='FracasPremise',
31+
fields=[
32+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
33+
('premise_index', models.IntegerField(help_text='The index of the premise in the original FraCaS problem.')),
34+
('premise', models.CharField(help_text='The premise from the original FraCaS problem.', max_length=255)),
35+
('fracas_problem', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='premises', to='problem.fracasproblem')),
36+
],
37+
options={
38+
'unique_together': {('fracas_problem', 'premise_index')},
39+
},
40+
),
41+
]

backend/problem/models.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,73 @@ class Dataset(models.TextChoices):
2525
decimal_places=3,
2626
validators=[MinValueValidator(1), MinValueValidator(5)],
2727
)
28+
29+
30+
class FracasProblem(models.Model):
31+
class FracasAnswer(models.TextChoices):
32+
YES = "yes", "Yes"
33+
NO = "no", "No"
34+
UNKNOWN = "unknown", "Unknown"
35+
UNDEF = "undef", "Undef"
36+
37+
fracas_id = models.IntegerField(unique=True)
38+
39+
# Four problems do not have a question.
40+
question = models.CharField(
41+
max_length=255,
42+
help_text="The question from the original FraCaS problem. 4 problems do not have a question.",
43+
)
44+
45+
hypothesis = models.CharField(
46+
max_length=255,
47+
help_text="The answer formulated as a hypothesis by McCartney.",
48+
)
49+
50+
answer = models.CharField(
51+
max_length=255,
52+
help_text='The answer from the original FraCaS problem. Most are "Yes", "No", or "Don\'t know", but not always.',
53+
)
54+
55+
fracas_answer = models.CharField(
56+
max_length=20,
57+
choices=FracasAnswer.choices,
58+
help_text="The answer constrained to one of a fixed set of values by McCartney.",
59+
)
60+
61+
fracas_non_standard = models.BooleanField(
62+
help_text='Indicates whether the answer in the origianl FraCaS problem is non-standard (i.e. not "Yes", "No", or "Don\'t know")'
63+
)
64+
65+
note = models.TextField(
66+
help_text="Note given by McCartney to explain issues arising during translation to XML."
67+
)
68+
69+
section_name = models.CharField(
70+
max_length=255,
71+
help_text="The section name from the original FraCaS problem.",
72+
)
73+
74+
subsection_name = models.CharField(
75+
max_length=255,
76+
help_text="The subsection name from the original FraCaS problem.",
77+
)
78+
79+
80+
class FracasPremise(models.Model):
81+
class Meta:
82+
unique_together = ("fracas_problem", "premise_index")
83+
84+
fracas_problem = models.ForeignKey(
85+
FracasProblem,
86+
on_delete=models.CASCADE,
87+
related_name="premises",
88+
)
89+
90+
premise_index = models.IntegerField(
91+
help_text="The index of the premise in the original FraCaS problem.",
92+
)
93+
94+
premise = models.CharField(
95+
max_length=255,
96+
help_text="The premise from the original FraCaS problem.",
97+
)

0 commit comments

Comments
 (0)