diff --git a/backend/.gitignore b/backend/.gitignore index 8655ff2..9cf99d0 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -31,3 +31,6 @@ venv/ ENV/ env.bak/ venv.bak/ + +# Data files +problem/data/* diff --git a/backend/example/apps.py b/backend/example/apps.py deleted file mode 100644 index dbb0083..0000000 --- a/backend/example/apps.py +++ /dev/null @@ -1,5 +0,0 @@ -from django.apps import AppConfig - - -class ExampleConfig(AppConfig): - name = 'example' diff --git a/backend/example/models.py b/backend/example/models.py deleted file mode 100644 index 71a8362..0000000 --- a/backend/example/models.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.db import models - -# Create your models here. diff --git a/backend/example/views.py b/backend/example/views.py deleted file mode 100644 index 7965457..0000000 --- a/backend/example/views.py +++ /dev/null @@ -1,11 +0,0 @@ -from rest_framework.decorators import api_view -from rest_framework.response import Response - -# This a very basic View that servers as an example. -# Note that, when utilizing models and a database, etc, you would not have -# views like this, but rather Viewsets. See the Django Rest Framework (DRF) documentation. - -@api_view() -def hooray(request): - response = [{ 'message': 'https://media.giphy.com/media/yoJC2GnSClbPOkV0eA/source.gif' }] - return Response(response) diff --git a/backend/langpro_annotator/common_settings.py b/backend/langpro_annotator/common_settings.py index 226c5b0..48081fc 100644 --- a/backend/langpro_annotator/common_settings.py +++ b/backend/langpro_annotator/common_settings.py @@ -16,9 +16,8 @@ # cf. https://github.com/iMerica/dj-rest-auth/pull/110. 'allauth.socialaccount', 'user', - 'revproxy', - 'example' + 'problem', ] MIDDLEWARE = [ @@ -67,4 +66,38 @@ REST_AUTH = { "USER_DETAILS_SERIALIZER": "user.serializers.CustomUserDetailsSerializer", -} \ No newline at end of file +} + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "verbose": { + "format": "{levelname} {asctime} {module} {message}", + "style": "{", + }, + "simple": { + "format": "{levelname} {message}", + "style": "{", + }, + }, + "handlers": { + "console": { + "level": "INFO", + "class": "logging.StreamHandler", + "formatter": "simple", + }, + }, + "loggers": { + "django": { + "handlers": ["console"], + "level": "INFO", + "propagate": False, + }, + "LangProAnnotator": { + "handlers": ["console"], + "level": "INFO", + "propagate": False, + }, + }, +} diff --git a/backend/langpro_annotator/logger.py b/backend/langpro_annotator/logger.py new file mode 100644 index 0000000..24ab895 --- /dev/null +++ b/backend/langpro_annotator/logger.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger('LangProAnnotator') diff --git a/backend/langpro_annotator/proxy_frontend.py b/backend/langpro_annotator/proxy_frontend.py index 923f9ba..ce5ecf8 100644 --- a/backend/langpro_annotator/proxy_frontend.py +++ b/backend/langpro_annotator/proxy_frontend.py @@ -2,10 +2,12 @@ from django.views.decorators.csrf import ensure_csrf_cookie from revproxy.views import ProxyView + view = ProxyView.as_view(upstream=settings.PROXY_FRONTEND) + @ensure_csrf_cookie def proxy_frontend(*args, **kwargs): - """ Wrapper for calls to the SPA ensuring the precense of a CSRF cookie.""" + """Wrapper for calls to the SPA ensuring the presence of a CSRF cookie.""" global view return view(*args, **kwargs) diff --git a/backend/langpro_annotator/urls.py b/backend/langpro_annotator/urls.py index 0179520..5b61427 100644 --- a/backend/langpro_annotator/urls.py +++ b/backend/langpro_annotator/urls.py @@ -13,6 +13,7 @@ 1. Import the include() function: from django.conf.urls import include, path 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ + from django.conf import settings from django.urls import path, re_path, include from django.contrib import admin @@ -24,28 +25,28 @@ from .proxy_frontend import proxy_frontend from .i18n import i18n -from example.views import hooray as ExampleView # DELETEME, see below - api_router = routers.DefaultRouter() # register viewsets with this router if settings.PROXY_FRONTEND: - spa_url = re_path(r'^(?P.*)$', proxy_frontend) + spa_url = re_path(r"^(?P.*)$", proxy_frontend) else: - spa_url = re_path(r'', index) + spa_url = re_path(r"", index) urlpatterns = [ - path('api/example/', ExampleView), # this is just an example, please delete and utilize router above. - path('admin', RedirectView.as_view(url='/admin/', permanent=True)), - path('api', RedirectView.as_view(url='/api/', permanent=True)), - path('api-auth', RedirectView.as_view(url='/api-auth/', permanent=True)), - path('admin/', admin.site.urls), - path('api/', include(api_router.urls)), - path('api-auth/', include( - 'rest_framework.urls', - namespace='rest_framework', - )), - path('api/i18n/', i18n),path("users/", include("user.urls")), - + path("admin", RedirectView.as_view(url="/admin/", permanent=True)), + path("api", RedirectView.as_view(url="/api/", permanent=True)), + path("api-auth", RedirectView.as_view(url="/api-auth/", permanent=True)), + path("admin/", admin.site.urls), + path("api/", include(api_router.urls)), + path( + "api-auth/", + include( + "rest_framework.urls", + namespace="rest_framework", + ), + ), + path("api/i18n/", i18n), + path("users/", include("user.urls")), spa_url, # catch-all; unknown paths to be handled by a SPA ] diff --git a/backend/example/__init__.py b/backend/problem/__init__.py similarity index 100% rename from backend/example/__init__.py rename to backend/problem/__init__.py diff --git a/backend/example/admin.py b/backend/problem/admin.py similarity index 100% rename from backend/example/admin.py rename to backend/problem/admin.py diff --git a/backend/problem/apps.py b/backend/problem/apps.py new file mode 100644 index 0000000..ab8c85e --- /dev/null +++ b/backend/problem/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class ProblemConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "problem" diff --git a/backend/example/migrations/__init__.py b/backend/problem/management/commands/__init__.py similarity index 100% rename from backend/example/migrations/__init__.py rename to backend/problem/management/commands/__init__.py diff --git a/backend/problem/management/commands/import_fracas.py b/backend/problem/management/commands/import_fracas.py new file mode 100644 index 0000000..254fb1c --- /dev/null +++ b/backend/problem/management/commands/import_fracas.py @@ -0,0 +1,125 @@ +import json +import xml.etree.ElementTree as ET + +from django.core.management.base import BaseCommand +from django.db import transaction +from tqdm import tqdm + +from langpro_annotator.logger import logger +from problem.services import get_fracas_problems +from problem.models import Problem + + +class Command(BaseCommand): + help = "Import FraCaS problems from fracas.xml." + + def add_arguments(self, parser): + parser.add_argument( + "--fracas_path", + type=str, + default="problem/data/fracas.xml", + help="Path to the fracas.xml file.", + ) + + def handle(self, *args, **options): + fracas_path = options["fracas_path"] + self.import_fracas_problems(fracas_path) + + @staticmethod + def _text_from_element(element: ET.Element) -> str: + """ + Extracts stripped text from an XML element, returning an empty string if the element is None or has no text. + """ + return element.text.strip() if element is not None and element.text else "" + + @staticmethod + def _annotate_section_subsections(tree: ET.ElementTree) -> None: + """ + Annotates each problem in the XML tree with its corresponding section, subsection, and subsubsection. + """ + current_section = None + current_subsection = None + current_subsubsection = None + + root = tree.getroot() + + for element in root: + if element.tag == "comment" and element.attrib.get("class") == "section": + current_section = element.text.strip() + elif ( + element.tag == "comment" and element.attrib.get("class") == "subsection" + ): + current_subsection = element.text.strip() + elif ( + element.tag == "comment" + and element.attrib.get("class") == "subsubsection" + ): + current_subsubsection = element.text.strip() + elif element.tag == "problem": + if current_section: + element.set("section", current_section) + if current_subsection: + element.set("subsection", current_subsection) + if current_subsubsection: + element.set("subsubsection", current_subsubsection) + + def import_fracas_problems(self, fracas_path: str) -> None: + tree = ET.parse(fracas_path) + self._annotate_section_subsections(tree) + root = tree.getroot() + all_problems = root.findall("problem") + + created = 0 + skipped = 0 + + existing_fracas_problems = get_fracas_problems() + existing_fracas_ids = {p.fracas_id for p in existing_fracas_problems} + + for problem in tqdm(all_problems, desc="Importing FraCaS problems"): + problem_id = problem.get("id") + if problem_id is None: + raise ValueError( + "Problem ID is missing in the XML file for problem: {}".format( + problem + ) + ) + + if int(problem_id) in existing_fracas_ids: + skipped += 1 + continue + + question = self._text_from_element(problem.find("q")) + hypothesis = self._text_from_element(problem.find("h")) + answer = self._text_from_element(problem.find("a")) + note = self._text_from_element(problem.find("note")) + + section = problem.get("section") + subsection = problem.get("subsection") + fracas_answer = problem.get("fracas_answer") + fracas_nonstandard = problem.get("fracas_nonstandard", False) == "true" + + premise_nodes = problem.findall("p") + premises = [node.text.strip() for node in premise_nodes if node.text] + + Problem.objects.create( + type=Problem.ProblemType.FRACAS, + content=json.dumps( + { + "fracas_id": int(problem_id), + "question": question, + "hypothesis": hypothesis, + "answer": answer, + "fracas_answer": fracas_answer, + "fracas_non_standard": fracas_nonstandard, + "note": note, + "section_name": section, + "subsection_name": subsection, + "premises": premises, + } + ), + ) + created += 1 + + logger.info( + f"FraCaS problems import complete! Total: {created} | Skipped: {skipped}" + ) diff --git a/backend/problem/management/commands/import_sick.py b/backend/problem/management/commands/import_sick.py new file mode 100644 index 0000000..fda2fdf --- /dev/null +++ b/backend/problem/management/commands/import_sick.py @@ -0,0 +1,55 @@ +import csv +import json + +from django.core.management.base import BaseCommand +from tqdm import tqdm + +from langpro_annotator.logger import logger +from problem.models import Problem +from problem.services import get_sick_problems + + +class Command(BaseCommand): + help = "Import SICK problems from SICK.txt (a TSV file)." + + def add_arguments(self, parser): + parser.add_argument( + "--sick_path", + type=str, + default="problem/data/SICK.txt", + help="Path to the SICK.txt file.", + ) + + def handle(self, *args, **options): + sick_path = options["sick_path"] + self.import_sick_problems(sick_path) + + def import_sick_problems(self, sick_path: str) -> None: + """ + Import SICK problems from SICK.txt (a TSV file) and enter them into the database. + """ + + skipped = 0 + created = 0 + + existing_sick_problems = get_sick_problems() + existing_pair_ids = {p.pair_id for p in existing_sick_problems} + + with open(sick_path, "r", encoding="utf-8") as file: + reader = csv.DictReader(file, delimiter="\t") + problem_list = list(reader) + + for problem in tqdm(problem_list, desc="Importing SICK problems"): + if problem["pair_ID"] in existing_pair_ids: + skipped += 1 + continue + + created += 1 + Problem.objects.create( + type=Problem.ProblemType.SICK, + content=json.dumps(problem), + ) + + logger.info( + f"SICK problems import complete! Created: {created} | Skipped: {skipped}" + ) diff --git a/backend/problem/migrations/0001_initial.py b/backend/problem/migrations/0001_initial.py new file mode 100644 index 0000000..76f45de --- /dev/null +++ b/backend/problem/migrations/0001_initial.py @@ -0,0 +1,34 @@ +# Generated by Django 4.2.20 on 2025-05-22 13:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Problem", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "type", + models.CharField( + choices=[("sick", "Sick"), ("fracas", "FraCaS")], max_length=255 + ), + ), + ("content", models.JSONField()), + ], + ), + ] diff --git a/backend/problem/migrations/__init__.py b/backend/problem/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/problem/models.py b/backend/problem/models.py new file mode 100644 index 0000000..9c3a272 --- /dev/null +++ b/backend/problem/models.py @@ -0,0 +1,14 @@ +from django.db import models + + +class Problem(models.Model): + class ProblemType(models.TextChoices): + SICK = "sick", "Sick" + FRACAS = "fracas", "FraCaS" + + type = models.CharField( + max_length=255, + choices=ProblemType.choices, + ) + + content = models.JSONField() diff --git a/backend/problem/services.py b/backend/problem/services.py new file mode 100644 index 0000000..16a48ac --- /dev/null +++ b/backend/problem/services.py @@ -0,0 +1,76 @@ +import json + +from langpro_annotator.logger import logger +from .models import Problem +from .types import FracasProblem, SickProblem + + +def get_sick_problems() -> list[SickProblem]: + """ + Retrieves all Problem objects of type 'SICK' from the database + and converts them into SickProblem instances. + """ + sick_problems: list[SickProblem] = [] + sick_objects = Problem.objects.filter(type=Problem.ProblemType.SICK) + + for sick_obj in sick_objects: + try: + problem_data = json.loads(sick_obj.content) + problem = SickProblem( + pair_id=problem_data["pair_ID"], + sentence_one=problem_data["sentence_A"], + sentence_two=problem_data["sentence_B"], + entailment_label=problem_data["entailment_label"], + relatedness_score=float(problem_data["relatedness_score"]), + ) + sick_problems.append(problem) + except json.JSONDecodeError: + logger.warning( + f"Warning: Could not parse JSON content for Problem ID {sick_obj.id}" + ) + continue + except TypeError as e: + logger.warning( + f"Warning: Could not create SickProblem for Problem ID {sick_obj.id}: {e}" + ) + continue + + return sick_problems + + +def get_fracas_problems() -> list[FracasProblem]: + """ + Retrieves all Problem objects of type 'Fracas' from the database + and converts them into FracasProblem instances. + """ + fracas_problems: list[FracasProblem] = [] + problem_objects = Problem.objects.filter(type=Problem.ProblemType.FRACAS) + + for problem_obj in problem_objects: + try: + problem_data = json.loads(problem_obj.content) + problem = FracasProblem( + fracas_id=problem_data["fracas_id"], + question=problem_data["question"], + hypothesis=problem_data["hypothesis"], + answer=problem_data["answer"], + fracas_answer=problem_data["fracas_answer"], + fracas_non_standard=problem_data["fracas_non_standard"], + note=problem_data["note"], + section_name=problem_data["section_name"], + subsection_name=problem_data["subsection_name"], + premises=problem_data.get("premises", []), + ) + fracas_problems.append(problem) + except json.JSONDecodeError: + logger.warning( + f"Warning: Could not parse JSON content for Problem ID {problem_obj.id}" + ) + continue + except TypeError as e: + logger.warning( + f"Warning: Could not create FracasProblem for Problem ID {problem_obj.id}: {e}" + ) + continue + + return fracas_problems diff --git a/backend/example/tests.py b/backend/problem/tests.py similarity index 100% rename from backend/example/tests.py rename to backend/problem/tests.py diff --git a/backend/problem/types.py b/backend/problem/types.py new file mode 100644 index 0000000..9b8ae93 --- /dev/null +++ b/backend/problem/types.py @@ -0,0 +1,29 @@ +from typing import Literal +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class SickProblem: + pair_id: int + sentence_one: str + sentence_two: str + entailment_label: Literal["neutral", "contradiction", "entailment"] + relatedness_score: float + + +@dataclass(frozen=True) +class FracasProblem: + fracas_id: int + question: str + hypothesis: str + answer: str + fracas_answer: Literal["yes", "no", "unknown", "undefined"] + fracas_non_standard: bool + note: str + section_name: str + subsection_name: str + premises: list[str] = field(default_factory=list) + + + + diff --git a/backend/problem/views.py b/backend/problem/views.py new file mode 100644 index 0000000..91ea44a --- /dev/null +++ b/backend/problem/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/backend/requirements.in b/backend/requirements.in index b746066..3637029 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -6,7 +6,7 @@ psycopg2 pytest pytest-django pytest-xdist - dj_rest_auth django-allauth -requests-oauthlib \ No newline at end of file +requests-oauthlib +tqdm diff --git a/backend/requirements.txt b/backend/requirements.txt index 5c9d2ff..48e55dc 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -13,7 +13,9 @@ certifi==2025.1.31 charset-normalizer==3.4.1 # via requests colorama==0.4.6 - # via pytest + # via + # pytest + # tqdm dj-rest-auth==7.0.1 # via -r requirements.in django==4.2.20 @@ -65,6 +67,8 @@ sqlparse==0.5.3 # via django tornado==6.4.2 # via django-livereload-server +tqdm==4.67.1 + # via -r requirements.in tzdata==2025.1 # via django urllib3==2.3.0 diff --git a/frontend/README.md b/frontend/README.md index 1d738d0..dae4f5c 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -1,5 +1,25 @@ # LangPro Annotator frontend +## Import ProofBank data + +LangPro Annotator can be used to browse and annotate problems from the ProofBank dataset, which is a collection of problems from both the [SICK (Sentences Involving Compositional Knowledge)][1] and FraCaS (Framework for Computational Semantics) datasets. For more information about these datasets, see the [References](#references) section below. + +To load these problems in, you need to follow the steps below: +- Obtain the SICK data [here][1] and put it in the project folder as `backend/problem/data/sick.txt`. +- Obtain the FraCaS data [here][2] and put it in the project folder as `backend/problem/data/fracas.xml`. +- Run `python manage.py import_sick` +- Run `python manage.py import_fracas` + +[1]: http://clic.cimec.unitn.it/composes/sick.html +[2]: https://www-nlp.stanford.edu/~wcmac/downloads/ + + +## References + +- [SICK dataset](http://clic.cimec.unitn.it/composes/sick.html) (Project page and download link) +- Marelli, M., Menini, S., Baroni, M., Bentivogli, L., Bernardi, R., and Zamparelli, R. (2014b). A sick cure for the evaluation of compositional distributional semantic models. LREC 2014, pages 216–223. [PDF](http://www.lrec-conf.org/proceedings/lrec2014/pdf/363_Paper.pdf) +- [FraCaS dataset](https://www-nlp.stanford.edu/~wcmac/downloads/) (represented in XML by [Bill MacCartney](https://www-nlp.stanford.edu/~wcmac/)). + ## Development server Run `yarn start` for a dev server. Navigate to `http://localhost:4200/`. This will not start the backend, to developing with a functioning backend use `yarn start` from the project root instead. Navigate to `http://localhost:8000/`, which will forward to the frontend. diff --git a/frontend/src/app/home/home.component.ts b/frontend/src/app/home/home.component.ts index 7988017..8809be1 100644 --- a/frontend/src/app/home/home.component.ts +++ b/frontend/src/app/home/home.component.ts @@ -1,6 +1,4 @@ -import { HttpClient } from "@angular/common/http"; -import { Component, OnInit } from "@angular/core"; -import { map } from "rxjs"; +import { Component } from "@angular/core"; @Component({ selector: "la-home", @@ -8,20 +6,4 @@ import { map } from "rxjs"; styleUrls: ["./home.component.scss"], standalone: true, }) -export class HomeComponent implements OnInit { - public hooray?: string; - - constructor(private http: HttpClient) {} - - ngOnInit(): void { - // This call is executed on the server and in the browser. - this.http - .get<{ message: string }[]>(`/api/example/`) - .pipe(map((hoorays) => hoorays[0].message)) - .subscribe((hooray) => { - if (!this.hooray) { - this.hooray = hooray; - } - }); - } -} +export class HomeComponent {}