Merge branch 'main' of https://github.com/MEITREX/docprocai_service

myluki2000 · myluki2000 · commit b2245c21ffd1 · 2024-10-31T11:09:17.000+01:00
diff --git a/controller/dapr_controller.py b/controller/dapr_controller.py
@@ -1,9 +1,12 @@
+from enum import Enum, auto
+
 import dapr
 from dapr.ext.fastapi.app import DaprApp
 from fastapi import FastAPI
 import uuid
 
 from dto import TaskInformationDto
+from controller.events import ContentChangeEvent, CrudOperation
 from service.DocProcAiService import DocProcAiService
 
 
@@ -34,4 +37,13 @@ def assessment_content_mutated_handler(data: dict):
             assessment_id = uuid.UUID(data["data"]["assessmentId"])
             task_information: list[TaskInformationDto] = data["data"]["taskInformationList"]
 
-            ai_service.enqueue_generate_assessment_segments(assessment_id, task_information)
+            ai_service.enqueue_generate_assessment_segments(assessment_id, task_information)
+
+        @dapr_app.subscribe(pubsub="meitrex", topic="content-changed")
+        def assessment_content_deleted_handler(data: dict):
+            content_change_event = ContentChangeEvent(data["data"]["contentIds"], data["data"]["operation"])
+
+            if content_change_event.crudOperation == "DELETE":
+                ai_service.delete_entries_of_assessments(content_change_event)
+
+
diff --git a/controller/events.py b/controller/events.py
@@ -0,0 +1,14 @@
+import uuid
+from dataclasses import dataclass
+from enum import Enum, auto
+
+
+class CrudOperation(Enum):
+    CREATE = auto()
+    UPDATE = auto()
+    DELETE = auto()
+
+@dataclass
+class ContentChangeEvent:
+    contentIds: list[uuid]
+    crudOperation: CrudOperation
diff --git a/controller/graphql_controller.py b/controller/graphql_controller.py
@@ -85,6 +85,10 @@ def get_media_record_summary(parent, info, mediaRecordId: UUID) -> list[str]:
         def get_media_record_suggested_tags(parent, info, mediaRecordId: UUID) -> list[str]:
             return ai_service.get_media_record_tags(mediaRecordId)
 
+        @query.field("_internal_noauth_getAssessmentSuggestedTags")
+        def get_media_record_suggested_tags(parent, info, assessmentId: UUID) -> list[str]:
+            return ai_service.get_assessment_tags(assessmentId)
+
         @query.field("_internal_noauth_getMediaRecordsAiProcessingProgress")
         def get_media_records_ai_processing_state(parent, info, mediaRecordIds: list[UUID])\
                 -> list[AiEntityProcessingProgressDto]:
diff --git a/fileextractlib/TopicModel.py b/fileextractlib/TopicModel.py
@@ -8,22 +8,21 @@
 from bertopic.vectorizers import ClassTfidfTransformer
 from sklearn.feature_extraction.text import CountVectorizer
 
+from persistence.AssesmentInfoDbConnector import AssessmentInfoDbConnector
 from persistence.MediaRecordInfoDbConnector import MediaRecordInfoDbConnector
 from persistence.SegmentDbConnector import SegmentDbConnector
-from persistence.entities import DocumentSegmentEntity, VideoSegmentEntity
+from persistence.entities import DocumentSegmentEntity, VideoSegmentEntity, AssessmentSegmentEntity
 
 _logger = logging.getLogger(__name__)
 
 
 class TopicModel:
     model = BERTopic()
 
-    def __init__(self, record_segments: list[DocumentSegmentEntity | VideoSegmentEntity], media_records):
+    def __init__(self, record_segments: list[VideoSegmentEntity | DocumentSegmentEntity | AssessmentSegmentEntity]):
         self.record_segments = []
-        self.media_records = {}
         self.docs = []
         self.record_segments = record_segments
-        self.media_records = media_records
         self.docs = []
 
     def create_topic_model(self):
@@ -36,6 +35,9 @@ def create_topic_model(self):
             if isinstance(entity, VideoSegmentEntity):
                 self.docs.append(entity.transcript)
                 embeddings.append(entity.embedding)
+            if isinstance(entity, AssessmentSegmentEntity):
+                self.docs.append(entity.textual_representation)
+                embeddings.append(entity.embedding)
 
         if len(self.docs) < 11:
             _logger.info("More documents needed to create topic model.")
@@ -46,7 +48,6 @@ def create_topic_model(self):
         ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
         mmr = MaximalMarginalRelevance(diversity=0.3)
 
-
         representation_models = mmr
 
         self.model = BERTopic(
@@ -58,41 +59,74 @@ def create_topic_model(self):
 
         self.model.fit_transform(self.docs, embeddings)
 
-    def add_tags_to_media_records(self, record_segments, media_records):
+    def add_tags_to_media_records(self, segments):
         if len(self.docs) < 11:
             _logger.info("Topic model wasn't created. More documents needed.")
             return
         document_info = self.model.get_document_info(self.docs)
         mediarecords_with_tags = {}
 
         i = 0
-        for record in media_records:
-            mediarecords_with_tags.update({record.get(id): set()})
+        while i < len(segments):
+            if isinstance(segments[i], AssessmentSegmentEntity):
+                i += 1
+                continue
 
-        while i < len(record_segments):
-            mediarecord_id = record_segments[i].media_record_id
+            mediarecord_id = segments[i].media_record_id
 
-            if isinstance(record_segments[i], DocumentSegmentEntity):
-                if record_segments[i].text != document_info['Document'].iat[i]:
+            if isinstance(segments[i], DocumentSegmentEntity):
+                if segments[i].text != document_info['Document'].iat[i]:
+                    i += 1
                     continue
-
-            elif isinstance(record_segments[i], VideoSegmentEntity):
-                if record_segments[i].transcript != document_info['Document'].iat[i]:
+            elif isinstance(segments[i], VideoSegmentEntity):
+                if segments[i].transcript != document_info['Document'].iat[i]:
+                    i += 1
                     continue
 
             tags = set()
             if mediarecords_with_tags.get(mediarecord_id) is not None:
-               tags = mediarecords_with_tags.get(mediarecord_id)
+                tags = mediarecords_with_tags.get(mediarecord_id)
             tags.update(set(document_info['Representation'].iat[i]))
 
             mediarecords_with_tags.update({mediarecord_id: tags})
             i += 1
 
         return mediarecords_with_tags
 
-if __name__ == "__main__":
+    def add_tags_to_assessments(self, segments):
+        if len(self.docs) < 11:
+            _logger.info("Topic model wasn't created. More documents needed.")
+            return
+        document_info = self.model.get_document_info(self.docs)
+        assesments_with_tags = {}
+
+        i = 0
+
+        while i < len(segments):
+            if isinstance(segments[i], DocumentSegmentEntity) or isinstance(segments[i], VideoSegmentEntity):
+                i += 1
+                continue
 
-    star = time.time()
+            assessment_id = segments[i].assessment_id
+
+            if isinstance(segments[i], AssessmentSegmentEntity):
+                if segments[i].textual_representation != document_info['Document'].iat[i]:
+                    i += 1
+                    continue
+
+            tags = set()
+            if assesments_with_tags.get(assessment_id) is not None:
+                tags = assesments_with_tags.get(assessment_id)
+            tags.update(set(document_info['Representation'].iat[i]))
+
+            assesments_with_tags.update({assessment_id: tags})
+            i += 1
+
+        return assesments_with_tags
+
+
+if __name__ == "__main__":
+    start = time.time()
 
     print("Connecting to DB")
     database_connection = psycopg.connect(
@@ -103,24 +137,29 @@ def add_tags_to_media_records(self, record_segments, media_records):
 
     segment_database = SegmentDbConnector(database_connection)
     media_record_info_database = MediaRecordInfoDbConnector(database_connection)
+    assessment_database = AssessmentInfoDbConnector(database_connection)
 
     print("Loading segments and media records")
 
-    record_segments = segment_database.get_all_media_record_segments()
+    segments = segment_database.get_all_entity_segments()
     media_records = media_record_info_database.get_all_media_records()
+    assessments = assessment_database.get_all_assessments()
 
-    topic_model = TopicModel(record_segments, media_records)
+    topic_model = TopicModel(segments)
 
     print("Running Topic model")
     topic_model.create_topic_model()
+    print("Topic model created")
 
-    media_records_with_tags = topic_model.add_tags_to_media_records(record_segments, media_records)
+    print("Adding tags")
+    media_records_with_tags = topic_model.add_tags_to_media_records(segments)
+    assessments_with_tags = topic_model.add_tags_to_assessments(segments)
     if media_records_with_tags is not None:
         for mrid, tags in media_records_with_tags.items():
             media_record_info_database.update_media_record_tags(mrid, list(tags))
-    end = time.time()
-    print("Done in " + str(end - star) + " seconds")
-
-
-
 
+    if assessments_with_tags is not None:
+        for aid, tags in assessments_with_tags.items():
+            assessment_database.update_assessment_tags(aid, list(tags))
+    end = time.time()
+    print("Done in " + str(end - start) + " seconds")
diff --git a/persistence/AssesmentInfoDbConnector.py b/persistence/AssesmentInfoDbConnector.py
@@ -0,0 +1,67 @@
+from uuid import UUID
+
+import psycopg
+from pgvector.psycopg import register_vector
+
+
+class AssessmentInfoDbConnector:
+    def __init__(self, db_connection: psycopg.Connection):
+        self.db_connection = db_connection
+
+        # ensure pgvector extension is installed, we need it to store text embeddings
+        self.db_connection.execute("CREATE EXTENSION IF NOT EXISTS vector")
+        register_vector(self.db_connection)
+
+        self.db_connection.execute(
+            """
+            CREATE TABLE IF NOT EXISTS assessments (
+              id uuid PRIMARY KEY,
+              tags text[]
+            );
+            """)
+
+    def upsert_assessment_info(self, id: UUID):
+        self.db_connection.execute(
+            query="""
+                  INSERT INTO assessments (id, tags)
+                  VALUES (%s, %s)
+                  ON CONFLICT (id)
+                  DO UPDATE SET 
+                    tags = EXCLUDED.tags
+                  """,
+            params=(id, [])
+        )
+
+    def get_assessment_tags_by_id(self, assesment_id) -> list[str]:
+        query_result = self.db_connection.execute(
+            "SELECT tags FROM assessments WHERE id = %s",
+            (assesment_id,)).fetchone()
+
+        if query_result is None:
+            return []
+
+        return query_result["tags"]
+
+    def get_all_assessments(self):
+        cursor = self.db_connection.cursor()
+        cursor.execute(
+            "SELECT * FROM assessments"
+        )
+        return cursor.fetchall()
+
+    def update_assessment_tags(self, id: UUID, tags: list[str]):
+        self.db_connection.execute(
+            """
+                UPDATE assessments
+                SET tags = (%(tags)s)
+                WHERE id = (%(id)s)
+            """,
+            {'tags': tags, 'id': id})
+
+    def delete_assessment_by_id(self, id: UUID):
+        self.db_connection.execute(
+            """
+            DELETE FROM assessments WHERE id = (%(id)s)
+            """,
+            {'id': id}
+        )
diff --git a/persistence/IngestionStateDbConnector.py b/persistence/IngestionStateDbConnector.py
@@ -101,4 +101,12 @@ def get_enqueued_or_processing_ingestion_entities(self) \
             FROM media_record_ingestion_states
             WHERE state IN ('ENQUEUED', 'PROCESSING');
             """).fetchall()
-        return [(x["id"], x["entity_type"], x["state"]) for x in query_results]
+        return [(x["id"], x["entity_type"], x["state"]) for x in query_results]
+
+    def delete_ingestion_state(self, id: UUID) -> None:
+        self.db_connection.execute(
+            """
+            DELETE FROM media_record_ingestion_states WHERE id = (%(id)s);
+            """,
+            {'id': id}
+        )
diff --git a/persistence/MediaRecordInfoDbConnector.py b/persistence/MediaRecordInfoDbConnector.py
@@ -81,3 +81,11 @@ def update_media_record_tags(self, id: UUID, tags: list[str]):
                 WHERE id = (%(id)s)
             """,
             {'tags': tags, 'id': id})
+
+    def delete_media_record_by_id(self, id: UUID):
+        self.db_connection.execute(
+            """
+            DELETE FROM media_records WHERE id = (%(id)s)
+            """,
+            {'id': id}
+        )
diff --git a/persistence/SegmentDbConnector.py b/persistence/SegmentDbConnector.py
@@ -290,6 +290,19 @@ def get_all_media_record_segments(self) -> list[EntitySegmentEntity]:
                 """
         return self.__get_record_segments_with_query(query, {})
 
+    def get_all_entity_segments(self) -> list[EntitySegmentEntity]:
+        query = """
+                SELECT * FROM (
+                    (SELECT *, 'document' AS source FROM document_segments) AS t1
+                    NATURAL FULL JOIN
+                    (SELECT *, 'video' AS source FROM video_segments) AS t2
+                    NATURAL FULL JOIN 
+                    (SELECT *, 'assessment' AS source FROM assessment_segments) AS t3
+                );
+                """
+        return self.__get_record_segments_with_query(query, {})
+
+
     def get_entity_segments_by_ids(self, segment_ids: list[UUID]) -> list[EntitySegmentEntity]:
         query = """
                 WITH document_results AS (
diff --git a/persistence/entities.py b/persistence/entities.py
@@ -7,6 +7,11 @@ def __init__(self, id: UUID, summary: list[str], tags: set):
         self.summary = summary
         self.tags = tags
 
+class AssessmentEntity:
+    def __init__(self, id: UUID, tags: set):
+        self.tags = tags
+
+
 
 class DocumentSegmentEntity:
     def __init__(self, id: UUID, media_record_id: UUID, page_index: int, text: str, thumbnail: bytes, title: str,
diff --git a/schema/query.graphqls b/schema/query.graphqls
@@ -82,6 +82,15 @@ type Query {
     """
     _internal_noauth_getMediaRecordSuggestedTags(mediaRecordId: UUID!): [String!]!
 
+    """
+    Gets the suggested tags of the specified  assessment. Returns a list of strings
+    where each string is a tag.
+
+    ⚠️ This query is only accessible internally in the system and allows the caller to fetch contents without
+    any permissions check and should not be called without any validation of the caller's permissions. ⚠️
+    """
+    _internal_noauth_getAssessmentSuggestedTags(assessmentId: UUID!): [String!]!
+
     """
     Gets the DocProcAI ingestion processing state of the specified media records. "UNKNOWN" is returned if the specified
     ID is unknown to the service (either because a media record with the given ID does not exist or because the media
diff --git a/service/DocProcAiService.py b/service/DocProcAiService.py