Merge pull request #3 from MEITREX/video_embedding

myluki2000 · web-flow · commit c81c0dc27d7a · 2024-07-01T16:43:06.000+02:00
Video embedding
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 venv
 .vscode
+.idea
 *.mp3
 *.mp4
 *.wav
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,18 @@
+ffmpeg-python
+pytesseract
+python-pptx
+transformers
+bitsandbytes
+pydantic
+lm-format-enforcer
+webvtt-py
+numpy
+openai-whisper
+Pillow
+levenshtein
+pdf2image
+sentence-transformers
+fastapi
+psycopg
+uvicorn
+pgvector
diff --git a/src/fileextractlib/LecturePdfEmbeddingGenerator.py b/src/fileextractlib/LecturePdfEmbeddingGenerator.py
@@ -0,0 +1,16 @@
+from typing import Any, Generator
+from torch._tensor import Tensor
+import PdfProcessor
+import SentenceEmbeddingRunner
+
+class LecturePdfEmbeddingGenerator:
+    def generate_embedding(file_url: str)-> list[tuple[Tensor, str, int]]:
+        pdf_processsor = PdfProcessor.PdfProcessor()
+        pages = pdf_processsor.process_from_url(file_url)
+
+        # remove null and empty strings
+        filtered_pages = [x for x in pages if x["text"] is not None and x["text"].strip()]
+
+        embeddings = SentenceEmbeddingRunner.generate_embeddings([x["text"] for x in filtered_pages])
+
+        return list(zip(embeddings, [x["text"] for x in filtered_pages], [x["page_number"] for x in filtered_pages]))
diff --git a/src/fileextractlib/LectureVideoEmbeddingGenerator.py b/src/fileextractlib/LectureVideoEmbeddingGenerator.py
@@ -0,0 +1,109 @@
+from webvtt import WebVTT
+from TranscriptGenerator import TranscriptGenerator
+import ffmpeg
+import pytesseract
+import PIL
+import io
+import time
+import Levenshtein
+import SentenceEmbeddingRunner
+from torch import Tensor
+
+class LectureVideoEmbeddingGenerator:
+    screen_text_similarity_threshold: float = 0.8
+
+    class Section:
+        def __init__(self, start_time: int, transcript: str, screen_text: str, embedding: Tensor):
+            self.start_time: int = start_time
+            self.transcript: str = transcript
+            self.screen_text: str = screen_text
+            self.embedding: Tensor = embedding
+
+    def generate_embeddings(self, file_url: str) -> list[Section]:
+        transcript_generator: TranscriptGenerator = TranscriptGenerator()
+        vtt: WebVTT = transcript_generator.process_to_vtt(file_url)
+
+        stream = ffmpeg.input(file_url)
+
+        # construct ffmpeg select filter to extract a frame at each transcript caption start time
+        select_filters: list[str] = []
+        for caption in vtt.captions:
+            start_time_seconds: int = caption.start_in_seconds
+            select_filters.append(f"eq(t,{start_time_seconds})")
+
+        out, err = (stream
+                    .filter_("select", "+".join(select_filters))
+                    .output("-", vsync=0, format="image2pipe", vcodec="bmp")
+                    .run(capture_stdout=True)
+                    )
+        
+        # list of tuples, where the first element in the tuple is the BMP file's raw bytes, and the second is the
+        # index of the image in relation to the captions
+        bmp_files: list[tuple[bytes, int]] = []
+
+        image_index = 0
+        byte_offset = 0
+        while byte_offset < len(out):
+            # ensure BMP magic number is present
+            if out[byte_offset:byte_offset + 2] != b'BM':
+                raise ValueError("Invalid BMP file")
+
+            # get size of bmp file in bytes
+            size_in_bytes: int = int.from_bytes(out[byte_offset + 2:byte_offset + 6], byteorder='little')
+            bmp_files.append((out[byte_offset:byte_offset + size_in_bytes], image_index))
+
+            byte_offset += size_in_bytes
+            image_index += 1
+
+        # delete ffmpeg output, we don't need it anymore
+        del out
+
+        # we will now create longer sections from our captions. Captions usually have a length of a sentence or a part of a sentence.
+        # We extracted images at the start of each caption, now we will check when the video changes significantly and create a new section,
+        # merging the captions within the timespan of that section
+        sections: list[LectureVideoEmbeddingGenerator.Section] = []
+        current_section = None
+        for bmp_file in bmp_files:
+            image = PIL.Image.open(io.BytesIO(bmp_file[0]))
+            image_index: int = bmp_file[1]
+
+            screen_text = pytesseract.image_to_string(image)
+
+            if current_section is None:
+                # if this is the first image, we need to create a new section
+                # captions always have a leading "- ", so we remove it
+                current_section = LectureVideoEmbeddingGenerator.Section(
+                    start_time=vtt.captions[image_index].start_in_seconds, 
+                    transcript=vtt.captions[image_index].text[2:],
+                    screen_text=screen_text,
+                    embedding=None)
+            else:
+                # otherwise we check if the screen text is similar to the previous screen text
+                similarity = Levenshtein.ratio(current_section.screen_text, screen_text)
+
+                if similarity > self.screen_text_similarity_threshold:
+                    # if the screen text is similar, we append the current caption to the current section
+                    # Captions always have a leading "- ", so we remove it
+                    current_section.transcript += " " + vtt.captions[image_index].text[2:]
+                else:
+                    # if the screen text is not similar, we create a new section
+                    # Caption texts always have a leading "- ", so we remove it
+                    sections.append(current_section)
+                    current_section = LectureVideoEmbeddingGenerator.Section(
+                        start_time=vtt.captions[image_index].start_in_seconds,
+                        transcript=vtt.captions[image_index].text[2:0], 
+                        screen_text=screen_text,
+                        embedding=None)
+
+        for section in sections:
+            section.embedding = SentenceEmbeddingRunner.generate_embeddings([section.transcript + "\n\n" + section.screen_text])[0]
+
+        return sections
+
+        
+if __name__ == "__main__":
+    start_time = time.time()
+    generator = LectureVideoEmbeddingGenerator()
+    generator.generate_embedding(r"E:\Lukas\Downloads\LLM-Lehrmaterial\LLM-Lehrmaterial\Marco Aiello\ Lecture videos\DS\20 Distributed Transactions.mp4")
+    end_time = time.time()
+    print("Embedding generated successfully in " + str(end_time - start_time) + " seconds.")
diff --git a/src/fileextractlib/TranscriptGenerator.py b/src/fileextractlib/TranscriptGenerator.py
@@ -11,7 +11,7 @@
 from webvtt import WebVTT, Caption
 
 
-class LectureVideoProcessor:
+class TranscriptGenerator:
     """
      Can be used to convert lecture video/audio to text transcripts in WebVTT format.
     """
@@ -24,16 +24,15 @@ def __init__(self, whisper_model: str = "base"):
         """
         self.model: whisper.Whisper = whisper.load_model(name=whisper_model)
 
-    def process(self, file_name: str) -> str:
+    def process_to_vtt(self, file_name: str) -> WebVTT:
         """
         Processes the file with the specified name to a transcript. Uses ffmpeg internally to extract the audio, so any video/audio format readable by 
         ffmpeg works by default. Additionally, networked resources supported by ffmpeg also work (e.g. specifying an HTTP URL to a video file as file_name)
 
         :param file_name: Name/path of the input video/audio file.
         :raises RuntimeError: Raised when the ffmpeg process encounters an error during audio extraction.
-        :return: Returnsa transcript as a string, in WebVTT caption format.
+        :return: Returns a WebVTT object containing the transcript.
         """
-
         # load audio data from file
         try:
             sample_rate = 16000
@@ -78,6 +77,19 @@ def process(self, file_name: str) -> str:
             vtt.captions.append(caption)
 
         print("Processed text in " + str(end_time - start_time) + " seconds.")
+        return vtt
+        
+
+    def process_to_file(self, file_name: str) -> str:
+        """
+        Processes the file with the specified name to a transcript. Uses ffmpeg internally to extract the audio, so any video/audio format readable by 
+        ffmpeg works by default. Additionally, networked resources supported by ffmpeg also work (e.g. specifying an HTTP URL to a video file as file_name)
+
+        :param file_name: Name/path of the input video/audio file.
+        :raises RuntimeError: Raised when the ffmpeg process encounters an error during audio extraction.
+        :return: Returnsa transcript as a string, in WebVTT caption format.
+        """
+        vtt = self.process_to_vtt(file_name)
         
         with io.StringIO() as f:
             vtt.write(f)
@@ -92,7 +104,7 @@ def process(self, file_name: str) -> str:
     parser.add_argument("--indir")
     parser.add_argument("--outdir")
     args = parser.parse_args()
-    processor = LectureVideoProcessor()
+    processor = TranscriptGenerator()
 
     if args.infile is not None and args.indir is not None:
         raise ValueError("Cannot specify both infile and indir. Either process a single file or batch process a folder")
diff --git a/src/fileextractlib/server.py b/src/fileextractlib/server.py
@@ -2,11 +2,13 @@
 import SentenceEmbeddingRunner
 from pydantic import BaseModel
 import LlamaRunner
-import LectureVideoProcessor
+import TranscriptGenerator as TranscriptGenerator
 import PdfProcessor as PdfProcessor
 import uvicorn
 from pgvector.psycopg import register_vector
 import psycopg
+from LecturePdfEmbeddingGenerator import LecturePdfEmbeddingGenerator
+from LectureVideoEmbeddingGenerator import LectureVideoEmbeddingGenerator
 
 app = fastapi.FastAPI()
 
@@ -16,23 +18,32 @@
 register_vector(db_conn)
 
 #db_conn.execute("DROP TABLE IF EXISTS documents")
+#db_conn.execute("DROP TABLE IF EXISTS videos")
+
 db_conn.execute("CREATE TABLE IF NOT EXISTS documents (PRIMARY KEY(origin_file, page), text text, origin_file text, page int, embedding vector(1024))")
+db_conn.execute("CREATE TABLE IF NOT EXISTS videos (PRIMARY KEY(origin_file, start_time), screen_text text, transcript text, origin_file text, start_time int, embedding vector(1024))")
 
 llamaRunner: LlamaRunner.LlamaRunner | None = None
 
 def ingest_document_task(file_url: str):
-    pdf_processsor = PdfProcessor.PdfProcessor()
-    pages: list[str] = pdf_processsor.process_from_url(file_url)
+    embeddings = LecturePdfEmbeddingGenerator.generate_embedding(file_url)
 
-    # remove null and empty strings
-    filtered_pages_text = [x["text"] for x in pages if x["text"] is not None and x["text"].strip()]
+    for embedding, text, page_no in embeddings:
+        db_conn.execute(query="INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)",
+                        params=(text, file_url, page_no, embedding))
 
-    embeddings = SentenceEmbeddingRunner.generate_embeddings(filtered_pages_text)
+    print(f"File {file_url} has been ingested into the database.")
 
-    for (page_no, embedding) in enumerate(embeddings, 1):
-        db_conn.execute("INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)", (filtered_pages_text.pop(0), file_url, page_no, embedding))
+def ingest_video_task(video_url: str):
+    lecture_video_embedding_generator = LectureVideoEmbeddingGenerator()
+    embeddings: list[LectureVideoEmbeddingGenerator.Section] = lecture_video_embedding_generator.generate_embeddings(video_url)
 
-    print(f"File {file_url} has been ingested into the database.")
+    for embedding in embeddings:
+        db_conn.execute(query="INSERT INTO videos (screen_text, transcript, origin_file, start_time, embedding) VALUES (%s, %s, %s, %s, %s)",
+                        params=(embedding.screen_text, embedding.transcript, video_url, embedding.start_time, embedding.embedding))
+        
+    print(f"Video {video_url} has been ingested into the database.")
+    
 
 @app.get("/generate-embedding/")
 def generate_embedding(input_text: str):
@@ -63,7 +74,7 @@ class TranscriptAnswerSchema(BaseModel):
         tag4: str
         tag5: str
 
-    lecture_video_processor = LectureVideoProcessor.LectureVideoProcessor()
+    lecture_video_processor = TranscriptGenerator.LectureVideoProcessor()
     transcript_text = lecture_video_processor.process(request.video_url)
 
     input_text = "# Video Transcript:\n" + transcript_text + "\n\n# Json Schema:\n" + TranscriptAnswerSchema.schema_json() + "\n\n# Json Result:\n"
@@ -91,21 +102,67 @@ def ingest_documents_into_db(request: IngestDocumentsIntoDbRequest, background_t
 
     return {"message": "Files have been added to ingest queue."}
 
+class IngestVideoIntoDbRequest(BaseModel):
+    video_url: str
+
+@app.post("/db-ingest-video/", status_code=202)
+def ingest_video_into_db(request: IngestVideoIntoDbRequest, background_tasks: fastapi.BackgroundTasks):
+    background_tasks.add_task(ingest_video_task, request.video_url)
+
+    return {"message": "Video has been added to ingest queue."}
+
 @app.get("/search/")
 def db_find_neighbor(query: str, count: int = 5):
     if count < 1 or count > 100:
         raise fastapi.HTTPException(status_code=400, detail="Count must be between 1 and 100.")
 
     query_embedding = SentenceEmbeddingRunner.generate_embeddings([query])[0]
 
-    result = db_conn.execute("SELECT * FROM documents ORDER BY embedding <=> %s LIMIT %s", (query_embedding, count))
-    return {
-        "results": [{
-                        "file": row["origin_file"],
-                        "page": row["page"],
-                        "text": row["text"],
-                    } for row in result]
-    }
+    query = """
+    WITH document_results AS (
+        SELECT
+            origin_file,
+            'document' AS source,
+            page,
+            NULL::integer AS start_time,
+            text,
+            NULL::text AS screen_text,
+            NULL::text AS transcript,
+            embedding <=> %s AS distance
+        FROM documents
+    ),
+    video_results AS (
+        SELECT origin_file,
+            'video' AS source,
+            NULL::integer AS page,
+            start_time,
+            NULL::text AS text,
+            screen_text,
+            transcript,
+            embedding <=> %s AS distance
+        FROM videos
+    ),
+    results AS (
+        SELECT * FROM document_results
+        UNION ALL
+        SELECT * FROM video_results
+    )
+    SELECT * FROM results ORDER BY distance LIMIT %s
+    """
+
+    query_result = db_conn.execute(query=query, params=(query_embedding, query_embedding, count)).fetchall()
+
+    for result in query_result:
+        if result["source"] == "document":
+            del result["start_time"]
+            del result["screen_text"]
+            del result["transcript"]
+        elif result["source"] == "video":
+            del result["page"]
+            del result["text"]
+
+    return query_result
+
 
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=8000)

-Original file line number
+Diff line change
@@ @@ -1,6 +1,7 @@ @@
 .DS_Store
 venv
 .vscode
 +.idea
 *.mp3
 *.mp4
 *.wav