Skip to content

Commit c81c0dc

Browse files
authored
Merge pull request #3 from MEITREX/video_embedding
Video embedding
2 parents e6ed7ed + a20bcb8 commit c81c0dc

6 files changed

Lines changed: 236 additions & 23 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.DS_Store
22
venv
33
.vscode
4+
.idea
45
*.mp3
56
*.mp4
67
*.wav

requirements.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
ffmpeg-python
2+
pytesseract
3+
python-pptx
4+
transformers
5+
bitsandbytes
6+
pydantic
7+
lm-format-enforcer
8+
webvtt-py
9+
numpy
10+
openai-whisper
11+
Pillow
12+
levenshtein
13+
pdf2image
14+
sentence-transformers
15+
fastapi
16+
psycopg
17+
uvicorn
18+
pgvector
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from typing import Any, Generator
2+
from torch._tensor import Tensor
3+
import PdfProcessor
4+
import SentenceEmbeddingRunner
5+
6+
class LecturePdfEmbeddingGenerator:
7+
def generate_embedding(file_url: str)-> list[tuple[Tensor, str, int]]:
8+
pdf_processsor = PdfProcessor.PdfProcessor()
9+
pages = pdf_processsor.process_from_url(file_url)
10+
11+
# remove null and empty strings
12+
filtered_pages = [x for x in pages if x["text"] is not None and x["text"].strip()]
13+
14+
embeddings = SentenceEmbeddingRunner.generate_embeddings([x["text"] for x in filtered_pages])
15+
16+
return list(zip(embeddings, [x["text"] for x in filtered_pages], [x["page_number"] for x in filtered_pages]))
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from webvtt import WebVTT
2+
from TranscriptGenerator import TranscriptGenerator
3+
import ffmpeg
4+
import pytesseract
5+
import PIL
6+
import io
7+
import time
8+
import Levenshtein
9+
import SentenceEmbeddingRunner
10+
from torch import Tensor
11+
12+
class LectureVideoEmbeddingGenerator:
13+
screen_text_similarity_threshold: float = 0.8
14+
15+
class Section:
16+
def __init__(self, start_time: int, transcript: str, screen_text: str, embedding: Tensor):
17+
self.start_time: int = start_time
18+
self.transcript: str = transcript
19+
self.screen_text: str = screen_text
20+
self.embedding: Tensor = embedding
21+
22+
def generate_embeddings(self, file_url: str) -> list[Section]:
23+
transcript_generator: TranscriptGenerator = TranscriptGenerator()
24+
vtt: WebVTT = transcript_generator.process_to_vtt(file_url)
25+
26+
stream = ffmpeg.input(file_url)
27+
28+
# construct ffmpeg select filter to extract a frame at each transcript caption start time
29+
select_filters: list[str] = []
30+
for caption in vtt.captions:
31+
start_time_seconds: int = caption.start_in_seconds
32+
select_filters.append(f"eq(t,{start_time_seconds})")
33+
34+
out, err = (stream
35+
.filter_("select", "+".join(select_filters))
36+
.output("-", vsync=0, format="image2pipe", vcodec="bmp")
37+
.run(capture_stdout=True)
38+
)
39+
40+
# list of tuples, where the first element in the tuple is the BMP file's raw bytes, and the second is the
41+
# index of the image in relation to the captions
42+
bmp_files: list[tuple[bytes, int]] = []
43+
44+
image_index = 0
45+
byte_offset = 0
46+
while byte_offset < len(out):
47+
# ensure BMP magic number is present
48+
if out[byte_offset:byte_offset + 2] != b'BM':
49+
raise ValueError("Invalid BMP file")
50+
51+
# get size of bmp file in bytes
52+
size_in_bytes: int = int.from_bytes(out[byte_offset + 2:byte_offset + 6], byteorder='little')
53+
bmp_files.append((out[byte_offset:byte_offset + size_in_bytes], image_index))
54+
55+
byte_offset += size_in_bytes
56+
image_index += 1
57+
58+
# delete ffmpeg output, we don't need it anymore
59+
del out
60+
61+
# we will now create longer sections from our captions. Captions usually have a length of a sentence or a part of a sentence.
62+
# We extracted images at the start of each caption, now we will check when the video changes significantly and create a new section,
63+
# merging the captions within the timespan of that section
64+
sections: list[LectureVideoEmbeddingGenerator.Section] = []
65+
current_section = None
66+
for bmp_file in bmp_files:
67+
image = PIL.Image.open(io.BytesIO(bmp_file[0]))
68+
image_index: int = bmp_file[1]
69+
70+
screen_text = pytesseract.image_to_string(image)
71+
72+
if current_section is None:
73+
# if this is the first image, we need to create a new section
74+
# captions always have a leading "- ", so we remove it
75+
current_section = LectureVideoEmbeddingGenerator.Section(
76+
start_time=vtt.captions[image_index].start_in_seconds,
77+
transcript=vtt.captions[image_index].text[2:],
78+
screen_text=screen_text,
79+
embedding=None)
80+
else:
81+
# otherwise we check if the screen text is similar to the previous screen text
82+
similarity = Levenshtein.ratio(current_section.screen_text, screen_text)
83+
84+
if similarity > self.screen_text_similarity_threshold:
85+
# if the screen text is similar, we append the current caption to the current section
86+
# Captions always have a leading "- ", so we remove it
87+
current_section.transcript += " " + vtt.captions[image_index].text[2:]
88+
else:
89+
# if the screen text is not similar, we create a new section
90+
# Caption texts always have a leading "- ", so we remove it
91+
sections.append(current_section)
92+
current_section = LectureVideoEmbeddingGenerator.Section(
93+
start_time=vtt.captions[image_index].start_in_seconds,
94+
transcript=vtt.captions[image_index].text[2:0],
95+
screen_text=screen_text,
96+
embedding=None)
97+
98+
for section in sections:
99+
section.embedding = SentenceEmbeddingRunner.generate_embeddings([section.transcript + "\n\n" + section.screen_text])[0]
100+
101+
return sections
102+
103+
104+
if __name__ == "__main__":
105+
start_time = time.time()
106+
generator = LectureVideoEmbeddingGenerator()
107+
generator.generate_embedding(r"E:\Lukas\Downloads\LLM-Lehrmaterial\LLM-Lehrmaterial\Marco Aiello\ Lecture videos\DS\20 Distributed Transactions.mp4")
108+
end_time = time.time()
109+
print("Embedding generated successfully in " + str(end_time - start_time) + " seconds.")

src/fileextractlib/LectureVideoProcessor.py renamed to src/fileextractlib/TranscriptGenerator.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from webvtt import WebVTT, Caption
1212

1313

14-
class LectureVideoProcessor:
14+
class TranscriptGenerator:
1515
"""
1616
Can be used to convert lecture video/audio to text transcripts in WebVTT format.
1717
"""
@@ -24,16 +24,15 @@ def __init__(self, whisper_model: str = "base"):
2424
"""
2525
self.model: whisper.Whisper = whisper.load_model(name=whisper_model)
2626

27-
def process(self, file_name: str) -> str:
27+
def process_to_vtt(self, file_name: str) -> WebVTT:
2828
"""
2929
Processes the file with the specified name to a transcript. Uses ffmpeg internally to extract the audio, so any video/audio format readable by
3030
ffmpeg works by default. Additionally, networked resources supported by ffmpeg also work (e.g. specifying an HTTP URL to a video file as file_name)
3131
3232
:param file_name: Name/path of the input video/audio file.
3333
:raises RuntimeError: Raised when the ffmpeg process encounters an error during audio extraction.
34-
:return: Returnsa transcript as a string, in WebVTT caption format.
34+
:return: Returns a WebVTT object containing the transcript.
3535
"""
36-
3736
# load audio data from file
3837
try:
3938
sample_rate = 16000
@@ -78,6 +77,19 @@ def process(self, file_name: str) -> str:
7877
vtt.captions.append(caption)
7978

8079
print("Processed text in " + str(end_time - start_time) + " seconds.")
80+
return vtt
81+
82+
83+
def process_to_file(self, file_name: str) -> str:
84+
"""
85+
Processes the file with the specified name to a transcript. Uses ffmpeg internally to extract the audio, so any video/audio format readable by
86+
ffmpeg works by default. Additionally, networked resources supported by ffmpeg also work (e.g. specifying an HTTP URL to a video file as file_name)
87+
88+
:param file_name: Name/path of the input video/audio file.
89+
:raises RuntimeError: Raised when the ffmpeg process encounters an error during audio extraction.
90+
:return: Returnsa transcript as a string, in WebVTT caption format.
91+
"""
92+
vtt = self.process_to_vtt(file_name)
8193

8294
with io.StringIO() as f:
8395
vtt.write(f)
@@ -92,7 +104,7 @@ def process(self, file_name: str) -> str:
92104
parser.add_argument("--indir")
93105
parser.add_argument("--outdir")
94106
args = parser.parse_args()
95-
processor = LectureVideoProcessor()
107+
processor = TranscriptGenerator()
96108

97109
if args.infile is not None and args.indir is not None:
98110
raise ValueError("Cannot specify both infile and indir. Either process a single file or batch process a folder")

src/fileextractlib/server.py

Lines changed: 75 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
import SentenceEmbeddingRunner
33
from pydantic import BaseModel
44
import LlamaRunner
5-
import LectureVideoProcessor
5+
import TranscriptGenerator as TranscriptGenerator
66
import PdfProcessor as PdfProcessor
77
import uvicorn
88
from pgvector.psycopg import register_vector
99
import psycopg
10+
from LecturePdfEmbeddingGenerator import LecturePdfEmbeddingGenerator
11+
from LectureVideoEmbeddingGenerator import LectureVideoEmbeddingGenerator
1012

1113
app = fastapi.FastAPI()
1214

@@ -16,23 +18,32 @@
1618
register_vector(db_conn)
1719

1820
#db_conn.execute("DROP TABLE IF EXISTS documents")
21+
#db_conn.execute("DROP TABLE IF EXISTS videos")
22+
1923
db_conn.execute("CREATE TABLE IF NOT EXISTS documents (PRIMARY KEY(origin_file, page), text text, origin_file text, page int, embedding vector(1024))")
24+
db_conn.execute("CREATE TABLE IF NOT EXISTS videos (PRIMARY KEY(origin_file, start_time), screen_text text, transcript text, origin_file text, start_time int, embedding vector(1024))")
2025

2126
llamaRunner: LlamaRunner.LlamaRunner | None = None
2227

2328
def ingest_document_task(file_url: str):
24-
pdf_processsor = PdfProcessor.PdfProcessor()
25-
pages: list[str] = pdf_processsor.process_from_url(file_url)
29+
embeddings = LecturePdfEmbeddingGenerator.generate_embedding(file_url)
2630

27-
# remove null and empty strings
28-
filtered_pages_text = [x["text"] for x in pages if x["text"] is not None and x["text"].strip()]
31+
for embedding, text, page_no in embeddings:
32+
db_conn.execute(query="INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)",
33+
params=(text, file_url, page_no, embedding))
2934

30-
embeddings = SentenceEmbeddingRunner.generate_embeddings(filtered_pages_text)
35+
print(f"File {file_url} has been ingested into the database.")
3136

32-
for (page_no, embedding) in enumerate(embeddings, 1):
33-
db_conn.execute("INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)", (filtered_pages_text.pop(0), file_url, page_no, embedding))
37+
def ingest_video_task(video_url: str):
38+
lecture_video_embedding_generator = LectureVideoEmbeddingGenerator()
39+
embeddings: list[LectureVideoEmbeddingGenerator.Section] = lecture_video_embedding_generator.generate_embeddings(video_url)
3440

35-
print(f"File {file_url} has been ingested into the database.")
41+
for embedding in embeddings:
42+
db_conn.execute(query="INSERT INTO videos (screen_text, transcript, origin_file, start_time, embedding) VALUES (%s, %s, %s, %s, %s)",
43+
params=(embedding.screen_text, embedding.transcript, video_url, embedding.start_time, embedding.embedding))
44+
45+
print(f"Video {video_url} has been ingested into the database.")
46+
3647

3748
@app.get("/generate-embedding/")
3849
def generate_embedding(input_text: str):
@@ -63,7 +74,7 @@ class TranscriptAnswerSchema(BaseModel):
6374
tag4: str
6475
tag5: str
6576

66-
lecture_video_processor = LectureVideoProcessor.LectureVideoProcessor()
77+
lecture_video_processor = TranscriptGenerator.LectureVideoProcessor()
6778
transcript_text = lecture_video_processor.process(request.video_url)
6879

6980
input_text = "# Video Transcript:\n" + transcript_text + "\n\n# Json Schema:\n" + TranscriptAnswerSchema.schema_json() + "\n\n# Json Result:\n"
@@ -91,21 +102,67 @@ def ingest_documents_into_db(request: IngestDocumentsIntoDbRequest, background_t
91102

92103
return {"message": "Files have been added to ingest queue."}
93104

105+
class IngestVideoIntoDbRequest(BaseModel):
106+
video_url: str
107+
108+
@app.post("/db-ingest-video/", status_code=202)
109+
def ingest_video_into_db(request: IngestVideoIntoDbRequest, background_tasks: fastapi.BackgroundTasks):
110+
background_tasks.add_task(ingest_video_task, request.video_url)
111+
112+
return {"message": "Video has been added to ingest queue."}
113+
94114
@app.get("/search/")
95115
def db_find_neighbor(query: str, count: int = 5):
96116
if count < 1 or count > 100:
97117
raise fastapi.HTTPException(status_code=400, detail="Count must be between 1 and 100.")
98118

99119
query_embedding = SentenceEmbeddingRunner.generate_embeddings([query])[0]
100120

101-
result = db_conn.execute("SELECT * FROM documents ORDER BY embedding <=> %s LIMIT %s", (query_embedding, count))
102-
return {
103-
"results": [{
104-
"file": row["origin_file"],
105-
"page": row["page"],
106-
"text": row["text"],
107-
} for row in result]
108-
}
121+
query = """
122+
WITH document_results AS (
123+
SELECT
124+
origin_file,
125+
'document' AS source,
126+
page,
127+
NULL::integer AS start_time,
128+
text,
129+
NULL::text AS screen_text,
130+
NULL::text AS transcript,
131+
embedding <=> %s AS distance
132+
FROM documents
133+
),
134+
video_results AS (
135+
SELECT origin_file,
136+
'video' AS source,
137+
NULL::integer AS page,
138+
start_time,
139+
NULL::text AS text,
140+
screen_text,
141+
transcript,
142+
embedding <=> %s AS distance
143+
FROM videos
144+
),
145+
results AS (
146+
SELECT * FROM document_results
147+
UNION ALL
148+
SELECT * FROM video_results
149+
)
150+
SELECT * FROM results ORDER BY distance LIMIT %s
151+
"""
152+
153+
query_result = db_conn.execute(query=query, params=(query_embedding, query_embedding, count)).fetchall()
154+
155+
for result in query_result:
156+
if result["source"] == "document":
157+
del result["start_time"]
158+
del result["screen_text"]
159+
del result["transcript"]
160+
elif result["source"] == "video":
161+
del result["page"]
162+
del result["text"]
163+
164+
return query_result
165+
109166

110167
if __name__ == "__main__":
111168
uvicorn.run(app, host="0.0.0.0", port=8000)

0 commit comments

Comments
 (0)