22import SentenceEmbeddingRunner
33from pydantic import BaseModel
44import LlamaRunner
5- import LectureVideoProcessor
5+ import TranscriptGenerator as TranscriptGenerator
66import PdfProcessor as PdfProcessor
77import uvicorn
88from pgvector .psycopg import register_vector
99import psycopg
10+ from LecturePdfEmbeddingGenerator import LecturePdfEmbeddingGenerator
11+ from LectureVideoEmbeddingGenerator import LectureVideoEmbeddingGenerator
1012
1113app = fastapi .FastAPI ()
1214
1618register_vector (db_conn )
1719
1820#db_conn.execute("DROP TABLE IF EXISTS documents")
21+ #db_conn.execute("DROP TABLE IF EXISTS videos")
22+
1923db_conn .execute ("CREATE TABLE IF NOT EXISTS documents (PRIMARY KEY(origin_file, page), text text, origin_file text, page int, embedding vector(1024))" )
24+ db_conn .execute ("CREATE TABLE IF NOT EXISTS videos (PRIMARY KEY(origin_file, start_time), screen_text text, transcript text, origin_file text, start_time int, embedding vector(1024))" )
2025
2126llamaRunner : LlamaRunner .LlamaRunner | None = None
2227
2328def ingest_document_task (file_url : str ):
24- pdf_processsor = PdfProcessor .PdfProcessor ()
25- pages : list [str ] = pdf_processsor .process_from_url (file_url )
29+ embeddings = LecturePdfEmbeddingGenerator .generate_embedding (file_url )
2630
27- # remove null and empty strings
28- filtered_pages_text = [x ["text" ] for x in pages if x ["text" ] is not None and x ["text" ].strip ()]
31+ for embedding , text , page_no in embeddings :
32+ db_conn .execute (query = "INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)" ,
33+ params = (text , file_url , page_no , embedding ))
2934
30- embeddings = SentenceEmbeddingRunner . generate_embeddings ( filtered_pages_text )
35+ print ( f"File { file_url } has been ingested into the database." )
3136
32- for (page_no , embedding ) in enumerate (embeddings , 1 ):
33- db_conn .execute ("INSERT INTO documents (text, origin_file, page, embedding) VALUES (%s, %s, %s, %s)" , (filtered_pages_text .pop (0 ), file_url , page_no , embedding ))
37+ def ingest_video_task (video_url : str ):
38+ lecture_video_embedding_generator = LectureVideoEmbeddingGenerator ()
39+ embeddings : list [LectureVideoEmbeddingGenerator .Section ] = lecture_video_embedding_generator .generate_embeddings (video_url )
3440
35- print (f"File { file_url } has been ingested into the database." )
41+ for embedding in embeddings :
42+ db_conn .execute (query = "INSERT INTO videos (screen_text, transcript, origin_file, start_time, embedding) VALUES (%s, %s, %s, %s, %s)" ,
43+ params = (embedding .screen_text , embedding .transcript , video_url , embedding .start_time , embedding .embedding ))
44+
45+ print (f"Video { video_url } has been ingested into the database." )
46+
3647
3748@app .get ("/generate-embedding/" )
3849def generate_embedding (input_text : str ):
@@ -63,7 +74,7 @@ class TranscriptAnswerSchema(BaseModel):
6374 tag4 : str
6475 tag5 : str
6576
66- lecture_video_processor = LectureVideoProcessor .LectureVideoProcessor ()
77+ lecture_video_processor = TranscriptGenerator .LectureVideoProcessor ()
6778 transcript_text = lecture_video_processor .process (request .video_url )
6879
6980 input_text = "# Video Transcript:\n " + transcript_text + "\n \n # Json Schema:\n " + TranscriptAnswerSchema .schema_json () + "\n \n # Json Result:\n "
@@ -91,21 +102,67 @@ def ingest_documents_into_db(request: IngestDocumentsIntoDbRequest, background_t
91102
92103 return {"message" : "Files have been added to ingest queue." }
93104
105+ class IngestVideoIntoDbRequest (BaseModel ):
106+ video_url : str
107+
108+ @app .post ("/db-ingest-video/" , status_code = 202 )
109+ def ingest_video_into_db (request : IngestVideoIntoDbRequest , background_tasks : fastapi .BackgroundTasks ):
110+ background_tasks .add_task (ingest_video_task , request .video_url )
111+
112+ return {"message" : "Video has been added to ingest queue." }
113+
94114@app .get ("/search/" )
95115def db_find_neighbor (query : str , count : int = 5 ):
96116 if count < 1 or count > 100 :
97117 raise fastapi .HTTPException (status_code = 400 , detail = "Count must be between 1 and 100." )
98118
99119 query_embedding = SentenceEmbeddingRunner .generate_embeddings ([query ])[0 ]
100120
101- result = db_conn .execute ("SELECT * FROM documents ORDER BY embedding <=> %s LIMIT %s" , (query_embedding , count ))
102- return {
103- "results" : [{
104- "file" : row ["origin_file" ],
105- "page" : row ["page" ],
106- "text" : row ["text" ],
107- } for row in result ]
108- }
121+ query = """
122+ WITH document_results AS (
123+ SELECT
124+ origin_file,
125+ 'document' AS source,
126+ page,
127+ NULL::integer AS start_time,
128+ text,
129+ NULL::text AS screen_text,
130+ NULL::text AS transcript,
131+ embedding <=> %s AS distance
132+ FROM documents
133+ ),
134+ video_results AS (
135+ SELECT origin_file,
136+ 'video' AS source,
137+ NULL::integer AS page,
138+ start_time,
139+ NULL::text AS text,
140+ screen_text,
141+ transcript,
142+ embedding <=> %s AS distance
143+ FROM videos
144+ ),
145+ results AS (
146+ SELECT * FROM document_results
147+ UNION ALL
148+ SELECT * FROM video_results
149+ )
150+ SELECT * FROM results ORDER BY distance LIMIT %s
151+ """
152+
153+ query_result = db_conn .execute (query = query , params = (query_embedding , query_embedding , count )).fetchall ()
154+
155+ for result in query_result :
156+ if result ["source" ] == "document" :
157+ del result ["start_time" ]
158+ del result ["screen_text" ]
159+ del result ["transcript" ]
160+ elif result ["source" ] == "video" :
161+ del result ["page" ]
162+ del result ["text" ]
163+
164+ return query_result
165+
109166
110167if __name__ == "__main__" :
111168 uvicorn .run (app , host = "0.0.0.0" , port = 8000 )
0 commit comments