Skip to content

Commit a34b7bb

Browse files
Merge pull request #21 from MEITREX/comments
Add comments and logging info
2 parents b2245c2 + fff4704 commit a34b7bb

4 files changed

Lines changed: 49 additions & 1 deletion

File tree

fileextractlib/DocumentProcessor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import io
2+
import logging
23

34
import requests
45
from fileextractlib.DocumentData import DocumentData
56
from fileextractlib.PdfProcessor import PdfProcessor
67
from fileextractlib.PowerPointProcessor import PowerPointProcessor
78

89

10+
_logger = logging.getLogger(__name__)
911
class DocumentProcessor:
1012
def __init__(self):
1113
self.pdf_processor = PdfProcessor()
@@ -21,6 +23,8 @@ def process(self, file_url: str) -> DocumentData:
2123
raise ValueError("Content type header not found")
2224

2325
if content_type_header == "application/pdf":
26+
_logger.info("Processing PDF")
2427
return self.pdf_processor.process_from_io(file_bytes)
2528
elif content_type_header == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
29+
_logger.info("Processing Powerpoint")
2630
return self.powerpoint_processor.process_from_io(file_bytes)

fileextractlib/PdfProcessor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import io
23
import pdf2image
34
import argparse
@@ -7,6 +8,7 @@
78
from pypdf import PdfWriter, PdfReader
89
from fileextractlib.DocumentData import DocumentData, PageData
910

11+
_logger = logging.getLogger(__name__)
1012

1113
class PdfProcessor:
1214
"""
@@ -18,9 +20,11 @@ def __init__(self):
1820

1921
def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
2022
# create thumbnail images for each page
23+
_logger.info("Creating thumbnails")
2124
page_images = pdf2image.convert_from_bytes(file.read())
2225

2326
# split the pdf into pages, so we can extract text for each page separately
27+
_logger.info("Splitting document into pages")
2428
file.seek(0)
2529
pdf_reader = PdfReader(file)
2630

@@ -40,6 +44,8 @@ def process_from_io(self, file: typing.BinaryIO) -> DocumentData:
4044

4145
pages.append(PageData(page_index, page_text, page_images[page_index], None))
4246

47+
_logger.info("Finished processing file.")
48+
4349
return DocumentData(pages, [])
4450

4551

fileextractlib/TopicModel.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,13 @@ def __init__(self, record_segments: list[VideoSegmentEntity | DocumentSegmentEnt
2626
self.docs = []
2727

2828
def create_topic_model(self):
29+
"""
30+
This method creates the topic model from which the suggested tags are generated.
31+
32+
"""
2933
embeddings = []
3034

35+
# find the appropriate fields to use depending on the segmentEntity
3136
for entity in self.record_segments:
3237
if isinstance(entity, DocumentSegmentEntity):
3338
self.docs.append(entity.text)
@@ -39,19 +44,25 @@ def create_topic_model(self):
3944
self.docs.append(entity.textual_representation)
4045
embeddings.append(entity.embedding)
4146

47+
# check to ensure enough segments are available to run the topic model
4248
if len(self.docs) < 11:
4349
_logger.info("More documents needed to create topic model.")
4450
return
4551

4652
embeddings = np.array(embeddings)
53+
# set stop_words to remove stop words, ngram_range defines how many words the terms can contain
4754
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3))
55+
# reduce_frequent_words to further reduce common words,
56+
# bm25_weighting changes the weighting to a more robust one for small datasets
57+
# more info: https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html
4858
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)
59+
# change diversity to further improve results of the keywords, lower values means less diverse
4960
mmr = MaximalMarginalRelevance(diversity=0.3)
5061

5162
representation_models = mmr
5263

5364
self.model = BERTopic(
54-
min_topic_size=7,
65+
min_topic_size=7, # set the min topic size lower to work better with small datasets
5566
vectorizer_model=vectorizer_model,
5667
ctfidf_model=ctfidf_model,
5768
representation_model=representation_models
@@ -60,6 +71,10 @@ def create_topic_model(self):
6071
self.model.fit_transform(self.docs, embeddings)
6172

6273
def add_tags_to_media_records(self, segments):
74+
"""
75+
This method adds tags to all media records. Replaces old tags when run.
76+
77+
"""
6378
if len(self.docs) < 11:
6479
_logger.info("Topic model wasn't created. More documents needed.")
6580
return
@@ -94,6 +109,10 @@ def add_tags_to_media_records(self, segments):
94109
return mediarecords_with_tags
95110

96111
def add_tags_to_assessments(self, segments):
112+
"""
113+
This method adds tags to all assessments. Replaces old tags when run.
114+
115+
"""
97116
if len(self.docs) < 11:
98117
_logger.info("Topic model wasn't created. More documents needed.")
99118
return

service/DocProcAiService.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,12 @@ async def ingest_media_record_task():
9999
self.segment_database.delete_document_segments_by_media_record_id([media_record_id])
100100
self.segment_database.delete_video_segments_by_media_record_id([media_record_id])
101101

102+
_logger.info("Processing file of Type: " + record_type)
102103
if record_type == "PRESENTATION" or record_type == "DOCUMENT":
104+
_logger.info("Starting document processor for " + str(media_record_id))
103105
document_processor = DocumentProcessor()
104106
document_data = document_processor.process(download_url)
107+
_logger.info("Generating embeddings for " + str(media_record_id))
105108
self.__lecture_pdf_embedding_generator.generate_embeddings(document_data.pages)
106109
for segment in document_data.pages:
107110
thumbnail_bytes = io.BytesIO()
@@ -116,10 +119,12 @@ async def ingest_media_record_task():
116119

117120
if config.current["lecture_llm_generator"]["document_summary_generator"]["enabled"]:
118121
# generate and store a summary of this media record
122+
_logger.info("Generating summary for " + str(media_record_id))
119123
self.__lecture_llm_generator.generate_summary_for_document(document_data)
120124

121125
self.media_record_info_database.upsert_media_record_info(media_record_id, document_data.summary, None)
122126
elif record_type == "VIDEO":
127+
_logger.info("Starting video processor for " + str(media_record_id))
123128
video_processor = VideoProcessor(
124129
segment_image_similarity_threshold=
125130
config.current["video_segmentation"]["segment_image_similarity_threshold"],
@@ -128,13 +133,16 @@ async def ingest_media_record_task():
128133
del video_processor
129134

130135
# generate text embeddings for the segments of the video
136+
_logger.info("Generating embeddings for " + str(media_record_id))
131137
self.__lecture_video_embedding_generator.generate_embeddings(video_data.segments)
132138

133139
# generate titles for the video's segments if llm features enabled
134140
if config.current["lecture_llm_generator"]["segment_title_generator"]["enabled"]:
141+
_logger.info("Generating title for " + str(media_record_id))
135142
self.__lecture_llm_generator.generate_titles_for_video(video_data)
136143
else:
137144
# otherwise set empty data/placeholders
145+
_logger.info("LLM generator disabled. Setting placeholders.")
138146
video_data.summary = []
139147
for i, segment in enumerate(video_data.segments, start=1):
140148
segment.title = "Section " + str(i)
@@ -177,6 +185,9 @@ async def ingest_media_record_task():
177185
priority))
178186

179187
def __generate_tags(self):
188+
"""
189+
Generates the suggested tags for all media records and assessments. This will recreate all suggested tags
190+
"""
180191
segments = self.segment_database.get_all_entity_segments()
181192

182193
topic_model = TopicModel(segments)
@@ -188,6 +199,10 @@ def __generate_tags(self):
188199
self.__generate_tags_for_assessments(segments, topic_model)
189200

190201
def __generate_tags_for_media_records(self, segments, topic_model):
202+
"""
203+
Generates the suggested tags for all media records. This will recreate all suggested tags.
204+
This step will be skipped if no media records are found.
205+
"""
191206
_logger.info("Generating tags for media records.")
192207
media_records = self.media_record_info_database.get_all_media_records()
193208
if not media_records: # check if media_records is empty
@@ -201,6 +216,10 @@ def __generate_tags_for_media_records(self, segments, topic_model):
201216
_logger.info("Generated tags for media records.")
202217

203218
def __generate_tags_for_assessments(self, segments, topic_model):
219+
"""
220+
Generates the suggested tags for all assessments. This will recreate all suggested tags.
221+
This step will be skipped if no assessments are found.
222+
"""
204223
_logger.info("Generating tags for assesments.")
205224
assesments = self.assesment_database.get_all_assessments()
206225
if not assesments: # check if assessments is empty

0 commit comments

Comments
 (0)