Skip to content

Commit fff4704

Browse files
Merge branch 'refs/heads/main' into comments
2 parents 4196283 + b2245c2 commit fff4704

2 files changed

Lines changed: 34 additions & 42 deletions

File tree

config.yaml

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,16 @@ lecture_llm_generator:
5353
# for inference
5454
base_model_path: "./llm_data/models/Meta-Llama-3.1-8B-Instruct"
5555
# path to the folder containing the llm lora adapter which should be used for inference
56-
lora_model_path: "./llm_data/loras/llama-3-1-8B-instruct-titles"
56+
lora_model_path: "./llm_data/loras/llama-3-1-8B-instr-titles-full-sliding-20k"
5757
# Hyperparameters passed to the model at inference-time
5858
hyperparameters:
5959
max_new_tokens: 2000
60-
temperature: 0.7
60+
temperature: 0.6
6161
top_p: 0.1
6262
top_k: 40
6363
typical_p: 1
6464
min_p: 0
65-
repetition_penalty: 1.2
65+
repetition_penalty: 0.85
6666
# The prompt given to the model for this task. {json_input} is automatically replaced by a stringified JSON array
6767
# containing an entry for each segment extracted from the video. {json_schema} is automatically replaced by a
6868
# JSON schema definition of the expected output the LLM should give.
@@ -71,14 +71,8 @@ lecture_llm_generator:
7171
7272
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
7373
74-
I have an input JSON file I need to process. It contains an array, where each element is a snippet of a lecture video. Each element contains the keys "start_time", which denotes the start time of the snippet in seconds after video start, a "transcript" of the spoken text, and "screen_text", the text on screen as detected by OCR. The transcript and screen_text might contain inaccuracies due to the nature of STT and OCR. The video was split into snippets by detecting when the screen changes by a significant amount. Please create a JSON file containing just an array of strings, the list of strings should consist of 1 to 5 bullet points to summarize the contents of the video. Each bullet point should at most be 2 sentences long. Remember to answer only with a JSON file.
75-
76-
Your answer should adhere to the following JSON schema:
77-
```
78-
{json_schema}
79-
```
74+
I have an input JSON file I need to process. It contains an array, where each element is a snippet of a lecture video. Each element contains the keys "start_time", which denotes the start time of the snippet in seconds after video start, a "transcript" of the spoken text, and "screen_text", the text on screen as detected by OCR. The transcript and screen_text might contain inaccuracies due to the nature of STT and OCR. The video was split into snippets by detecting when the screen changes by a significant amount. Please create a JSON object which contains a property for each segment in the input, where the property key is the start_time of the segment, and the value is a string containing your suggested title for that segment. Choose high-quality and concise titles. If you want two back-to-back snippet to be considered as the same chapter, give them the same title in your JSON array. Remember to answer only with a JSON file. This is the input JSON:
8075
81-
The input data:
8276
```
8377
{json_input}
8478
```<|eot_id|><|start_header_id|>assistant<|end_header_id|>
@@ -97,7 +91,7 @@ lecture_llm_generator:
9791
# Hyperparameters passed to the model at inference-time
9892
hyperparameters:
9993
max_new_tokens: 300
100-
temperature: 0.6
94+
temperature: 0.4
10195
top_p: 0.1
10296
top_k: 40
10397
typical_p: 1

fileextractlib/LectureLlmGenerator.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import gc
22
import json
3+
from collections import OrderedDict
34
from typing import Annotated, Optional
45

56
import pydantic
@@ -40,58 +41,46 @@ def generate_titles_for_video(self, video_data: VideoData) -> None:
4041
:param video_data: The video data of the video to generate segment titles for.
4142
"""
4243

43-
class PromptJsonOutputElement(pydantic.BaseModel):
44-
start_time: int
45-
title: str
44+
if config.current["lecture_llm_generator"]["keep_models_loaded"]:
45+
llama_runner = self.__title_llama_runner
46+
else:
47+
llama_runner = LectureLlmGenerator.__load_title_llama_runner()
4648

4749
current_segment_index = 0
48-
4950
while current_segment_index < len(video_data.segments):
5051
step_segment_count = min(len(video_data.segments) - current_segment_index, 10)
52+
step_video_segments = video_data.segments[current_segment_index:current_segment_index + step_segment_count]
5153

5254
prompt_input = [{
5355
"start_time": x.start_time,
5456
"transcript": x.transcript,
5557
"screen_text": x.screen_text
56-
} for x in video_data.segments[current_segment_index:current_segment_index + step_segment_count]]
58+
} for x in step_video_segments]
5759

58-
answer_schema = (pydantic.RootModel[
59-
Annotated[list[PromptJsonOutputElement], Len(min_length=step_segment_count, max_length=step_segment_count)]]
60-
.model_json_schema())
60+
# construct the answer schema
61+
model_properties = OrderedDict()
6162

62-
prompt = """
63-
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
63+
for segment in step_video_segments:
64+
model_properties[str(segment.start_time)] = (str, ...)
6465

65-
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
66+
answer_model = pydantic.create_model("SegmentTitle", **model_properties)
6667

67-
I have an input JSON file I need to process. It contains an array, where each element is a snippet of a lecture video. Each element contains the keys "start_time", which denotes the start time of the snippet in seconds after video start, a "transcript" of the spoken text, and "screen_text", the text on screen as detected by OCR. The transcript and screen_text might contain inaccuracies due to the nature of STT and OCR. The video was split into snippets by detecting when the screen changes by a significant amount. Please create a JSON file containing an array of elements, where each element represents the respective snippet from the input JSON. Each element should contain a title you'd give this snippet. Choose high-quality and concise titles. If you want two back-to-back snippet to be considered as the same chapter, give them the same title in your JSON array. Remember to answer only with a JSON file. This is the input JSON:
68+
answer_schema = answer_model.model_json_schema()
6869

69-
Your response should be following this JSON schema: {json_schema}
70-
```
71-
{json_input}
72-
```<|eot_id|><|start_header_id|>assistant<|end_header_id|>
73-
""".format(json_input=json.dumps(prompt_input, indent=4, ensure_ascii=False),
74-
json_schema=answer_schema)
70+
_logger.info(str(answer_schema))
7571

76-
if config.current["lecture_llm_generator"]["keep_models_loaded"]:
77-
llama_runner = self.__title_llama_runner
78-
else:
79-
llama_runner = LectureLlmGenerator.__load_title_llama_runner()
72+
prompt = (config.current["lecture_llm_generator"]["segment_title_generator"]["prompt"]
73+
.format(json_input=json.dumps(prompt_input, indent=4, ensure_ascii=False),
74+
json_schema=answer_schema))
8075

8176
# get the answer json, force the LLM to conform to our json schema
8277
answer_json = LectureLlmGenerator.__generate_answer_json(
8378
llama_runner,
8479
prompt,
8580
answer_schema,
86-
pipeline_args=config.current["lecture_llm_generator"]["document_title_generator"]["hyperparameters"])
87-
for i, segment_json in enumerate(answer_json):
88-
video_data.segments[current_segment_index + i].title = segment_json["title"]
89-
90-
# if we don't want to keep the model loaded, get rid of it ASAP
91-
if not config.current["lecture_llm_generator"]["keep_models_loaded"]:
92-
del llama_runner
93-
gc.collect()
94-
torch.cuda.empty_cache()
81+
pipeline_args=config.current["lecture_llm_generator"]["segment_title_generator"]["hyperparameters"])
82+
for (key, value) in answer_json.items():
83+
next(x for x in video_data.segments if x.start_time == int(key)).title = value
9584

9685
# if we haven't yet reached the end of the video, step through the segments' titles we've generated and
9786
# search for the last "switch" from one title to another. We will continue generating more titles for
@@ -117,10 +106,19 @@ class PromptJsonOutputElement(pydantic.BaseModel):
117106
else:
118107
return
119108

109+
# if we don't want to keep the model loaded, get rid of it ASAP
110+
if not config.current["lecture_llm_generator"]["keep_models_loaded"]:
111+
del llama_runner
112+
gc.collect()
113+
torch.cuda.empty_cache()
114+
120115

121116

122117
def generate_summary_for_video(self, video_data: VideoData) -> None:
123118

119+
# TODO: Disabled for now
120+
return
121+
124122
json_input = [{
125123
"start_time": x.start_time,
126124
"transcript": x.transcript,

0 commit comments

Comments
 (0)