forked from alexeygrigorev/loom-transcript-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_llm_integration.py
More file actions
240 lines (202 loc) · 9.79 KB
/
Copy pathprocess_llm_integration.py
File metadata and controls
240 lines (202 loc) · 9.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env python3
"""
process_llm_integration.py
This script updates the Loom transcript scraper to integrate LLM transcript processing.
It adds the necessary functionality to process transcripts for LLM use right after
they are downloaded, while preserving folder structure and ensuring proper handling
for future runs without overlap.
"""
import os
import re
import time
import string
import shutil
import argparse
from pathlib import Path
# Function to modify process.py to include LLM transcript preparation
def update_process_script():
"""Updates the process.py script to include LLM transcript preparation functionality."""
# Path to the original process.py script
process_script_path = "process.py"
# Backup the original script before making changes
backup_path = "process.py.backup"
if not os.path.exists(backup_path):
shutil.copy2(process_script_path, backup_path)
print(f"Backed up original process.py to {backup_path}")
# Read the original script
with open(process_script_path, 'r', encoding='utf-8') as f:
original_content = f.read()
# Check if the script has already been modified
if "def clean_transcript(" in original_content:
print("The process.py script has already been modified to include LLM transcript processing.")
return False
# The new imports to add
new_imports = """import re
import string
"""
# The LLM processing functions to add
llm_functions = """
# LLM transcript processing functions
def clean_transcript(text):
'''
Process transcript text to make it LLM-friendly.
Args:
text (str): Raw transcript text
Returns:
str: Cleaned and formatted transcript text
'''
# Step 1: Preserve timestamps by standardizing their format to [HH:MM:SS]
# Step 1: Preserve timestamps by standardizing their format to [HH:MM:SS]
# This regex matches common timestamp formats and standardizes them
text = re.sub(r'(\[?\(?\s*)(\d{1,2}:\d{2}(?:\d{2})?)\s*(?:\]|\))?', r'[\2]', text)
# Step 2: Normalize line breaks and ensure speaker names are properly formatted
# Step 2: Normalize line breaks and ensure speaker names are properly formatted
# This helps maintain the conversation structure
text = re.sub(r'\n{3,}', '\n\n', text) # Replace excessive newlines with double newlines
# Step 3: Fix common punctuation issues
# Step 3: Fix common punctuation issues
# Remove duplicate punctuation and ensure proper spacing
text = re.sub(r'([.!?])\s*([.!?])+', r'\1', text) # Remove duplicate punctuation
text = re.sub(r'\s+([.,;:!?])', r'\1', text) # Remove space before punctuation
text = re.sub(r'([.,;:!?])([^\s\d])', r'\1 \2', text) # Add space after punctuation if missing
# Step 4: Normalize whitespace
# Remove trailing/leading whitespace from each line and collapse multiple spaces
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)
text = re.sub(r' +', ' ', text) # Replace multiple spaces with a single space
# Step 5: Remove empty lines while preserving paragraph structure
lines = text.split('\n')
non_empty_lines = []
for i, line in enumerate(lines):
# Keep the line if it's not empty or if it's a deliberate paragraph break
if line.strip() or (i > 0 and i < len(lines) - 1 and lines[i-1].strip() and lines[i+1].strip()):
non_empty_lines.append(line)
# Step 6: Final cleanup - remove any remaining problematic characters
# (but carefully preserve important special characters)
text = '\n'.join(non_empty_lines)
# Filter out any non-printable characters except for common line breaks
printable_chars = set(string.printable)
text = ''.join(c for c in text if c in printable_chars)
return text
def process_for_llm(transcript_filepath, llm_dir):
'''Process a transcript file for LLM and save to the LLM directory.
Args:
transcript_filepath (str): Path to the transcript file
llm_dir (str): Directory to save LLM-ready transcript
Returns:
bool: True if successful, False otherwise
'''
try:
# Get the base filename
base_name = os.path.basename(transcript_filepath)
name_without_ext = os.path.splitext(base_name)[0]
llm_filepath = os.path.join(llm_dir, f"{name_without_ext}_llm.txt")
# Skip if already processed
if os.path.exists(llm_filepath):
print(f"LLM version already exists: {llm_filepath}")
return False
# Read the transcript
with open(transcript_filepath, "r", encoding="utf-8") as f:
transcript_text = f.read()
# Clean and format for LLM
processed_text = clean_transcript(transcript_text)
# Save to LLM directory
with open(llm_filepath, "w", encoding="utf-8") as f:
f.write(processed_text)
print(f"Created LLM-ready transcript: {llm_filepath}")
return True
except Exception as e:
print(f"Error processing transcript for LLM: {str(e)}")
return False
"""
# Find the right spot to insert the new import statements
import_insertion_point = original_content.find("import os")
if import_insertion_point == -1:
print("Could not find import statements in process.py")
return False
# Find the right spot to insert the LLM functions (before the main try block)
function_insertion_point = original_content.find("try:")
if function_insertion_point == -1:
print("Could not find the main try block in process.py")
return False
# Add command-line argument for LLM processing
arg_content = """parser.add_argument('--process-llm', action='store_true',
help='Process transcripts for LLM after downloading')
parser.add_argument('--llm-dir', type=str, default="llm_ready_transcripts",
help='Directory to store LLM-ready transcripts (default: llm_ready_transcripts)')"""
# Find the right spot to insert the new command-line arguments
arg_insertion_point = original_content.find("args = parser.parse_args(")
if arg_insertion_point == -1:
print("Could not find the argument parsing section in process.py")
return False
# Find the line before args = parser.parse_args()
last_arg_line = original_content.rfind("\n", 0, arg_insertion_point)
if last_arg_line == -1:
print("Could not find the end of argument definitions in process.py")
return False
# Add LLM directory creation
dir_creation_code = """# Ensure LLM directory exists if processing for LLM
if args.process_llm:
llm_dir = args.llm_dir
if not os.path.exists(llm_dir):
os.makedirs(llm_dir)
print(f"Created directory for LLM-ready transcripts: {llm_dir}")
else:
print(f"Using existing directory for LLM-ready transcripts: {llm_dir}")
"""
# Find the spot to insert directory creation code (after download_dir creation)
dir_insertion_point = original_content.find("if not os.path.exists(download_dir):")
dir_code_end = original_content.find("else:", dir_insertion_point)
dir_code_end = original_content.find("\n", dir_code_end) + 1
# Add LLM processing call after saving transcript to file
llm_processing_code = """ # Process for LLM if requested
if args.process_llm:
process_for_llm(transcript_filepath, args.llm_dir)
"""
# Find the spot to insert LLM processing code (after saving transcript)
processing_insertion_point = original_content.find('print(f"Transcript saved to: {transcript_filepath}")')
if processing_insertion_point == -1:
print("Could not find the transcript saving section in process.py")
return False
processing_insertion_point = original_content.find("\n", processing_insertion_point) + 1
# Construct the modified content
modified_content = (
original_content[:import_insertion_point] +
new_imports +
original_content[import_insertion_point:last_arg_line + 1] +
arg_content +
original_content[last_arg_line + 1:dir_code_end] +
dir_creation_code +
original_content[dir_code_end:function_insertion_point] +
llm_functions +
original_content[function_insertion_point:processing_insertion_point] +
llm_processing_code +
original_content[processing_insertion_point:]
)
# Write the modified content back to the file
with open(process_script_path, 'w', encoding='utf-8') as f:
f.write(modified_content)
print("Successfully updated process.py with LLM transcript processing functionality")
return True
def main():
parser = argparse.ArgumentParser(description='Integrate LLM transcript processing into the Loom scraper.')
parser.add_argument('--restore', action='store_true',
help='Restore the original process.py from backup')
args = parser.parse_args()
if args.restore:
backup_path = "process.py.backup"
if os.path.exists(backup_path):
shutil.copy2(backup_path, "process.py")
print("Restored original process.py from backup")
else:
print("No backup file found (process.py.backup)")
else:
update_process_script()
print("\nTo use the LLM processing functionality, run process.py with the --process-llm flag:")
print("python process.py --process-llm")
print("\nYou can specify a custom directory for LLM-ready transcripts:")
print("python process.py --process-llm --llm-dir custom_llm_dir")
print("\nTo restore the original process.py script:")
print("python process_llm_integration.py --restore")
if __name__ == "__main__":
main()