-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_final_text.py
More file actions
94 lines (75 loc) · 3.17 KB
/
get_final_text.py
File metadata and controls
94 lines (75 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
import json
import os
import argparse
import logging
def get_final_text(input_file: str, output_file: str):
"""
Extract text from censored.json and write it to final.txt
with appropriate spacing between words.
Args:
input_file (str): Path to the censored.json file
output_file (str): Path to the output text file
"""
try:
# Read the JSON file
logging.info(f"Reading input file: {input_file}")
with open(input_file, 'r') as f:
data = json.load(f)
# Extract all text elements
text_elements = [segment[0] for segment in data]
# Combine text elements with proper spacing
final_text = ' '.join(text_elements)
# Write to output file
logging.info(f"Writing output to: {output_file}")
with open(output_file, 'w') as f:
f.write(final_text)
logging.info("Successfully extracted text")
except json.JSONDecodeError as e:
logging.error(f"Failed to parse JSON file: {str(e)}")
raise
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
raise
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(
description='''
Extract plain text from the censored transcript.
This script reads the censored transcript JSON file and extracts all text
segments into a single text file, with proper spacing between segments.
This is useful for getting a clean, readable version of the transcript
without timing information or speaker labels.
Input:
- censored.json: JSON file containing the censored transcript
Output:
- final.txt: Plain text file containing all text segments combined
The script preserves the order of text segments and ensures proper spacing
between words from different segments.
'''
)
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging')
parser.add_argument('-o', '--outputdir', default='output', help='Directory for the input and output')
parser.add_argument('-r', '--regenerate', action='store_true', help='Force regeneration of output text')
args = parser.parse_args()
# Setup logging
log_level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(level=log_level, format='%(levelname)s: %(message)s')
# Define input and output paths
input_file = os.path.join(args.outputdir, 'censored.json')
output_file = os.path.join(args.outputdir, 'final.txt')
# Check if output file exists
if os.path.exists(output_file) and not args.regenerate:
logging.info("Output file already exists. Use --regenerate to force recreation.")
return
# Check if input file exists
if not os.path.exists(input_file):
logging.error(f"Input file {input_file} not found")
exit(1)
try:
get_final_text(input_file, output_file)
except Exception as e:
logging.error(f"Failed to extract text: {str(e)}")
exit(1)
if __name__ == "__main__":
main()