brainrot-generator/get_final_text.py at master · harryf/brainrot-generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3

import json
import os
import argparse
import logging

def get_final_text(input_file: str, output_file: str):
    """
    Extract text from censored.json and write it to final.txt
    with appropriate spacing between words.

    Args:
        input_file (str): Path to the censored.json file
        output_file (str): Path to the output text file
    """
    try:
        # Read the JSON file
        logging.info(f"Reading input file: {input_file}")
        with open(input_file, 'r') as f:
            data = json.load(f)

        # Extract all text elements
        text_elements = [segment[0] for segment in data]

        # Combine text elements with proper spacing
        final_text = ' '.join(text_elements)

        # Write to output file
        logging.info(f"Writing output to: {output_file}")
        with open(output_file, 'w') as f:
            f.write(final_text)

        logging.info("Successfully extracted text")

    except json.JSONDecodeError as e:
        logging.error(f"Failed to parse JSON file: {str(e)}")
        raise
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='''
        Extract plain text from the censored transcript.

        This script reads the censored transcript JSON file and extracts all text
        segments into a single text file, with proper spacing between segments.
        This is useful for getting a clean, readable version of the transcript
        without timing information or speaker labels.

        Input:
        - censored.json: JSON file containing the censored transcript

        Output:
        - final.txt: Plain text file containing all text segments combined

        The script preserves the order of text segments and ensures proper spacing
        between words from different segments.
        '''
    )
    parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging')
    parser.add_argument('-o', '--outputdir', default='output', help='Directory for the input and output')
    parser.add_argument('-r', '--regenerate', action='store_true', help='Force regeneration of output text')
    args = parser.parse_args()

    # Setup logging
    log_level = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(level=log_level, format='%(levelname)s: %(message)s')

    # Define input and output paths
    input_file = os.path.join(args.outputdir, 'censored.json')
    output_file = os.path.join(args.outputdir, 'final.txt')

    # Check if output file exists
    if os.path.exists(output_file) and not args.regenerate:
        logging.info("Output file already exists. Use --regenerate to force recreation.")
        return

    # Check if input file exists
    if not os.path.exists(input_file):
        logging.error(f"Input file {input_file} not found")
        exit(1)

    try:
        get_final_text(input_file, output_file)
    except Exception as e:
        logging.error(f"Failed to extract text: {str(e)}")
        exit(1)

if __name__ == "__main__":
    main()