Skip to content

Commit 0ad43e0

Browse files
committed
reviewed subtitles to srt creation
1 parent ef2e845 commit 0ad43e0

7 files changed

Lines changed: 152 additions & 5 deletions

File tree

tts_n_stt/README.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ scripts use uv package manager for dependencies.
44
You can refer to this youtube video for more
55
details: https://youtu.be/LZXps8KE4XM
66

7-
Text to Speech with Kokoro Model:
7+
## Text to Speech with Kokoro Model:
88

99
The kokorotts models has to be downloaded for the
1010
app.py to work.
@@ -23,9 +23,24 @@ uv run app.py
2323
The onnx and bin files are not commited to the
2424
repo. So you have to download it.
2525

26-
Speech to Text with Whisper Model:
26+
## Speech to Text with Whisper Model:
2727

2828
The stt_app.py is the Speech to Text Flask Server.
2929
Run it with below command
3030

3131
uv run stt_app.py
32+
33+
text to
34+
35+
## text to speech with simpler pydub
36+
37+
- text_to_mp3.py gives the speech file
38+
39+
- Use the above to get synthetic speech, which is then used for speech to text in json format using stt_app.py
40+
41+
- subtitles.srt is recieved in JSon Format. renamed it
42+
43+
## Use the JSON to SRT converter:
44+
45+
-
46+

tts_n_stt/apple_description.mp3

31.3 KB
Binary file not shown.

tts_n_stt/json_to_srt.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# /// script
2+
# requires-python = ">=3.11"
3+
# dependencies = [
4+
# "pydub",
5+
# ]
6+
# ///
7+
import json
8+
import sys
9+
from pathlib import Path
10+
from pydub import AudioSegment
11+
12+
def format_timestamp(seconds: float) -> str:
13+
"""Convert seconds to SRT timestamp format: HH:MM:SS,mmm"""
14+
millis = int(seconds * 1000)
15+
hours = millis // (3600 * 1000)
16+
minutes = (millis % (3600 * 1000)) // (60 * 1000)
17+
secs = (millis % (60 * 1000)) // 1000
18+
ms = millis % 1000
19+
return f"{hours:02}:{minutes:02}:{secs:02},{ms:03}"
20+
21+
def generate_srt(transcript, audio_file, output_file):
22+
# Load audio for validation
23+
audio = AudioSegment.from_mp3(audio_file)
24+
duration_sec = len(audio) / 1000.0
25+
26+
lines = []
27+
for i, entry in enumerate(transcript, start=1):
28+
start = entry["start"]
29+
end = entry["end"]
30+
text = entry["text"]
31+
32+
# validate against audio length
33+
if end > duration_sec:
34+
print(f"Warning: Subtitle {i} ends after audio length, trimming to {duration_sec:.2f}s")
35+
end = duration_sec
36+
37+
start_ts = format_timestamp(start)
38+
end_ts = format_timestamp(end)
39+
40+
lines.append(f"{i}\n{start_ts} --> {end_ts}\n{text}\n")
41+
42+
Path(output_file).write_text("\n".join(lines), encoding="utf-8")
43+
print(f"SRT file created: {output_file}")
44+
45+
if __name__ == "__main__":
46+
if len(sys.argv) != 4:
47+
print("Usage: uv run generate_srt.py transcript.json input.mp3 output.srt")
48+
sys.exit(1)
49+
50+
transcript_file = sys.argv[1]
51+
audio_file = sys.argv[2]
52+
output_file = sys.argv[3]
53+
54+
with open(transcript_file, "r", encoding="utf-8") as f:
55+
data = json.load(f)
56+
transcript = data["transcription"]
57+
58+
59+
generate_srt(transcript, audio_file, output_file)

tts_n_stt/output_text.srt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
1
2+
00:00:00,000 --> 00:00:01,000
3+
An apple is a sweet fruit
4+
5+
2
6+
00:00:01,000 --> 00:00:02,000
7+
that comes in many
8+
9+
3
10+
00:00:02,000 --> 00:00:04,000
11+
colors like red, green, and yellow.
12+
13+
4
14+
00:00:04,000 --> 00:00:05,000
15+
Apples are rich in fiber and vitamins
16+
17+
5
18+
00:00:05,000 --> 00:00:07,891
19+
and are enjoyed worldwide.

tts_n_stt/stt_app.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ def transcribe():
2828
return jsonify({"error": "Empty filename"}), 400
2929

3030
# Save file temporarily
31-
filepath = os.path.join("/tmp", file.filename)
32-
file.save(filepath)
31+
filepath = os.path.join(".", file.filename)
32+
# file.save(filepath)
3333

3434
segments, _ = model.transcribe(filepath)
3535

@@ -41,7 +41,7 @@ def transcribe():
4141
"text": segment.text
4242
})
4343

44-
os.remove(filepath) # clean up
44+
# os.remove(filepath) # clean up
4545

4646
return jsonify({"transcription": result})
4747

tts_n_stt/subtitles.json

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"transcription": [
3+
{
4+
"end": 1.0,
5+
"start": 0.0,
6+
"text": "An apple is a sweet fruit"
7+
},
8+
{
9+
"end": 2.0,
10+
"start": 1.0,
11+
"text": "that comes in many"
12+
},
13+
{
14+
"end": 4.0,
15+
"start": 2.0,
16+
"text": "colors like red, green, and yellow."
17+
},
18+
{
19+
"end": 5.0,
20+
"start": 4.0,
21+
"text": " Apples are rich in fiber and vitamins"
22+
},
23+
{
24+
"end": 8.0,
25+
"start": 5.0,
26+
"text": " and are enjoyed worldwide."
27+
}
28+
]
29+
}

tts_n_stt/text_to_mp3.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# /// script
2+
# requires-python = ">=3.11"
3+
# dependencies = [
4+
# "pydub",
5+
# "pyttsx3",
6+
# ]
7+
# ///
8+
import pyttsx3
9+
from pydub import AudioSegment
10+
11+
# Initialize TTS engine
12+
engine = pyttsx3.init()
13+
14+
# Two spoken sentences
15+
text = "An apple is a sweet fruit that comes in many colors like red, green, and yellow. Apples are rich in fiber and vitamins, and are enjoyed worldwide."
16+
17+
# Save to WAV first (pyttsx3 works best with wav)
18+
engine.save_to_file(text, "apple_description.wav")
19+
engine.runAndWait()
20+
21+
# Convert to MP3 using pydub
22+
sound = AudioSegment.from_wav("apple_description.wav")
23+
sound.export("apple_description.mp3", format="mp3")
24+
25+
print("Generated apple_description.mp3")

0 commit comments

Comments
 (0)