reviewed subtitles to srt creation

Kamalabot · Kamalabot · commit 0ad43e0fe590 · 2025-08-20T20:35:26.000+05:30
diff --git a/tts_n_stt/README.md b/tts_n_stt/README.md
@@ -4,7 +4,7 @@ scripts use uv package manager for dependencies.
 You can refer to this youtube video for more
 details: https://youtu.be/LZXps8KE4XM
 
-Text to Speech with Kokoro Model:
+## Text to Speech with Kokoro Model:
 
 The kokorotts models has to be downloaded for the
 app.py to work.
@@ -23,9 +23,24 @@ uv run app.py
 The onnx and bin files are not commited to the
 repo. So you have to download it.
 
-Speech to Text with Whisper Model:
+## Speech to Text with Whisper Model:
 
 The stt_app.py is the Speech to Text Flask Server.
 Run it with below command
 
 uv run stt_app.py
+
+text to 
+
+## text to speech with simpler pydub
+
+- text_to_mp3.py gives the speech file 
+
+- Use the above to get synthetic speech, which is then used for speech to text in json format using stt_app.py
+
+- subtitles.srt is recieved in JSon Format. renamed it
+
+## Use the JSON to SRT converter:
+
+- 
+
diff --git a/tts_n_stt/apple_description.mp3 b/tts_n_stt/apple_description.mp3
diff --git a/tts_n_stt/json_to_srt.py b/tts_n_stt/json_to_srt.py
@@ -0,0 +1,59 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "pydub",
+# ]
+# ///
+import json
+import sys
+from pathlib import Path
+from pydub import AudioSegment
+
+def format_timestamp(seconds: float) -> str:
+    """Convert seconds to SRT timestamp format: HH:MM:SS,mmm"""
+    millis = int(seconds * 1000)
+    hours = millis // (3600 * 1000)
+    minutes = (millis % (3600 * 1000)) // (60 * 1000)
+    secs = (millis % (60 * 1000)) // 1000
+    ms = millis % 1000
+    return f"{hours:02}:{minutes:02}:{secs:02},{ms:03}"
+
+def generate_srt(transcript, audio_file, output_file):
+    # Load audio for validation
+    audio = AudioSegment.from_mp3(audio_file)
+    duration_sec = len(audio) / 1000.0
+
+    lines = []
+    for i, entry in enumerate(transcript, start=1):
+        start = entry["start"]
+        end = entry["end"]
+        text = entry["text"]
+
+        # validate against audio length
+        if end > duration_sec:
+            print(f"Warning: Subtitle {i} ends after audio length, trimming to {duration_sec:.2f}s")
+            end = duration_sec
+
+        start_ts = format_timestamp(start)
+        end_ts = format_timestamp(end)
+
+        lines.append(f"{i}\n{start_ts} --> {end_ts}\n{text}\n")
+
+    Path(output_file).write_text("\n".join(lines), encoding="utf-8")
+    print(f"SRT file created: {output_file}")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: uv run generate_srt.py transcript.json input.mp3 output.srt")
+        sys.exit(1)
+
+    transcript_file = sys.argv[1]
+    audio_file = sys.argv[2]
+    output_file = sys.argv[3]
+
+    with open(transcript_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+        transcript = data["transcription"]
+
+
+    generate_srt(transcript, audio_file, output_file)
diff --git a/tts_n_stt/output_text.srt b/tts_n_stt/output_text.srt
@@ -0,0 +1,19 @@
+1
+00:00:00,000 --> 00:00:01,000
+An apple is a sweet fruit
+
+2
+00:00:01,000 --> 00:00:02,000
+that comes in many
+
+3
+00:00:02,000 --> 00:00:04,000
+colors like red, green, and yellow.
+
+4
+00:00:04,000 --> 00:00:05,000
+ Apples are rich in fiber and vitamins
+
+5
+00:00:05,000 --> 00:00:07,891
+ and are enjoyed worldwide.
diff --git a/tts_n_stt/stt_app.py b/tts_n_stt/stt_app.py
@@ -28,8 +28,8 @@ def transcribe():
         return jsonify({"error": "Empty filename"}), 400
 
     # Save file temporarily
-    filepath = os.path.join("/tmp", file.filename)
-    file.save(filepath)
+    filepath = os.path.join(".", file.filename)
+    # file.save(filepath)
 
     segments, _ = model.transcribe(filepath)
 
@@ -41,7 +41,7 @@ def transcribe():
             "text": segment.text
         })
 
-    os.remove(filepath)  # clean up
+    # os.remove(filepath)  # clean up
 
     return jsonify({"transcription": result})
 
diff --git a/tts_n_stt/subtitles.json b/tts_n_stt/subtitles.json
@@ -0,0 +1,29 @@
+{
+  "transcription": [
+    {
+      "end": 1.0,
+      "start": 0.0,
+      "text": "An apple is a sweet fruit"
+    },
+    {
+      "end": 2.0,
+      "start": 1.0,
+      "text": "that comes in many"
+    },
+    {
+      "end": 4.0,
+      "start": 2.0,
+      "text": "colors like red, green, and yellow."
+    },
+    {
+      "end": 5.0,
+      "start": 4.0,
+      "text": " Apples are rich in fiber and vitamins"
+    },
+    {
+      "end": 8.0,
+      "start": 5.0,
+      "text": " and are enjoyed worldwide."
+    }
+  ]
+}
diff --git a/tts_n_stt/text_to_mp3.py b/tts_n_stt/text_to_mp3.py
@@ -0,0 +1,25 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "pydub",
+#     "pyttsx3",
+# ]
+# ///
+import pyttsx3
+from pydub import AudioSegment
+
+# Initialize TTS engine
+engine = pyttsx3.init()
+
+# Two spoken sentences
+text = "An apple is a sweet fruit that comes in many colors like red, green, and yellow. Apples are rich in fiber and vitamins, and are enjoyed worldwide."
+
+# Save to WAV first (pyttsx3 works best with wav)
+engine.save_to_file(text, "apple_description.wav")
+engine.runAndWait()
+
+# Convert to MP3 using pydub
+sound = AudioSegment.from_wav("apple_description.wav")
+sound.export("apple_description.mp3", format="mp3")
+
+print("Generated apple_description.mp3")