Skip to content

Commit 70e109e

Browse files
maxitoonclaude
andcommitted
Record as raw PCM to fix live transcription chunks
Previous approaches tried to read a WAV file mid-recording, but rec does not finalize the WAV header until it stops. Reading the file as raw (-t raw) still produced garbled audio because the WAV header bytes were misinterpreted as PCM samples. Fix: record directly as raw PCM (rec -t raw), eliminating the header entirely. The raw file has zero ambiguity: file size = audio bytes, and sox can trim time ranges exactly. On Ctrl+C, convert raw to WAV for permanent storage and final transcription. Also fix hallucination filter regex for macOS grep (POSIX classes). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 16702a8 commit 70e109e

1 file changed

Lines changed: 38 additions & 34 deletions

File tree

whisper-transcribe-with-download.sh

Lines changed: 38 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -131,42 +131,49 @@ cleanup_old_audio() {
131131
original_live_transcription() {
132132
local model_file=$1
133133
local language=$2
134-
local recording_file="$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_live_recording.wav"
134+
local recording_wav="$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_live_recording.wav"
135+
local recording_raw="/tmp/live_recording_${TIMESTAMP}.raw"
135136
local transcript_file="$TRANSCRIPT_DIR/${TIMESTAMP}_live_transcript"
136-
137+
137138
print_color "$CYAN" "🎙️ ORIGINAL Live Transcription Setup"
138139
echo "" >&2
139-
print_color "$YELLOW" "Recording will be saved to: $recording_file" >&2
140+
print_color "$YELLOW" "Recording will be saved to: $recording_wav" >&2
140141
print_color "$YELLOW" "Audio file will be kept for 7 days" >&2
141142
print_color "$YELLOW" "Live transcript will appear below every ~10 seconds:" >&2
142-
print_color "$YELLOW" "Final transcript will be saved to: $transcript_file" >&2
143+
print_color "$YELLOW" "Final transcript will be saved to: ${transcript_file}.txt" >&2
143144
echo "" >&2
144145
print_color "$BLUE" "Press Ctrl+C to stop recording and save transcript" >&2
145146
echo "" >&2
146-
147-
# Start recording in background
147+
148+
# Record as raw PCM (no WAV header) so we can reliably read during recording.
149+
# Format: 16kHz, mono, 16-bit signed integer, little-endian = 32000 bytes/sec
148150
print_color "$GREEN" "🔴 Recording started... (Press Ctrl+C to stop)" >&2
149-
rec -r 16000 -c 1 "$recording_file" >/dev/null 2>&1 &
151+
rec -t raw -r 16000 -c 1 -b 16 -e signed-integer "$recording_raw" >/dev/null 2>&1 &
150152
local rec_pid=$!
151-
153+
152154
# Show live transcription area
153-
print_color "$CYAN" "📝 LIVE TRANSCRIPTION (appears in real-time):" >&2
155+
print_color "$CYAN" "📝 LIVE TRANSCRIPTION:" >&2
154156
print_color "$CYAN" "════════════════════════════════════════════════════" >&2
155-
157+
156158
# Function to handle cleanup on exit
157159
cleanup() {
158160
print_color "$YELLOW" "\n🛑 Stopping recording..." >&2
159161
kill $rec_pid 2>/dev/null
160162
wait $rec_pid 2>/dev/null
161-
162-
if [ -f "$recording_file" ] && [ -s "$recording_file" ]; then
163-
print_color "$GREEN" "✅ Recording saved: $recording_file" >&2
163+
164+
if [ -f "$recording_raw" ] && [ -s "$recording_raw" ]; then
165+
# Convert raw PCM to WAV for permanent storage and final transcription
166+
sox -t raw -r 16000 -c 1 -b 16 -e signed-integer -L \
167+
"$recording_raw" "$recording_wav" 2>/dev/null
168+
rm -f "$recording_raw"
169+
170+
print_color "$GREEN" "✅ Recording saved: $recording_wav" >&2
164171
print_color "$YELLOW" "📁 Audio file will be kept for 7 days" >&2
165-
172+
166173
# Final transcription of the complete recording
167174
print_color "$BLUE" "🎯 Performing final transcription..." >&2
168-
whisper-cli -m "$model_file" -f "$recording_file" -l "$language" -otxt -of "$transcript_file" -pp -nt >&2
169-
175+
whisper-cli -m "$model_file" -f "$recording_wav" -l "$language" -otxt -of "$transcript_file" -pp -nt >&2
176+
170177
if [ $? -eq 0 ] && [ -f "${transcript_file}.txt" ]; then
171178
print_color "$GREEN" "✅ Final transcript saved: ${transcript_file}.txt" >&2
172179
print_color "$YELLOW" "📝 Final transcript preview:" >&2
@@ -179,22 +186,21 @@ original_live_transcription() {
179186
else
180187
print_color "$RED" "❌ No recording was made!" >&2
181188
fi
182-
183-
# Clean up old audio files
189+
190+
# Clean up
191+
rm -f "$recording_raw" 2>/dev/null
184192
cleanup_old_audio
185-
193+
186194
exit 0
187195
}
188-
196+
189197
# Set up signal handler for cleanup
190198
trap cleanup SIGINT SIGTERM
191-
199+
192200
# Incremental chunk-based live transcription
193-
# rec writes 16kHz mono 16-bit PCM = 32000 bytes/sec.
194-
# The WAV header is NOT finalized until rec stops, so we:
195-
# 1. Estimate duration from file size
196-
# 2. Tell sox to read the file as raw PCM (-t raw), bypassing the broken header
197-
# 3. Use sox trim to extract time ranges directly
201+
# Recording is raw PCM: 16kHz, mono, 16-bit = 32000 bytes/sec, no header.
202+
# File size directly equals audio bytes, so duration = size / 32000.
203+
# sox reads the raw file and trims out time ranges into proper WAV chunks.
198204
local BYTES_PER_SEC=32000
199205
local MIN_CHUNK_SECS=10
200206
local OVERLAP_SECS=2
@@ -204,9 +210,9 @@ original_live_transcription() {
204210
touch "$running_transcript"
205211

206212
while kill -0 $rec_pid 2>/dev/null; do
207-
if [ -f "$recording_file" ]; then
213+
if [ -f "$recording_raw" ]; then
208214
local current_size
209-
current_size=$(stat -f%z "$recording_file" 2>/dev/null || echo "0")
215+
current_size=$(stat -f%z "$recording_raw" 2>/dev/null || echo "0")
210216
local current_secs=$((current_size / BYTES_PER_SEC))
211217
local new_secs=$((current_secs - last_secs))
212218

@@ -222,24 +228,22 @@ original_live_transcription() {
222228
fi
223229
local trim_duration=$((current_secs - trim_start))
224230

225-
# Read recording as raw PCM (bypasses unfinalised WAV header)
226-
# and extract only the time range we need
231+
# Extract time range from raw PCM → proper WAV for whisper-cli
227232
sox -t raw -r 16000 -c 1 -b 16 -e signed-integer -L \
228-
"$recording_file" "$chunk_file" \
233+
"$recording_raw" "$chunk_file" \
229234
trim "$trim_start" "$trim_duration" 2>/dev/null
230235

231236
if [ -f "$chunk_file" ] && [ -s "$chunk_file" ]; then
232237
local chunk_transcript="/tmp/chunk_transcript_${TIMESTAMP}_${chunk_count}"
233238

234-
# Transcribe the chunk
235239
whisper-cli -m "$model_file" -f "$chunk_file" -l "$language" -otxt -of "$chunk_transcript" -pp -nt >/dev/null 2>&1
236240

237241
if [ -f "${chunk_transcript}.txt" ] && [ -s "${chunk_transcript}.txt" ]; then
238242
local new_text
239243
new_text=$(cat "${chunk_transcript}.txt" | sed '/^$/d')
240244
# Filter out Whisper hallucinations on silence/noise
241245
local filtered_text
242-
filtered_text=$(echo "$new_text" | grep -viE '^\s*[\[\(\*].*[\]\)\*]\s*$|^[[:space:]]*$' || true)
246+
filtered_text=$(echo "$new_text" | grep -viE '^[[:space:]]*[][(*].*[])*][[:space:]]*$|^[[:space:]]*$' || true)
243247
if [ -n "$filtered_text" ]; then
244248
echo "$filtered_text" >> "$running_transcript"
245249
print_color "$GREEN" "$filtered_text" >&2
@@ -257,7 +261,7 @@ original_live_transcription() {
257261

258262
# Clean up running transcript temp file
259263
rm -f "$running_transcript" 2>/dev/null
260-
264+
261265
# Wait for recording to complete
262266
wait $rec_pid
263267
}

0 commit comments

Comments
 (0)