Skip to content

Commit 4e96f21

Browse files
maxitoonclaude
andcommitted
Improve live transcription accuracy
- Increase chunk size from 5s to 10s (short chunks cause Whisper hallucinations) - Add 2s overlap between chunks for sentence continuity context - Filter out hallucinated non-speech like [Bruits de la vache], *Bruits de vent* Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1ad9c4e commit 4e96f21

1 file changed

Lines changed: 21 additions & 10 deletions

File tree

whisper-transcribe-with-download.sh

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ original_live_transcription() {
138138
echo "" >&2
139139
print_color "$YELLOW" "Recording will be saved to: $recording_file" >&2
140140
print_color "$YELLOW" "Audio file will be kept for 7 days" >&2
141-
print_color "$YELLOW" "Live transcript will appear below every ~5 seconds:" >&2
141+
print_color "$YELLOW" "Live transcript will appear below every ~10 seconds:" >&2
142142
print_color "$YELLOW" "Final transcript will be saved to: $transcript_file" >&2
143143
echo "" >&2
144144
print_color "$BLUE" "Press Ctrl+C to stop recording and save transcript" >&2
@@ -195,7 +195,8 @@ original_live_transcription() {
195195
# duration from file size and extract raw PCM bytes directly.
196196
local BYTES_PER_SEC=32000
197197
local WAV_HEADER_SIZE=44
198-
local MIN_CHUNK_SECS=5
198+
local MIN_CHUNK_SECS=10
199+
local OVERLAP_SECS=2
199200
local last_byte_offset=$WAV_HEADER_SIZE
200201
local chunk_count=0
201202
local running_transcript="/tmp/running_transcript_${TIMESTAMP}.txt"
@@ -211,13 +212,19 @@ original_live_transcription() {
211212
if [ "$new_secs" -ge "$MIN_CHUNK_SECS" ]; then
212213
chunk_count=$((chunk_count + 1))
213214

214-
# Align to full seconds of audio
215-
local chunk_bytes=$((new_secs * BYTES_PER_SEC))
215+
# Include overlap from previous chunk for context (except first chunk)
216+
local overlap_bytes=0
217+
if [ "$chunk_count" -gt 1 ]; then
218+
overlap_bytes=$((OVERLAP_SECS * BYTES_PER_SEC))
219+
fi
220+
local extract_offset=$((last_byte_offset - overlap_bytes))
221+
local chunk_bytes=$((new_secs * BYTES_PER_SEC + overlap_bytes))
222+
216223
local raw_file="/tmp/chunk_raw_${TIMESTAMP}_${chunk_count}.pcm"
217224
local chunk_file="/tmp/chunk_${TIMESTAMP}_${chunk_count}.wav"
218225

219226
# Extract raw PCM bytes (bypass incomplete WAV header)
220-
dd if="$recording_file" of="$raw_file" bs=1 skip="$last_byte_offset" count="$chunk_bytes" 2>/dev/null
227+
dd if="$recording_file" of="$raw_file" bs=1 skip="$extract_offset" count="$chunk_bytes" 2>/dev/null
221228

222229
# Convert raw PCM to valid WAV for whisper-cli
223230
sox -t raw -r 16000 -c 1 -b 16 -e signed-integer -L "$raw_file" "$chunk_file" 2>/dev/null
@@ -226,22 +233,26 @@ original_live_transcription() {
226233
if [ -f "$chunk_file" ] && [ -s "$chunk_file" ]; then
227234
local chunk_transcript="/tmp/chunk_transcript_${TIMESTAMP}_${chunk_count}"
228235

229-
# Transcribe only the new chunk
236+
# Transcribe the chunk
230237
whisper-cli -m "$model_file" -f "$chunk_file" -l "$language" -otxt -of "$chunk_transcript" -pp -nt >/dev/null 2>&1
231238

232239
if [ -f "${chunk_transcript}.txt" ] && [ -s "${chunk_transcript}.txt" ]; then
233240
local new_text
234241
new_text=$(cat "${chunk_transcript}.txt" | sed '/^$/d')
235-
if [ -n "$new_text" ] && [ "$new_text" != " " ]; then
236-
echo "$new_text" >> "$running_transcript"
237-
print_color "$GREEN" "$new_text" >&2
242+
# Filter out Whisper hallucinations on silence/noise
243+
local filtered_text
244+
filtered_text=$(echo "$new_text" | grep -viE '^\[.*\]$|^\*.*\*$|^[[:space:]]*$' || true)
245+
if [ -n "$filtered_text" ]; then
246+
echo "$filtered_text" >> "$running_transcript"
247+
print_color "$GREEN" "$filtered_text" >&2
238248
fi
239249
fi
240250

241251
rm -f "$chunk_file" "${chunk_transcript}.txt" 2>/dev/null
242252
fi
243253

244-
last_byte_offset=$((last_byte_offset + chunk_bytes))
254+
# Advance position (without overlap — overlap is re-read next time)
255+
last_byte_offset=$((last_byte_offset + new_secs * BYTES_PER_SEC))
245256
fi
246257
fi
247258
sleep 2

0 commit comments

Comments
 (0)