Skip to content

Commit 0ccd896

Browse files
common : fix server /inference fails to decode in-memory audio (regression) (ggml-org#3818)
* common: add memory buffer overload of read_audio_data whisper-server /inference without --convert passed the uploaded file bytes to read_audio_data as a filename, so ma_decoder_init_file tried to open a path starting with "RIFF" and failed. every request returned HTTP 400 "Invalid request" on builds without WHISPER_FFMPEG, which is the default. factor the PCM extraction into a shared helper and add an overload that decodes straight from a memory buffer via ma_decoder_init_memory, which the function already used for the stdin path. server now calls it with the upload content. the filename overload behavior is unchanged.
1 parent 8443cf0 commit 0ccd896

3 files changed

Lines changed: 57 additions & 33 deletions

File tree

examples/common-whisper.cpp

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,42 @@
3939
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
4040
#endif
4141

42+
// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split
43+
static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
44+
ma_result result;
45+
ma_uint64 frame_count;
46+
ma_uint64 frames_read;
47+
48+
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
49+
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
50+
return false;
51+
}
52+
53+
pcmf32.resize(stereo ? frame_count*2 : frame_count);
54+
55+
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
56+
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
57+
return false;
58+
}
59+
60+
if (stereo) {
61+
std::vector<float> stereo_data = pcmf32;
62+
pcmf32.resize(frame_count);
63+
for (uint64_t i = 0; i < frame_count; i++) {
64+
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
65+
}
66+
pcmf32s.resize(2);
67+
pcmf32s[0].resize(frame_count);
68+
pcmf32s[1].resize(frame_count);
69+
for (uint64_t i = 0; i < frame_count; i++) {
70+
pcmf32s[0][i] = stereo_data[2*i];
71+
pcmf32s[1][i] = stereo_data[2*i + 1];
72+
}
73+
}
74+
75+
return true;
76+
}
77+
4278
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
4379
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
4480

@@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
109145
#endif
110146
}
111147

112-
ma_uint64 frame_count;
113-
ma_uint64 frames_read;
114-
115-
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
116-
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
117-
118-
return false;
119-
}
120-
121-
pcmf32.resize(stereo ? frame_count*2 : frame_count);
122-
123-
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
124-
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
125-
126-
return false;
127-
}
128-
129-
if (stereo) {
130-
std::vector<float> stereo_data = pcmf32;
131-
pcmf32.resize(frame_count);
148+
return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
149+
}
132150

133-
for (uint64_t i = 0; i < frame_count; i++) {
134-
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
135-
}
151+
// decode audio bytes already held in memory
152+
bool read_audio_data(const char * buffer, size_t buffer_size, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
153+
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
154+
ma_decoder decoder;
136155

137-
pcmf32s.resize(2);
138-
pcmf32s[0].resize(frame_count);
139-
pcmf32s[1].resize(frame_count);
140-
for (uint64_t i = 0; i < frame_count; i++) {
141-
pcmf32s[0][i] = stereo_data[2*i];
142-
pcmf32s[1][i] = stereo_data[2*i + 1];
143-
}
156+
if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) {
157+
fprintf(stderr, "error: failed to decode audio data from memory buffer\n");
158+
return false;
144159
}
145160

146-
return true;
161+
bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo);
162+
ma_decoder_uninit(&decoder);
163+
return ok;
147164
}
148165

149166
// 500 -> 00:05.000

examples/common-whisper.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ bool read_audio_data(
1414
std::vector<std::vector<float>> & pcmf32s,
1515
bool stereo);
1616

17+
// decode audio bytes already held in memory (uploaded file, network buffer)
18+
bool read_audio_data(
19+
const char * buffer,
20+
size_t buffer_size,
21+
std::vector<float> & pcmf32,
22+
std::vector<std::vector<float>> & pcmf32s,
23+
bool stereo);
24+
1725
// convert timestamp to string, 6000 -> 01:00.000
1826
std::string to_timestamp(int64_t t, bool comma = false);
1927

examples/server/server.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -868,8 +868,7 @@ int main(int argc, char ** argv) {
868868
// remove temp file
869869
std::remove(temp_filename.c_str());
870870
} else {
871-
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
872-
{
871+
if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) {
873872
fprintf(stderr, "error: failed to read audio data\n");
874873
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
875874
res.status = 400;

0 commit comments

Comments
 (0)