common : fix server /inference fails to decode in-memory audio (regression) (ggml-org#3818)

ServeurpersoCom · web-flow · commit 0ccd896f5b88 · 2026-05-22T08:27:35.000+02:00
* common: add memory buffer overload of read_audio_data

whisper-server /inference without --convert passed the uploaded file
bytes to read_audio_data as a filename, so ma_decoder_init_file tried
to open a path starting with "RIFF" and failed. every request returned
HTTP 400 "Invalid request" on builds without WHISPER_FFMPEG, which is
the default.

factor the PCM extraction into a shared helper and add an overload that
decodes straight from a memory buffer via ma_decoder_init_memory, which
the function already used for the stdin path. server now calls it with
the upload content. the filename overload behavior is unchanged.
diff --git a/examples/common-whisper.cpp b/examples/common-whisper.cpp
@@ -39,6 +39,42 @@
 extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
 #endif
 
+// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split
+static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
+    ma_result result;
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+
+    if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
+        return false;
+    }
+
+    pcmf32.resize(stereo ? frame_count*2 : frame_count);
+
+    if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
+        return false;
+    }
+
+    if (stereo) {
+        std::vector<float> stereo_data = pcmf32;
+        pcmf32.resize(frame_count);
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
+        }
+        pcmf32s.resize(2);
+        pcmf32s[0].resize(frame_count);
+        pcmf32s[1].resize(frame_count);
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32s[0][i] = stereo_data[2*i];
+            pcmf32s[1][i] = stereo_data[2*i + 1];
+        }
+    }
+
+    return true;
+}
+
 bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
     std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
 
@@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
 #endif
     }
 
-    ma_uint64 frame_count;
-    ma_uint64 frames_read;
-
-    if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
-		fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
-
-		return false;
-    }
-
-    pcmf32.resize(stereo ? frame_count*2 : frame_count);
-
-    if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
-		fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
-
-		return false;
-    }
-
-    if (stereo) {
-        std::vector<float> stereo_data = pcmf32;
-        pcmf32.resize(frame_count);
+    return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
+}
 
-        for (uint64_t i = 0; i < frame_count; i++) {
-            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
-        }
+// decode audio bytes already held in memory
+bool read_audio_data(const char * buffer, size_t buffer_size, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
+    ma_decoder decoder;
 
-        pcmf32s.resize(2);
-        pcmf32s[0].resize(frame_count);
-        pcmf32s[1].resize(frame_count);
-        for (uint64_t i = 0; i < frame_count; i++) {
-            pcmf32s[0][i] = stereo_data[2*i];
-            pcmf32s[1][i] = stereo_data[2*i + 1];
-        }
+    if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) {
+        fprintf(stderr, "error: failed to decode audio data from memory buffer\n");
+        return false;
     }
 
-    return true;
+    bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo);
+    ma_decoder_uninit(&decoder);
+    return ok;
 }
 
 //  500 -> 00:05.000
diff --git a/examples/common-whisper.h b/examples/common-whisper.h
@@ -14,6 +14,14 @@ bool read_audio_data(
         std::vector<std::vector<float>> & pcmf32s,
         bool stereo);
 
+// decode audio bytes already held in memory (uploaded file, network buffer)
+bool read_audio_data(
+        const char * buffer,
+        size_t buffer_size,
+        std::vector<float> & pcmf32,
+        std::vector<std::vector<float>> & pcmf32s,
+        bool stereo);
+
 // convert timestamp to string, 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -868,8 +868,7 @@ int main(int argc, char ** argv) {
             // remove temp file
             std::remove(temp_filename.c_str());
         } else {
-            if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
-            {
+            if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) {
                 fprintf(stderr, "error: failed to read audio data\n");
                 const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
                 res.status = 400;