Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 48 additions & 31 deletions examples/common-whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,42 @@
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
#endif

// extract f32 PCM frames from an initialized decoder, downmix to mono and keep the stereo split
static bool read_audio_from_decoder(ma_decoder & decoder, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
ma_result result;
ma_uint64 frame_count;
ma_uint64 frames_read;

if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
return false;
}

pcmf32.resize(stereo ? frame_count*2 : frame_count);

if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
return false;
}

if (stereo) {
std::vector<float> stereo_data = pcmf32;
pcmf32.resize(frame_count);
for (uint64_t i = 0; i < frame_count; i++) {
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
}
pcmf32s.resize(2);
pcmf32s[0].resize(frame_count);
pcmf32s[1].resize(frame_count);
for (uint64_t i = 0; i < frame_count; i++) {
pcmf32s[0][i] = stereo_data[2*i];
pcmf32s[1][i] = stereo_data[2*i + 1];
}
}

return true;
}

bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output

Expand Down Expand Up @@ -109,41 +145,22 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
#endif
}

ma_uint64 frame_count;
ma_uint64 frames_read;

if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));

return false;
}

pcmf32.resize(stereo ? frame_count*2 : frame_count);

if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));

return false;
}

if (stereo) {
std::vector<float> stereo_data = pcmf32;
pcmf32.resize(frame_count);
return read_audio_from_decoder(decoder.decoder, pcmf32, pcmf32s, stereo);
}

for (uint64_t i = 0; i < frame_count; i++) {
pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
}
// decode audio bytes already held in memory
bool read_audio_data(const char * buffer, size_t buffer_size, std::vector<float> & pcmf32, std::vector<std::vector<float>> & pcmf32s, bool stereo) {
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
ma_decoder decoder;

pcmf32s.resize(2);
pcmf32s[0].resize(frame_count);
pcmf32s[1].resize(frame_count);
for (uint64_t i = 0; i < frame_count; i++) {
pcmf32s[0][i] = stereo_data[2*i];
pcmf32s[1][i] = stereo_data[2*i + 1];
}
if (ma_decoder_init_memory(buffer, buffer_size, &decoder_config, &decoder) != MA_SUCCESS) {
fprintf(stderr, "error: failed to decode audio data from memory buffer\n");
return false;
}

return true;
bool ok = read_audio_from_decoder(decoder, pcmf32, pcmf32s, stereo);
ma_decoder_uninit(&decoder);
return ok;
}

// 500 -> 00:05.000
Expand Down
8 changes: 8 additions & 0 deletions examples/common-whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ bool read_audio_data(
std::vector<std::vector<float>> & pcmf32s,
bool stereo);

// decode audio bytes already held in memory (uploaded file, network buffer)
bool read_audio_data(
const char * buffer,
size_t buffer_size,
std::vector<float> & pcmf32,
std::vector<std::vector<float>> & pcmf32s,
bool stereo);

// convert timestamp to string, 6000 -> 01:00.000
std::string to_timestamp(int64_t t, bool comma = false);

Expand Down
3 changes: 1 addition & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -868,8 +868,7 @@ int main(int argc, char ** argv) {
// remove temp file
std::remove(temp_filename.c_str());
} else {
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
{
if (!::read_audio_data(audio_file.content.data(), audio_file.content.size(), pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read audio data\n");
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
res.status = 400;
Expand Down
Loading