qvac-ext-lib-whisper.cpp/examples/ffmpeg-transcode.cpp at master · tetherto/qvac-ext-lib-whisper.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#ifdef WHISPER_COMMON_FFMPEG

#include <string>
#include <vector>
#include <cstdio>
#include <cstring>

extern "C" {
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libswresample/swresample.h>
}

// Write a minimal WAV header into the output buffer.
// Returns the number of bytes written (44 for a standard PCM WAV header).
static size_t wav_header_write(uint8_t * buf, int num_channels, int sample_rate, int bits_per_sample, uint32_t data_size) {
    // RIFF header
    memcpy(buf, "RIFF", 4);
    uint32_t chunk_size = 36 + data_size;
    memcpy(buf + 4, &chunk_size, 4);
    memcpy(buf + 8, "WAVE", 4);

    // fmt subchunk
    memcpy(buf + 12, "fmt ", 4);
    uint32_t subchunk1_size = 16;
    memcpy(buf + 16, &subchunk1_size, 4);
    uint16_t audio_format = 1; // PCM
    memcpy(buf + 20, &audio_format, 2);
    memcpy(buf + 22, &num_channels, 2);
    memcpy(buf + 24, &sample_rate, 4);

    int bytes_per_sample = (bits_per_sample / 8) * num_channels;
    int byte_rate = sample_rate * bytes_per_sample;
    memcpy(buf + 28, &byte_rate, 4);
    memcpy(buf + 32, &bytes_per_sample, 2);
    memcpy(buf + 34, &bits_per_sample, 2);

    // data subchunk
    memcpy(buf + 36, "data", 4);
    memcpy(buf + 40, &data_size, 4);

    return 44;
}

bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data, int out_sample_rate) {
    {
        const char * verbose = getenv("WHISPER_COMMON_FFMPEG_VERBOSE");
        if (verbose && strcmp(verbose, "2") == 0) {
            av_log_set_level(AV_LOG_DEBUG);
        } else if (verbose && strcmp(verbose, "1") == 0) {
            av_log_set_level(AV_LOG_VERBOSE);
        } else {
            av_log_set_level(AV_LOG_WARNING);
        }
    }

    AVFormatContext * fmt_ctx = nullptr;
    if (avformat_open_input(&fmt_ctx, ifname.c_str(), nullptr, nullptr) != 0) {
        fprintf(stderr, "error: failed to open input file '%s'\n", ifname.c_str());
        return true;
    }

    if (avformat_find_stream_info(fmt_ctx, nullptr) < 0) {
        fprintf(stderr, "error: failed to find stream information\n");
        avformat_close_input(&fmt_ctx);
        return true;
    }

    // Find the first audio stream
    int audio_stream_idx = -1;
    for (unsigned int i = 0; i < fmt_ctx->nb_streams; i++) {
        if (fmt_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
            audio_stream_idx = i;
            break;
        }
    }

    if (audio_stream_idx == -1) {
        fprintf(stderr, "error: failed to find an audio stream in '%s'\n", ifname.c_str());
        avformat_close_input(&fmt_ctx);
        return true;
    }

    AVStream * audio_stream = fmt_ctx->streams[audio_stream_idx];

    // Open the decoder
    const AVCodec * codec = avcodec_find_decoder(audio_stream->codecpar->codec_id);
    if (!codec) {
        fprintf(stderr, "error: failed to find decoder for codec id %d\n", audio_stream->codecpar->codec_id);
        avformat_close_input(&fmt_ctx);
        return true;
    }

    AVCodecContext * codec_ctx = avcodec_alloc_context3(codec);
    if (!codec_ctx) {
        fprintf(stderr, "error: failed to allocate codec context\n");
        avformat_close_input(&fmt_ctx);
        return true;
    }

    if (avcodec_parameters_to_context(codec_ctx, audio_stream->codecpar) < 0) {
        fprintf(stderr, "error: failed to copy codec parameters to context\n");
        avcodec_free_context(&codec_ctx);
        avformat_close_input(&fmt_ctx);
        return true;
    }

    if (avcodec_open2(codec_ctx, codec, nullptr) < 0) {
        fprintf(stderr, "error: failed to open codec\n");
        avcodec_free_context(&codec_ctx);
        avformat_close_input(&fmt_ctx);
        return true;
    }

    // Setup resampler: convert to 16-bit signed PCM, mono, 16000 Hz
    const enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_S16;

    AVChannelLayout out_ch_layout = AV_CHANNEL_LAYOUT_MONO;

    SwrContext * swr_ctx = nullptr;
    if (swr_alloc_set_opts2(&swr_ctx, &out_ch_layout, out_sample_fmt, out_sample_rate,
                            &codec_ctx->ch_layout, codec_ctx->sample_fmt, codec_ctx->sample_rate,
                            0, nullptr) < 0) {
        fprintf(stderr, "error: failed to allocate swr context\n");
        avcodec_free_context(&codec_ctx);
        avformat_close_input(&fmt_ctx);
        return true;
    }

    if (swr_init(swr_ctx) < 0) {
        fprintf(stderr, "error: failed to initialize swr context\n");
        swr_free(&swr_ctx);
        avcodec_free_context(&codec_ctx);
        avformat_close_input(&fmt_ctx);
        return true;
    }

    // Decode and resample
    AVPacket * packet = av_packet_alloc();
    AVFrame * frame = av_frame_alloc();

    // Buffer to collect resampled output
    std::vector<int16_t> pcm_data;

    // Max output samples per swr_convert call
    const int max_out_samples = 16 * 1024;
    std::vector<int16_t> out_buffer(max_out_samples);

    while (av_read_frame(fmt_ctx, packet) >= 0) {
        if (packet->stream_index != audio_stream_idx) {
            av_packet_unref(packet);
            continue;
        }

        int ret = avcodec_send_packet(codec_ctx, packet);
        av_packet_unref(packet);

        if (ret < 0) {
            continue;
        }

        while (ret >= 0) {
            ret = avcodec_receive_frame(codec_ctx, frame);
            if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
                break;
            }
            if (ret < 0) {
                break;
            }

            // Resample
            int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples,
                                              out_sample_rate, out_sample_rate, AV_ROUND_UP);
            if (out_samples > (int)out_buffer.size()) {
                out_buffer.resize(out_samples);
            }

            const uint8_t * in_data[16] = {0};
            for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) {
                in_data[p] = frame->data[p];
            }
            uint8_t * out_data[16] = {0};
            out_data[0] = (uint8_t *)out_buffer.data();

            int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples);
            if (got_samples > 0) {
                pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples);
            }
        }
    }

    // Flush the decoder
    avcodec_send_packet(codec_ctx, nullptr);
    while (avcodec_receive_frame(codec_ctx, frame) >= 0) {
        int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, out_sample_rate) + frame->nb_samples,
                                          out_sample_rate, out_sample_rate, AV_ROUND_UP);
        if (out_samples > (int)out_buffer.size()) {
            out_buffer.resize(out_samples);
        }
        const uint8_t * in_data[16] = {0};
        for (int p = 0; p < (int)codec_ctx->ch_layout.nb_channels && p < 16; p++) {
            in_data[p] = frame->data[p];
        }
        uint8_t * out_data[16] = {0};
        out_data[0] = (uint8_t *)out_buffer.data();

        int got_samples = swr_convert(swr_ctx, out_data, out_samples, in_data, frame->nb_samples);
        if (got_samples > 0) {
            pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + got_samples);
        }
    }

    // Flush the resampler
    uint8_t * out_data[16] = {0};
    out_data[0] = (uint8_t *)out_buffer.data();
    int flush_samples = swr_convert(swr_ctx, out_data, max_out_samples, nullptr, 0);
    if (flush_samples > 0) {
        pcm_data.insert(pcm_data.end(), out_buffer.begin(), out_buffer.begin() + flush_samples);
    }

    // Build WAV output
    uint32_t data_size = pcm_data.size() * sizeof(int16_t);
    wav_data.resize(44 + data_size);

    wav_header_write(wav_data.data(), 1, out_sample_rate, 16, data_size);
    memcpy(wav_data.data() + 44, pcm_data.data(), data_size);

    // Cleanup
    av_frame_free(&frame);
    av_packet_free(&packet);
    swr_free(&swr_ctx);
    avcodec_free_context(&codec_ctx);
    avformat_close_input(&fmt_ctx);

    return false; // success
}

#endif // WHISPER_COMMON_FFMPEG