Skip to content

Commit de2ce74

Browse files
localai-botmudler
andauthored
fix(stablediffusion-ggml): mux LTX-2 audio into output MP4 (#9990)
feat(stablediffusion-ggml): mux LTX-2 audio into output MP4 sd.cpp's generate_video now returns a sd_audio_t* alongside the video frames for models with an audio VAE (LTX-2.3). Our gosd wrapper was already collecting that pointer but immediately freed it without ever muxing it into the output, so LTX-2 generations landed as silent MP4s even though the audio VAE decode succeeded. Stage the planar float32 waveform to a temp WAV (IEEE float, header hand-built; samples interleaved on the fly), then add it as a second ffmpeg input with -c:a aac -map 0:v:0 -map 1:a:0 -shortest. The temp WAV is cleaned up unconditionally after ffmpeg exits, including on the write/waitpid error paths. Non-LTX models (Wan i2v / FLF2V) keep their current behaviour: audio arg is nullptr, the audio-related ffmpeg flags are not added, and no temp file is created. Assisted-by: Claude:claude-opus-4-7 Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 1c6c3ad commit de2ce74

1 file changed

Lines changed: 145 additions & 27 deletions

File tree

  • backend/go/stablediffusion-ggml/cpp

backend/go/stablediffusion-ggml/cpp/gosd.cpp

Lines changed: 145 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <stdlib.h>
2828
#include <regex>
2929
#include <errno.h>
30+
#include <inttypes.h>
3031
#include <signal.h>
3132
#include <unistd.h>
3233
#include <sys/wait.h>
@@ -1075,9 +1076,71 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
10751076
return buf;
10761077
}
10771078

1079+
// Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
1080+
// sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
1081+
// interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
1082+
// Returns 0 on success and fills wav_path (must be at least 64 bytes).
1083+
static int write_planar_float_wav(const sd_audio_t* a, char* wav_path, size_t wav_path_sz) {
1084+
if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0) {
1085+
return -1;
1086+
}
1087+
1088+
snprintf(wav_path, wav_path_sz, "/tmp/gosd-audio-XXXXXX.wav");
1089+
int fd = mkstemps(wav_path, 4);
1090+
if (fd < 0) { perror("mkstemps wav"); return -1; }
1091+
FILE* f = fdopen(fd, "wb");
1092+
if (!f) { perror("fdopen wav"); close(fd); return -1; }
1093+
1094+
uint64_t frames = a->sample_count;
1095+
uint32_t channels = a->channels;
1096+
uint32_t sample_rate = a->sample_rate;
1097+
uint64_t total_samples64 = frames * (uint64_t)channels;
1098+
uint64_t data_bytes64 = total_samples64 * sizeof(float);
1099+
if (data_bytes64 > 0xFFFFFFFFull - 44) {
1100+
fprintf(stderr, "audio too large for 32-bit WAV (%" PRIu64 " bytes)\n", data_bytes64);
1101+
fclose(f);
1102+
unlink(wav_path);
1103+
return -1;
1104+
}
1105+
uint32_t data_bytes = (uint32_t)data_bytes64;
1106+
uint32_t riff_size = 36 + data_bytes;
1107+
uint16_t fmt_code = 3; // WAVE_FORMAT_IEEE_FLOAT
1108+
uint16_t bits_per_sample = 32;
1109+
uint16_t block_align = (uint16_t)(channels * sizeof(float));
1110+
uint32_t byte_rate = sample_rate * block_align;
1111+
uint16_t ch16 = (uint16_t)channels;
1112+
uint32_t fmt_size = 16;
1113+
1114+
fwrite("RIFF", 1, 4, f);
1115+
fwrite(&riff_size, 4, 1, f);
1116+
fwrite("WAVEfmt ", 1, 8, f);
1117+
fwrite(&fmt_size, 4, 1, f);
1118+
fwrite(&fmt_code, 2, 1, f);
1119+
fwrite(&ch16, 2, 1, f);
1120+
fwrite(&sample_rate, 4, 1, f);
1121+
fwrite(&byte_rate, 4, 1, f);
1122+
fwrite(&block_align, 2, 1, f);
1123+
fwrite(&bits_per_sample, 2, 1, f);
1124+
fwrite("data", 1, 4, f);
1125+
fwrite(&data_bytes, 4, 1, f);
1126+
1127+
// Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
1128+
for (uint64_t s = 0; s < frames; s++) {
1129+
for (uint32_t c = 0; c < channels; c++) {
1130+
float v = a->data[(size_t)c * frames + s];
1131+
fwrite(&v, sizeof(float), 1, f);
1132+
}
1133+
}
1134+
fclose(f);
1135+
return 0;
1136+
}
1137+
10781138
// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
1079-
// Uses fork+execvp to avoid shell interpretation of dst.
1080-
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
1139+
// Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
1140+
// non-null, the audio waveform is staged to a temp WAV and added as a second
1141+
// ffmpeg input so the final MP4 contains both video and AAC audio.
1142+
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps,
1143+
const sd_audio_t* audio, const char* dst) {
10811144
if (num_frames <= 0 || !frames || !frames[0].data) {
10821145
fprintf(stderr, "ffmpeg_mux: empty frames\n");
10831146
return 1;
@@ -1092,38 +1155,87 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
10921155
snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
10931156
snprintf(fps_str, sizeof(fps_str), "%d", fps);
10941157

1158+
// Optional audio: write a temp WAV file if the model produced audio.
1159+
char wav_path[64] = {0};
1160+
bool have_audio = false;
1161+
if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0) {
1162+
if (write_planar_float_wav(audio, wav_path, sizeof(wav_path)) == 0) {
1163+
have_audio = true;
1164+
fprintf(stderr, "ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n",
1165+
audio->sample_rate, audio->channels, audio->sample_count, wav_path);
1166+
} else {
1167+
fprintf(stderr, "ffmpeg_mux: failed to stage audio; producing silent video\n");
1168+
}
1169+
}
1170+
10951171
int pipefd[2];
1096-
if (pipe(pipefd) != 0) { perror("pipe"); return 1; }
1172+
if (pipe(pipefd) != 0) {
1173+
perror("pipe");
1174+
if (have_audio) unlink(wav_path);
1175+
return 1;
1176+
}
10971177

10981178
pid_t pid = fork();
1099-
if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }
1179+
if (pid < 0) {
1180+
perror("fork");
1181+
close(pipefd[0]); close(pipefd[1]);
1182+
if (have_audio) unlink(wav_path);
1183+
return 1;
1184+
}
11001185

11011186
if (pid == 0) {
11021187
// child
11031188
close(pipefd[1]);
11041189
if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
11051190
close(pipefd[0]);
1106-
std::vector<char*> argv = {
1107-
const_cast<char*>("ffmpeg"),
1108-
const_cast<char*>("-y"),
1109-
const_cast<char*>("-hide_banner"),
1110-
const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
1111-
const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
1112-
const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
1113-
const_cast<char*>("-s"), size_str,
1114-
const_cast<char*>("-framerate"), fps_str,
1115-
const_cast<char*>("-i"), const_cast<char*>("-"),
1116-
const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
1117-
const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
1118-
const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
1119-
// Force MP4 container. Distributed LocalAI hands us a staging
1120-
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
1121-
// extension; relying on filename suffix makes ffmpeg bail with
1122-
// "Unable to choose an output format".
1123-
const_cast<char*>("-f"), const_cast<char*>("mp4"),
1124-
const_cast<char*>(dst),
1125-
nullptr
1126-
};
1191+
std::vector<char*> argv;
1192+
argv.push_back(const_cast<char*>("ffmpeg"));
1193+
argv.push_back(const_cast<char*>("-y"));
1194+
argv.push_back(const_cast<char*>("-hide_banner"));
1195+
argv.push_back(const_cast<char*>("-loglevel"));
1196+
argv.push_back(const_cast<char*>("warning"));
1197+
// Input 0: raw video from stdin
1198+
argv.push_back(const_cast<char*>("-f"));
1199+
argv.push_back(const_cast<char*>("rawvideo"));
1200+
argv.push_back(const_cast<char*>("-pix_fmt"));
1201+
argv.push_back(const_cast<char*>(pix_fmt_in));
1202+
argv.push_back(const_cast<char*>("-s"));
1203+
argv.push_back(size_str);
1204+
argv.push_back(const_cast<char*>("-framerate"));
1205+
argv.push_back(fps_str);
1206+
argv.push_back(const_cast<char*>("-i"));
1207+
argv.push_back(const_cast<char*>("-"));
1208+
// Input 1: optional audio WAV
1209+
if (have_audio) {
1210+
argv.push_back(const_cast<char*>("-i"));
1211+
argv.push_back(wav_path);
1212+
argv.push_back(const_cast<char*>("-map"));
1213+
argv.push_back(const_cast<char*>("0:v:0"));
1214+
argv.push_back(const_cast<char*>("-map"));
1215+
argv.push_back(const_cast<char*>("1:a:0"));
1216+
argv.push_back(const_cast<char*>("-c:a"));
1217+
argv.push_back(const_cast<char*>("aac"));
1218+
argv.push_back(const_cast<char*>("-b:a"));
1219+
argv.push_back(const_cast<char*>("192k"));
1220+
// -shortest so the final clip ends with the shorter of the two
1221+
// streams — guards against an audio buffer that overshoots the
1222+
// video duration (or vice versa) on certain LTX variants.
1223+
argv.push_back(const_cast<char*>("-shortest"));
1224+
}
1225+
argv.push_back(const_cast<char*>("-c:v"));
1226+
argv.push_back(const_cast<char*>("libx264"));
1227+
argv.push_back(const_cast<char*>("-pix_fmt"));
1228+
argv.push_back(const_cast<char*>("yuv420p"));
1229+
argv.push_back(const_cast<char*>("-movflags"));
1230+
argv.push_back(const_cast<char*>("+faststart"));
1231+
// Force MP4 container. Distributed LocalAI hands us a staging
1232+
// path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
1233+
// extension; relying on filename suffix makes ffmpeg bail with
1234+
// "Unable to choose an output format".
1235+
argv.push_back(const_cast<char*>("-f"));
1236+
argv.push_back(const_cast<char*>("mp4"));
1237+
argv.push_back(const_cast<char*>(dst));
1238+
argv.push_back(nullptr);
11271239
execvp(argv[0], argv.data());
11281240
perror("execvp ffmpeg");
11291241
_exit(127);
@@ -1148,6 +1260,7 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
11481260
close(pipefd[1]);
11491261
int status;
11501262
waitpid(pid, &status, 0);
1263+
if (have_audio) unlink(wav_path);
11511264
return 1;
11521265
}
11531266
p += n;
@@ -1158,8 +1271,13 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
11581271

11591272
int status = 0;
11601273
while (waitpid(pid, &status, 0) < 0) {
1161-
if (errno != EINTR) { perror("waitpid"); return 1; }
1274+
if (errno != EINTR) {
1275+
perror("waitpid");
1276+
if (have_audio) unlink(wav_path);
1277+
return 1;
1278+
}
11621279
}
1280+
if (have_audio) unlink(wav_path);
11631281
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
11641282
fprintf(stderr, "ffmpeg exited with status %d\n", status);
11651283
return 1;
@@ -1234,7 +1352,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int
12341352

12351353
fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);
12361354

1237-
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);
1355+
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, audio, dst);
12381356

12391357
for (int i = 0; i < num_frames_out; i++) {
12401358
if (frames[i].data) free(frames[i].data);

0 commit comments

Comments
 (0)