2727#include < stdlib.h>
2828#include < regex>
2929#include < errno.h>
30+ #include < inttypes.h>
3031#include < signal.h>
3132#include < unistd.h>
3233#include < sys/wait.h>
@@ -1075,9 +1076,71 @@ static uint8_t* load_and_resize_image(const char* path, int target_width, int ta
10751076 return buf;
10761077}
10771078
1079+ // Write sd.cpp's audio buffer to a temp WAV file (IEEE float, interleaved).
1080+ // sd_audio_t.data is planar (all channel 0 samples, then channel 1, etc.) — we
1081+ // interleave on the fly so ffmpeg's standard wav demuxer can read it directly.
1082+ // Returns 0 on success and fills wav_path (must be at least 64 bytes).
1083+ static int write_planar_float_wav (const sd_audio_t * a, char * wav_path, size_t wav_path_sz) {
1084+ if (!a || !a->data || a->sample_count == 0 || a->channels == 0 || a->sample_rate == 0 ) {
1085+ return -1 ;
1086+ }
1087+
1088+ snprintf (wav_path, wav_path_sz, " /tmp/gosd-audio-XXXXXX.wav" );
1089+ int fd = mkstemps (wav_path, 4 );
1090+ if (fd < 0 ) { perror (" mkstemps wav" ); return -1 ; }
1091+ FILE* f = fdopen (fd, " wb" );
1092+ if (!f) { perror (" fdopen wav" ); close (fd); return -1 ; }
1093+
1094+ uint64_t frames = a->sample_count ;
1095+ uint32_t channels = a->channels ;
1096+ uint32_t sample_rate = a->sample_rate ;
1097+ uint64_t total_samples64 = frames * (uint64_t )channels;
1098+ uint64_t data_bytes64 = total_samples64 * sizeof (float );
1099+ if (data_bytes64 > 0xFFFFFFFFull - 44 ) {
1100+ fprintf (stderr, " audio too large for 32-bit WAV (%" PRIu64 " bytes)\n " , data_bytes64);
1101+ fclose (f);
1102+ unlink (wav_path);
1103+ return -1 ;
1104+ }
1105+ uint32_t data_bytes = (uint32_t )data_bytes64;
1106+ uint32_t riff_size = 36 + data_bytes;
1107+ uint16_t fmt_code = 3 ; // WAVE_FORMAT_IEEE_FLOAT
1108+ uint16_t bits_per_sample = 32 ;
1109+ uint16_t block_align = (uint16_t )(channels * sizeof (float ));
1110+ uint32_t byte_rate = sample_rate * block_align;
1111+ uint16_t ch16 = (uint16_t )channels;
1112+ uint32_t fmt_size = 16 ;
1113+
1114+ fwrite (" RIFF" , 1 , 4 , f);
1115+ fwrite (&riff_size, 4 , 1 , f);
1116+ fwrite (" WAVEfmt " , 1 , 8 , f);
1117+ fwrite (&fmt_size, 4 , 1 , f);
1118+ fwrite (&fmt_code, 2 , 1 , f);
1119+ fwrite (&ch16, 2 , 1 , f);
1120+ fwrite (&sample_rate, 4 , 1 , f);
1121+ fwrite (&byte_rate, 4 , 1 , f);
1122+ fwrite (&block_align, 2 , 1 , f);
1123+ fwrite (&bits_per_sample, 2 , 1 , f);
1124+ fwrite (" data" , 1 , 4 , f);
1125+ fwrite (&data_bytes, 4 , 1 , f);
1126+
1127+ // Interleave planar [ch0_samples..., ch1_samples...] → [ch0_s0, ch1_s0, ...]
1128+ for (uint64_t s = 0 ; s < frames; s++) {
1129+ for (uint32_t c = 0 ; c < channels; c++) {
1130+ float v = a->data [(size_t )c * frames + s];
1131+ fwrite (&v, sizeof (float ), 1 , f);
1132+ }
1133+ }
1134+ fclose (f);
1135+ return 0 ;
1136+ }
1137+
10781138// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
1079- // Uses fork+execvp to avoid shell interpretation of dst.
1080- static int ffmpeg_mux_raw_to_mp4 (sd_image_t * frames, int num_frames, int fps, const char * dst) {
1139+ // Uses fork+execvp to avoid shell interpretation of dst. When `audio` is
1140+ // non-null, the audio waveform is staged to a temp WAV and added as a second
1141+ // ffmpeg input so the final MP4 contains both video and AAC audio.
1142+ static int ffmpeg_mux_raw_to_mp4 (sd_image_t * frames, int num_frames, int fps,
1143+ const sd_audio_t * audio, const char * dst) {
10811144 if (num_frames <= 0 || !frames || !frames[0 ].data ) {
10821145 fprintf (stderr, " ffmpeg_mux: empty frames\n " );
10831146 return 1 ;
@@ -1092,38 +1155,87 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
10921155 snprintf (size_str, sizeof (size_str), " %dx%d" , width, height);
10931156 snprintf (fps_str, sizeof (fps_str), " %d" , fps);
10941157
1158+ // Optional audio: write a temp WAV file if the model produced audio.
1159+ char wav_path[64 ] = {0 };
1160+ bool have_audio = false ;
1161+ if (audio && audio->data && audio->sample_count > 0 && audio->channels > 0 && audio->sample_rate > 0 ) {
1162+ if (write_planar_float_wav (audio, wav_path, sizeof (wav_path)) == 0 ) {
1163+ have_audio = true ;
1164+ fprintf (stderr, " ffmpeg_mux: audio %u Hz × %u ch × %" PRIu64 " frames → %s\n " ,
1165+ audio->sample_rate , audio->channels , audio->sample_count , wav_path);
1166+ } else {
1167+ fprintf (stderr, " ffmpeg_mux: failed to stage audio; producing silent video\n " );
1168+ }
1169+ }
1170+
10951171 int pipefd[2 ];
1096- if (pipe (pipefd) != 0 ) { perror (" pipe" ); return 1 ; }
1172+ if (pipe (pipefd) != 0 ) {
1173+ perror (" pipe" );
1174+ if (have_audio) unlink (wav_path);
1175+ return 1 ;
1176+ }
10971177
10981178 pid_t pid = fork ();
1099- if (pid < 0 ) { perror (" fork" ); close (pipefd[0 ]); close (pipefd[1 ]); return 1 ; }
1179+ if (pid < 0 ) {
1180+ perror (" fork" );
1181+ close (pipefd[0 ]); close (pipefd[1 ]);
1182+ if (have_audio) unlink (wav_path);
1183+ return 1 ;
1184+ }
11001185
11011186 if (pid == 0 ) {
11021187 // child
11031188 close (pipefd[1 ]);
11041189 if (dup2 (pipefd[0 ], STDIN_FILENO) < 0 ) { perror (" dup2" ); _exit (127 ); }
11051190 close (pipefd[0 ]);
1106- std::vector<char *> argv = {
1107- const_cast <char *>(" ffmpeg" ),
1108- const_cast <char *>(" -y" ),
1109- const_cast <char *>(" -hide_banner" ),
1110- const_cast <char *>(" -loglevel" ), const_cast <char *>(" warning" ),
1111- const_cast <char *>(" -f" ), const_cast <char *>(" rawvideo" ),
1112- const_cast <char *>(" -pix_fmt" ), const_cast <char *>(pix_fmt_in),
1113- const_cast <char *>(" -s" ), size_str,
1114- const_cast <char *>(" -framerate" ), fps_str,
1115- const_cast <char *>(" -i" ), const_cast <char *>(" -" ),
1116- const_cast <char *>(" -c:v" ), const_cast <char *>(" libx264" ),
1117- const_cast <char *>(" -pix_fmt" ), const_cast <char *>(" yuv420p" ),
1118- const_cast <char *>(" -movflags" ), const_cast <char *>(" +faststart" ),
1119- // Force MP4 container. Distributed LocalAI hands us a staging
1120- // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
1121- // extension; relying on filename suffix makes ffmpeg bail with
1122- // "Unable to choose an output format".
1123- const_cast <char *>(" -f" ), const_cast <char *>(" mp4" ),
1124- const_cast <char *>(dst),
1125- nullptr
1126- };
1191+ std::vector<char *> argv;
1192+ argv.push_back (const_cast <char *>(" ffmpeg" ));
1193+ argv.push_back (const_cast <char *>(" -y" ));
1194+ argv.push_back (const_cast <char *>(" -hide_banner" ));
1195+ argv.push_back (const_cast <char *>(" -loglevel" ));
1196+ argv.push_back (const_cast <char *>(" warning" ));
1197+ // Input 0: raw video from stdin
1198+ argv.push_back (const_cast <char *>(" -f" ));
1199+ argv.push_back (const_cast <char *>(" rawvideo" ));
1200+ argv.push_back (const_cast <char *>(" -pix_fmt" ));
1201+ argv.push_back (const_cast <char *>(pix_fmt_in));
1202+ argv.push_back (const_cast <char *>(" -s" ));
1203+ argv.push_back (size_str);
1204+ argv.push_back (const_cast <char *>(" -framerate" ));
1205+ argv.push_back (fps_str);
1206+ argv.push_back (const_cast <char *>(" -i" ));
1207+ argv.push_back (const_cast <char *>(" -" ));
1208+ // Input 1: optional audio WAV
1209+ if (have_audio) {
1210+ argv.push_back (const_cast <char *>(" -i" ));
1211+ argv.push_back (wav_path);
1212+ argv.push_back (const_cast <char *>(" -map" ));
1213+ argv.push_back (const_cast <char *>(" 0:v:0" ));
1214+ argv.push_back (const_cast <char *>(" -map" ));
1215+ argv.push_back (const_cast <char *>(" 1:a:0" ));
1216+ argv.push_back (const_cast <char *>(" -c:a" ));
1217+ argv.push_back (const_cast <char *>(" aac" ));
1218+ argv.push_back (const_cast <char *>(" -b:a" ));
1219+ argv.push_back (const_cast <char *>(" 192k" ));
1220+ // -shortest so the final clip ends with the shorter of the two
1221+ // streams — guards against an audio buffer that overshoots the
1222+ // video duration (or vice versa) on certain LTX variants.
1223+ argv.push_back (const_cast <char *>(" -shortest" ));
1224+ }
1225+ argv.push_back (const_cast <char *>(" -c:v" ));
1226+ argv.push_back (const_cast <char *>(" libx264" ));
1227+ argv.push_back (const_cast <char *>(" -pix_fmt" ));
1228+ argv.push_back (const_cast <char *>(" yuv420p" ));
1229+ argv.push_back (const_cast <char *>(" -movflags" ));
1230+ argv.push_back (const_cast <char *>(" +faststart" ));
1231+ // Force MP4 container. Distributed LocalAI hands us a staging
1232+ // path (e.g. /staging/localai-output-NNN.tmp) with a non-standard
1233+ // extension; relying on filename suffix makes ffmpeg bail with
1234+ // "Unable to choose an output format".
1235+ argv.push_back (const_cast <char *>(" -f" ));
1236+ argv.push_back (const_cast <char *>(" mp4" ));
1237+ argv.push_back (const_cast <char *>(dst));
1238+ argv.push_back (nullptr );
11271239 execvp (argv[0 ], argv.data ());
11281240 perror (" execvp ffmpeg" );
11291241 _exit (127 );
@@ -1148,6 +1260,7 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
11481260 close (pipefd[1 ]);
11491261 int status;
11501262 waitpid (pid, &status, 0 );
1263+ if (have_audio) unlink (wav_path);
11511264 return 1 ;
11521265 }
11531266 p += n;
@@ -1158,8 +1271,13 @@ static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, co
11581271
11591272 int status = 0 ;
11601273 while (waitpid (pid, &status, 0 ) < 0 ) {
1161- if (errno != EINTR) { perror (" waitpid" ); return 1 ; }
1274+ if (errno != EINTR) {
1275+ perror (" waitpid" );
1276+ if (have_audio) unlink (wav_path);
1277+ return 1 ;
1278+ }
11621279 }
1280+ if (have_audio) unlink (wav_path);
11631281 if (!WIFEXITED (status) || WEXITSTATUS (status) != 0 ) {
11641282 fprintf (stderr, " ffmpeg exited with status %d\n " , status);
11651283 return 1 ;
@@ -1234,7 +1352,7 @@ int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int
12341352
12351353 fprintf (stderr, " Generated %d frames, muxing to %s via ffmpeg\n " , num_frames_out, dst);
12361354
1237- int rc = ffmpeg_mux_raw_to_mp4 (frames, num_frames_out, fps, dst);
1355+ int rc = ffmpeg_mux_raw_to_mp4 (frames, num_frames_out, fps, audio, dst);
12381356
12391357 for (int i = 0 ; i < num_frames_out; i++) {
12401358 if (frames[i].data ) free (frames[i].data );
0 commit comments