Skip to content

Commit 054c4b4

Browse files
authored
feat(stable-diffusion.ggml): add support for video generation (#9420)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 6e49dba commit 054c4b4

File tree

7 files changed

+439
-25
lines changed

7 files changed

+439
-25
lines changed

backend/go/stablediffusion-ggml/gosd.cpp

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
#include "stb_image_resize.h"
2727
#include <stdlib.h>
2828
#include <regex>
29+
#include <errno.h>
30+
#include <signal.h>
31+
#include <unistd.h>
32+
#include <sys/wait.h>
2933

3034

3135

@@ -980,6 +984,251 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
980984
return !ret;
981985
}
982986

987+
// ---------------- Video generation ----------------
988+
989+
sd_vid_gen_params_t* sd_vid_gen_params_new(void) {
990+
sd_vid_gen_params_t *params = (sd_vid_gen_params_t *)std::malloc(sizeof(sd_vid_gen_params_t));
991+
sd_vid_gen_params_init(params);
992+
sd_sample_params_init(&params->sample_params);
993+
sd_sample_params_init(&params->high_noise_sample_params);
994+
sd_cache_params_init(&params->cache);
995+
return params;
996+
}
997+
998+
// Persistent storage for cleaned video prompts (kept alive for the duration of generation)
999+
static std::string cleaned_vid_prompt_storage;
1000+
static std::string cleaned_vid_negative_prompt_storage;
1001+
1002+
void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt) {
1003+
lora_vec.clear();
1004+
lora_strings.clear();
1005+
1006+
std::string prompt_str = prompt ? prompt : "";
1007+
std::string negative_prompt_str = negative_prompt ? negative_prompt : "";
1008+
1009+
const char* lora_dir_to_use = lora_dir_path.empty() ? nullptr : lora_dir_path.c_str();
1010+
1011+
auto [loras, cleaned_prompt] = parse_loras_from_prompt(prompt_str, lora_dir_to_use);
1012+
lora_vec = loras;
1013+
cleaned_vid_prompt_storage = cleaned_prompt;
1014+
1015+
auto [neg_loras, cleaned_negative] = parse_loras_from_prompt(negative_prompt_str, lora_dir_to_use);
1016+
cleaned_vid_negative_prompt_storage = cleaned_negative;
1017+
1018+
params->prompt = cleaned_vid_prompt_storage.c_str();
1019+
params->negative_prompt = cleaned_vid_negative_prompt_storage.c_str();
1020+
params->loras = lora_vec.empty() ? nullptr : lora_vec.data();
1021+
params->lora_count = static_cast<uint32_t>(lora_vec.size());
1022+
}
1023+
1024+
void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height) {
1025+
params->width = width;
1026+
params->height = height;
1027+
}
1028+
1029+
void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed) {
1030+
params->seed = seed;
1031+
}
1032+
1033+
void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n) {
1034+
params->video_frames = n;
1035+
}
1036+
1037+
// Load an image file into an sd_image_t, resizing to target dims if needed.
1038+
// Returns a heap-allocated buffer the caller must free (or nullptr on failure).
1039+
static uint8_t* load_and_resize_image(const char* path, int target_width, int target_height, sd_image_t* out) {
1040+
if (!path || strlen(path) == 0) {
1041+
*out = {0, 0, 0, nullptr};
1042+
return nullptr;
1043+
}
1044+
int c = 0, img_w = 0, img_h = 0;
1045+
uint8_t* buf = stbi_load(path, &img_w, &img_h, &c, 3);
1046+
if (!buf) {
1047+
fprintf(stderr, "Failed to load image from '%s'\n", path);
1048+
*out = {0, 0, 0, nullptr};
1049+
return nullptr;
1050+
}
1051+
if (img_w != target_width || img_h != target_height) {
1052+
fprintf(stderr, "Resizing image from %dx%d to %dx%d\n", img_w, img_h, target_width, target_height);
1053+
uint8_t* resized = (uint8_t*)malloc((size_t)target_width * target_height * 3);
1054+
if (!resized) { free(buf); *out = {0, 0, 0, nullptr}; return nullptr; }
1055+
stbir_resize(buf, img_w, img_h, 0,
1056+
resized, target_width, target_height, 0, STBIR_TYPE_UINT8,
1057+
3, STBIR_ALPHA_CHANNEL_NONE, 0,
1058+
STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
1059+
STBIR_FILTER_BOX, STBIR_FILTER_BOX,
1060+
STBIR_COLORSPACE_SRGB, nullptr);
1061+
free(buf);
1062+
buf = resized;
1063+
}
1064+
*out = {(uint32_t)target_width, (uint32_t)target_height, 3, buf};
1065+
return buf;
1066+
}
1067+
1068+
// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst.
1069+
// Uses fork+execvp to avoid shell interpretation of dst.
1070+
static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) {
1071+
if (num_frames <= 0 || !frames || !frames[0].data) {
1072+
fprintf(stderr, "ffmpeg_mux: empty frames\n");
1073+
return 1;
1074+
}
1075+
int width = (int)frames[0].width;
1076+
int height = (int)frames[0].height;
1077+
int channels = (int)frames[0].channel;
1078+
const char* pix_fmt_in = (channels == 4) ? "rgba" : "rgb24";
1079+
1080+
char size_str[32];
1081+
char fps_str[32];
1082+
snprintf(size_str, sizeof(size_str), "%dx%d", width, height);
1083+
snprintf(fps_str, sizeof(fps_str), "%d", fps);
1084+
1085+
int pipefd[2];
1086+
if (pipe(pipefd) != 0) { perror("pipe"); return 1; }
1087+
1088+
pid_t pid = fork();
1089+
if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; }
1090+
1091+
if (pid == 0) {
1092+
// child
1093+
close(pipefd[1]);
1094+
if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); }
1095+
close(pipefd[0]);
1096+
std::vector<char*> argv = {
1097+
const_cast<char*>("ffmpeg"),
1098+
const_cast<char*>("-y"),
1099+
const_cast<char*>("-hide_banner"),
1100+
const_cast<char*>("-loglevel"), const_cast<char*>("warning"),
1101+
const_cast<char*>("-f"), const_cast<char*>("rawvideo"),
1102+
const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in),
1103+
const_cast<char*>("-s"), size_str,
1104+
const_cast<char*>("-framerate"), fps_str,
1105+
const_cast<char*>("-i"), const_cast<char*>("-"),
1106+
const_cast<char*>("-c:v"), const_cast<char*>("libx264"),
1107+
const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"),
1108+
const_cast<char*>("-movflags"), const_cast<char*>("+faststart"),
1109+
const_cast<char*>(dst),
1110+
nullptr
1111+
};
1112+
execvp(argv[0], argv.data());
1113+
perror("execvp ffmpeg");
1114+
_exit(127);
1115+
}
1116+
1117+
// parent
1118+
close(pipefd[0]);
1119+
1120+
// Ignore SIGPIPE so a dying ffmpeg surfaces via write() errno instead of killing us.
1121+
signal(SIGPIPE, SIG_IGN);
1122+
1123+
for (int i = 0; i < num_frames; i++) {
1124+
if (!frames[i].data) continue;
1125+
size_t frame_bytes = (size_t)frames[i].width * frames[i].height * frames[i].channel;
1126+
const uint8_t* p = frames[i].data;
1127+
size_t remaining = frame_bytes;
1128+
while (remaining > 0) {
1129+
ssize_t n = write(pipefd[1], p, remaining);
1130+
if (n < 0) {
1131+
if (errno == EINTR) continue;
1132+
perror("write frame to ffmpeg");
1133+
close(pipefd[1]);
1134+
int status;
1135+
waitpid(pid, &status, 0);
1136+
return 1;
1137+
}
1138+
p += n;
1139+
remaining -= (size_t)n;
1140+
}
1141+
}
1142+
close(pipefd[1]);
1143+
1144+
int status = 0;
1145+
while (waitpid(pid, &status, 0) < 0) {
1146+
if (errno != EINTR) { perror("waitpid"); return 1; }
1147+
}
1148+
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
1149+
fprintf(stderr, "ffmpeg exited with status %d\n", status);
1150+
return 1;
1151+
}
1152+
return 0;
1153+
}
1154+
1155+
int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image) {
1156+
if (!p) return 1;
1157+
if (!dst || strlen(dst) == 0) {
1158+
fprintf(stderr, "gen_video: dst is empty\n");
1159+
std::free(p);
1160+
return 1;
1161+
}
1162+
1163+
std::vector<int> skip_layers = {7, 8, 9};
1164+
1165+
fprintf(stderr, "Generating video: %dx%d, frames=%d, fps=%d, steps=%d, cfg=%.2f\n",
1166+
p->width, p->height, p->video_frames, fps, steps, cfg_scale);
1167+
1168+
// Sample params (shared by both low and high-noise passes — MoE models use the high-noise
1169+
// set during the first phase; single-model Wan2.1 ignores it. Same defaults for both is fine.)
1170+
p->sample_params.guidance.txt_cfg = cfg_scale;
1171+
p->sample_params.guidance.slg.layers = skip_layers.data();
1172+
p->sample_params.guidance.slg.layer_count = skip_layers.size();
1173+
p->sample_params.sample_method = sample_method;
1174+
p->sample_params.sample_steps = steps;
1175+
p->sample_params.scheduler = scheduler;
1176+
p->sample_params.flow_shift = flow_shift;
1177+
1178+
p->high_noise_sample_params.guidance.txt_cfg = cfg_scale;
1179+
p->high_noise_sample_params.guidance.slg.layers = skip_layers.data();
1180+
p->high_noise_sample_params.guidance.slg.layer_count = skip_layers.size();
1181+
p->high_noise_sample_params.sample_method = sample_method;
1182+
p->high_noise_sample_params.sample_steps = steps;
1183+
p->high_noise_sample_params.scheduler = scheduler;
1184+
p->high_noise_sample_params.flow_shift = flow_shift;
1185+
1186+
// Load init/end reference images if provided (resized to output dims).
1187+
uint8_t* init_buf = nullptr;
1188+
uint8_t* end_buf = nullptr;
1189+
sd_image_t init_img = {0, 0, 0, nullptr};
1190+
sd_image_t end_img = {0, 0, 0, nullptr};
1191+
if (init_image && strlen(init_image) > 0) {
1192+
init_buf = load_and_resize_image(init_image, p->width, p->height, &init_img);
1193+
if (!init_buf) { std::free(p); return 1; }
1194+
}
1195+
if (end_image && strlen(end_image) > 0) {
1196+
end_buf = load_and_resize_image(end_image, p->width, p->height, &end_img);
1197+
if (!end_buf) { if (init_buf) free(init_buf); std::free(p); return 1; }
1198+
}
1199+
p->init_image = init_img;
1200+
p->end_image = end_img;
1201+
1202+
// Generate
1203+
int num_frames_out = 0;
1204+
sd_image_t* frames = generate_video(sd_c, p, &num_frames_out);
1205+
std::free(p);
1206+
1207+
if (!frames || num_frames_out == 0) {
1208+
fprintf(stderr, "generate_video produced no frames\n");
1209+
if (init_buf) free(init_buf);
1210+
if (end_buf) free(end_buf);
1211+
return 1;
1212+
}
1213+
1214+
fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst);
1215+
1216+
int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst);
1217+
1218+
for (int i = 0; i < num_frames_out; i++) {
1219+
if (frames[i].data) free(frames[i].data);
1220+
}
1221+
free(frames);
1222+
if (init_buf) free(init_buf);
1223+
if (end_buf) free(end_buf);
1224+
1225+
if (rc == 0) {
1226+
fprintf(stderr, "gen_video done: %s\n", dst);
1227+
}
1228+
fflush(stderr);
1229+
return rc;
1230+
}
1231+
9831232
int unload() {
9841233
free_sd_ctx(sd_c);
9851234
return 0;

backend/go/stablediffusion-ggml/gosd.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ type SDGGML struct {
2323
var (
2424
LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int
2525
GenImage func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []uintptr, refImagesCount int) int
26+
GenVideo func(params uintptr, steps int, dst string, cfgScale float32, fps int, initImage string, endImage string) int
2627

2728
TilingParamsSetEnabled func(params uintptr, enabled bool)
2829
TilingParamsSetTileSizes func(params uintptr, tileSizeX int, tileSizeY int)
@@ -34,6 +35,12 @@ var (
3435
ImgGenParamsSetDimensions func(params uintptr, width int, height int)
3536
ImgGenParamsSetSeed func(params uintptr, seed int64)
3637
ImgGenParamsGetVaeTilingParams func(params uintptr) uintptr
38+
39+
VidGenParamsNew func() uintptr
40+
VidGenParamsSetPrompts func(params uintptr, prompt string, negativePrompt string)
41+
VidGenParamsSetDimensions func(params uintptr, width int, height int)
42+
VidGenParamsSetSeed func(params uintptr, seed int64)
43+
VidGenParamsSetVideoFrames func(params uintptr, n int)
3744
)
3845

3946
// Copied from Purego internal/strings
@@ -153,3 +160,58 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
153160

154161
return nil
155162
}
163+
164+
func (sd *SDGGML) GenerateVideo(opts *pb.GenerateVideoRequest) error {
165+
dst := opts.Dst
166+
if dst == "" {
167+
return fmt.Errorf("dst is empty")
168+
}
169+
170+
width := int(opts.Width)
171+
height := int(opts.Height)
172+
if width == 0 {
173+
width = 512
174+
}
175+
if height == 0 {
176+
height = 512
177+
}
178+
179+
numFrames := int(opts.NumFrames)
180+
if numFrames <= 0 {
181+
numFrames = 16
182+
}
183+
184+
fps := int(opts.Fps)
185+
if fps <= 0 {
186+
fps = 16
187+
}
188+
189+
steps := int(opts.Step)
190+
if steps <= 0 {
191+
steps = 20
192+
}
193+
194+
cfg := opts.CfgScale
195+
if cfg == 0 {
196+
cfg = sd.cfgScale
197+
}
198+
if cfg == 0 {
199+
cfg = 5.0
200+
}
201+
202+
// sd_vid_gen_params_new allocates; gen_video frees it after the generation call.
203+
p := VidGenParamsNew()
204+
VidGenParamsSetPrompts(p, opts.Prompt, opts.NegativePrompt)
205+
VidGenParamsSetDimensions(p, width, height)
206+
VidGenParamsSetSeed(p, int64(opts.Seed))
207+
VidGenParamsSetVideoFrames(p, numFrames)
208+
209+
fmt.Fprintf(os.Stderr, "GenerateVideo: dst=%s size=%dx%d frames=%d fps=%d steps=%d cfg=%.2f\n",
210+
dst, width, height, numFrames, fps, steps, cfg)
211+
212+
ret := GenVideo(p, steps, dst, cfg, fps, opts.StartImage, opts.EndImage)
213+
if ret != 0 {
214+
return fmt.Errorf("video inference failed (code %d)", ret)
215+
}
216+
return nil
217+
}

backend/go/stablediffusion-ggml/gosd.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed);
1818

1919
int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel);
2020
int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count);
21+
22+
sd_vid_gen_params_t* sd_vid_gen_params_new(void);
23+
void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt);
24+
void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height);
25+
void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed);
26+
void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n);
27+
int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image);
2128
#ifdef __cplusplus
2229
}
2330
#endif

backend/go/stablediffusion-ggml/main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ func main() {
3232
libFuncs := []LibFuncs{
3333
{&LoadModel, "load_model"},
3434
{&GenImage, "gen_image"},
35+
{&GenVideo, "gen_video"},
3536
{&TilingParamsSetEnabled, "sd_tiling_params_set_enabled"},
3637
{&TilingParamsSetTileSizes, "sd_tiling_params_set_tile_sizes"},
3738
{&TilingParamsSetRelSizes, "sd_tiling_params_set_rel_sizes"},
@@ -42,6 +43,12 @@ func main() {
4243
{&ImgGenParamsSetDimensions, "sd_img_gen_params_set_dimensions"},
4344
{&ImgGenParamsSetSeed, "sd_img_gen_params_set_seed"},
4445
{&ImgGenParamsGetVaeTilingParams, "sd_img_gen_params_get_vae_tiling_params"},
46+
47+
{&VidGenParamsNew, "sd_vid_gen_params_new"},
48+
{&VidGenParamsSetPrompts, "sd_vid_gen_params_set_prompts"},
49+
{&VidGenParamsSetDimensions, "sd_vid_gen_params_set_dimensions"},
50+
{&VidGenParamsSetSeed, "sd_vid_gen_params_set_seed"},
51+
{&VidGenParamsSetVideoFrames, "sd_vid_gen_params_set_video_frames"},
4552
}
4653

4754
for _, lf := range libFuncs {

0 commit comments

Comments
 (0)