diff --git a/backend/go/stablediffusion-ggml/gosd.cpp b/backend/go/stablediffusion-ggml/gosd.cpp index f010f73a6add..ce5febe77503 100644 --- a/backend/go/stablediffusion-ggml/gosd.cpp +++ b/backend/go/stablediffusion-ggml/gosd.cpp @@ -26,6 +26,10 @@ #include "stb_image_resize.h" #include #include +#include +#include +#include +#include @@ -980,6 +984,251 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha return !ret; } +// ---------------- Video generation ---------------- + +sd_vid_gen_params_t* sd_vid_gen_params_new(void) { + sd_vid_gen_params_t *params = (sd_vid_gen_params_t *)std::malloc(sizeof(sd_vid_gen_params_t)); + sd_vid_gen_params_init(params); + sd_sample_params_init(¶ms->sample_params); + sd_sample_params_init(¶ms->high_noise_sample_params); + sd_cache_params_init(¶ms->cache); + return params; +} + +// Persistent storage for cleaned video prompts (kept alive for the duration of generation) +static std::string cleaned_vid_prompt_storage; +static std::string cleaned_vid_negative_prompt_storage; + +void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt) { + lora_vec.clear(); + lora_strings.clear(); + + std::string prompt_str = prompt ? prompt : ""; + std::string negative_prompt_str = negative_prompt ? negative_prompt : ""; + + const char* lora_dir_to_use = lora_dir_path.empty() ? nullptr : lora_dir_path.c_str(); + + auto [loras, cleaned_prompt] = parse_loras_from_prompt(prompt_str, lora_dir_to_use); + lora_vec = loras; + cleaned_vid_prompt_storage = cleaned_prompt; + + auto [neg_loras, cleaned_negative] = parse_loras_from_prompt(negative_prompt_str, lora_dir_to_use); + cleaned_vid_negative_prompt_storage = cleaned_negative; + + params->prompt = cleaned_vid_prompt_storage.c_str(); + params->negative_prompt = cleaned_vid_negative_prompt_storage.c_str(); + params->loras = lora_vec.empty() ? nullptr : lora_vec.data(); + params->lora_count = static_cast(lora_vec.size()); +} + +void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height) { + params->width = width; + params->height = height; +} + +void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed) { + params->seed = seed; +} + +void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n) { + params->video_frames = n; +} + +// Load an image file into an sd_image_t, resizing to target dims if needed. +// Returns a heap-allocated buffer the caller must free (or nullptr on failure). +static uint8_t* load_and_resize_image(const char* path, int target_width, int target_height, sd_image_t* out) { + if (!path || strlen(path) == 0) { + *out = {0, 0, 0, nullptr}; + return nullptr; + } + int c = 0, img_w = 0, img_h = 0; + uint8_t* buf = stbi_load(path, &img_w, &img_h, &c, 3); + if (!buf) { + fprintf(stderr, "Failed to load image from '%s'\n", path); + *out = {0, 0, 0, nullptr}; + return nullptr; + } + if (img_w != target_width || img_h != target_height) { + fprintf(stderr, "Resizing image from %dx%d to %dx%d\n", img_w, img_h, target_width, target_height); + uint8_t* resized = (uint8_t*)malloc((size_t)target_width * target_height * 3); + if (!resized) { free(buf); *out = {0, 0, 0, nullptr}; return nullptr; } + stbir_resize(buf, img_w, img_h, 0, + resized, target_width, target_height, 0, STBIR_TYPE_UINT8, + 3, STBIR_ALPHA_CHANNEL_NONE, 0, + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, + STBIR_FILTER_BOX, STBIR_FILTER_BOX, + STBIR_COLORSPACE_SRGB, nullptr); + free(buf); + buf = resized; + } + *out = {(uint32_t)target_width, (uint32_t)target_height, 3, buf}; + return buf; +} + +// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst. +// Uses fork+execvp to avoid shell interpretation of dst. +static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) { + if (num_frames <= 0 || !frames || !frames[0].data) { + fprintf(stderr, "ffmpeg_mux: empty frames\n"); + return 1; + } + int width = (int)frames[0].width; + int height = (int)frames[0].height; + int channels = (int)frames[0].channel; + const char* pix_fmt_in = (channels == 4) ? "rgba" : "rgb24"; + + char size_str[32]; + char fps_str[32]; + snprintf(size_str, sizeof(size_str), "%dx%d", width, height); + snprintf(fps_str, sizeof(fps_str), "%d", fps); + + int pipefd[2]; + if (pipe(pipefd) != 0) { perror("pipe"); return 1; } + + pid_t pid = fork(); + if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; } + + if (pid == 0) { + // child + close(pipefd[1]); + if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); } + close(pipefd[0]); + std::vector argv = { + const_cast("ffmpeg"), + const_cast("-y"), + const_cast("-hide_banner"), + const_cast("-loglevel"), const_cast("warning"), + const_cast("-f"), const_cast("rawvideo"), + const_cast("-pix_fmt"), const_cast(pix_fmt_in), + const_cast("-s"), size_str, + const_cast("-framerate"), fps_str, + const_cast("-i"), const_cast("-"), + const_cast("-c:v"), const_cast("libx264"), + const_cast("-pix_fmt"), const_cast("yuv420p"), + const_cast("-movflags"), const_cast("+faststart"), + const_cast(dst), + nullptr + }; + execvp(argv[0], argv.data()); + perror("execvp ffmpeg"); + _exit(127); + } + + // parent + close(pipefd[0]); + + // Ignore SIGPIPE so a dying ffmpeg surfaces via write() errno instead of killing us. + signal(SIGPIPE, SIG_IGN); + + for (int i = 0; i < num_frames; i++) { + if (!frames[i].data) continue; + size_t frame_bytes = (size_t)frames[i].width * frames[i].height * frames[i].channel; + const uint8_t* p = frames[i].data; + size_t remaining = frame_bytes; + while (remaining > 0) { + ssize_t n = write(pipefd[1], p, remaining); + if (n < 0) { + if (errno == EINTR) continue; + perror("write frame to ffmpeg"); + close(pipefd[1]); + int status; + waitpid(pid, &status, 0); + return 1; + } + p += n; + remaining -= (size_t)n; + } + } + close(pipefd[1]); + + int status = 0; + while (waitpid(pid, &status, 0) < 0) { + if (errno != EINTR) { perror("waitpid"); return 1; } + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fprintf(stderr, "ffmpeg exited with status %d\n", status); + return 1; + } + return 0; +} + +int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image) { + if (!p) return 1; + if (!dst || strlen(dst) == 0) { + fprintf(stderr, "gen_video: dst is empty\n"); + std::free(p); + return 1; + } + + std::vector skip_layers = {7, 8, 9}; + + fprintf(stderr, "Generating video: %dx%d, frames=%d, fps=%d, steps=%d, cfg=%.2f\n", + p->width, p->height, p->video_frames, fps, steps, cfg_scale); + + // Sample params (shared by both low and high-noise passes — MoE models use the high-noise + // set during the first phase; single-model Wan2.1 ignores it. Same defaults for both is fine.) + p->sample_params.guidance.txt_cfg = cfg_scale; + p->sample_params.guidance.slg.layers = skip_layers.data(); + p->sample_params.guidance.slg.layer_count = skip_layers.size(); + p->sample_params.sample_method = sample_method; + p->sample_params.sample_steps = steps; + p->sample_params.scheduler = scheduler; + p->sample_params.flow_shift = flow_shift; + + p->high_noise_sample_params.guidance.txt_cfg = cfg_scale; + p->high_noise_sample_params.guidance.slg.layers = skip_layers.data(); + p->high_noise_sample_params.guidance.slg.layer_count = skip_layers.size(); + p->high_noise_sample_params.sample_method = sample_method; + p->high_noise_sample_params.sample_steps = steps; + p->high_noise_sample_params.scheduler = scheduler; + p->high_noise_sample_params.flow_shift = flow_shift; + + // Load init/end reference images if provided (resized to output dims). + uint8_t* init_buf = nullptr; + uint8_t* end_buf = nullptr; + sd_image_t init_img = {0, 0, 0, nullptr}; + sd_image_t end_img = {0, 0, 0, nullptr}; + if (init_image && strlen(init_image) > 0) { + init_buf = load_and_resize_image(init_image, p->width, p->height, &init_img); + if (!init_buf) { std::free(p); return 1; } + } + if (end_image && strlen(end_image) > 0) { + end_buf = load_and_resize_image(end_image, p->width, p->height, &end_img); + if (!end_buf) { if (init_buf) free(init_buf); std::free(p); return 1; } + } + p->init_image = init_img; + p->end_image = end_img; + + // Generate + int num_frames_out = 0; + sd_image_t* frames = generate_video(sd_c, p, &num_frames_out); + std::free(p); + + if (!frames || num_frames_out == 0) { + fprintf(stderr, "generate_video produced no frames\n"); + if (init_buf) free(init_buf); + if (end_buf) free(end_buf); + return 1; + } + + fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst); + + int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst); + + for (int i = 0; i < num_frames_out; i++) { + if (frames[i].data) free(frames[i].data); + } + free(frames); + if (init_buf) free(init_buf); + if (end_buf) free(end_buf); + + if (rc == 0) { + fprintf(stderr, "gen_video done: %s\n", dst); + } + fflush(stderr); + return rc; +} + int unload() { free_sd_ctx(sd_c); return 0; diff --git a/backend/go/stablediffusion-ggml/gosd.go b/backend/go/stablediffusion-ggml/gosd.go index 205f3f2d17c0..219b78470711 100644 --- a/backend/go/stablediffusion-ggml/gosd.go +++ b/backend/go/stablediffusion-ggml/gosd.go @@ -23,6 +23,7 @@ type SDGGML struct { var ( LoadModel func(model, model_apth string, options []uintptr, threads int32, diff int) int GenImage func(params uintptr, steps int, dst string, cfgScale float32, srcImage string, strength float32, maskImage string, refImages []uintptr, refImagesCount int) int + GenVideo func(params uintptr, steps int, dst string, cfgScale float32, fps int, initImage string, endImage string) int TilingParamsSetEnabled func(params uintptr, enabled bool) TilingParamsSetTileSizes func(params uintptr, tileSizeX int, tileSizeY int) @@ -34,6 +35,12 @@ var ( ImgGenParamsSetDimensions func(params uintptr, width int, height int) ImgGenParamsSetSeed func(params uintptr, seed int64) ImgGenParamsGetVaeTilingParams func(params uintptr) uintptr + + VidGenParamsNew func() uintptr + VidGenParamsSetPrompts func(params uintptr, prompt string, negativePrompt string) + VidGenParamsSetDimensions func(params uintptr, width int, height int) + VidGenParamsSetSeed func(params uintptr, seed int64) + VidGenParamsSetVideoFrames func(params uintptr, n int) ) // Copied from Purego internal/strings @@ -153,3 +160,58 @@ func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error { return nil } + +func (sd *SDGGML) GenerateVideo(opts *pb.GenerateVideoRequest) error { + dst := opts.Dst + if dst == "" { + return fmt.Errorf("dst is empty") + } + + width := int(opts.Width) + height := int(opts.Height) + if width == 0 { + width = 512 + } + if height == 0 { + height = 512 + } + + numFrames := int(opts.NumFrames) + if numFrames <= 0 { + numFrames = 16 + } + + fps := int(opts.Fps) + if fps <= 0 { + fps = 16 + } + + steps := int(opts.Step) + if steps <= 0 { + steps = 20 + } + + cfg := opts.CfgScale + if cfg == 0 { + cfg = sd.cfgScale + } + if cfg == 0 { + cfg = 5.0 + } + + // sd_vid_gen_params_new allocates; gen_video frees it after the generation call. + p := VidGenParamsNew() + VidGenParamsSetPrompts(p, opts.Prompt, opts.NegativePrompt) + VidGenParamsSetDimensions(p, width, height) + VidGenParamsSetSeed(p, int64(opts.Seed)) + VidGenParamsSetVideoFrames(p, numFrames) + + fmt.Fprintf(os.Stderr, "GenerateVideo: dst=%s size=%dx%d frames=%d fps=%d steps=%d cfg=%.2f\n", + dst, width, height, numFrames, fps, steps, cfg) + + ret := GenVideo(p, steps, dst, cfg, fps, opts.StartImage, opts.EndImage) + if ret != 0 { + return fmt.Errorf("video inference failed (code %d)", ret) + } + return nil +} diff --git a/backend/go/stablediffusion-ggml/gosd.h b/backend/go/stablediffusion-ggml/gosd.h index 8324a3ead4ea..31ce72ab7cc6 100644 --- a/backend/go/stablediffusion-ggml/gosd.h +++ b/backend/go/stablediffusion-ggml/gosd.h @@ -18,6 +18,13 @@ void sd_img_gen_params_set_seed(sd_img_gen_params_t *params, int64_t seed); int load_model(const char *model, char *model_path, char* options[], int threads, int diffusionModel); int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char* ref_images[], int ref_images_count); + +sd_vid_gen_params_t* sd_vid_gen_params_new(void); +void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt); +void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height); +void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed); +void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n); +int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image); #ifdef __cplusplus } #endif diff --git a/backend/go/stablediffusion-ggml/main.go b/backend/go/stablediffusion-ggml/main.go index 2181f576439f..998f2a5ab85b 100644 --- a/backend/go/stablediffusion-ggml/main.go +++ b/backend/go/stablediffusion-ggml/main.go @@ -32,6 +32,7 @@ func main() { libFuncs := []LibFuncs{ {&LoadModel, "load_model"}, {&GenImage, "gen_image"}, + {&GenVideo, "gen_video"}, {&TilingParamsSetEnabled, "sd_tiling_params_set_enabled"}, {&TilingParamsSetTileSizes, "sd_tiling_params_set_tile_sizes"}, {&TilingParamsSetRelSizes, "sd_tiling_params_set_rel_sizes"}, @@ -42,6 +43,12 @@ func main() { {&ImgGenParamsSetDimensions, "sd_img_gen_params_set_dimensions"}, {&ImgGenParamsSetSeed, "sd_img_gen_params_set_seed"}, {&ImgGenParamsGetVaeTilingParams, "sd_img_gen_params_get_vae_tiling_params"}, + + {&VidGenParamsNew, "sd_vid_gen_params_new"}, + {&VidGenParamsSetPrompts, "sd_vid_gen_params_set_prompts"}, + {&VidGenParamsSetDimensions, "sd_vid_gen_params_set_dimensions"}, + {&VidGenParamsSetSeed, "sd_vid_gen_params_set_seed"}, + {&VidGenParamsSetVideoFrames, "sd_vid_gen_params_set_video_frames"}, } for _, lf := range libFuncs { diff --git a/core/http/endpoints/localai/video.go b/core/http/endpoints/localai/video.go index 8a65ae5fd9e0..cfed9c204eaf 100644 --- a/core/http/endpoints/localai/video.go +++ b/core/http/endpoints/localai/video.go @@ -80,51 +80,65 @@ func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi return echo.ErrBadRequest } - src := "" - if input.StartImage != "" { - + // Stage a base64- or URL-provided image into a temp file so the + // backend can read it as a path. Used for both start_image and + // (optional) end_image. Returns the temp file path, or "" if the + // input is empty. Caller is responsible for the defer-cleanup. + stageImage := func(ref string) (string, error) { + if ref == "" { + return "", nil + } var fileData []byte var err error - // check if input.File is an URL, if so download it and save it - // to a temporary file - if strings.HasPrefix(input.StartImage, "http://") || strings.HasPrefix(input.StartImage, "https://") { - out, err := downloadFile(input.StartImage) - if err != nil { - return fmt.Errorf("failed downloading file:%w", err) + if strings.HasPrefix(ref, "http://") || strings.HasPrefix(ref, "https://") { + out, derr := downloadFile(ref) + if derr != nil { + return "", fmt.Errorf("failed downloading file: %w", derr) } defer os.RemoveAll(out) - fileData, err = os.ReadFile(out) if err != nil { - return fmt.Errorf("failed reading file:%w", err) + return "", fmt.Errorf("failed reading file: %w", err) } - } else { - // base 64 decode the file and write it somewhere - // that we will cleanup - fileData, err = base64.StdEncoding.DecodeString(input.StartImage) + fileData, err = base64.StdEncoding.DecodeString(ref) if err != nil { - return err + return "", err } } - - // Create a temporary file outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64") if err != nil { - return err + return "", err } - // write the base64 result writer := bufio.NewWriter(outputFile) - _, err = writer.Write(fileData) - if err != nil { + if _, err := writer.Write(fileData); err != nil { outputFile.Close() - return err + return "", err + } + if err := writer.Flush(); err != nil { + outputFile.Close() + return "", err } outputFile.Close() - src = outputFile.Name() + return outputFile.Name(), nil + } + + src, err := stageImage(input.StartImage) + if err != nil { + return err + } + if src != "" { defer os.RemoveAll(src) } + endSrc, err := stageImage(input.EndImage) + if err != nil { + return err + } + if endSrc != "" { + defer os.RemoveAll(endSrc) + } + xlog.Debug("Parameter Config", "config", config) switch config.Backend { @@ -184,7 +198,7 @@ func VideoEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi input.Prompt, input.NegativePrompt, src, - input.EndImage, + endSrc, output, input.NumFrames, input.FPS, diff --git a/gallery/index.yaml b/gallery/index.yaml index 712f7c4ff07e..5fab5dcc2ecb 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -15167,6 +15167,62 @@ - sd-3 - gpu url: "github:mudler/LocalAI/gallery/stablediffusion3.yaml@master" +- name: wan-2.1-t2v-1.3b-ggml + license: apache-2.0 + url: "github:mudler/LocalAI/gallery/wan-ggml.yaml@master" + description: | + Wan 2.1 T2V 1.3B — text-to-video diffusion model, GGUF-quantized for the + stable-diffusion.cpp backend. Generates short (33-frame) 832x480 clips + from a text prompt. Cheapest Wan variant, suitable for CPU-offloaded + inference with ~10 GB of usable RAM. + urls: + - https://huggingface.co/calcuis/wan-gguf + - https://huggingface.co/city96/umt5-xxl-encoder-gguf + tags: + - text-to-video + - wan + - video-generation + - cpu + - gpu + overrides: + parameters: + model: wan2.1-t2v-1.3B-Q8_0.gguf + files: + - filename: "wan2.1-t2v-1.3B-Q8_0.gguf" + uri: "huggingface://calcuis/wan-gguf/wan2.1-t2v-1.3B-Q8_0.gguf" + - filename: "wan_2.1_vae.safetensors" + uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors" + - filename: "umt5-xxl-encoder-Q8_0.gguf" + uri: "huggingface://city96/umt5-xxl-encoder-gguf/umt5-xxl-encoder-Q8_0.gguf" +- name: wan-2.1-i2v-14b-480p-ggml + license: apache-2.0 + url: "github:mudler/LocalAI/gallery/wan-ggml.yaml@master" + description: | + Wan 2.1 I2V 14B 480P — image-to-video diffusion, GGUF Q4 quantization. + Animates a reference image into a 33-frame 480p clip. Requires more + RAM than the 1.3B T2V variant; CPU offload enabled by default. + urls: + - https://huggingface.co/city96/Wan2.1-I2V-14B-480P-gguf + tags: + - image-to-video + - wan + - video-generation + - cpu + - gpu + overrides: + parameters: + model: wan2.1-i2v-14b-480p-Q4_K_M.gguf + options: + - "clip_vision_path:clip_vision_h.safetensors" + files: + - filename: "wan2.1-i2v-14b-480p-Q4_K_M.gguf" + uri: "huggingface://city96/Wan2.1-I2V-14B-480P-gguf/wan2.1-i2v-14b-480p-Q4_K_M.gguf" + - filename: "wan_2.1_vae.safetensors" + uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors" + - filename: "umt5-xxl-encoder-Q8_0.gguf" + uri: "huggingface://city96/umt5-xxl-encoder-gguf/umt5-xxl-encoder-Q8_0.gguf" + - filename: "clip_vision_h.safetensors" + uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/clip_vision/clip_vision_h.safetensors" - name: sd-1.5-ggml icon: https://avatars.githubusercontent.com/u/37351293 license: creativeml-openrail-m diff --git a/gallery/wan-ggml.yaml b/gallery/wan-ggml.yaml new file mode 100644 index 000000000000..4000424748ff --- /dev/null +++ b/gallery/wan-ggml.yaml @@ -0,0 +1,19 @@ +--- +name: "wan-ggml" + +config_file: | + backend: stablediffusion-ggml + step: 20 + cfg_scale: 6.0 + options: + - "diffusion_model" + - "vae_decode_only:false" + - "sampler:euler" + - "scheduler:discrete" + - "flow_shift:3.0" + - "diffusion_flash_attn:true" + - "offload_params_to_cpu:true" + - "keep_vae_on_cpu:true" + - "keep_clip_on_cpu:true" + - "t5xxl_path:umt5-xxl-encoder-Q8_0.gguf" + - "vae_path:wan_2.1_vae.safetensors"