|
26 | 26 | #include "stb_image_resize.h" |
27 | 27 | #include <stdlib.h> |
28 | 28 | #include <regex> |
| 29 | +#include <errno.h> |
| 30 | +#include <signal.h> |
| 31 | +#include <unistd.h> |
| 32 | +#include <sys/wait.h> |
29 | 33 |
|
30 | 34 |
|
31 | 35 |
|
@@ -980,6 +984,251 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha |
980 | 984 | return !ret; |
981 | 985 | } |
982 | 986 |
|
| 987 | +// ---------------- Video generation ---------------- |
| 988 | + |
| 989 | +sd_vid_gen_params_t* sd_vid_gen_params_new(void) { |
| 990 | + sd_vid_gen_params_t *params = (sd_vid_gen_params_t *)std::malloc(sizeof(sd_vid_gen_params_t)); |
| 991 | + sd_vid_gen_params_init(params); |
| 992 | + sd_sample_params_init(¶ms->sample_params); |
| 993 | + sd_sample_params_init(¶ms->high_noise_sample_params); |
| 994 | + sd_cache_params_init(¶ms->cache); |
| 995 | + return params; |
| 996 | +} |
| 997 | + |
| 998 | +// Persistent storage for cleaned video prompts (kept alive for the duration of generation) |
| 999 | +static std::string cleaned_vid_prompt_storage; |
| 1000 | +static std::string cleaned_vid_negative_prompt_storage; |
| 1001 | + |
| 1002 | +void sd_vid_gen_params_set_prompts(sd_vid_gen_params_t *params, const char *prompt, const char *negative_prompt) { |
| 1003 | + lora_vec.clear(); |
| 1004 | + lora_strings.clear(); |
| 1005 | + |
| 1006 | + std::string prompt_str = prompt ? prompt : ""; |
| 1007 | + std::string negative_prompt_str = negative_prompt ? negative_prompt : ""; |
| 1008 | + |
| 1009 | + const char* lora_dir_to_use = lora_dir_path.empty() ? nullptr : lora_dir_path.c_str(); |
| 1010 | + |
| 1011 | + auto [loras, cleaned_prompt] = parse_loras_from_prompt(prompt_str, lora_dir_to_use); |
| 1012 | + lora_vec = loras; |
| 1013 | + cleaned_vid_prompt_storage = cleaned_prompt; |
| 1014 | + |
| 1015 | + auto [neg_loras, cleaned_negative] = parse_loras_from_prompt(negative_prompt_str, lora_dir_to_use); |
| 1016 | + cleaned_vid_negative_prompt_storage = cleaned_negative; |
| 1017 | + |
| 1018 | + params->prompt = cleaned_vid_prompt_storage.c_str(); |
| 1019 | + params->negative_prompt = cleaned_vid_negative_prompt_storage.c_str(); |
| 1020 | + params->loras = lora_vec.empty() ? nullptr : lora_vec.data(); |
| 1021 | + params->lora_count = static_cast<uint32_t>(lora_vec.size()); |
| 1022 | +} |
| 1023 | + |
| 1024 | +void sd_vid_gen_params_set_dimensions(sd_vid_gen_params_t *params, int width, int height) { |
| 1025 | + params->width = width; |
| 1026 | + params->height = height; |
| 1027 | +} |
| 1028 | + |
| 1029 | +void sd_vid_gen_params_set_seed(sd_vid_gen_params_t *params, int64_t seed) { |
| 1030 | + params->seed = seed; |
| 1031 | +} |
| 1032 | + |
| 1033 | +void sd_vid_gen_params_set_video_frames(sd_vid_gen_params_t *params, int n) { |
| 1034 | + params->video_frames = n; |
| 1035 | +} |
| 1036 | + |
| 1037 | +// Load an image file into an sd_image_t, resizing to target dims if needed. |
| 1038 | +// Returns a heap-allocated buffer the caller must free (or nullptr on failure). |
| 1039 | +static uint8_t* load_and_resize_image(const char* path, int target_width, int target_height, sd_image_t* out) { |
| 1040 | + if (!path || strlen(path) == 0) { |
| 1041 | + *out = {0, 0, 0, nullptr}; |
| 1042 | + return nullptr; |
| 1043 | + } |
| 1044 | + int c = 0, img_w = 0, img_h = 0; |
| 1045 | + uint8_t* buf = stbi_load(path, &img_w, &img_h, &c, 3); |
| 1046 | + if (!buf) { |
| 1047 | + fprintf(stderr, "Failed to load image from '%s'\n", path); |
| 1048 | + *out = {0, 0, 0, nullptr}; |
| 1049 | + return nullptr; |
| 1050 | + } |
| 1051 | + if (img_w != target_width || img_h != target_height) { |
| 1052 | + fprintf(stderr, "Resizing image from %dx%d to %dx%d\n", img_w, img_h, target_width, target_height); |
| 1053 | + uint8_t* resized = (uint8_t*)malloc((size_t)target_width * target_height * 3); |
| 1054 | + if (!resized) { free(buf); *out = {0, 0, 0, nullptr}; return nullptr; } |
| 1055 | + stbir_resize(buf, img_w, img_h, 0, |
| 1056 | + resized, target_width, target_height, 0, STBIR_TYPE_UINT8, |
| 1057 | + 3, STBIR_ALPHA_CHANNEL_NONE, 0, |
| 1058 | + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, |
| 1059 | + STBIR_FILTER_BOX, STBIR_FILTER_BOX, |
| 1060 | + STBIR_COLORSPACE_SRGB, nullptr); |
| 1061 | + free(buf); |
| 1062 | + buf = resized; |
| 1063 | + } |
| 1064 | + *out = {(uint32_t)target_width, (uint32_t)target_height, 3, buf}; |
| 1065 | + return buf; |
| 1066 | +} |
| 1067 | + |
| 1068 | +// Pipe raw RGB/RGBA frames to ffmpeg stdin and let it produce an MP4 at dst. |
| 1069 | +// Uses fork+execvp to avoid shell interpretation of dst. |
| 1070 | +static int ffmpeg_mux_raw_to_mp4(sd_image_t* frames, int num_frames, int fps, const char* dst) { |
| 1071 | + if (num_frames <= 0 || !frames || !frames[0].data) { |
| 1072 | + fprintf(stderr, "ffmpeg_mux: empty frames\n"); |
| 1073 | + return 1; |
| 1074 | + } |
| 1075 | + int width = (int)frames[0].width; |
| 1076 | + int height = (int)frames[0].height; |
| 1077 | + int channels = (int)frames[0].channel; |
| 1078 | + const char* pix_fmt_in = (channels == 4) ? "rgba" : "rgb24"; |
| 1079 | + |
| 1080 | + char size_str[32]; |
| 1081 | + char fps_str[32]; |
| 1082 | + snprintf(size_str, sizeof(size_str), "%dx%d", width, height); |
| 1083 | + snprintf(fps_str, sizeof(fps_str), "%d", fps); |
| 1084 | + |
| 1085 | + int pipefd[2]; |
| 1086 | + if (pipe(pipefd) != 0) { perror("pipe"); return 1; } |
| 1087 | + |
| 1088 | + pid_t pid = fork(); |
| 1089 | + if (pid < 0) { perror("fork"); close(pipefd[0]); close(pipefd[1]); return 1; } |
| 1090 | + |
| 1091 | + if (pid == 0) { |
| 1092 | + // child |
| 1093 | + close(pipefd[1]); |
| 1094 | + if (dup2(pipefd[0], STDIN_FILENO) < 0) { perror("dup2"); _exit(127); } |
| 1095 | + close(pipefd[0]); |
| 1096 | + std::vector<char*> argv = { |
| 1097 | + const_cast<char*>("ffmpeg"), |
| 1098 | + const_cast<char*>("-y"), |
| 1099 | + const_cast<char*>("-hide_banner"), |
| 1100 | + const_cast<char*>("-loglevel"), const_cast<char*>("warning"), |
| 1101 | + const_cast<char*>("-f"), const_cast<char*>("rawvideo"), |
| 1102 | + const_cast<char*>("-pix_fmt"), const_cast<char*>(pix_fmt_in), |
| 1103 | + const_cast<char*>("-s"), size_str, |
| 1104 | + const_cast<char*>("-framerate"), fps_str, |
| 1105 | + const_cast<char*>("-i"), const_cast<char*>("-"), |
| 1106 | + const_cast<char*>("-c:v"), const_cast<char*>("libx264"), |
| 1107 | + const_cast<char*>("-pix_fmt"), const_cast<char*>("yuv420p"), |
| 1108 | + const_cast<char*>("-movflags"), const_cast<char*>("+faststart"), |
| 1109 | + const_cast<char*>(dst), |
| 1110 | + nullptr |
| 1111 | + }; |
| 1112 | + execvp(argv[0], argv.data()); |
| 1113 | + perror("execvp ffmpeg"); |
| 1114 | + _exit(127); |
| 1115 | + } |
| 1116 | + |
| 1117 | + // parent |
| 1118 | + close(pipefd[0]); |
| 1119 | + |
| 1120 | + // Ignore SIGPIPE so a dying ffmpeg surfaces via write() errno instead of killing us. |
| 1121 | + signal(SIGPIPE, SIG_IGN); |
| 1122 | + |
| 1123 | + for (int i = 0; i < num_frames; i++) { |
| 1124 | + if (!frames[i].data) continue; |
| 1125 | + size_t frame_bytes = (size_t)frames[i].width * frames[i].height * frames[i].channel; |
| 1126 | + const uint8_t* p = frames[i].data; |
| 1127 | + size_t remaining = frame_bytes; |
| 1128 | + while (remaining > 0) { |
| 1129 | + ssize_t n = write(pipefd[1], p, remaining); |
| 1130 | + if (n < 0) { |
| 1131 | + if (errno == EINTR) continue; |
| 1132 | + perror("write frame to ffmpeg"); |
| 1133 | + close(pipefd[1]); |
| 1134 | + int status; |
| 1135 | + waitpid(pid, &status, 0); |
| 1136 | + return 1; |
| 1137 | + } |
| 1138 | + p += n; |
| 1139 | + remaining -= (size_t)n; |
| 1140 | + } |
| 1141 | + } |
| 1142 | + close(pipefd[1]); |
| 1143 | + |
| 1144 | + int status = 0; |
| 1145 | + while (waitpid(pid, &status, 0) < 0) { |
| 1146 | + if (errno != EINTR) { perror("waitpid"); return 1; } |
| 1147 | + } |
| 1148 | + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { |
| 1149 | + fprintf(stderr, "ffmpeg exited with status %d\n", status); |
| 1150 | + return 1; |
| 1151 | + } |
| 1152 | + return 0; |
| 1153 | +} |
| 1154 | + |
| 1155 | +int gen_video(sd_vid_gen_params_t *p, int steps, char *dst, float cfg_scale, int fps, char *init_image, char *end_image) { |
| 1156 | + if (!p) return 1; |
| 1157 | + if (!dst || strlen(dst) == 0) { |
| 1158 | + fprintf(stderr, "gen_video: dst is empty\n"); |
| 1159 | + std::free(p); |
| 1160 | + return 1; |
| 1161 | + } |
| 1162 | + |
| 1163 | + std::vector<int> skip_layers = {7, 8, 9}; |
| 1164 | + |
| 1165 | + fprintf(stderr, "Generating video: %dx%d, frames=%d, fps=%d, steps=%d, cfg=%.2f\n", |
| 1166 | + p->width, p->height, p->video_frames, fps, steps, cfg_scale); |
| 1167 | + |
| 1168 | + // Sample params (shared by both low and high-noise passes — MoE models use the high-noise |
| 1169 | + // set during the first phase; single-model Wan2.1 ignores it. Same defaults for both is fine.) |
| 1170 | + p->sample_params.guidance.txt_cfg = cfg_scale; |
| 1171 | + p->sample_params.guidance.slg.layers = skip_layers.data(); |
| 1172 | + p->sample_params.guidance.slg.layer_count = skip_layers.size(); |
| 1173 | + p->sample_params.sample_method = sample_method; |
| 1174 | + p->sample_params.sample_steps = steps; |
| 1175 | + p->sample_params.scheduler = scheduler; |
| 1176 | + p->sample_params.flow_shift = flow_shift; |
| 1177 | + |
| 1178 | + p->high_noise_sample_params.guidance.txt_cfg = cfg_scale; |
| 1179 | + p->high_noise_sample_params.guidance.slg.layers = skip_layers.data(); |
| 1180 | + p->high_noise_sample_params.guidance.slg.layer_count = skip_layers.size(); |
| 1181 | + p->high_noise_sample_params.sample_method = sample_method; |
| 1182 | + p->high_noise_sample_params.sample_steps = steps; |
| 1183 | + p->high_noise_sample_params.scheduler = scheduler; |
| 1184 | + p->high_noise_sample_params.flow_shift = flow_shift; |
| 1185 | + |
| 1186 | + // Load init/end reference images if provided (resized to output dims). |
| 1187 | + uint8_t* init_buf = nullptr; |
| 1188 | + uint8_t* end_buf = nullptr; |
| 1189 | + sd_image_t init_img = {0, 0, 0, nullptr}; |
| 1190 | + sd_image_t end_img = {0, 0, 0, nullptr}; |
| 1191 | + if (init_image && strlen(init_image) > 0) { |
| 1192 | + init_buf = load_and_resize_image(init_image, p->width, p->height, &init_img); |
| 1193 | + if (!init_buf) { std::free(p); return 1; } |
| 1194 | + } |
| 1195 | + if (end_image && strlen(end_image) > 0) { |
| 1196 | + end_buf = load_and_resize_image(end_image, p->width, p->height, &end_img); |
| 1197 | + if (!end_buf) { if (init_buf) free(init_buf); std::free(p); return 1; } |
| 1198 | + } |
| 1199 | + p->init_image = init_img; |
| 1200 | + p->end_image = end_img; |
| 1201 | + |
| 1202 | + // Generate |
| 1203 | + int num_frames_out = 0; |
| 1204 | + sd_image_t* frames = generate_video(sd_c, p, &num_frames_out); |
| 1205 | + std::free(p); |
| 1206 | + |
| 1207 | + if (!frames || num_frames_out == 0) { |
| 1208 | + fprintf(stderr, "generate_video produced no frames\n"); |
| 1209 | + if (init_buf) free(init_buf); |
| 1210 | + if (end_buf) free(end_buf); |
| 1211 | + return 1; |
| 1212 | + } |
| 1213 | + |
| 1214 | + fprintf(stderr, "Generated %d frames, muxing to %s via ffmpeg\n", num_frames_out, dst); |
| 1215 | + |
| 1216 | + int rc = ffmpeg_mux_raw_to_mp4(frames, num_frames_out, fps, dst); |
| 1217 | + |
| 1218 | + for (int i = 0; i < num_frames_out; i++) { |
| 1219 | + if (frames[i].data) free(frames[i].data); |
| 1220 | + } |
| 1221 | + free(frames); |
| 1222 | + if (init_buf) free(init_buf); |
| 1223 | + if (end_buf) free(end_buf); |
| 1224 | + |
| 1225 | + if (rc == 0) { |
| 1226 | + fprintf(stderr, "gen_video done: %s\n", dst); |
| 1227 | + } |
| 1228 | + fflush(stderr); |
| 1229 | + return rc; |
| 1230 | +} |
| 1231 | + |
983 | 1232 | int unload() { |
984 | 1233 | free_sd_ctx(sd_c); |
985 | 1234 | return 0; |
|
0 commit comments