Skip to content

Commit 8f83d6c

Browse files
authored
mtmd : add video input support (ggml-org#24269)
* wip * ok: lazy bitmap API * remember to free lazy text * wip * add mtmd_helper_video * support video input on server (base64 input) * add MTMD_VIDEO config * add timestamp * update CLI * cli: allow auto-completion for video * add --video arg * fix build * update docs * rename as suggested
1 parent c2b1518 commit 8f83d6c

16 files changed

Lines changed: 807 additions & 77 deletions

common/arg.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2221,8 +2221,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22212221
}
22222222
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
22232223
add_opt(common_arg(
2224-
{"--image", "--audio"}, "FILE",
2225-
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
2224+
{"--image", "--audio", "--video"}, "FILE",
2225+
"path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files\n",
22262226
[](common_params & params, const std::string & value) {
22272227
for (const auto & item : parse_csv_row(value)) {
22282228
params.image.emplace_back(item);

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ struct common_params {
571571
struct common_params_model mmproj;
572572
bool mmproj_use_gpu = true; // use GPU for multimodal model
573573
bool no_mmproj = false; // explicitly disable multimodal model
574-
std::vector<std::string> image; // path to image file(s)
574+
std::vector<std::string> image; // path to image file(s) ; TODO: change the name to "media"
575575
int image_min_tokens = -1;
576576
int image_max_tokens = -1;
577577

tests/test-mtmd-c-api.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <assert.h>
33

44
#include "mtmd.h"
5+
#include "mtmd-helper.h"
56

67
int main(void) {
78
printf("\n\nTesting libmtmd C API...\n");
@@ -17,6 +18,11 @@ int main(void) {
1718
return 1;
1819
}
1920

21+
// simple test for the helper
22+
size_t n_tokens_total = mtmd_helper_get_n_tokens(chunks);
23+
printf("Total tokens in chunks: %zu\n", n_tokens_total);
24+
assert(n_tokens_total > 0);
25+
2026
size_t n_chunks = mtmd_input_chunks_size(chunks);
2127
printf("Number of chunks: %zu\n", n_chunks);
2228
assert(n_chunks > 0);

tools/cli/cli.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,14 +235,15 @@ struct cli_context {
235235
};
236236

237237
// TODO?: Make this reusable, enums, docs
238-
static const std::array<std::string_view, 7> cmds = {
238+
static const std::array<std::string_view, 8> cmds = {
239239
"/audio ",
240240
"/clear",
241241
"/exit",
242242
"/glob ",
243243
"/image ",
244244
"/read ",
245245
"/regen",
246+
"/video ",
246247
};
247248

248249
static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) {
@@ -457,6 +458,9 @@ int llama_cli(int argc, char ** argv) {
457458
if (inf.has_inp_audio) {
458459
console::log(" /audio <file> add an audio file\n");
459460
}
461+
if (inf.has_inp_video) {
462+
console::log(" /video <file> add a video file\n");
463+
}
460464
console::log("\n");
461465

462466
// interactive loop
@@ -553,7 +557,8 @@ int llama_cli(int argc, char ** argv) {
553557
continue;
554558
} else if (
555559
(string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
556-
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
560+
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
561+
(string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
557562
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
558563
std::string fname = string_strip(buffer.substr(7));
559564
std::string marker = ctx_cli.load_input_file(fname, true);

tools/mtmd/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# mtmd
22

3+
set(MTMD_VIDEO ON CACHE BOOL "enable video support in mtmd (requires ffmpeg binary in PATH)")
4+
# TODO: add MTMD_VIDEO_METHOD in the future to select between ffmpeg and other backends
5+
36
find_package(Threads REQUIRED)
47

58
add_library(mtmd
@@ -63,6 +66,10 @@ target_include_directories(mtmd PRIVATE ../..)
6366
target_include_directories(mtmd PRIVATE ../../vendor)
6467
target_compile_features (mtmd PRIVATE cxx_std_17)
6568

69+
if (MTMD_VIDEO)
70+
target_compile_definitions(mtmd PRIVATE MTMD_VIDEO)
71+
endif()
72+
6673
if (BUILD_SHARED_LIBS)
6774
set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
6875
target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)

tools/mtmd/mtmd-cli.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct mtmd_cli_context {
7777
int n_batch;
7878

7979
mtmd::bitmaps bitmaps;
80+
std::vector<mtmd_helper::video_ptr> videos;
8081

8182
// chat template
8283
common_chat_templates_ptr tmpls;
@@ -166,11 +167,14 @@ struct mtmd_cli_context {
166167
}
167168

168169
bool load_media(const std::string & fname) {
169-
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false));
170-
if (!bmp.ptr) {
170+
auto res = mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str(), false);
171+
if (!res.bitmap) {
171172
return false;
172173
}
173-
bitmaps.entries.push_back(std::move(bmp));
174+
bitmaps.entries.emplace_back(res.bitmap);
175+
if (res.video_ctx) {
176+
videos.emplace_back(res.video_ctx);
177+
}
174178
return true;
175179
}
176180
};
@@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
253257
}
254258

255259
ctx.bitmaps.entries.clear();
260+
ctx.videos.clear();
256261

257262
llama_pos new_n_past;
258263
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
@@ -373,6 +378,9 @@ int main(int argc, char ** argv) {
373378
if (mtmd_support_audio(ctx.ctx_vision.get())) {
374379
LOG("\n /audio <path> load an audio");
375380
}
381+
if (mtmd_helper_support_video(ctx.ctx_vision.get())) {
382+
LOG("\n /video <path> load a video");
383+
}
376384
LOG("\n /clear clear the chat history");
377385
LOG("\n /quit or /exit exit the program");
378386
LOG("\n");
@@ -407,14 +415,15 @@ int main(int argc, char ** argv) {
407415
g_is_generating = true;
408416
bool is_image = line == "/image" || line.find("/image ") == 0;
409417
bool is_audio = line == "/audio" || line.find("/audio ") == 0;
410-
if (is_image || is_audio) {
418+
bool is_video = line == "/video" || line.find("/video ") == 0;
419+
if (is_image || is_audio || is_video) {
411420
if (line.size() < 8) {
412421
LOG_ERR("ERR: Missing media filename\n");
413422
continue;
414423
}
415424
std::string media_path = line.substr(7);
416425
if (ctx.load_media(media_path)) {
417-
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
426+
LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : is_audio ? "audio" : "video");
418427
content += mtmd_default_marker();
419428
}
420429
// else, error is already printed by libmtmd

0 commit comments

Comments
 (0)