@@ -77,6 +77,7 @@ struct mtmd_cli_context {
7777 int n_batch;
7878
7979 mtmd::bitmaps bitmaps;
80+ std::vector<mtmd_helper::video_ptr> videos;
8081
8182 // chat template
8283 common_chat_templates_ptr tmpls;
@@ -166,11 +167,14 @@ struct mtmd_cli_context {
166167 }
167168
168169 bool load_media (const std::string & fname) {
169- mtmd::bitmap bmp ( mtmd_helper_bitmap_init_from_file (ctx_vision.get (), fname.c_str (), false ) );
170- if (!bmp. ptr ) {
170+ auto res = mtmd_helper_bitmap_init_from_file (ctx_vision.get (), fname.c_str (), false );
171+ if (!res. bitmap ) {
171172 return false ;
172173 }
173- bitmaps.entries .push_back (std::move (bmp));
174+ bitmaps.entries .emplace_back (res.bitmap );
175+ if (res.video_ctx ) {
176+ videos.emplace_back (res.video_ctx );
177+ }
174178 return true ;
175179 }
176180};
@@ -253,6 +257,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
253257 }
254258
255259 ctx.bitmaps .entries .clear ();
260+ ctx.videos .clear ();
256261
257262 llama_pos new_n_past;
258263 if (mtmd_helper_eval_chunks (ctx.ctx_vision .get (),
@@ -373,6 +378,9 @@ int main(int argc, char ** argv) {
373378 if (mtmd_support_audio (ctx.ctx_vision .get ())) {
374379 LOG (" \n /audio <path> load an audio" );
375380 }
381+ if (mtmd_helper_support_video (ctx.ctx_vision .get ())) {
382+ LOG (" \n /video <path> load a video" );
383+ }
376384 LOG (" \n /clear clear the chat history" );
377385 LOG (" \n /quit or /exit exit the program" );
378386 LOG (" \n " );
@@ -407,14 +415,15 @@ int main(int argc, char ** argv) {
407415 g_is_generating = true ;
408416 bool is_image = line == " /image" || line.find (" /image " ) == 0 ;
409417 bool is_audio = line == " /audio" || line.find (" /audio " ) == 0 ;
410- if (is_image || is_audio) {
418+ bool is_video = line == " /video" || line.find (" /video " ) == 0 ;
419+ if (is_image || is_audio || is_video) {
411420 if (line.size () < 8 ) {
412421 LOG_ERR (" ERR: Missing media filename\n " );
413422 continue ;
414423 }
415424 std::string media_path = line.substr (7 );
416425 if (ctx.load_media (media_path)) {
417- LOG (" %s %s loaded\n " , media_path.c_str (), is_image ? " image" : " audio" );
426+ LOG (" %s %s loaded\n " , media_path.c_str (), is_image ? " image" : is_audio ? " audio" : " video " );
418427 content += mtmd_default_marker ();
419428 }
420429 // else, error is already printed by libmtmd
0 commit comments