@@ -122,6 +122,8 @@ struct RCLIEngine {
122122 bool using_metalrt_vlm = false ; // true when VLM is running on MetalRT backend
123123 void * metalrt_vision_handle = nullptr ; // opaque handle from metalrt_vision_create()
124124 std::string last_vlm_response;
125+ std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT"
126+ std::string vlm_model_name; // e.g. "Qwen3 VL 2B"
125127
126128 // MetalRT VLM stats (filled after each analyze call)
127129 struct {
@@ -3084,6 +3086,8 @@ static int vlm_init_locked(RCLIEngine* engine) {
30843086 engine->vlm_initialized = true ;
30853087 const char * mname = mrt_loader.vision_model_name
30863088 ? mrt_loader.vision_model_name (handle) : " Qwen3-VL-2B" ;
3089+ engine->vlm_backend_name = " MetalRT" ;
3090+ engine->vlm_model_name = mname;
30873091 LOG_INFO (" VLM" , " MetalRT VLM engine ready (%s)" , mname);
30883092 return 0 ;
30893093 }
@@ -3094,6 +3098,11 @@ static int vlm_init_locked(RCLIEngine* engine) {
30943098 }
30953099
30963100 // --- Fallback: llama.cpp VLM backend ---
3101+ // MetalRT dylib either not loaded, or doesn't export vision symbols yet.
3102+ // Use llama.cpp with GGUF models — still runs on Metal GPU via ggml-metal.
3103+ if (mrt_loader.is_loaded () && !mrt_loader.has_vision ()) {
3104+ LOG_INFO (" VLM" , " MetalRT engine active but VLM not yet supported in dylib — using llama.cpp for vision" );
3105+ }
30973106
30983107 // Find or download VLM model
30993108 auto vlm_models = rcli::all_vlm_models ();
@@ -3142,7 +3151,10 @@ static int vlm_init_locked(RCLIEngine* engine) {
31423151 }
31433152
31443153 engine->vlm_initialized = true ;
3145- LOG_INFO (" VLM" , " VLM engine ready (%s)" , model_def.name .c_str ());
3154+ engine->using_metalrt_vlm = false ;
3155+ engine->vlm_backend_name = " llama.cpp (Metal GPU)" ;
3156+ engine->vlm_model_name = model_def.name ;
3157+ LOG_INFO (" VLM" , " VLM engine ready — %s via llama.cpp (Metal GPU)" , model_def.name .c_str ());
31463158 return 0 ;
31473159}
31483160
@@ -3230,6 +3242,18 @@ int rcli_vlm_is_ready(RCLIHandle handle) {
32303242 return engine->vlm_initialized ? 1 : 0 ;
32313243}
32323244
3245+ const char * rcli_vlm_backend_name (RCLIHandle handle) {
3246+ if (!handle) return " " ;
3247+ auto * engine = static_cast <RCLIEngine*>(handle);
3248+ return engine->vlm_backend_name .c_str ();
3249+ }
3250+
3251+ const char * rcli_vlm_model_name (RCLIHandle handle) {
3252+ if (!handle) return " " ;
3253+ auto * engine = static_cast <RCLIEngine*>(handle);
3254+ return engine->vlm_model_name .c_str ();
3255+ }
3256+
32333257int rcli_vlm_get_stats (RCLIHandle handle, RCLIVlmStats* out_stats) {
32343258 if (!handle || !out_stats) return -1 ;
32353259 auto * engine = static_cast <RCLIEngine*>(handle);
@@ -3312,16 +3336,24 @@ int rcli_vlm_exit(RCLIHandle handle) {
33123336 auto * engine = static_cast <RCLIEngine*>(handle);
33133337 std::lock_guard<std::mutex> lock (engine->mutex );
33143338
3315- // Unload VLM
3339+ // Unload MetalRT VLM if active
33163340 if (engine->metalrt_vision_handle ) {
33173341 auto & loader = rastack::MetalRTLoader::instance ();
33183342 if (loader.vision_destroy )
33193343 loader.vision_destroy (engine->metalrt_vision_handle );
33203344 engine->metalrt_vision_handle = nullptr ;
33213345 }
3346+
3347+ // Shutdown llama.cpp VLM if it was the active backend
3348+ if (!engine->using_metalrt_vlm && engine->vlm_engine .is_initialized ()) {
3349+ engine->vlm_engine .shutdown ();
3350+ }
3351+
33223352 engine->using_metalrt_vlm = false ;
33233353 engine->vlm_initialized = false ;
3324- LOG_INFO (" VLM" , " MetalRT VLM unloaded" );
3354+ engine->vlm_backend_name .clear ();
3355+ engine->vlm_model_name .clear ();
3356+ LOG_INFO (" VLM" , " VLM unloaded" );
33253357
33263358 // Reload MetalRT LLM
33273359 if (engine->pipeline .using_metalrt ()) {
@@ -3351,67 +3383,96 @@ int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
33513383 auto * engine = static_cast <RCLIEngine*>(handle);
33523384 std::lock_guard<std::mutex> lock (engine->mutex );
33533385
3354- if (!engine->using_metalrt_vlm || !engine->metalrt_vision_handle ) {
3355- LOG_ERROR (" VLM" , " VLM not loaded. Call rcli_vlm_enter() first." );
3356- return -1 ;
3386+ // Lazy-init VLM if not yet loaded
3387+ if (!engine->vlm_initialized ) {
3388+ if (vlm_init_locked (engine) != 0 ) {
3389+ LOG_ERROR (" VLM" , " Failed to initialize VLM engine for streaming" );
3390+ return -1 ;
3391+ }
33573392 }
33583393
33593394 std::string text_prompt = (prompt && prompt[0 ])
33603395 ? std::string (prompt) : " Describe this image in detail." ;
33613396
3362- auto & loader = rastack::MetalRTLoader::instance ();
3363- std::string accumulated;
3397+ if (engine->using_metalrt_vlm && engine->metalrt_vision_handle ) {
3398+ // --- MetalRT VLM streaming path ---
3399+ auto & loader = rastack::MetalRTLoader::instance ();
3400+ std::string accumulated;
33643401
3365- struct StreamCtx {
3366- RCLIEventCallback cb;
3367- void * ud;
3368- std::string* accum;
3369- };
3370- StreamCtx sctx{callback, user_data, &accumulated};
3402+ struct StreamCtx {
3403+ RCLIEventCallback cb;
3404+ void * ud;
3405+ std::string* accum;
3406+ };
3407+ StreamCtx sctx{callback, user_data, &accumulated};
33713408
3372- rastack::MetalRTStreamCb stream_cb = [](const char * piece, void * ud) -> bool {
3373- auto * ctx = static_cast <StreamCtx*>(ud);
3374- // Skip special tokens
3375- if (std::strstr (piece, " <|im_end|>" ) || std::strstr (piece, " <|im_start|>" ))
3409+ rastack::MetalRTStreamCb stream_cb = [](const char * piece, void * ud) -> bool {
3410+ auto * ctx = static_cast <StreamCtx*>(ud);
3411+ if (std::strstr (piece, " <|im_end|>" ) || std::strstr (piece, " <|im_start|>" ))
3412+ return true ;
3413+ ctx->accum ->append (piece);
3414+ if (ctx->cb ) ctx->cb (" token" , piece, ctx->ud );
33763415 return true ;
3377- ctx->accum ->append (piece);
3378- if (ctx->cb ) ctx->cb (" token" , piece, ctx->ud );
3379- return true ;
3380- };
3416+ };
33813417
3382- rastack::MetalRTLoader::MetalRTVisionOptions opts{};
3383- opts.max_tokens = 512 ;
3384- opts.top_k = 40 ;
3385- opts.temperature = 0 .0f ;
3386- opts.think = false ;
3418+ rastack::MetalRTLoader::MetalRTVisionOptions opts{};
3419+ opts.max_tokens = 512 ;
3420+ opts.top_k = 40 ;
3421+ opts.temperature = 0 .0f ;
3422+ opts.think = false ;
33873423
3388- rastack::MetalRTLoader::MetalRTVisionResult vr;
3389- {
3390- std::lock_guard<std::mutex> gpu_lock (loader.gpu_mutex ());
3391- vr = loader.vision_analyze_stream (engine->metalrt_vision_handle ,
3392- image_path, text_prompt.c_str (),
3393- stream_cb, &sctx, &opts);
3394- }
3424+ rastack::MetalRTLoader::MetalRTVisionResult vr;
3425+ {
3426+ std::lock_guard<std::mutex> gpu_lock (loader.gpu_mutex ());
3427+ vr = loader.vision_analyze_stream (engine->metalrt_vision_handle ,
3428+ image_path, text_prompt.c_str (),
3429+ stream_cb, &sctx, &opts);
3430+ }
33953431
3396- // Store stats
3397- engine->metalrt_vlm_stats .vision_encode_ms = vr.vision_encode_ms ;
3398- engine->metalrt_vlm_stats .prefill_ms = vr.prefill_ms ;
3399- engine->metalrt_vlm_stats .decode_ms = vr.decode_ms ;
3400- engine->metalrt_vlm_stats .tps = vr.tps ;
3401- engine->metalrt_vlm_stats .prompt_tokens = vr.prompt_tokens ;
3402- engine->metalrt_vlm_stats .generated_tokens = vr.generated_tokens ;
3432+ engine->metalrt_vlm_stats .vision_encode_ms = vr.vision_encode_ms ;
3433+ engine->metalrt_vlm_stats .prefill_ms = vr.prefill_ms ;
3434+ engine->metalrt_vlm_stats .decode_ms = vr.decode_ms ;
3435+ engine->metalrt_vlm_stats .tps = vr.tps ;
3436+ engine->metalrt_vlm_stats .prompt_tokens = vr.prompt_tokens ;
3437+ engine->metalrt_vlm_stats .generated_tokens = vr.generated_tokens ;
34033438
3404- std::string result = vr.response ? std::string (vr.response ) : accumulated;
3405- if (loader.vision_free_result ) loader.vision_free_result (vr);
3406- engine->last_vlm_response = result.empty () ? " Error: Failed to analyze image." : result;
3439+ std::string result = vr.response ? std::string (vr.response ) : accumulated;
3440+ if (loader.vision_free_result ) loader.vision_free_result (vr);
3441+ engine->last_vlm_response = result.empty () ? " Error: Failed to analyze image." : result;
34073442
3408- if (callback) {
3409- callback (" response" , engine->last_vlm_response .c_str (), user_data);
3410- char stats_buf[256 ];
3411- snprintf (stats_buf, sizeof (stats_buf),
3412- " {\" tps\" :%.1f,\" tokens\" :%d,\" vision_encode_ms\" :%.1f}" ,
3413- vr.tps , vr.generated_tokens , vr.vision_encode_ms );
3414- callback (" stats" , stats_buf, user_data);
3443+ if (callback) {
3444+ callback (" response" , engine->last_vlm_response .c_str (), user_data);
3445+ char stats_buf[256 ];
3446+ snprintf (stats_buf, sizeof (stats_buf),
3447+ " {\" tps\" :%.1f,\" tokens\" :%d,\" vision_encode_ms\" :%.1f}" ,
3448+ vr.tps , vr.generated_tokens , vr.vision_encode_ms );
3449+ callback (" stats" , stats_buf, user_data);
3450+ }
3451+ } else {
3452+ // --- llama.cpp VLM streaming path ---
3453+ rastack::TokenCallback token_cb = nullptr ;
3454+ if (callback) {
3455+ token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
3456+ if (!tok.text .empty ()) {
3457+ callback (" token" , tok.text .c_str (), user_data);
3458+ }
3459+ };
3460+ }
3461+
3462+ std::string result = engine->vlm_engine .analyze_image (
3463+ std::string (image_path), text_prompt, token_cb);
3464+
3465+ engine->last_vlm_response = result.empty () ? " Error: Failed to analyze image." : result;
3466+
3467+ if (callback) {
3468+ callback (" response" , engine->last_vlm_response .c_str (), user_data);
3469+ auto & s = engine->vlm_engine .last_stats ();
3470+ char stats_buf[256 ];
3471+ snprintf (stats_buf, sizeof (stats_buf),
3472+ " {\" tps\" :%.1f,\" tokens\" :%lld,\" vision_encode_ms\" :%.1f}" ,
3473+ s.gen_tps (), s.generated_tokens , s.image_encode_us / 1000.0 );
3474+ callback (" stats" , stats_buf, user_data);
3475+ }
34153476 }
34163477
34173478 return engine->last_vlm_response .find (" Error:" ) == 0 ? -1 : 0 ;
0 commit comments