Skip to content

Commit 8a3e514

Browse files
fixed vlm to be using fallback for llamacpp / llamacpp vlm working - metalRT to do
1 parent 14908e3 commit 8a3e514

5 files changed

Lines changed: 186 additions & 68 deletions

File tree

src/api/rcli_api.cpp

Lines changed: 112 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ struct RCLIEngine {
122122
bool using_metalrt_vlm = false; // true when VLM is running on MetalRT backend
123123
void* metalrt_vision_handle = nullptr; // opaque handle from metalrt_vision_create()
124124
std::string last_vlm_response;
125+
std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT"
126+
std::string vlm_model_name; // e.g. "Qwen3 VL 2B"
125127

126128
// MetalRT VLM stats (filled after each analyze call)
127129
struct {
@@ -3084,6 +3086,8 @@ static int vlm_init_locked(RCLIEngine* engine) {
30843086
engine->vlm_initialized = true;
30853087
const char* mname = mrt_loader.vision_model_name
30863088
? mrt_loader.vision_model_name(handle) : "Qwen3-VL-2B";
3089+
engine->vlm_backend_name = "MetalRT";
3090+
engine->vlm_model_name = mname;
30873091
LOG_INFO("VLM", "MetalRT VLM engine ready (%s)", mname);
30883092
return 0;
30893093
}
@@ -3094,6 +3098,11 @@ static int vlm_init_locked(RCLIEngine* engine) {
30943098
}
30953099

30963100
// --- Fallback: llama.cpp VLM backend ---
3101+
// MetalRT dylib either not loaded, or doesn't export vision symbols yet.
3102+
// Use llama.cpp with GGUF models — still runs on Metal GPU via ggml-metal.
3103+
if (mrt_loader.is_loaded() && !mrt_loader.has_vision()) {
3104+
LOG_INFO("VLM", "MetalRT engine active but VLM not yet supported in dylib — using llama.cpp for vision");
3105+
}
30973106

30983107
// Find or download VLM model
30993108
auto vlm_models = rcli::all_vlm_models();
@@ -3142,7 +3151,10 @@ static int vlm_init_locked(RCLIEngine* engine) {
31423151
}
31433152

31443153
engine->vlm_initialized = true;
3145-
LOG_INFO("VLM", "VLM engine ready (%s)", model_def.name.c_str());
3154+
engine->using_metalrt_vlm = false;
3155+
engine->vlm_backend_name = "llama.cpp (Metal GPU)";
3156+
engine->vlm_model_name = model_def.name;
3157+
LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str());
31463158
return 0;
31473159
}
31483160

@@ -3230,6 +3242,18 @@ int rcli_vlm_is_ready(RCLIHandle handle) {
32303242
return engine->vlm_initialized ? 1 : 0;
32313243
}
32323244

3245+
const char* rcli_vlm_backend_name(RCLIHandle handle) {
3246+
if (!handle) return "";
3247+
auto* engine = static_cast<RCLIEngine*>(handle);
3248+
return engine->vlm_backend_name.c_str();
3249+
}
3250+
3251+
const char* rcli_vlm_model_name(RCLIHandle handle) {
3252+
if (!handle) return "";
3253+
auto* engine = static_cast<RCLIEngine*>(handle);
3254+
return engine->vlm_model_name.c_str();
3255+
}
3256+
32333257
int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) {
32343258
if (!handle || !out_stats) return -1;
32353259
auto* engine = static_cast<RCLIEngine*>(handle);
@@ -3312,16 +3336,24 @@ int rcli_vlm_exit(RCLIHandle handle) {
33123336
auto* engine = static_cast<RCLIEngine*>(handle);
33133337
std::lock_guard<std::mutex> lock(engine->mutex);
33143338

3315-
// Unload VLM
3339+
// Unload MetalRT VLM if active
33163340
if (engine->metalrt_vision_handle) {
33173341
auto& loader = rastack::MetalRTLoader::instance();
33183342
if (loader.vision_destroy)
33193343
loader.vision_destroy(engine->metalrt_vision_handle);
33203344
engine->metalrt_vision_handle = nullptr;
33213345
}
3346+
3347+
// Shutdown llama.cpp VLM if it was the active backend
3348+
if (!engine->using_metalrt_vlm && engine->vlm_engine.is_initialized()) {
3349+
engine->vlm_engine.shutdown();
3350+
}
3351+
33223352
engine->using_metalrt_vlm = false;
33233353
engine->vlm_initialized = false;
3324-
LOG_INFO("VLM", "MetalRT VLM unloaded");
3354+
engine->vlm_backend_name.clear();
3355+
engine->vlm_model_name.clear();
3356+
LOG_INFO("VLM", "VLM unloaded");
33253357

33263358
// Reload MetalRT LLM
33273359
if (engine->pipeline.using_metalrt()) {
@@ -3351,67 +3383,96 @@ int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
33513383
auto* engine = static_cast<RCLIEngine*>(handle);
33523384
std::lock_guard<std::mutex> lock(engine->mutex);
33533385

3354-
if (!engine->using_metalrt_vlm || !engine->metalrt_vision_handle) {
3355-
LOG_ERROR("VLM", "VLM not loaded. Call rcli_vlm_enter() first.");
3356-
return -1;
3386+
// Lazy-init VLM if not yet loaded
3387+
if (!engine->vlm_initialized) {
3388+
if (vlm_init_locked(engine) != 0) {
3389+
LOG_ERROR("VLM", "Failed to initialize VLM engine for streaming");
3390+
return -1;
3391+
}
33573392
}
33583393

33593394
std::string text_prompt = (prompt && prompt[0])
33603395
? std::string(prompt) : "Describe this image in detail.";
33613396

3362-
auto& loader = rastack::MetalRTLoader::instance();
3363-
std::string accumulated;
3397+
if (engine->using_metalrt_vlm && engine->metalrt_vision_handle) {
3398+
// --- MetalRT VLM streaming path ---
3399+
auto& loader = rastack::MetalRTLoader::instance();
3400+
std::string accumulated;
33643401

3365-
struct StreamCtx {
3366-
RCLIEventCallback cb;
3367-
void* ud;
3368-
std::string* accum;
3369-
};
3370-
StreamCtx sctx{callback, user_data, &accumulated};
3402+
struct StreamCtx {
3403+
RCLIEventCallback cb;
3404+
void* ud;
3405+
std::string* accum;
3406+
};
3407+
StreamCtx sctx{callback, user_data, &accumulated};
33713408

3372-
rastack::MetalRTStreamCb stream_cb = [](const char* piece, void* ud) -> bool {
3373-
auto* ctx = static_cast<StreamCtx*>(ud);
3374-
// Skip special tokens
3375-
if (std::strstr(piece, "<|im_end|>") || std::strstr(piece, "<|im_start|>"))
3409+
rastack::MetalRTStreamCb stream_cb = [](const char* piece, void* ud) -> bool {
3410+
auto* ctx = static_cast<StreamCtx*>(ud);
3411+
if (std::strstr(piece, "<|im_end|>") || std::strstr(piece, "<|im_start|>"))
3412+
return true;
3413+
ctx->accum->append(piece);
3414+
if (ctx->cb) ctx->cb("token", piece, ctx->ud);
33763415
return true;
3377-
ctx->accum->append(piece);
3378-
if (ctx->cb) ctx->cb("token", piece, ctx->ud);
3379-
return true;
3380-
};
3416+
};
33813417

3382-
rastack::MetalRTLoader::MetalRTVisionOptions opts{};
3383-
opts.max_tokens = 512;
3384-
opts.top_k = 40;
3385-
opts.temperature = 0.0f;
3386-
opts.think = false;
3418+
rastack::MetalRTLoader::MetalRTVisionOptions opts{};
3419+
opts.max_tokens = 512;
3420+
opts.top_k = 40;
3421+
opts.temperature = 0.0f;
3422+
opts.think = false;
33873423

3388-
rastack::MetalRTLoader::MetalRTVisionResult vr;
3389-
{
3390-
std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
3391-
vr = loader.vision_analyze_stream(engine->metalrt_vision_handle,
3392-
image_path, text_prompt.c_str(),
3393-
stream_cb, &sctx, &opts);
3394-
}
3424+
rastack::MetalRTLoader::MetalRTVisionResult vr;
3425+
{
3426+
std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
3427+
vr = loader.vision_analyze_stream(engine->metalrt_vision_handle,
3428+
image_path, text_prompt.c_str(),
3429+
stream_cb, &sctx, &opts);
3430+
}
33953431

3396-
// Store stats
3397-
engine->metalrt_vlm_stats.vision_encode_ms = vr.vision_encode_ms;
3398-
engine->metalrt_vlm_stats.prefill_ms = vr.prefill_ms;
3399-
engine->metalrt_vlm_stats.decode_ms = vr.decode_ms;
3400-
engine->metalrt_vlm_stats.tps = vr.tps;
3401-
engine->metalrt_vlm_stats.prompt_tokens = vr.prompt_tokens;
3402-
engine->metalrt_vlm_stats.generated_tokens = vr.generated_tokens;
3432+
engine->metalrt_vlm_stats.vision_encode_ms = vr.vision_encode_ms;
3433+
engine->metalrt_vlm_stats.prefill_ms = vr.prefill_ms;
3434+
engine->metalrt_vlm_stats.decode_ms = vr.decode_ms;
3435+
engine->metalrt_vlm_stats.tps = vr.tps;
3436+
engine->metalrt_vlm_stats.prompt_tokens = vr.prompt_tokens;
3437+
engine->metalrt_vlm_stats.generated_tokens = vr.generated_tokens;
34033438

3404-
std::string result = vr.response ? std::string(vr.response) : accumulated;
3405-
if (loader.vision_free_result) loader.vision_free_result(vr);
3406-
engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
3439+
std::string result = vr.response ? std::string(vr.response) : accumulated;
3440+
if (loader.vision_free_result) loader.vision_free_result(vr);
3441+
engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
34073442

3408-
if (callback) {
3409-
callback("response", engine->last_vlm_response.c_str(), user_data);
3410-
char stats_buf[256];
3411-
snprintf(stats_buf, sizeof(stats_buf),
3412-
"{\"tps\":%.1f,\"tokens\":%d,\"vision_encode_ms\":%.1f}",
3413-
vr.tps, vr.generated_tokens, vr.vision_encode_ms);
3414-
callback("stats", stats_buf, user_data);
3443+
if (callback) {
3444+
callback("response", engine->last_vlm_response.c_str(), user_data);
3445+
char stats_buf[256];
3446+
snprintf(stats_buf, sizeof(stats_buf),
3447+
"{\"tps\":%.1f,\"tokens\":%d,\"vision_encode_ms\":%.1f}",
3448+
vr.tps, vr.generated_tokens, vr.vision_encode_ms);
3449+
callback("stats", stats_buf, user_data);
3450+
}
3451+
} else {
3452+
// --- llama.cpp VLM streaming path ---
3453+
rastack::TokenCallback token_cb = nullptr;
3454+
if (callback) {
3455+
token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
3456+
if (!tok.text.empty()) {
3457+
callback("token", tok.text.c_str(), user_data);
3458+
}
3459+
};
3460+
}
3461+
3462+
std::string result = engine->vlm_engine.analyze_image(
3463+
std::string(image_path), text_prompt, token_cb);
3464+
3465+
engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
3466+
3467+
if (callback) {
3468+
callback("response", engine->last_vlm_response.c_str(), user_data);
3469+
auto& s = engine->vlm_engine.last_stats();
3470+
char stats_buf[256];
3471+
snprintf(stats_buf, sizeof(stats_buf),
3472+
"{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}",
3473+
s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0);
3474+
callback("stats", stats_buf, user_data);
3475+
}
34153476
}
34163477

34173478
return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0;

src/api/rcli_api.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,14 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch
279279
// Returns 1 if ready, 0 if not.
280280
int rcli_vlm_is_ready(RCLIHandle handle);
281281

282+
// Get the name of the active VLM backend (e.g. "llama.cpp (Metal GPU)" or "MetalRT").
283+
// Returns "" if VLM is not initialized.
284+
const char* rcli_vlm_backend_name(RCLIHandle handle);
285+
286+
// Get the name of the active VLM model (e.g. "Qwen3 VL 2B Instruct").
287+
// Returns "" if VLM is not initialized.
288+
const char* rcli_vlm_model_name(RCLIHandle handle);
289+
282290
// VLM performance stats from the last analysis call.
283291
typedef struct {
284292
double gen_tok_per_sec; // Generation tokens/second

src/cli/main.cpp

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,12 +491,20 @@ static int cmd_vlm(const Args& args) {
491491
return 1;
492492
}
493493

494+
// Show which VLM backend is active
495+
const char* backend = rcli_vlm_backend_name(g_engine);
496+
const char* model = rcli_vlm_model_name(g_engine);
497+
if (backend && backend[0]) {
498+
fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
499+
color::dim, color::reset, color::bold, model,
500+
color::reset, color::dim, backend, color::reset);
501+
}
502+
494503
fprintf(stderr, "%sAnalyzing image: %s%s\n", color::dim, image_path.c_str(), color::reset);
495504

496505
const char* response = rcli_vlm_analyze(g_engine, image_path.c_str(), prompt.c_str());
497506
if (response && response[0]) {
498507
fprintf(stdout, "%s\n", response);
499-
// Print performance stats
500508
RCLIVlmStats stats;
501509
if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
502510
fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n",
@@ -543,21 +551,27 @@ static int cmd_camera(const Args& args) {
543551
return 1;
544552
}
545553

554+
const char* backend = rcli_vlm_backend_name(g_engine);
555+
const char* model = rcli_vlm_model_name(g_engine);
556+
if (backend && backend[0]) {
557+
fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
558+
color::dim, color::reset, color::bold, model,
559+
color::reset, color::dim, backend, color::reset);
560+
}
561+
546562
const char* response = rcli_vlm_analyze(g_engine, photo_path.c_str(), prompt.c_str());
547563
if (response && response[0]) {
548564
fprintf(stdout, "%s\n", response);
549565
if (!args.no_speak) {
550566
rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers);
551567
rcli_speak(g_engine, response);
552568
}
553-
// Print performance stats
554569
RCLIVlmStats stats;
555570
if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
556571
fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n",
557572
color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
558573
stats.total_time_sec, stats.first_token_ms, color::reset);
559574
}
560-
// Open the captured photo in Preview so user can see what was captured
561575
{
562576
pid_t pid;
563577
const char* argv[] = {"open", photo_path.c_str(), nullptr};
@@ -605,6 +619,14 @@ static int cmd_screen(const Args& args) {
605619
return 1;
606620
}
607621

622+
const char* backend = rcli_vlm_backend_name(g_engine);
623+
const char* model = rcli_vlm_model_name(g_engine);
624+
if (backend && backend[0]) {
625+
fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
626+
color::dim, color::reset, color::bold, model,
627+
color::reset, color::dim, backend, color::reset);
628+
}
629+
608630
const char* response = rcli_vlm_analyze(g_engine, screen_path.c_str(), prompt.c_str());
609631
if (response && response[0]) {
610632
fprintf(stdout, "%s\n", response);

src/cli/model_pickers.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -694,9 +694,12 @@ inline int cmd_info() {
694694

695695
auto vlm_all_info = rcli::all_vlm_models();
696696
auto [vlm_found, vlm_def] = rcli::find_installed_vlm(models_dir);
697-
std::string vlm_info = vlm_found
698-
? (vlm_def.name + " (llama.cpp + mtmd)")
699-
: "not installed — run: rcli models vlm";
697+
std::string vlm_info;
698+
if (vlm_found) {
699+
vlm_info = vlm_def.name + " (llama.cpp, Metal GPU)";
700+
} else {
701+
vlm_info = "not installed — run: rcli models vlm";
702+
}
700703

701704
fprintf(stdout,
702705
"\n%s%s RCLI%s %s%s%s\n\n"

0 commit comments

Comments
 (0)