Skip to content

Commit 55fbba0

Browse files
Merge pull request #25 from RunanywhereAI/VLM
Adding VLM support
2 parents 5ceffe3 + 6a37c95 commit 55fbba0

25 files changed

Lines changed: 2705 additions & 67 deletions

CMakeLists.txt

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
4646
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
4747
add_subdirectory(deps/llama.cpp ${CMAKE_BINARY_DIR}/llama.cpp EXCLUDE_FROM_ALL)
4848

49+
# --- libmtmd (multimodal/vision support from llama.cpp) ---
50+
set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "" FORCE)
51+
add_subdirectory(deps/llama.cpp/tools/mtmd ${CMAKE_BINARY_DIR}/mtmd EXCLUDE_FROM_ALL)
52+
4953
# --- sherpa-onnx (STT + TTS + VAD) ---
5054
set(SHERPA_ONNX_ENABLE_C_API ON CACHE BOOL "Enable C API" FORCE)
5155
set(SHERPA_ONNX_ENABLE_BINARY OFF CACHE BOOL "" FORCE)
@@ -99,8 +103,11 @@ add_library(rcli STATIC
99103
src/engines/metalrt_engine.cpp
100104
src/engines/metalrt_stt_engine.cpp
101105
src/engines/metalrt_tts_engine.cpp
106+
src/engines/vlm_engine.cpp
102107
src/audio/audio_io.cpp
103108
src/audio/mic_permission.mm
109+
src/audio/camera_capture.mm
110+
src/audio/screen_capture.mm
104111
src/pipeline/orchestrator.cpp
105112
src/pipeline/sentence_detector.cpp
106113
src/tools/tool_engine.cpp
@@ -133,26 +140,33 @@ add_library(rcli STATIC
133140
src/api/rcli_api.cpp
134141
)
135142

136-
set_source_files_properties(src/audio/mic_permission.mm
143+
set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm
137144
PROPERTIES LANGUAGE CXX)
138145

139146
target_include_directories(rcli PUBLIC
140147
${CMAKE_CURRENT_SOURCE_DIR}/src
141148
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/include
142149
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include
150+
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/tools/mtmd
143151
${CMAKE_CURRENT_SOURCE_DIR}/deps/sherpa-onnx/sherpa-onnx/c-api
144152
${usearch_SOURCE_DIR}/include
145153
)
146154

147155
target_link_libraries(rcli PUBLIC
148156
llama
149157
ggml
158+
mtmd
150159
sherpa-onnx-c-api
151160
"-framework CoreAudio"
152161
"-framework AudioToolbox"
153162
"-framework AudioUnit"
154163
"-framework Foundation"
155164
"-framework AVFoundation"
165+
"-framework AppKit"
166+
"-framework CoreImage"
167+
"-framework CoreMedia"
168+
"-framework CoreVideo"
169+
"-framework CoreGraphics"
156170
"-framework IOKit"
157171
)
158172

@@ -186,6 +200,27 @@ target_compile_definitions(rcli_cli PRIVATE
186200
RCLI_VERSION="${PROJECT_VERSION}"
187201
)
188202

203+
# =============================================================================
204+
# rcli_overlay — standalone Cocoa helper for visual overlay window
205+
# =============================================================================
206+
add_executable(rcli_overlay
207+
src/audio/rcli_overlay.m
208+
)
209+
210+
set_source_files_properties(src/audio/rcli_overlay.m PROPERTIES LANGUAGE CXX)
211+
212+
target_compile_options(rcli_overlay PRIVATE -x objective-c++)
213+
214+
target_link_libraries(rcli_overlay PRIVATE
215+
"-framework AppKit"
216+
"-framework CoreGraphics"
217+
)
218+
219+
set_target_properties(rcli_overlay PROPERTIES
220+
OUTPUT_NAME "rcli_overlay"
221+
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
222+
)
223+
189224
# =============================================================================
190225
# rcli_test — test executable
191226
# =============================================================================

README.md

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
<a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="MIT"></a>
1010
</p>
1111

12-
**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS pipeline running natively on Apple Silicon — 38 macOS actions via voice, local RAG over your documents, sub-200ms end-to-end latency. No cloud, no API keys.
12+
**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS + VLM pipeline running natively on Apple Silicon — 40 macOS actions via voice, local RAG over your documents, on-device vision (camera & screen analysis), sub-200ms end-to-end latency. No cloud, no API keys.
1313

1414
Powered by [MetalRT](#metalrt-gpu-engine), a proprietary GPU inference engine built by [RunAnywhere, Inc.](https://runanywhere.ai) specifically for Apple Silicon.
1515

@@ -112,6 +112,9 @@ rcli # interactive TUI (push-to-talk + text)
112112
rcli listen # continuous voice mode
113113
rcli ask "open Safari" # one-shot command
114114
rcli ask "play some jazz on Spotify"
115+
rcli vlm photo.jpg "what's in this image?" # vision analysis
116+
rcli camera # live camera VLM
117+
rcli screen # screen capture VLM
115118
rcli metalrt # MetalRT GPU engine management
116119
rcli llamacpp # llama.cpp engine management
117120
```
@@ -149,7 +152,18 @@ A full STT + LLM + TTS pipeline running on Metal GPU with three concurrent threa
149152
- **Tool Calling** — LLM-native tool call formats (Qwen3, LFM2, etc.)
150153
- **Multi-turn Memory** — Sliding window conversation history with token-budget trimming
151154

152-
### 38 macOS Actions
155+
### Vision (VLM)
156+
157+
Analyze images, camera captures, and screen regions using on-device vision-language models. VLM runs on the llama.cpp engine via Metal GPU — no cloud.
158+
159+
- **Image Analysis**`rcli vlm photo.jpg "describe this"` for single-image queries
160+
- **Camera** — Press **V** in the TUI or run `rcli camera` for live camera analysis
161+
- **Screen Capture** — Press **S** in the TUI or run `rcli screen` to analyze screen regions
162+
- **Models** — Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — download on demand via `rcli models vlm`
163+
164+
> **Note:** VLM is currently available on the llama.cpp engine. MetalRT VLM support is coming soon.
165+
166+
### 40 macOS Actions
153167

154168
Control your Mac by voice or text. The LLM routes intent to actions executed locally via AppleScript and shell commands.
155169

@@ -161,7 +175,7 @@ Control your Mac by voice or text. The LLM routes intent to actions executed loc
161175
| **System** | `open_app`, `quit_app`, `set_volume`, `toggle_dark_mode`, `screenshot`, `lock_screen` |
162176
| **Web** | `search_web`, `search_youtube`, `open_url`, `open_maps` |
163177

164-
Run `rcli actions` to see all 38, or toggle them on/off in the TUI Actions panel.
178+
Run `rcli actions` to see all 40, or toggle them on/off in the TUI Actions panel.
165179

166180
> **Tip:** If tool calling feels unreliable, press **X** in the TUI to clear the conversation and reset context. With small LLMs, accumulated context can degrade tool-calling accuracy — a fresh context often fixes it.
167181
@@ -181,7 +195,9 @@ A terminal dashboard with push-to-talk, live hardware monitoring, model manageme
181195
| Key | Action |
182196
|-----|--------|
183197
| **SPACE** | Push-to-talk |
184-
| **M** | Models — browse, download, hot-swap LLM/STT/TTS |
198+
| **V** | Camera — capture and analyze with VLM |
199+
| **S** | Screen — capture and analyze a screen region with VLM |
200+
| **M** | Models — browse, download, hot-swap LLM/STT/TTS/VLM |
185201
| **A** | Actions — browse, enable/disable macOS actions |
186202
| **R** | RAG — ingest documents |
187203
| **X** | Clear conversation and reset context |
@@ -207,18 +223,21 @@ MetalRT is distributed under a [proprietary license](https://github.com/Runanywh
207223

208224
## Supported Models
209225

210-
RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
226+
RCLI supports 20+ models across LLM, STT, TTS, VLM, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
211227

212228
**LLM:** LFM2 1.2B (default), LFM2 350M, LFM2.5 1.2B, LFM2 2.6B, Qwen3 0.6B, Qwen3.5 0.8B/2B/4B, Qwen3 4B
213229

214230
**STT:** Zipformer (streaming), Whisper base.en (offline, default), Parakeet TDT 0.6B (~1.9% WER)
215231

216232
**TTS:** Piper Lessac/Amy, KittenTTS Nano, Matcha LJSpeech, Kokoro English/Multi-lang
217233

218-
**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings.
234+
**VLM:** Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — on-demand download via `rcli models vlm` (llama.cpp engine only)
235+
236+
**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings. VLM models are downloaded on demand.
219237

220238
```bash
221239
rcli models # interactive model management
240+
rcli models vlm # download/manage VLM models
222241
rcli upgrade-llm # guided LLM upgrade
223242
rcli voices # browse and switch TTS voices
224243
rcli cleanup # remove unused models
@@ -247,10 +266,13 @@ All dependencies are vendored or CMake-fetched. Requires CMake 3.15+ and Apple C
247266
rcli Interactive TUI (push-to-talk + text + trace)
248267
rcli listen Continuous voice mode
249268
rcli ask <text> One-shot text command
269+
rcli vlm <image> [prompt] Analyze an image with VLM
270+
rcli camera [prompt] Live camera capture + VLM analysis
271+
rcli screen [prompt] Screen capture + VLM analysis
250272
rcli actions [name] List actions or show detail
251273
rcli rag ingest <dir> Index documents for RAG
252274
rcli rag query <text> Query indexed documents
253-
rcli models [llm|stt|tts] Manage AI models
275+
rcli models [llm|stt|tts|vlm] Manage AI models
254276
rcli voices Manage TTS voices
255277
rcli metalrt MetalRT GPU engine management
256278
rcli llamacpp llama.cpp engine management

0 commit comments

Comments
 (0)