sqliteai
diff --git a/‎API.md‎
Lines changed: 79 additions & 5 deletions b/‎API.md‎
Lines changed: 79 additions & 5 deletions
diff --git a/‎Makefile‎
Lines changed: 4 additions & 4 deletions b/‎Makefile‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 21 additions & 1 deletion b/‎README.md‎
Lines changed: 21 additions & 1 deletion
@@ -16,7 +16,7 @@ Returns the current version of the SQLite-AI extension.
 
 ```sql
 SELECT ai_version();
--- e.g., '0.9.0'
+-- e.g., '1.0.0'
 ```
 
 ---
@@ -608,17 +608,29 @@ SELECT llm_embed_generate('hello world', 'json_output=1');
 
 ---
 
-## `llm_text_generate(text TEXT, options TEXT)`
+## `llm_text_generate(text TEXT, [image1, image2, ...], options TEXT)`
 
 **Returns:** `TEXT`
 
 **Description:**
 Generates a full-text completion based on input, with optional configuration provided as a comma-separated list of key=value pairs.
 
-**Example:**
+When a vision model is loaded via `llm_vision_load()`, you can pass one or more images as additional arguments. Images can be file paths (TEXT) or raw image data (BLOB). Supported image formats: JPG, PNG, BMP, GIF.
+
+**Examples:**
 
 ```sql
+-- Text-only generation
 SELECT llm_text_generate('Once upon a time', 'n_predict=1024');
+
+-- Vision: describe an image
+SELECT llm_text_generate('Describe this image', './photos/cat.jpg');
+
+-- Vision: compare multiple images
+SELECT llm_text_generate('What is different between these images?', './img1.jpg', './img2.jpg');
+
+-- Vision: image from BLOB column
+SELECT llm_text_generate('What do you see?', image_data) FROM photos WHERE id = 1;
 ```
 
 ---
@@ -700,18 +712,27 @@ SELECT llm_chat_restore('b59e...');
 
 ---
 
-## `llm_chat_respond(text TEXT)`
+## `llm_chat_respond(text TEXT, [image1, image2, ...])`
 
 **Returns:** `TEXT`
 
 **Description:**
 Generates a context-aware reply using chat memory, returned as a single, complete response.
 For a streaming model reply, use the llm_chat virtual table.
 
-**Example:**
+When a vision model is loaded via `llm_vision_load()`, you can pass one or more images as additional arguments. Images can be file paths (TEXT) or raw image data (BLOB). Supported image formats: JPG, PNG, BMP, GIF.
+
+**Examples:**
 
 ```sql
+-- Text-only chat
 SELECT llm_chat_respond('What are the most visited cities in Italy?');
+
+-- Vision: ask about an image
+SELECT llm_chat_respond('What is in this photo?', './photos/landscape.jpg');
+
+-- Vision: multiple images
+SELECT llm_chat_respond('Compare these two charts', './chart1.png', './chart2.png');
 ```
 
 ---
@@ -735,6 +756,59 @@ SELECT llm_chat_system_prompt();
 
 ---
 
+## Vision Functions
+
+### `llm_vision_load(path TEXT, options TEXT)`
+
+**Returns:** `NULL`
+
+**Description:**
+Loads a multimodal projector (mmproj) model for vision capabilities. This requires a text model to already be loaded via `llm_model_load()`. The mmproj file is a separate GGUF file that contains the vision encoder and projector weights.
+
+Once loaded, vision capabilities are available through `llm_text_generate()` and `llm_chat_respond()` by passing image arguments.
+
+The following option keys are available:
+
+| Key                | Type                              | Default | Meaning                                                              |
+| ------------------ | --------------------------------- | ------- | -------------------------------------------------------------------- |
+| `use_gpu`          | `1 or 0`                          | `1`     | Use GPU for vision encoding.                                         |
+| `n_threads`        | `number`                          | `4`     | Number of threads for vision processing.                             |
+| `warmup`           | `1 or 0`                          | `1`     | Run a warmup pass on load for faster first use.                      |
+| `flash_attn_type`  | `auto, disabled, enabled`         | `auto`  | Controls Flash Attention for the vision encoder.                     |
+| `image_min_tokens` | `number`                          | `0`     | Minimum image tokens for dynamic resolution models (0 = from model). |
+| `image_max_tokens` | `number`                          | `0`     | Maximum image tokens for dynamic resolution models (0 = from model). |
+
+**Example:**
+
+```sql
+-- Load text model first
+SELECT llm_model_load('./models/Gemma-3-4B-IT-Q4_K_M.gguf', 'gpu_layers=99');
+SELECT llm_context_create_textgen();
+
+-- Load vision projector
+SELECT llm_vision_load('./models/mmproj-Gemma-3-4B-IT-f16.gguf');
+
+-- Now use vision with llm_text_generate or llm_chat_respond
+SELECT llm_text_generate('Describe this image', './photos/cat.jpg');
+```
+
+---
+
+### `llm_vision_free()`
+
+**Returns:** `NULL`
+
+**Description:**
+Unloads the current vision (mmproj) model and frees associated memory. The text model remains loaded.
+
+**Example:**
+
+```sql
+SELECT llm_vision_free();
+```
+
+---
+
 ## Audio Functions
 
 ### `audio_model_load(path TEXT, options TEXT)`
 
@@ -58,8 +58,8 @@ SKIP_UNITTEST ?= 0
 # Compiler and flags
 CC = gcc
 CXX = g++
-CFLAGS = -Wall -Wextra -Wno-unused-parameter -I$(SRC_DIR) -I$(BUILD_GGML)/include -I$(WHISPER_DIR)/include -I$(MINIAUDIO_DIR)
-LLAMA_OPTIONS = $(LLAMA) -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_COMMON=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF -DLLAMA_BUILD_SERVER=OFF -DGGML_RPC=OFF
+CFLAGS = -Wall -Wextra -Wno-unused-parameter -I$(SRC_DIR) -I$(BUILD_GGML)/include -I$(WHISPER_DIR)/include -I$(MINIAUDIO_DIR) -I$(LLAMA_DIR)/tools/mtmd
+LLAMA_OPTIONS = $(LLAMA) -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=OFF -DGGML_RPC=OFF
 WHISPER_OPTIONS = $(LLAMA) $(WHISPER) -DBUILD_SHARED_LIBS=OFF -DWHISPER_BUILD_EXAMPLES=OFF -DWHISPER_BUILD_TESTS=OFF -DWHISPER_BUILD_SERVER=OFF -DWHISPER_RPC=OFF -DWHISPER_USE_SYSTEM_GGML=ON
 MINIAUDIO_OPTIONS = $(MINIAUDIO) -DBUILD_SHARED_LIBS=OFF -DMINIAUDIO_BUILD_EXAMPLES=OFF -DMINIAUDIO_BUILD_TESTS=OFF
 # MinGW produces .a files without lib prefix, use -l:filename.a syntax
@@ -69,7 +69,7 @@ ifeq ($(PLATFORM),windows)
 else
 	L = -l
 endif
-LLAMA_LDFLAGS = -L./$(BUILD_GGML)/lib -L./$(BUILD_LLAMA)/src -lllama $(L)ggml$(A) $(L)ggml-base$(A) $(L)ggml-cpu$(A)
+LLAMA_LDFLAGS = -L./$(BUILD_LLAMA)/tools/mtmd -L./$(BUILD_GGML)/lib -L./$(BUILD_LLAMA)/src -lmtmd -lllama $(L)ggml$(A) $(L)ggml-base$(A) $(L)ggml-cpu$(A)
 WHISPER_LDFLAGS = -L./$(BUILD_WHISPER)/src -lwhisper
 MINIAUDIO_LDFLAGS = -L./$(BUILD_MINIAUDIO) -lminiaudio -lminiaudio_channel_combiner_node -lminiaudio_channel_separator_node -lminiaudio_ltrim_node -lminiaudio_reverb_node -lminiaudio_vocoder_node
 LDFLAGS = $(LLAMA_LDFLAGS) $(WHISPER_LDFLAGS) $(MINIAUDIO_LDFLAGS)
@@ -85,7 +85,7 @@ SQLITE_TEST_SRC = tests/c/sqlite3.c
 # Files
 SRC_FILES = $(wildcard $(SRC_DIR)/*.c)
 OBJ_FILES = $(patsubst %.c, $(BUILD_DIR)/%.o, $(notdir $(SRC_FILES)))
-LLAMA_LIBS = $(BUILD_GGML)/libggml.a $(BUILD_GGML)/libggml-base.a $(BUILD_GGML)/libggml-cpu.a $(BUILD_LLAMA)/src/libllama.a
+LLAMA_LIBS = $(BUILD_LLAMA)/tools/mtmd/libmtmd.a $(BUILD_GGML)/libggml.a $(BUILD_GGML)/libggml-base.a $(BUILD_GGML)/libggml-cpu.a $(BUILD_LLAMA)/src/libllama.a
 WHISPER_LIBS = $(BUILD_WHISPER)/src/libwhisper.a
 MINIAUDIO_LIBS = $(BUILD_MINIAUDIO)/libminiaudio.a
 
 
@@ -11,9 +11,10 @@
 * **Offline-First**: No server dependencies or internet connection required.
 * **Composable SQL Interface**: AI + relational logic in a single unified layer.
 * **Audio Transcription**: Speech-to-text via Whisper models (WAV, MP3, FLAC).
+* **Vision / Multimodal**: Analyze images via multimodal models (JPG, PNG, BMP, GIF).
 * **Supports any GGUF model**: available on Huggingface; Qwen, Gemma, Llama, DeepSeek and more
 
-SQLite-AI supports **text embedding generation** for search and classification, a **chat-like interface with history and token streaming**, **automatic context save and restore** across sessions, and **audio transcription** via Whisper models — making it ideal for building conversational agents, memory-aware assistants, and voice-enabled applications.
+SQLite-AI supports **text embedding generation** for search and classification, a **chat-like interface with history and token streaming**, **automatic context save and restore** across sessions, **audio transcription** via Whisper models, and **vision/multimodal** image understanding — making it ideal for building conversational agents, memory-aware assistants, and voice-enabled applications.
 
 ## Getting Started
 
@@ -82,6 +83,25 @@ SELECT audio_model_transcribe('./audio/speech.mp3', 'language=it,translate=1');
 SELECT audio_model_transcribe(audio_data) FROM recordings WHERE id = 1;
 ```
 
+### Vision / Multimodal
+
+```sql
+-- Load a multimodal model and its vision projector
+SELECT llm_model_load('./models/Gemma-3-4B-IT-Q4_K_M.gguf', 'gpu_layers=99');
+SELECT llm_context_create_textgen();
+SELECT llm_vision_load('./models/mmproj-Gemma-3-4B-IT-f16.gguf');
+
+-- Describe an image
+SELECT llm_text_generate('Describe this image', './photos/cat.jpg');
+
+-- Use vision in a chat conversation
+SELECT llm_context_create_chat();
+SELECT llm_chat_respond('What do you see in this photo?', './photos/landscape.jpg');
+
+-- Analyze multiple images
+SELECT llm_text_generate('Compare these two images', './img1.jpg', './img2.jpg');
+```
+
 ## Documentation
 
 For detailed information on all available functions, their parameters, and examples, refer to the [comprehensive API Reference](./API.md).