docs: write an instruction for using llm with vision capabilities

NorbertKlockiewicz · NorbertKlockiewicz · commit 17ce70b014b1 · 2026-03-09T17:55:38.000+01:00
diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
@@ -1,3 +1,4 @@
+multimodal
 swmansion
 executorch
 execu
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useLLM.md b/docs/docs/03-hooks/01-natural-language-processing/useLLM.md
@@ -481,14 +481,86 @@ The response should include JSON:
 
 Depending on selected model and the user's device generation speed can be above 60 tokens per second. If the [`tokenCallback`](../../06-api-reference/classes/LLMModule.md#tokencallback) from [`LLMModule`](../../06-api-reference/classes/LLMModule.md), which is used under the hood, triggers rerenders and is invoked on every single token it can significantly decrease the app's performance. To alleviate this and help improve performance we've implemented token batching. To configure this you need to call [`configure`](../../06-api-reference/interfaces/LLMType.md#configure) method and pass [`generationConfig`](../../06-api-reference/interfaces/LLMConfig.md#generationconfig). You can check what you can configure [Configuring the Model](../../03-hooks/01-natural-language-processing/useLLM.md#configuring-the-model). They set the size of the batch before tokens are emitted and the maximum time interval between consecutive batches respectively. Each batch is emitted if either `timeInterval` elapses since last batch or `countInterval` number of tokens are generated. This allows for smooth generation even if model lags during generation. Default parameters are set to 10 tokens and 80ms for time interval (~12 batches per second).
 
+## Vision-Language Models (VLM)
+
+Some models support multimodal input — text and images together. To use them, pass a `capabilities` array when loading the model.
+
+### Loading a VLM
+
+```tsx
+import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+
+const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+```
+
+The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
+
+```tsx
+const llm = useLLM({
+  model: {
+    modelSource: '...',
+    tokenizerSource: '...',
+    tokenizerConfigSource: '...',
+    capabilities: ['vision'],
+  },
+});
+```
+
+Passing `capabilities` unlocks the typed `media` argument on `sendMessage`.
+
+### Sending a message with an image
+
+```tsx
+const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+
+const send = () => {
+  llm.sendMessage('What is in this image?', {
+    imagePath: '/path/to/image.jpg',
+  });
+};
+
+return (
+  <View>
+    <Button onPress={send} title="Send!" />
+    <Text>{llm.response}</Text>
+  </View>
+);
+```
+
+The `imagePath` should be a local file path on the device.
+
+### Functional generation with images
+
+You can also use `generate` directly, passing `imagePaths` as the third argument:
+
+```tsx
+const llm = useLLM({ model: LFM2_VL_1_6B_QUANTIZED });
+
+const handleGenerate = async () => {
+  const chat: Message[] = [
+    {
+      role: 'user',
+      content: [
+        { type: 'image' },
+        { type: 'text', text: 'Describe this image.' },
+      ],
+    },
+  ];
+
+  const response = await llm.generate(chat, undefined, ['/path/to/image.jpg']);
+  console.log(response);
+};
+```
+
 ## Available models
 
-| Model Family                                                                                                 |      Sizes       | Quantized |
-| ------------------------------------------------------------------------------------------------------------ | :--------------: | :-------: |
-| [Hammer 2.1](https://huggingface.co/software-mansion/react-native-executorch-hammer-2.1)                     |  0.5B, 1.5B, 3B  |    ✅     |
-| [Qwen 2.5](https://huggingface.co/software-mansion/react-native-executorch-qwen-2.5)                         |  0.5B, 1.5B, 3B  |    ✅     |
-| [Qwen 3](https://huggingface.co/software-mansion/react-native-executorch-qwen-3)                             |  0.6B, 1.7B, 4B  |    ✅     |
-| [Phi 4 Mini](https://huggingface.co/software-mansion/react-native-executorch-phi-4-mini)                     |        4B        |    ✅     |
-| [SmolLM 2](https://huggingface.co/software-mansion/react-native-executorch-smolLm-2)                         | 135M, 360M, 1.7B |    ✅     |
-| [LLaMA 3.2](https://huggingface.co/software-mansion/react-native-executorch-llama-3.2)                       |      1B, 3B      |    ✅     |
-| [LFM2.5-1.2B-Instruct](https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-1.2B-instruct) |       1.2B       |    ✅     |
+| Model Family                                                                                                 |      Sizes       | Quantized | Capabilities |
+| ------------------------------------------------------------------------------------------------------------ | :--------------: | :-------: | :----------: |
+| [Hammer 2.1](https://huggingface.co/software-mansion/react-native-executorch-hammer-2.1)                     |  0.5B, 1.5B, 3B  |    ✅     |      -       |
+| [Qwen 2.5](https://huggingface.co/software-mansion/react-native-executorch-qwen-2.5)                         |  0.5B, 1.5B, 3B  |    ✅     |      -       |
+| [Qwen 3](https://huggingface.co/software-mansion/react-native-executorch-qwen-3)                             |  0.6B, 1.7B, 4B  |    ✅     |      -       |
+| [Phi 4 Mini](https://huggingface.co/software-mansion/react-native-executorch-phi-4-mini)                     |        4B        |    ✅     |      -       |
+| [SmolLM 2](https://huggingface.co/software-mansion/react-native-executorch-smolLm-2)                         | 135M, 360M, 1.7B |    ✅     |      -       |
+| [LLaMA 3.2](https://huggingface.co/software-mansion/react-native-executorch-llama-3.2)                       |      1B, 3B      |    ✅     |      -       |
+| [LFM2.5-1.2B-Instruct](https://huggingface.co/software-mansion/react-native-executorch-lfm2.5-1.2B-instruct) |       1.2B       |    ✅     |      -       |
+| [LFM2.5-VL-1.6B](https://huggingface.co/nklockiewicz/lfm2-vl-et)                                             |       1.6B       |    ✅     |    vision    |
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/LLMModule.md
@@ -114,6 +114,55 @@ To configure model (i.e. change system prompt, load initial conversation history
 
   - [`topp`](../../06-api-reference/interfaces/GenerationConfig.md#topp) - Only samples from the smallest set of tokens whose cumulative probability exceeds topp.
 
+## Vision-Language Models (VLM)
+
+Some models support multimodal input — text and images together. To use them, pass `capabilities` in the model object when calling [`load`](../../06-api-reference/classes/LLMModule.md#load):
+
+```typescript
+import { LLMModule, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
+
+const llm = new LLMModule({
+  tokenCallback: (token) => console.log(token),
+});
+
+await llm.load(LFM2_VL_1_6B_QUANTIZED);
+```
+
+The `capabilities` field is already set on the model constant. You can also construct the model object explicitly:
+
+```typescript
+await llm.load({
+  modelSource: '...',
+  tokenizerSource: '...',
+  tokenizerConfigSource: '...',
+  capabilities: ['vision'],
+});
+```
+
+Once loaded, pass `imagePath` to [`sendMessage`](../../06-api-reference/classes/LLMModule.md#sendmessage):
+
+```typescript
+const response = await llm.sendMessage('What is in this image?', {
+  imagePath: '/path/to/image.jpg',
+});
+```
+
+Or use [`generate`](../../06-api-reference/classes/LLMModule.md#generate) with `imagePaths` directly:
+
+```typescript
+const chat: Message[] = [
+  {
+    role: 'user',
+    content: [
+      { type: 'image' },
+      { type: 'text', text: 'Describe this image.' },
+    ],
+  },
+];
+
+const response = await llm.generate(chat, undefined, ['/path/to/image.jpg']);
+```
+
 ## Deleting the model from memory
 
 To delete the model from memory, you can use the [`delete`](../../06-api-reference/classes/LLMModule.md#delete) method.

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+multimodal`
`1`	`2`	`swmansion`
`2`	`3`	`executorch`
`3`	`4`	`execu`