Skip to content

Commit ce065d2

Browse files
NorbertKlockiewiczclaudechmjkb
authored
feat: implementation of multimodal runner (#892)
## Description Adds vision/multimodal support to useLLM: load a VLM by passing capabilities: ['vision'], then use sendMessage(text, { imagePath }) to send messages with images. Under the hood this introduces a pluggable encoder architecture (IEncoder / VisionEncoder), a dedicated MultimodalRunner, and a refactored BaseLLMRunner with cleaner ownership and shared state. Also exposes getVisualTokenCount() JSI method for accurate token counting with images. No changes to the text-only path. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [x] New feature (change which adds functionality) - [x] Documentation update (improves or adds clarity to existing documentation) - [x] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [x] Android ### Testing instructions Run the `llm` example app, select `multimodal llm` screen. Select an image and prompt the model. ### Screenshots <!-- Add screenshots here, if applicable --> ### Related issues <!-- Link related issues here using #issue-number --> ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes <!-- Include any additional information, assumptions, or context that reviewers might need to understand this PR. --> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Jakub Chmura <92989966+chmjkb@users.noreply.github.com>
1 parent 2cc7ba5 commit ce065d2

File tree

41 files changed

+2287
-621
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2287
-621
lines changed

.cspell-wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
multimodal
12
swmansion
23
executorch
34
execu

apps/llm/app/_layout.tsx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ export default function _layout() {
8989
headerTitleStyle: { color: ColorPalette.primary },
9090
}}
9191
/>
92+
<Drawer.Screen
93+
name="multimodal_llm/index"
94+
options={{
95+
drawerLabel: 'Multimodal LLM (VLM)',
96+
title: 'Multimodal LLM',
97+
headerTitleStyle: { color: ColorPalette.primary },
98+
}}
99+
/>
92100
<Drawer.Screen
93101
name="index"
94102
options={{

apps/llm/app/index.tsx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ export default function Home() {
3535
>
3636
<Text style={styles.buttonText}>Voice Chat</Text>
3737
</TouchableOpacity>
38+
<TouchableOpacity
39+
style={styles.button}
40+
onPress={() => router.navigate('multimodal_llm/')}
41+
>
42+
<Text style={styles.buttonText}>Multimodal LLM (VLM)</Text>
43+
</TouchableOpacity>
3844
</View>
3945
</View>
4046
);
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
import { useContext, useEffect, useRef, useState } from 'react';
2+
import {
3+
Image,
4+
Keyboard,
5+
KeyboardAvoidingView,
6+
Platform,
7+
StyleSheet,
8+
Text,
9+
TextInput,
10+
TouchableOpacity,
11+
TouchableWithoutFeedback,
12+
View,
13+
} from 'react-native';
14+
import { launchImageLibrary } from 'react-native-image-picker';
15+
import { useIsFocused } from '@react-navigation/native';
16+
import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
17+
import SendIcon from '../../assets/icons/send_icon.svg';
18+
import PauseIcon from '../../assets/icons/pause_icon.svg';
19+
import ColorPalette from '../../colors';
20+
import Messages from '../../components/Messages';
21+
import Spinner from '../../components/Spinner';
22+
import { GeneratingContext } from '../../context';
23+
24+
export default function MultimodalLLMScreenWrapper() {
25+
const isFocused = useIsFocused();
26+
return isFocused ? <MultimodalLLMScreen /> : null;
27+
}
28+
29+
function MultimodalLLMScreen() {
30+
const [imageUri, setImageUri] = useState<string | null>(null);
31+
const [userInput, setUserInput] = useState('');
32+
const [isTextInputFocused, setIsTextInputFocused] = useState(false);
33+
const textInputRef = useRef<TextInput>(null);
34+
const { setGlobalGenerating } = useContext(GeneratingContext);
35+
36+
const vlm = useLLM({
37+
model: LFM2_VL_1_6B_QUANTIZED,
38+
});
39+
40+
useEffect(() => {
41+
setGlobalGenerating(vlm.isGenerating);
42+
}, [vlm.isGenerating, setGlobalGenerating]);
43+
44+
useEffect(() => {
45+
if (vlm.error) console.error('MultimodalLLM error:', vlm.error);
46+
}, [vlm.error]);
47+
48+
const pickImage = async () => {
49+
const result = await launchImageLibrary({ mediaType: 'photo' });
50+
if (result.assets && result.assets.length > 0) {
51+
const uri = result.assets[0]?.uri;
52+
if (uri) setImageUri(uri);
53+
}
54+
};
55+
56+
const sendMessage = async () => {
57+
if (!userInput.trim() || vlm.isGenerating) return;
58+
const text = userInput.trim();
59+
setUserInput('');
60+
textInputRef.current?.clear();
61+
Keyboard.dismiss();
62+
const currentImageUri = imageUri;
63+
setImageUri(null);
64+
try {
65+
await vlm.sendMessage(
66+
text,
67+
currentImageUri ? { imagePath: currentImageUri } : undefined
68+
);
69+
} catch (e) {
70+
console.error('Generation error:', e);
71+
}
72+
};
73+
74+
if (!vlm.isReady) {
75+
return (
76+
<Spinner
77+
visible={!vlm.isReady}
78+
textContent={
79+
vlm.error
80+
? `Error: ${vlm.error.message}`
81+
: `Loading model ${(vlm.downloadProgress * 100).toFixed(0)}%`
82+
}
83+
/>
84+
);
85+
}
86+
87+
return (
88+
<TouchableWithoutFeedback onPress={Keyboard.dismiss}>
89+
<KeyboardAvoidingView
90+
style={styles.container}
91+
collapsable={false}
92+
behavior={Platform.OS === 'ios' ? 'padding' : undefined}
93+
keyboardVerticalOffset={Platform.OS === 'ios' ? 120 : 40}
94+
>
95+
<View style={styles.container}>
96+
{vlm.messageHistory.length ? (
97+
<View style={styles.chatContainer}>
98+
<Messages
99+
chatHistory={vlm.messageHistory}
100+
llmResponse={vlm.response}
101+
isGenerating={vlm.isGenerating}
102+
deleteMessage={vlm.deleteMessage}
103+
/>
104+
</View>
105+
) : (
106+
<View style={styles.helloMessageContainer}>
107+
<Text style={styles.helloText}>Hello! 👋</Text>
108+
<Text style={styles.bottomHelloText}>
109+
Pick an image and ask me anything about it.
110+
</Text>
111+
</View>
112+
)}
113+
114+
{/* Image thumbnail strip */}
115+
{imageUri && (
116+
<TouchableOpacity
117+
style={styles.imageThumbnailContainer}
118+
onPress={pickImage}
119+
>
120+
<Image
121+
source={{ uri: imageUri }}
122+
style={styles.imageThumbnail}
123+
resizeMode="cover"
124+
/>
125+
<Text style={styles.imageThumbnailHint}>Tap to change</Text>
126+
</TouchableOpacity>
127+
)}
128+
129+
<View style={styles.bottomContainer}>
130+
{/* Image picker button */}
131+
<TouchableOpacity
132+
style={styles.imageButton}
133+
onPress={pickImage}
134+
disabled={vlm.isGenerating}
135+
>
136+
<Text style={styles.imageButtonText}>📷</Text>
137+
</TouchableOpacity>
138+
139+
<TextInput
140+
autoCorrect={false}
141+
ref={textInputRef}
142+
onFocus={() => setIsTextInputFocused(true)}
143+
onBlur={() => setIsTextInputFocused(false)}
144+
style={[
145+
styles.textInput,
146+
{
147+
borderColor: isTextInputFocused
148+
? ColorPalette.blueDark
149+
: ColorPalette.blueLight,
150+
},
151+
]}
152+
placeholder={imageUri ? 'Ask about the image…' : 'Your message'}
153+
placeholderTextColor="#C1C6E5"
154+
multiline
155+
onChangeText={setUserInput}
156+
/>
157+
158+
{userInput.trim() && !vlm.isGenerating && (
159+
<TouchableOpacity
160+
style={styles.sendChatTouchable}
161+
onPress={sendMessage}
162+
>
163+
<SendIcon height={24} width={24} padding={4} margin={8} />
164+
</TouchableOpacity>
165+
)}
166+
{vlm.isGenerating && (
167+
<TouchableOpacity
168+
style={styles.sendChatTouchable}
169+
onPress={vlm.interrupt}
170+
>
171+
<PauseIcon height={24} width={24} padding={4} margin={8} />
172+
</TouchableOpacity>
173+
)}
174+
</View>
175+
</View>
176+
</KeyboardAvoidingView>
177+
</TouchableWithoutFeedback>
178+
);
179+
}
180+
181+
const styles = StyleSheet.create({
182+
// Setup phase
183+
setupContainer: {
184+
flex: 1,
185+
padding: 24,
186+
backgroundColor: '#fff',
187+
justifyContent: 'center',
188+
},
189+
setupTitle: {
190+
fontSize: 20,
191+
fontFamily: 'medium',
192+
color: ColorPalette.primary,
193+
marginBottom: 8,
194+
},
195+
setupHint: {
196+
fontSize: 13,
197+
fontFamily: 'regular',
198+
color: ColorPalette.blueDark,
199+
marginBottom: 32,
200+
lineHeight: 18,
201+
},
202+
filePickerRow: {
203+
flexDirection: 'row',
204+
alignItems: 'center',
205+
borderWidth: 1,
206+
borderColor: ColorPalette.blueLight,
207+
borderRadius: 10,
208+
padding: 14,
209+
marginBottom: 12,
210+
backgroundColor: '#fafbff',
211+
},
212+
filePickerInfo: { flex: 1 },
213+
filePickerLabel: {
214+
fontSize: 12,
215+
fontFamily: 'medium',
216+
color: ColorPalette.blueDark,
217+
marginBottom: 2,
218+
},
219+
filePickerValue: { fontSize: 14, fontFamily: 'regular' },
220+
filePickerValueSet: { color: ColorPalette.primary },
221+
filePickerValueEmpty: { color: ColorPalette.blueLight },
222+
filePickerChevron: {
223+
fontSize: 24,
224+
color: ColorPalette.blueLight,
225+
marginLeft: 8,
226+
},
227+
loadButton: {
228+
marginTop: 16,
229+
backgroundColor: ColorPalette.strongPrimary,
230+
borderRadius: 10,
231+
padding: 14,
232+
alignItems: 'center',
233+
},
234+
loadButtonDisabled: { backgroundColor: ColorPalette.blueLight },
235+
loadButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 15 },
236+
237+
// Chat phase
238+
container: { flex: 1 },
239+
chatContainer: { flex: 10, width: '100%' },
240+
helloMessageContainer: {
241+
flex: 10,
242+
width: '100%',
243+
alignItems: 'center',
244+
justifyContent: 'center',
245+
},
246+
helloText: {
247+
fontFamily: 'medium',
248+
fontSize: 30,
249+
color: ColorPalette.primary,
250+
},
251+
bottomHelloText: {
252+
fontFamily: 'regular',
253+
fontSize: 20,
254+
lineHeight: 28,
255+
textAlign: 'center',
256+
color: ColorPalette.primary,
257+
paddingHorizontal: 24,
258+
},
259+
imageThumbnailContainer: {
260+
flexDirection: 'row',
261+
alignItems: 'center',
262+
paddingHorizontal: 16,
263+
paddingVertical: 6,
264+
gap: 8,
265+
},
266+
imageThumbnail: {
267+
width: 48,
268+
height: 48,
269+
borderRadius: 8,
270+
borderWidth: 1,
271+
borderColor: ColorPalette.blueLight,
272+
},
273+
imageThumbnailHint: {
274+
fontSize: 12,
275+
fontFamily: 'regular',
276+
color: ColorPalette.blueDark,
277+
},
278+
bottomContainer: {
279+
height: 100,
280+
width: '100%',
281+
flexDirection: 'row',
282+
justifyContent: 'space-between',
283+
alignItems: 'center',
284+
paddingHorizontal: 16,
285+
},
286+
imageButton: {
287+
width: 40,
288+
height: 40,
289+
justifyContent: 'center',
290+
alignItems: 'center',
291+
marginRight: 4,
292+
},
293+
imageButtonText: { fontSize: 22 },
294+
textInput: {
295+
flex: 1,
296+
borderWidth: 1,
297+
borderRadius: 8,
298+
lineHeight: 19.6,
299+
fontFamily: 'regular',
300+
fontSize: 14,
301+
color: ColorPalette.primary,
302+
padding: 16,
303+
},
304+
sendChatTouchable: {
305+
height: '100%',
306+
width: 48,
307+
justifyContent: 'center',
308+
alignItems: 'flex-end',
309+
},
310+
});

0 commit comments

Comments
 (0)