Skip to content

Commit 75d95dc

Browse files
committed
gemma4 audio
1 parent 7e1c887 commit 75d95dc

21 files changed

Lines changed: 761 additions & 60 deletions

File tree

apps/llm/app/llm/index.tsx

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import {
1111
View,
1212
} from 'react-native';
1313
import SendIcon from '../../assets/icons/send_icon.svg';
14-
import { useLLM, LLAMA3_2_1B_SPINQUANT } from 'react-native-executorch';
14+
import { useLLM, QWEN3_0_6B_QUANTIZED } from 'react-native-executorch';
1515
import { ModelPicker } from '../../components/ModelPicker';
1616
import { LLM_MODELS, LLMModelSources } from '../../components/llmModels';
1717
import PauseIcon from '../../assets/icons/pause_icon.svg';
@@ -42,9 +42,8 @@ function LLMScreen() {
4242
const { bottom } = useSafeAreaInsets();
4343
const [isTextInputFocused, setIsTextInputFocused] = useState(false);
4444
const [userInput, setUserInput] = useState('');
45-
const [selectedModel, setSelectedModel] = useState<LLMModelSources>(
46-
LLAMA3_2_1B_SPINQUANT
47-
);
45+
const [selectedModel, setSelectedModel] =
46+
useState<LLMModelSources>(QWEN3_0_6B_QUANTIZED);
4847
const textInputRef = useRef<TextInput>(null);
4948
const { setGlobalGenerating } = useContext(GeneratingContext);
5049

apps/llm/app/multimodal_llm/index.tsx

Lines changed: 211 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ import {
1212
View,
1313
} from 'react-native';
1414
import { launchImageLibrary } from 'react-native-image-picker';
15+
import {
16+
AudioManager,
17+
AudioRecorder,
18+
AudioContext,
19+
} from 'react-native-audio-api';
1520
import { useIsFocused } from '@react-navigation/native';
1621
import { useSafeAreaInsets } from 'react-native-safe-area-context';
1722
import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
@@ -46,7 +51,15 @@ function MultimodalLLMScreen() {
4651
const textInputRef = useRef<TextInput>(null);
4752
const { setGlobalGenerating } = useContext(GeneratingContext);
4853

49-
// Added error state
54+
const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
55+
const [audioLabel, setAudioLabel] = useState<string | null>(null);
56+
const [audioUrl, setAudioUrl] = useState('');
57+
const [isFetchingAudio, setIsFetchingAudio] = useState(false);
58+
const [isRecording, setIsRecording] = useState(false);
59+
const [hasMicPermission, setHasMicPermission] = useState(false);
60+
const recorder = useRef(new AudioRecorder());
61+
const recordChunks = useRef<Float32Array[]>([]);
62+
5063
const [error, setError] = useState<string | null>(null);
5164

5265
const vlm = useLLM({
@@ -68,6 +81,87 @@ function MultimodalLLMScreen() {
6881
if (vlm.error) setError(String(vlm.error));
6982
}, [vlm.error]);
7083

84+
useEffect(() => {
85+
AudioManager.setAudioSessionOptions({
86+
iosCategory: 'playAndRecord',
87+
iosMode: 'spokenAudio',
88+
iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
89+
});
90+
(async () => {
91+
const status = await AudioManager.requestRecordingPermissions();
92+
setHasMicPermission(status === 'Granted');
93+
})();
94+
}, []);
95+
96+
const loadAudioFromUrl = async () => {
97+
const url = audioUrl.trim();
98+
if (!url) return;
99+
setIsFetchingAudio(true);
100+
try {
101+
const ctx = new AudioContext({ sampleRate: 16000 });
102+
const decoded = await ctx.decodeAudioData(url);
103+
const pcm = decoded.getChannelData(0);
104+
const name = url.split('/').pop() || 'audio';
105+
setAudioBuffer(pcm);
106+
setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
107+
} catch (e) {
108+
setError(e instanceof Error ? e.message : String(e));
109+
} finally {
110+
setIsFetchingAudio(false);
111+
}
112+
};
113+
114+
const startRecording = async () => {
115+
if (!hasMicPermission) {
116+
setError('Microphone permission denied. Please enable it in Settings.');
117+
return;
118+
}
119+
recordChunks.current = [];
120+
const sampleRate = 16000;
121+
recorder.current.onAudioReady(
122+
{ sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
123+
({ buffer }) => {
124+
recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
125+
}
126+
);
127+
try {
128+
const ok = await AudioManager.setAudioSessionActivity(true);
129+
if (!ok) {
130+
setError('Cannot start audio session');
131+
return;
132+
}
133+
const result = recorder.current.start();
134+
if (result.status === 'error') {
135+
setError(`Recording problems: ${result.message}`);
136+
return;
137+
}
138+
setIsRecording(true);
139+
} catch (e) {
140+
setError(e instanceof Error ? e.message : String(e));
141+
}
142+
};
143+
144+
const stopRecording = () => {
145+
recorder.current.stop();
146+
setIsRecording(false);
147+
const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
148+
if (total === 0) return;
149+
const pcm = new Float32Array(total);
150+
let off = 0;
151+
for (const c of recordChunks.current) {
152+
pcm.set(c, off);
153+
off += c.length;
154+
}
155+
recordChunks.current = [];
156+
setAudioBuffer(pcm);
157+
setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
158+
};
159+
160+
const clearAudio = () => {
161+
setAudioBuffer(null);
162+
setAudioLabel(null);
163+
};
164+
71165
const pickImage = async () => {
72166
try {
73167
const result = await launchImageLibrary({ mediaType: 'photo' });
@@ -88,12 +182,19 @@ function MultimodalLLMScreen() {
88182
textInputRef.current?.clear();
89183
Keyboard.dismiss();
90184
const currentImageUri = imageUri;
185+
const currentAudio = audioBuffer;
91186
setImageUri(null);
187+
setAudioBuffer(null);
188+
setAudioLabel(null);
92189
try {
93-
await vlm.sendMessage(
94-
text,
95-
currentImageUri ? { imagePath: currentImageUri } : undefined
96-
);
190+
const media =
191+
currentImageUri || currentAudio
192+
? {
193+
...(currentImageUri ? { imagePath: currentImageUri } : {}),
194+
...(currentAudio ? { audioBuffer: currentAudio } : {}),
195+
}
196+
: undefined;
197+
await vlm.sendMessage(text, media);
97198
} catch (e) {
98199
// Updated to set UI error instead of just console.error
99200
setError(e instanceof Error ? e.message : String(e));
@@ -159,6 +260,42 @@ function MultimodalLLMScreen() {
159260
</TouchableOpacity>
160261
)}
161262

263+
{/* Audio URL input */}
264+
<View style={styles.audioUrlRow}>
265+
<TextInput
266+
placeholder="Audio URL (mp3/wav/…)"
267+
placeholderTextColor="#C1C6E5"
268+
style={styles.audioUrlInput}
269+
value={audioUrl}
270+
onChangeText={setAudioUrl}
271+
autoCapitalize="none"
272+
autoCorrect={false}
273+
/>
274+
<TouchableOpacity
275+
style={[
276+
styles.audioUrlButton,
277+
(!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
278+
styles.disabled,
279+
]}
280+
onPress={loadAudioFromUrl}
281+
disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
282+
>
283+
<Text style={styles.audioUrlButtonText}>
284+
{isFetchingAudio ? '…' : 'Load'}
285+
</Text>
286+
</TouchableOpacity>
287+
</View>
288+
289+
{/* Audio attachment strip */}
290+
{audioLabel && (
291+
<View style={styles.audioAttachmentContainer}>
292+
<Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
293+
<TouchableOpacity onPress={clearAudio}>
294+
<Text style={styles.audioAttachmentClear}></Text>
295+
</TouchableOpacity>
296+
</View>
297+
)}
298+
162299
<StatsBar stats={stats} />
163300
<View
164301
style={[
@@ -178,6 +315,17 @@ function MultimodalLLMScreen() {
178315
<Text style={styles.imageButtonText}>📷</Text>
179316
</TouchableOpacity>
180317

318+
{/* Mic record / stop button */}
319+
<TouchableOpacity
320+
style={styles.imageButton}
321+
onPress={isRecording ? stopRecording : startRecording}
322+
disabled={vlm.isGenerating}
323+
>
324+
<Text style={styles.imageButtonText}>
325+
{isRecording ? '⏹️' : '🎤'}
326+
</Text>
327+
</TouchableOpacity>
328+
181329
<TextInput
182330
autoCorrect={false}
183331
ref={textInputRef}
@@ -319,6 +467,64 @@ const styles = StyleSheet.create({
319467
fontFamily: 'regular',
320468
color: ColorPalette.blueDark,
321469
},
470+
audioAttachmentContainer: {
471+
flexDirection: 'row',
472+
alignItems: 'center',
473+
justifyContent: 'space-between',
474+
paddingHorizontal: 16,
475+
paddingVertical: 8,
476+
marginHorizontal: 16,
477+
marginBottom: 4,
478+
borderRadius: 8,
479+
borderWidth: 1,
480+
borderColor: ColorPalette.blueLight,
481+
backgroundColor: '#fafbff',
482+
},
483+
audioAttachmentText: {
484+
fontSize: 13,
485+
fontFamily: 'regular',
486+
color: ColorPalette.blueDark,
487+
},
488+
audioAttachmentClear: {
489+
fontSize: 16,
490+
color: ColorPalette.blueDark,
491+
paddingHorizontal: 8,
492+
},
493+
audioUrlRow: {
494+
flexDirection: 'row',
495+
alignItems: 'center',
496+
marginHorizontal: 16,
497+
marginBottom: 4,
498+
},
499+
audioUrlInput: {
500+
flex: 1,
501+
padding: 10,
502+
borderTopLeftRadius: 8,
503+
borderBottomLeftRadius: 8,
504+
borderWidth: 1,
505+
borderColor: ColorPalette.blueLight,
506+
borderRightWidth: 0,
507+
fontFamily: 'regular',
508+
fontSize: 13,
509+
color: ColorPalette.primary,
510+
},
511+
audioUrlButton: {
512+
paddingVertical: 10,
513+
paddingHorizontal: 16,
514+
backgroundColor: ColorPalette.strongPrimary,
515+
borderTopRightRadius: 8,
516+
borderBottomRightRadius: 8,
517+
justifyContent: 'center',
518+
alignItems: 'center',
519+
},
520+
audioUrlButtonText: {
521+
color: '#fff',
522+
fontFamily: 'medium',
523+
fontSize: 13,
524+
},
525+
disabled: {
526+
opacity: 0.5,
527+
},
322528
bottomContainer: {
323529
height: 100,
324530
width: '100%',

apps/llm/components/llmModels.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
5757
{ label: 'Qwen3 0.6B', value: QWEN3_0_6B },
5858
{ label: 'Qwen3 0.6B Quantized', value: QWEN3_0_6B_QUANTIZED },
5959
{ label: 'Qwen3 1.7B', value: QWEN3_1_7B },
60+
{ label: 'Gemma4 e2b Quantized', value: GEMMA4_E2B_QUANTIZED },
6061
{ label: 'Qwen3 1.7B Quantized', value: QWEN3_1_7B_QUANTIZED },
6162
{ label: 'Qwen3 4B', value: QWEN3_4B },
6263
{ label: 'Qwen3 4B Quantized', value: QWEN3_4B_QUANTIZED },

packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,24 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
223223
return getArrayAsVector<float>(val, runtime);
224224
}
225225

226+
// JS side passes an Array<Float32Array> (one clip per element). Each inner
227+
// element is read as a typed-array span and copied into a std::vector<float>.
228+
template <>
229+
inline std::vector<std::vector<float>>
230+
getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
231+
jsi::Runtime &runtime) {
232+
jsi::Array array = val.asObject(runtime).asArray(runtime);
233+
const size_t length = array.size(runtime);
234+
std::vector<std::vector<float>> result;
235+
result.reserve(length);
236+
for (size_t i = 0; i < length; ++i) {
237+
auto span =
238+
getTypedArrayAsSpan<float>(array.getValueAtIndex(runtime, i), runtime);
239+
result.emplace_back(span.begin(), span.end());
240+
}
241+
return result;
242+
}
243+
226244
template <>
227245
inline std::vector<int64_t>
228246
getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {

packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
166166
promiseHostFunction<&Model::generateMultimodal>,
167167
"generateMultimodal"));
168168

169+
addFunctions(JSI_EXPORT_FUNCTION(
170+
ModelHostObject<Model>,
171+
promiseHostFunction<&Model::generateMultimodalWithAudio>,
172+
"generateMultimodalWithAudio"));
173+
169174
addFunctions(JSI_EXPORT_FUNCTION(
170175
ModelHostObject<Model>,
171176
synchronousHostFunction<&Model::getVisualTokenCount>,

0 commit comments

Comments
 (0)