software-mansion
diff --git a/‎apps/llm/app/llm/index.tsx‎
Lines changed: 3 additions & 4 deletions b/‎apps/llm/app/llm/index.tsx‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎apps/llm/app/multimodal_llm/index.tsx‎
Lines changed: 211 additions & 5 deletions b/‎apps/llm/app/multimodal_llm/index.tsx‎
Lines changed: 211 additions & 5 deletions
diff --git a/‎apps/llm/components/llmModels.ts‎
Lines changed: 1 addition & 0 deletions b/‎apps/llm/components/llmModels.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 18 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h‎
Lines changed: 5 additions & 0 deletions b/‎packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h‎
Lines changed: 5 additions & 0 deletions
@@ -11,7 +11,7 @@ import {
   View,
 } from 'react-native';
 import SendIcon from '../../assets/icons/send_icon.svg';
-import { useLLM, LLAMA3_2_1B_SPINQUANT } from 'react-native-executorch';
+import { useLLM, QWEN3_0_6B_QUANTIZED } from 'react-native-executorch';
 import { ModelPicker } from '../../components/ModelPicker';
 import { LLM_MODELS, LLMModelSources } from '../../components/llmModels';
 import PauseIcon from '../../assets/icons/pause_icon.svg';
@@ -42,9 +42,8 @@ function LLMScreen() {
   const { bottom } = useSafeAreaInsets();
   const [isTextInputFocused, setIsTextInputFocused] = useState(false);
   const [userInput, setUserInput] = useState('');
-  const [selectedModel, setSelectedModel] = useState<LLMModelSources>(
-    LLAMA3_2_1B_SPINQUANT
-  );
+  const [selectedModel, setSelectedModel] =
+    useState<LLMModelSources>(QWEN3_0_6B_QUANTIZED);
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
 
@@ -12,6 +12,11 @@ import {
   View,
 } from 'react-native';
 import { launchImageLibrary } from 'react-native-image-picker';
+import {
+  AudioManager,
+  AudioRecorder,
+  AudioContext,
+} from 'react-native-audio-api';
 import { useIsFocused } from '@react-navigation/native';
 import { useSafeAreaInsets } from 'react-native-safe-area-context';
 import { useLLM, LFM2_5_VL_1_6B_QUANTIZED } from 'react-native-executorch';
@@ -46,7 +51,15 @@ function MultimodalLLMScreen() {
   const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
-  // Added error state
+  const [audioBuffer, setAudioBuffer] = useState<Float32Array | null>(null);
+  const [audioLabel, setAudioLabel] = useState<string | null>(null);
+  const [audioUrl, setAudioUrl] = useState('');
+  const [isFetchingAudio, setIsFetchingAudio] = useState(false);
+  const [isRecording, setIsRecording] = useState(false);
+  const [hasMicPermission, setHasMicPermission] = useState(false);
+  const recorder = useRef(new AudioRecorder());
+  const recordChunks = useRef<Float32Array[]>([]);
+
   const [error, setError] = useState<string | null>(null);
 
   const vlm = useLLM({
@@ -68,6 +81,87 @@ function MultimodalLLMScreen() {
     if (vlm.error) setError(String(vlm.error));
   }, [vlm.error]);
 
+  useEffect(() => {
+    AudioManager.setAudioSessionOptions({
+      iosCategory: 'playAndRecord',
+      iosMode: 'spokenAudio',
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
+    });
+    (async () => {
+      const status = await AudioManager.requestRecordingPermissions();
+      setHasMicPermission(status === 'Granted');
+    })();
+  }, []);
+
+  const loadAudioFromUrl = async () => {
+    const url = audioUrl.trim();
+    if (!url) return;
+    setIsFetchingAudio(true);
+    try {
+      const ctx = new AudioContext({ sampleRate: 16000 });
+      const decoded = await ctx.decodeAudioData(url);
+      const pcm = decoded.getChannelData(0);
+      const name = url.split('/').pop() || 'audio';
+      setAudioBuffer(pcm);
+      setAudioLabel(`${name} · ${(pcm.length / 16000).toFixed(1)}s`);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setIsFetchingAudio(false);
+    }
+  };
+
+  const startRecording = async () => {
+    if (!hasMicPermission) {
+      setError('Microphone permission denied. Please enable it in Settings.');
+      return;
+    }
+    recordChunks.current = [];
+    const sampleRate = 16000;
+    recorder.current.onAudioReady(
+      { sampleRate, bufferLength: 0.1 * sampleRate, channelCount: 1 },
+      ({ buffer }) => {
+        recordChunks.current.push(new Float32Array(buffer.getChannelData(0)));
+      }
+    );
+    try {
+      const ok = await AudioManager.setAudioSessionActivity(true);
+      if (!ok) {
+        setError('Cannot start audio session');
+        return;
+      }
+      const result = recorder.current.start();
+      if (result.status === 'error') {
+        setError(`Recording problems: ${result.message}`);
+        return;
+      }
+      setIsRecording(true);
+    } catch (e) {
+      setError(e instanceof Error ? e.message : String(e));
+    }
+  };
+
+  const stopRecording = () => {
+    recorder.current.stop();
+    setIsRecording(false);
+    const total = recordChunks.current.reduce((n, c) => n + c.length, 0);
+    if (total === 0) return;
+    const pcm = new Float32Array(total);
+    let off = 0;
+    for (const c of recordChunks.current) {
+      pcm.set(c, off);
+      off += c.length;
+    }
+    recordChunks.current = [];
+    setAudioBuffer(pcm);
+    setAudioLabel(`Recording · ${(pcm.length / 16000).toFixed(1)}s`);
+  };
+
+  const clearAudio = () => {
+    setAudioBuffer(null);
+    setAudioLabel(null);
+  };
+
   const pickImage = async () => {
     try {
       const result = await launchImageLibrary({ mediaType: 'photo' });
@@ -88,12 +182,19 @@ function MultimodalLLMScreen() {
     textInputRef.current?.clear();
     Keyboard.dismiss();
     const currentImageUri = imageUri;
+    const currentAudio = audioBuffer;
     setImageUri(null);
+    setAudioBuffer(null);
+    setAudioLabel(null);
     try {
-      await vlm.sendMessage(
-        text,
-        currentImageUri ? { imagePath: currentImageUri } : undefined
-      );
+      const media =
+        currentImageUri || currentAudio
+          ? {
+              ...(currentImageUri ? { imagePath: currentImageUri } : {}),
+              ...(currentAudio ? { audioBuffer: currentAudio } : {}),
+            }
+          : undefined;
+      await vlm.sendMessage(text, media);
     } catch (e) {
       // Updated to set UI error instead of just console.error
       setError(e instanceof Error ? e.message : String(e));
@@ -159,6 +260,42 @@ function MultimodalLLMScreen() {
             </TouchableOpacity>
           )}
 
+          {/* Audio URL input */}
+          <View style={styles.audioUrlRow}>
+            <TextInput
+              placeholder="Audio URL (mp3/wav/…)"
+              placeholderTextColor="#C1C6E5"
+              style={styles.audioUrlInput}
+              value={audioUrl}
+              onChangeText={setAudioUrl}
+              autoCapitalize="none"
+              autoCorrect={false}
+            />
+            <TouchableOpacity
+              style={[
+                styles.audioUrlButton,
+                (!audioUrl.trim() || isFetchingAudio || vlm.isGenerating) &&
+                  styles.disabled,
+              ]}
+              onPress={loadAudioFromUrl}
+              disabled={!audioUrl.trim() || isFetchingAudio || vlm.isGenerating}
+            >
+              <Text style={styles.audioUrlButtonText}>
+                {isFetchingAudio ? '…' : 'Load'}
+              </Text>
+            </TouchableOpacity>
+          </View>
+
+          {/* Audio attachment strip */}
+          {audioLabel && (
+            <View style={styles.audioAttachmentContainer}>
+              <Text style={styles.audioAttachmentText}>🎵 {audioLabel}</Text>
+              <TouchableOpacity onPress={clearAudio}>
+                <Text style={styles.audioAttachmentClear}>✕</Text>
+              </TouchableOpacity>
+            </View>
+          )}
+
           <StatsBar stats={stats} />
           <View
             style={[
@@ -178,6 +315,17 @@ function MultimodalLLMScreen() {
               <Text style={styles.imageButtonText}>📷</Text>
             </TouchableOpacity>
 
+            {/* Mic record / stop button */}
+            <TouchableOpacity
+              style={styles.imageButton}
+              onPress={isRecording ? stopRecording : startRecording}
+              disabled={vlm.isGenerating}
+            >
+              <Text style={styles.imageButtonText}>
+                {isRecording ? '⏹️' : '🎤'}
+              </Text>
+            </TouchableOpacity>
+
             <TextInput
               autoCorrect={false}
               ref={textInputRef}
@@ -319,6 +467,64 @@ const styles = StyleSheet.create({
     fontFamily: 'regular',
     color: ColorPalette.blueDark,
   },
+  audioAttachmentContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    justifyContent: 'space-between',
+    paddingHorizontal: 16,
+    paddingVertical: 8,
+    marginHorizontal: 16,
+    marginBottom: 4,
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    backgroundColor: '#fafbff',
+  },
+  audioAttachmentText: {
+    fontSize: 13,
+    fontFamily: 'regular',
+    color: ColorPalette.blueDark,
+  },
+  audioAttachmentClear: {
+    fontSize: 16,
+    color: ColorPalette.blueDark,
+    paddingHorizontal: 8,
+  },
+  audioUrlRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginHorizontal: 16,
+    marginBottom: 4,
+  },
+  audioUrlInput: {
+    flex: 1,
+    padding: 10,
+    borderTopLeftRadius: 8,
+    borderBottomLeftRadius: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderRightWidth: 0,
+    fontFamily: 'regular',
+    fontSize: 13,
+    color: ColorPalette.primary,
+  },
+  audioUrlButton: {
+    paddingVertical: 10,
+    paddingHorizontal: 16,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderTopRightRadius: 8,
+    borderBottomRightRadius: 8,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  audioUrlButtonText: {
+    color: '#fff',
+    fontFamily: 'medium',
+    fontSize: 13,
+  },
+  disabled: {
+    opacity: 0.5,
+  },
   bottomContainer: {
     height: 100,
     width: '100%',
 
@@ -57,6 +57,7 @@ export const LLM_MODELS: ModelOption<LLMModelSources>[] = [
   { label: 'Qwen3 0.6B', value: QWEN3_0_6B },
   { label: 'Qwen3 0.6B Quantized', value: QWEN3_0_6B_QUANTIZED },
   { label: 'Qwen3 1.7B', value: QWEN3_1_7B },
+  { label: 'Gemma4 e2b Quantized', value: GEMMA4_E2B_QUANTIZED },
   { label: 'Qwen3 1.7B Quantized', value: QWEN3_1_7B_QUANTIZED },
   { label: 'Qwen3 4B', value: QWEN3_4B },
   { label: 'Qwen3 4B Quantized', value: QWEN3_4B_QUANTIZED },
 
@@ -223,6 +223,24 @@ inline std::vector<float> getValue<std::vector<float>>(const jsi::Value &val,
   return getArrayAsVector<float>(val, runtime);
 }
 
+// JS side passes an Array<Float32Array> (one clip per element). Each inner
+// element is read as a typed-array span and copied into a std::vector<float>.
+template <>
+inline std::vector<std::vector<float>>
+getValue<std::vector<std::vector<float>>>(const jsi::Value &val,
+                                          jsi::Runtime &runtime) {
+  jsi::Array array = val.asObject(runtime).asArray(runtime);
+  const size_t length = array.size(runtime);
+  std::vector<std::vector<float>> result;
+  result.reserve(length);
+  for (size_t i = 0; i < length; ++i) {
+    auto span =
+        getTypedArrayAsSpan<float>(array.getValueAtIndex(runtime, i), runtime);
+    result.emplace_back(span.begin(), span.end());
+  }
+  return result;
+}
+
 template <>
 inline std::vector<int64_t>
 getValue<std::vector<int64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
 
@@ -166,6 +166,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                               promiseHostFunction<&Model::generateMultimodal>,
                               "generateMultimodal"));
 
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          promiseHostFunction<&Model::generateMultimodalWithAudio>,
+          "generateMultimodalWithAudio"));
+
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           synchronousHostFunction<&Model::getVisualTokenCount>,