diff --git a/apps/computer-vision/app/classification/index.tsx b/apps/computer-vision/app/classification/index.tsx index c40bc5ec93..97dde1727d 100644 --- a/apps/computer-vision/app/classification/index.tsx +++ b/apps/computer-vision/app/classification/index.tsx @@ -1,6 +1,9 @@ import Spinner from '../../components/Spinner'; import { getImage } from '../../utils'; -import { useClassification, EFFICIENTNET_V2_S } from 'react-native-executorch'; +import { + useClassification, + EFFICIENTNET_V2_S_QUANTIZED, +} from 'react-native-executorch'; import { View, StyleSheet, Image, Text, ScrollView } from 'react-native'; import { BottomBar } from '../../components/BottomBar'; import React, { useContext, useEffect, useState } from 'react'; @@ -13,7 +16,7 @@ export default function ClassificationScreen() { ); const [imageUri, setImageUri] = useState(''); - const model = useClassification({ model: EFFICIENTNET_V2_S }); + const model = useClassification({ model: EFFICIENTNET_V2_S_QUANTIZED }); const { setGlobalGenerating } = useContext(GeneratingContext); useEffect(() => { setGlobalGenerating(model.isGenerating); diff --git a/apps/computer-vision/app/semantic_segmentation/index.tsx b/apps/computer-vision/app/semantic_segmentation/index.tsx index f0b3f0688c..5ecb22ea5f 100644 --- a/apps/computer-vision/app/semantic_segmentation/index.tsx +++ b/apps/computer-vision/app/semantic_segmentation/index.tsx @@ -2,7 +2,7 @@ import Spinner from '../../components/Spinner'; import { BottomBar } from '../../components/BottomBar'; import { getImage } from '../../utils'; import { - DEEPLAB_V3_RESNET50, + DEEPLAB_V3_MOBILENET_V3_LARGE_QUANTIZED, useSemanticSegmentation, } from 'react-native-executorch'; import { @@ -46,7 +46,7 @@ export default function SemanticSegmentationScreen() { const { setGlobalGenerating } = useContext(GeneratingContext); const { isReady, isGenerating, downloadProgress, forward } = useSemanticSegmentation({ - model: DEEPLAB_V3_RESNET50, + model: DEEPLAB_V3_MOBILENET_V3_LARGE_QUANTIZED, }); const [imageUri, setImageUri] = useState(''); const [imageSize, setImageSize] = useState({ width: 0, height: 0 }); diff --git a/apps/computer-vision/app/style_transfer/index.tsx b/apps/computer-vision/app/style_transfer/index.tsx index a1b3a7834d..dc6a0d4963 100644 --- a/apps/computer-vision/app/style_transfer/index.tsx +++ b/apps/computer-vision/app/style_transfer/index.tsx @@ -3,7 +3,7 @@ import { BottomBar } from '../../components/BottomBar'; import { getImage } from '../../utils'; import { useStyleTransfer, - STYLE_TRANSFER_CANDY, + STYLE_TRANSFER_CANDY_QUANTIZED, } from 'react-native-executorch'; import { View, StyleSheet, Image } from 'react-native'; import React, { useContext, useEffect, useState } from 'react'; @@ -11,7 +11,7 @@ import { GeneratingContext } from '../../context'; import ScreenWrapper from '../../ScreenWrapper'; export default function StyleTransferScreen() { - const model = useStyleTransfer({ model: STYLE_TRANSFER_CANDY }); + const model = useStyleTransfer({ model: STYLE_TRANSFER_CANDY_QUANTIZED }); const { setGlobalGenerating } = useContext(GeneratingContext); useEffect(() => { setGlobalGenerating(model.isGenerating); diff --git a/apps/text-embeddings/app/clip-embeddings/index.tsx b/apps/text-embeddings/app/clip-embeddings/index.tsx index 66ca348757..4ff3c895de 100644 --- a/apps/text-embeddings/app/clip-embeddings/index.tsx +++ b/apps/text-embeddings/app/clip-embeddings/index.tsx @@ -15,7 +15,7 @@ import { useTextEmbeddings, useImageEmbeddings, CLIP_VIT_BASE_PATCH32_TEXT, - CLIP_VIT_BASE_PATCH32_IMAGE, + CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, } from 'react-native-executorch'; import { launchImageLibrary } from 'react-native-image-picker'; import { useIsFocused } from '@react-navigation/native'; @@ -29,7 +29,9 @@ export default function ClipEmbeddingsScreenWrapper() { function ClipEmbeddingsScreen() { const textModel = useTextEmbeddings({ model: CLIP_VIT_BASE_PATCH32_TEXT }); - const imageModel = useImageEmbeddings({ model: CLIP_VIT_BASE_PATCH32_IMAGE }); + const imageModel = useImageEmbeddings({ + model: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED, + }); const [inputSentence, setInputSentence] = useState(''); const [sentencesWithEmbeddings, setSentencesWithEmbeddings] = useState< diff --git a/docs/docs/02-benchmarks/inference-time.md b/docs/docs/02-benchmarks/inference-time.md index a1580169ac..7e43d7d8a6 100644 --- a/docs/docs/02-benchmarks/inference-time.md +++ b/docs/docs/02-benchmarks/inference-time.md @@ -3,29 +3,84 @@ title: Inference Time --- :::warning -Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. +Times presented in the tables are measured as consecutive runs of the model. +Initial run times may be up to 2x longer due to model loading and +initialization. ::: ## Classification -| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| ----------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| EFFICIENTNET_V2_S | 64 | 68 | 217 | 205 | 198 | +:::info +Inference times are measured directly from native C++ code, wrapping only the +model's forward pass, excluding input-dependent pre- and post-processing (e.g. +image resizing, normalization) and any overhead from React Native runtime. +::: + +:::info +For this model all input images, whether larger or smaller, are resized before +processing. Resizing is typically fast for small images but may be noticeably +slower for very large images, which can increase total time. +::: + +| Model / Device | iPhone 17 Pro [ms] | Google Pixel 10 [ms] | +| :------------------------------- | :----------------: | :------------------: | +| EFFICIENTNET_V2_S (XNNPACK FP32) | 70 | 100 | +| EFFICIENTNET_V2_S (XNNPACK INT8) | 22 | 38 | +| EFFICIENTNET_V2_S (Core ML FP32) | 12 | - | +| EFFICIENTNET_V2_S (Core ML FP16) | 5 | - | ## Object Detection -| Model | iPhone 17 Pro (XNNPACK) [ms] | iPhone 16 Pro (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| ------------------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 71 | 74 | 257 | 115 | 109 | +:::info +Inference times are measured directly from native C++ code, wrapping only the +model's forward pass, excluding input-dependent pre- and post-processing (e.g. +image resizing, normalization) and any overhead from React Native runtime. +::: + +:::info +For this model all input images, whether larger or smaller, are resized before +processing. Resizing is typically fast for small images but may be noticeably +slower for very large images, which can increase total time. +::: + +| Model / Device | iPhone 17 Pro [ms] | Google Pixel 10 [ms] | +| :-------------------------------------------- | :----------------: | :------------------: | +| SSDLITE_320_MOBILENET_V3_LARGE (XNNPACK FP32) | 20 | 18 | +| SSDLITE_320_MOBILENET_V3_LARGE (Core ML FP32) | 18 | - | +| SSDLITE_320_MOBILENET_V3_LARGE (Core ML FP16) | 8 | - | ## Style Transfer -| Model | iPhone 17 Pro (Core ML) [ms] | iPhone 16 Pro (Core ML) [ms] | iPhone SE 3 (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| ---------------------------- | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: | -| STYLE_TRANSFER_CANDY | 1400 | 1485 | 4255 | 2510 | 2355 | -| STYLE_TRANSFER_MOSAIC | 1400 | 1485 | 4255 | 2510 | 2355 | -| STYLE_TRANSFER_UDNIE | 1400 | 1485 | 4255 | 2510 | 2355 | -| STYLE_TRANSFER_RAIN_PRINCESS | 1400 | 1485 | 4255 | 2510 | 2355 | +:::info +Inference times are measured directly from native C++ code, wrapping only the +model's forward pass, excluding input-dependent pre- and post-processing (e.g. +image resizing, normalization) and any overhead from React Native runtime. +::: + +:::info +For this model all input images, whether larger or smaller, are resized before +processing. Resizing is typically fast for small images but may be noticeably +slower for very large images, which can increase total time. +::: + +| Model / Device | iPhone 17 Pro [ms] | Google Pixel 10 [ms] | +| :------------------------------------------ | :----------------: | :------------------: | +| STYLE_TRANSFER_CANDY (XNNPACK FP32) | 1192 | 1025 | +| STYLE_TRANSFER_CANDY (XNNPACK INT8) | 272 | 430 | +| STYLE_TRANSFER_CANDY (Core ML FP32) | 100 | - | +| STYLE_TRANSFER_CANDY (Core ML FP16) | 150 | - | +| STYLE_TRANSFER_MOSAIC (XNNPACK FP32) | 1192 | 1025 | +| STYLE_TRANSFER_MOSAIC (XNNPACK INT8) | 272 | 430 | +| STYLE_TRANSFER_MOSAIC (Core ML FP32) | 100 | - | +| STYLE_TRANSFER_MOSAIC (Core ML FP16) | 150 | - | +| STYLE_TRANSFER_UDNIE (XNNPACK FP32) | 1192 | 1025 | +| STYLE_TRANSFER_UDNIE (XNNPACK INT8) | 272 | 430 | +| STYLE_TRANSFER_UDNIE (Core ML FP32) | 100 | - | +| STYLE_TRANSFER_UDNIE (Core ML FP16) | 150 | - | +| STYLE_TRANSFER_RAIN_PRINCESS (XNNPACK FP32) | 1192 | 1025 | +| STYLE_TRANSFER_RAIN_PRINCESS (XNNPACK INT8) | 272 | 430 | +| STYLE_TRANSFER_RAIN_PRINCESS (Core ML FP32) | 100 | - | +| STYLE_TRANSFER_RAIN_PRINCESS (Core ML FP16) | 150 | - | ## OCR @@ -109,23 +164,51 @@ Benchmark times for text embeddings are highly dependent on the sentence length. ## Image Embeddings -| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| --------------------------- | :--------------------------: | :-----------------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 18 | 55 | +:::info +Inference times are measured directly from native C++ code, wrapping only the +model's forward pass, excluding input-dependent pre- and post-processing (e.g. +image resizing, normalization) and any overhead from React Native runtime. +::: :::info -Image embedding benchmark times are measured using 224×224 pixel images, as required by the model. All input images, whether larger or smaller, are resized to 224×224 before processing. Resizing is typically fast for small images but may be noticeably slower for very large images, which can increase total inference time. +For this model all input images, whether larger or smaller, are resized before +processing. Resizing is typically fast for small images but may be noticeably +slower for very large images, which can increase total time. ::: +| Model / Device | iPhone 17 Pro [ms] | Google Pixel 10 [ms] | +| :----------------------------------------- | :----------------: | :------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE (XNNPACK FP32) | 14 | 68 | +| CLIP_VIT_BASE_PATCH32_IMAGE (XNNPACK INT8) | 11 | 31 | + ## Semantic Segmentation -:::warning -Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. +:::info +Inference times are measured directly from native C++ code, wrapping only the +model's forward pass, excluding input-dependent pre- and post-processing (e.g. +image resizing, normalization) and any overhead from React Native runtime. +::: + +:::info +For this model all input images, whether larger or smaller, are resized before +processing. Resizing is typically fast for small images but may be noticeably +slower for very large images, which can increase total time. ::: -| Model | iPhone 16 Pro (Core ML) [ms] | iPhone 14 Pro Max (Core ML) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | -| ----------------- | ---------------------------- | -------------------------------- | --------------------------------- | -| DEELABV3_RESNET50 | 1000 | 670 | 700 | +| Model / Device | iPhone 17 Pro [ms] | Google Pixel 10 [ms] | +| :------------------------------------------- | :----------------: | :------------------: | +| DEEPLAB_V3_RESNET50 (XNNPACK FP32) | 2000 | 2200 | +| DEEPLAB_V3_RESNET50 (XNNPACK INT8) | 118 | 380 | +| DEEPLAB_V3_RESNET101 (XNNPACK FP32) | 2900 | 3300 | +| DEEPLAB_V3_RESNET101 (XNNPACK INT8) | 174 | 660 | +| DEEPLAB_V3_MOBILENET_V3_LARGE (XNNPACK FP32) | 131 | 153 | +| DEEPLAB_V3_MOBILENET_V3_LARGE (XNNPACK INT8) | 17 | 40 | +| LRASPP_MOBILENET_V3_LARGE (XNNPACK FP32) | 13 | 36 | +| LRASPP_MOBILENET_V3_LARGE (XNNPACK INT8) | 12 | 20 | +| FCN_RESNET50 (XNNPACK FP32) | 1800 | 2160 | +| FCN_RESNET50 (XNNPACK INT8) | 100 | 320 | +| FCN_RESNET101 (XNNPACK FP32) | 2600 | 3160 | +| FCN_RESNET101 (XNNPACK INT8) | 160 | 620 | ## Text to image diff --git a/docs/docs/02-benchmarks/memory-usage.md b/docs/docs/02-benchmarks/memory-usage.md index 2f921cb48e..0dc59f0fc3 100644 --- a/docs/docs/02-benchmarks/memory-usage.md +++ b/docs/docs/02-benchmarks/memory-usage.md @@ -2,45 +2,91 @@ title: Memory Usage --- +## Classification + :::info -All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (Android). +Memory usage values represent the peak memory increase observed while the model was +loaded and actively running inference, relative to the baseline app memory +before model initialization. ::: -## Classification - -| Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | -| ----------------- | :--------------------: | :----------------: | -| EFFICIENTNET_V2_S | 230 | 87 | +| Model / Device | iPhone 17 Pro [MB] | Google Pixel 10 [MB] | +| -------------------------------- | :----------------: | :------------------: | +| EFFICIENTNET_V2_S (XNNPACK FP32) | 101 | 122 | +| EFFICIENTNET_V2_S (XNNPACK INT8) | 62 | 78 | +| EFFICIENTNET_V2_S (Core ML FP32) | 101 | - | +| EFFICIENTNET_V2_S (Core ML FP16) | 87 | - | ## Object Detection -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| ------------------------------ | :--------------------: | :----------------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 164 | 132 | +:::info +Memory usage values represent the peak memory increase observed while the model was +loaded and actively running inference, relative to the baseline app memory +before model initialization. +::: + +| Model / Device | iPhone 17 Pro [MB] | Google Pixel 10 [MB] | +| --------------------------------------------- | :----------------: | :------------------: | +| SSDLITE_320_MOBILENET_V3_LARGE (XNNPACK FP32) | 94 | 104 | +| SSDLITE_320_MOBILENET_V3_LARGE (Core ML FP32) | 83 | - | +| SSDLITE_320_MOBILENET_V3_LARGE (Core ML FP16) | 62 | - | ## Style Transfer -| Model | Android (XNNPACK) [MB] | iOS (Core ML) [MB] | -| ---------------------------- | :--------------------: | :----------------: | -| STYLE_TRANSFER_CANDY | 1200 | 380 | -| STYLE_TRANSFER_MOSAIC | 1200 | 380 | -| STYLE_TRANSFER_UDNIE | 1200 | 380 | -| STYLE_TRANSFER_RAIN_PRINCESS | 1200 | 380 | +:::info +Memory usage values represent the peak memory increase observed while the model was +loaded and actively running inference, relative to the baseline app memory +before model initialization. +::: + +| Model / Device | iPhone 17 Pro [MB] | Google Pixel 10 [MB] | +| ------------------------------------------- | :----------------: | :------------------: | +| STYLE_TRANSFER_CANDY (XNNPACK FP32) | 1200 | 1200 | +| STYLE_TRANSFER_CANDY (XNNPACK INT8) | 800 | 800 | +| STYLE_TRANSFER_CANDY (Core ML FP32) | 400 | - | +| STYLE_TRANSFER_CANDY (Core ML FP16) | 380 | - | +| STYLE_TRANSFER_MOSAIC (XNNPACK FP32) | 1200 | 1200 | +| STYLE_TRANSFER_MOSAIC (XNNPACK INT8) | 800 | 800 | +| STYLE_TRANSFER_MOSAIC (Core ML FP32) | 400 | - | +| STYLE_TRANSFER_MOSAIC (Core ML FP16) | 380 | - | +| STYLE_TRANSFER_UDNIE (XNNPACK FP32) | 1200 | 1200 | +| STYLE_TRANSFER_UDNIE (XNNPACK INT8) | 800 | 800 | +| STYLE_TRANSFER_UDNIE (Core ML FP32) | 400 | - | +| STYLE_TRANSFER_UDNIE (Core ML FP16) | 380 | - | +| STYLE_TRANSFER_RAIN_PRINCESS (XNNPACK FP32) | 1200 | 1200 | +| STYLE_TRANSFER_RAIN_PRINCESS (XNNPACK INT8) | 800 | 800 | +| STYLE_TRANSFER_RAIN_PRINCESS (Core ML FP32) | 400 | - | +| STYLE_TRANSFER_RAIN_PRINCESS (Core ML FP16) | 380 | - | ## OCR +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------------ | :--------------------: | :----------------: | | Detector (CRAFT) + Recognizer (CRNN) | 1400 | 1320 | ## Vertical OCR +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------------------------------ | :--------------------: | :----------------: | | Detector (CRAFT) + Recognizer (CRNN) | 1000-1600 | 1000-1500 | ## LLMs +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | | --------------------- | :--------------------: | :----------------: | | LLAMA3_2_1B | 3.3 | 3.1 | @@ -52,12 +98,22 @@ All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (A ## Speech to text +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------ | :--------------------: | :----------------: | | WHISPER_TINY | 410 | 375 | ## Text to speech +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | ------------- | :--------------------: | :----------------: | | KOKORO_SMALL | 820 | 820 | @@ -69,6 +125,11 @@ The reported memory usage values include the memory footprint of the Phonemis pa ## Text Embeddings +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | -------------------------- | :--------------------: | :----------------: | | ALL_MINILM_L6_V2 | 95 | 110 | @@ -79,14 +140,27 @@ The reported memory usage values include the memory footprint of the Phonemis pa ## Image Embeddings -| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | -| --------------------------- | :--------------------: | :----------------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 345 | 340 | +:::info +Memory usage values represent the peak memory increase observed while the model was +loaded and actively running inference, relative to the baseline app memory +before model initialization. +::: + +| Model / Device | iPhone 17 Pro [MB] | Google Pixel 10 [MB] | +| ------------------------------------------ | :----------------: | :------------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE (XNNPACK FP32) | 340 | 345 | ## Semantic Segmentation +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + :::warning -Data presented in the following sections is based on inference with non-resized output. When resize is enabled, expect higher memory usage and inference time with higher resolutions. +Data presented in the following sections is based on inference with non-resized +output. When resize is enabled, expect higher memory usage and inference time +with higher resolutions. ::: | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | @@ -95,6 +169,11 @@ Data presented in the following sections is based on inference with non-resized ## Text to image +:::info +All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 +(Android). +::: + | Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | | --------------------- | ---------------------- | ------------------ | | BK_SDM_TINY_VPRED_256 | 2400 | 2400 | diff --git a/docs/docs/02-benchmarks/model-size.md b/docs/docs/02-benchmarks/model-size.md index 7fa1d7f384..8dcfbbf45a 100644 --- a/docs/docs/02-benchmarks/model-size.md +++ b/docs/docs/02-benchmarks/model-size.md @@ -4,24 +4,24 @@ title: Model Size ## Classification -| Model | XNNPACK [MB] | Core ML [MB] | -| ----------------- | :----------: | :----------: | -| EFFICIENTNET_V2_S | 85.6 | 43.9 | +| Model | XNNPACK FP32 [MB] | XNNPACK INT8 [MB] | Core ML FP32 [MB] | Core ML FP16 [MB] | +| :---------------- | :---------------: | :---------------: | :---------------: | :---------------: | +| EFFICIENTNET_V2_S | 85.7 | 22.9 | 86.5 | 43.9 | ## Object Detection -| Model | XNNPACK [MB] | -| ------------------------------ | :----------: | -| SSDLITE_320_MOBILENET_V3_LARGE | 13.9 | +| Model | XNNPACK FP32 [MB] | Core ML FP32 [MB] | Core ML FP16 [MB] | +| ------------------------------ | :---------------: | :---------------: | :---------------: | +| SSDLITE_320_MOBILENET_V3_LARGE | 13.9 | 15.6 | 8.46 | ## Style Transfer -| Model | XNNPACK [MB] | Core ML [MB] | -| ---------------------------- | :----------: | :----------: | -| STYLE_TRANSFER_CANDY | 6.78 | 5.22 | -| STYLE_TRANSFER_MOSAIC | 6.78 | 5.22 | -| STYLE_TRANSFER_UDNIE | 6.78 | 5.22 | -| STYLE_TRANSFER_RAIN_PRINCESS | 6.78 | 5.22 | +| Model | XNNPACK FP32 [MB] | XNNPACK INT8 [MB] | Core ML FP32 [MB] | Core ML FP16 [MB] | +| ---------------------------- | :---------------: | :---------------: | :---------------: | :---------------: | +| STYLE_TRANSFER_CANDY | 6.82 | 1.84 | 7.12 | 3.79 | +| STYLE_TRANSFER_MOSAIC | 6.82 | 1.84 | 7.12 | 3.79 | +| STYLE_TRANSFER_UDNIE | 6.82 | 1.84 | 7.12 | 3.79 | +| STYLE_TRANSFER_RAIN_PRINCESS | 6.82 | 1.84 | 7.12 | 3.79 | ## OCR @@ -82,15 +82,20 @@ title: Model Size ## Image Embeddings -| Model | XNNPACK [MB] | -| --------------------------- | :----------: | -| CLIP_VIT_BASE_PATCH32_IMAGE | 352 | +| Model | XNNPACK FP32 [MB] | XNNPACK INT8 [MB] | +| --------------------------- | :---------------: | :---------------: | +| CLIP_VIT_BASE_PATCH32_IMAGE | 352 | 96.4 | ## Semantic Segmentation -| Model | XNNPACK [MB] | -| ----------------- | ------------ | -| DEELABV3_RESNET50 | 168 | +| Model | XNNPACK FP32 [MB] | XNNPACK INT8 [MB] | +| ----------------------------- | :---------------: | :---------------: | +| DEEPLAB_V3_RESNET50 | 168 | 42.4 | +| DEEPLAB_V3_RESNET101 | 244 | 61.7 | +| DEEPLAB_V3_MOBILENET_V3_LARGE | 44.1 | 11.4 | +| LRASPP_MOBILENET_V3_LARGE | 12.9 | 3.53 | +| FCN_RESNET50 | 141 | 35.7 | +| FCN_RESNET101 | 217 | 55 | ## Text to image diff --git a/docs/docs/03-hooks/02-computer-vision/useClassification.md b/docs/docs/03-hooks/02-computer-vision/useClassification.md index f57b7b254e..e9c2eebfab 100644 --- a/docs/docs/03-hooks/02-computer-vision/useClassification.md +++ b/docs/docs/03-hooks/02-computer-vision/useClassification.md @@ -86,6 +86,6 @@ function App() { ## Supported models -| Model | Number of classes | Class list | -| ------------------------------------------------------------------------------------------------------ | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [efficientnet_v2_s](https://huggingface.co/software-mansion/react-native-executorch-efficientnet-v2-s) | 1000 | [ImageNet1k_v1](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/common/rnexecutorch/models/classification/Constants.h) | +| Model | Number of classes | Class list | Quantized | +| ------------------------------------------------------------------------------------------------------ | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------: | +| [efficientnet_v2_s](https://huggingface.co/software-mansion/react-native-executorch-efficientnet-v2-s) | 1000 | [ImageNet1k_v1](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/common/rnexecutorch/models/classification/Constants.h) | Yes | diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index e19801cfd3..9c6cb6c016 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -374,18 +374,34 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = { // Classification const EFFICIENTNET_V2_S_MODEL = Platform.OS === `ios` - ? `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/coreml/efficientnet_v2_s_coreml_all.pte` - : `${URL_PREFIX}-efficientnet-v2-s/${VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack.pte`; + ? `${URL_PREFIX}-efficientnet-v2-s/${NEXT_VERSION_TAG}/coreml/efficientnet_v2_s_coreml_fp32.pte` + : `${URL_PREFIX}-efficientnet-v2-s/${NEXT_VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack_fp32.pte`; +const EFFICIENTNET_V2_S_QUANTIZED_MODEL = + Platform.OS === `ios` + ? `${URL_PREFIX}-efficientnet-v2-s/${NEXT_VERSION_TAG}/coreml/efficientnet_v2_s_coreml_fp16.pte` + : `${URL_PREFIX}-efficientnet-v2-s/${NEXT_VERSION_TAG}/xnnpack/efficientnet_v2_s_xnnpack_int8.pte`; /** * @category Models - Classification */ export const EFFICIENTNET_V2_S = { + modelName: 'efficientnet-v2-s', modelSource: EFFICIENTNET_V2_S_MODEL, }; +/** + * @category Models - Classification + */ +export const EFFICIENTNET_V2_S_QUANTIZED = { + modelName: 'efficientnet-v2-s-quantized', + modelSource: EFFICIENTNET_V2_S_QUANTIZED_MODEL, +}; + // Object detection -const SSDLITE_320_MOBILENET_V3_LARGE_MODEL = `${URL_PREFIX}-ssdlite320-mobilenet-v3-large/${VERSION_TAG}/ssdlite320-mobilenetv3-large.pte`; +const SSDLITE_320_MOBILENET_V3_LARGE_MODEL = + Platform.OS === 'ios' + ? `${URL_PREFIX}-ssdlite320-mobilenet-v3-large/${NEXT_VERSION_TAG}/coreml/ssdlite320_mobilenet_v3_large_coreml_fp16.pte` + : `${URL_PREFIX}-ssdlite320-mobilenet-v3-large/${NEXT_VERSION_TAG}/xnnpack/ssdlite320_mobilenet_v3_large_xnnpack_fp32.pte`; const RF_DETR_NANO_MODEL = `${URL_PREFIX}-rfdetr-nano-detector/${NEXT_VERSION_TAG}/rfdetr_detector.pte`; /** @@ -407,49 +423,101 @@ export const RF_DETR_NANO = { // Style transfer const STYLE_TRANSFER_CANDY_MODEL = Platform.OS === `ios` - ? `${URL_PREFIX}-style-transfer-candy/${VERSION_TAG}/coreml/style_transfer_candy_coreml.pte` - : `${URL_PREFIX}-style-transfer-candy/${VERSION_TAG}/xnnpack/style_transfer_candy_xnnpack.pte`; + ? `${URL_PREFIX}-style-transfer-candy/${NEXT_VERSION_TAG}/coreml/style_transfer_candy_coreml_fp32.pte` + : `${URL_PREFIX}-style-transfer-candy/${NEXT_VERSION_TAG}/xnnpack/style_transfer_candy_xnnpack_fp32.pte`; +const STYLE_TRANSFER_CANDY_QUANTIZED_MODEL = + Platform.OS === `ios` + ? `${URL_PREFIX}-style-transfer-candy/${NEXT_VERSION_TAG}/coreml/style_transfer_candy_coreml_fp16.pte` + : `${URL_PREFIX}-style-transfer-candy/${NEXT_VERSION_TAG}/xnnpack/style_transfer_candy_xnnpack_int8.pte`; const STYLE_TRANSFER_MOSAIC_MODEL = Platform.OS === `ios` - ? `${URL_PREFIX}-style-transfer-mosaic/${VERSION_TAG}/coreml/style_transfer_mosaic_coreml.pte` - : `${URL_PREFIX}-style-transfer-mosaic/${VERSION_TAG}/xnnpack/style_transfer_mosaic_xnnpack.pte`; + ? `${URL_PREFIX}-style-transfer-mosaic/${NEXT_VERSION_TAG}/coreml/style_transfer_mosaic_coreml_fp32.pte` + : `${URL_PREFIX}-style-transfer-mosaic/${NEXT_VERSION_TAG}/xnnpack/style_transfer_mosaic_xnnpack_fp32.pte`; +const STYLE_TRANSFER_MOSAIC_QUANTIZED_MODEL = + Platform.OS === `ios` + ? `${URL_PREFIX}-style-transfer-mosaic/${NEXT_VERSION_TAG}/coreml/style_transfer_mosaic_coreml_fp16.pte` + : `${URL_PREFIX}-style-transfer-mosaic/${NEXT_VERSION_TAG}/xnnpack/style_transfer_mosaic_xnnpack_int8.pte`; const STYLE_TRANSFER_RAIN_PRINCESS_MODEL = Platform.OS === `ios` - ? `${URL_PREFIX}-style-transfer-rain-princess/${VERSION_TAG}/coreml/style_transfer_rain_princess_coreml.pte` - : `${URL_PREFIX}-style-transfer-rain-princess/${VERSION_TAG}/xnnpack/style_transfer_rain_princess_xnnpack.pte`; + ? `${URL_PREFIX}-style-transfer-rain-princess/${NEXT_VERSION_TAG}/coreml/style_transfer_rain_princess_coreml_fp32.pte` + : `${URL_PREFIX}-style-transfer-rain-princess/${NEXT_VERSION_TAG}/xnnpack/style_transfer_rain_princess_xnnpack_fp32.pte`; +const STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED_MODEL = + Platform.OS === `ios` + ? `${URL_PREFIX}-style-transfer-rain-princess/${NEXT_VERSION_TAG}/coreml/style_transfer_rain_princess_coreml_fp16.pte` + : `${URL_PREFIX}-style-transfer-rain-princess/${NEXT_VERSION_TAG}/xnnpack/style_transfer_rain_princess_xnnpack_int8.pte`; const STYLE_TRANSFER_UDNIE_MODEL = Platform.OS === `ios` - ? `${URL_PREFIX}-style-transfer-udnie/${VERSION_TAG}/coreml/style_transfer_udnie_coreml.pte` - : `${URL_PREFIX}-style-transfer-udnie/${VERSION_TAG}/xnnpack/style_transfer_udnie_xnnpack.pte`; + ? `${URL_PREFIX}-style-transfer-udnie/${NEXT_VERSION_TAG}/coreml/style_transfer_udnie_coreml_fp32.pte` + : `${URL_PREFIX}-style-transfer-udnie/${NEXT_VERSION_TAG}/xnnpack/style_transfer_udnie_xnnpack_fp32.pte`; +const STYLE_TRANSFER_UDNIE_QUANTIZED_MODEL = + Platform.OS === `ios` + ? `${URL_PREFIX}-style-transfer-udnie/${NEXT_VERSION_TAG}/coreml/style_transfer_udnie_coreml_fp16.pte` + : `${URL_PREFIX}-style-transfer-udnie/${NEXT_VERSION_TAG}/xnnpack/style_transfer_udnie_xnnpack_int8.pte`; /** * @category Models - Style Transfer */ export const STYLE_TRANSFER_CANDY = { + modelName: 'style-transfer-candy', modelSource: STYLE_TRANSFER_CANDY_MODEL, }; +/** + * @category Models - Style Transfer + */ +export const STYLE_TRANSFER_CANDY_QUANTIZED = { + modelName: 'style-transfer-candy-quantized', + modelSource: STYLE_TRANSFER_CANDY_QUANTIZED_MODEL, +}; + /** * @category Models - Style Transfer */ export const STYLE_TRANSFER_MOSAIC = { + modelName: 'style-transfer-mosaic', modelSource: STYLE_TRANSFER_MOSAIC_MODEL, }; +/** + * @category Models - Style Transfer + */ +export const STYLE_TRANSFER_MOSAIC_QUANTIZED = { + modelName: 'style-transfer-mosaic-quantized', + modelSource: STYLE_TRANSFER_MOSAIC_QUANTIZED_MODEL, +}; + /** * @category Models - Style Transfer */ export const STYLE_TRANSFER_RAIN_PRINCESS = { + modelName: 'style-transfer-rain-princess', modelSource: STYLE_TRANSFER_RAIN_PRINCESS_MODEL, }; +/** + * @category Models - Style Transfer + */ +export const STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED = { + modelName: 'style-transfer-rain-princess-quantized', + modelSource: STYLE_TRANSFER_RAIN_PRINCESS_QUANTIZED_MODEL, +}; + /** * @category Models - Style Transfer */ export const STYLE_TRANSFER_UDNIE = { + modelName: 'style-transfer-udnie', modelSource: STYLE_TRANSFER_UDNIE_MODEL, }; +/** + * @category Models - Style Transfer + */ +export const STYLE_TRANSFER_UDNIE_QUANTIZED = { + modelName: 'style-transfer-udnie-quantized', + modelSource: STYLE_TRANSFER_UDNIE_QUANTIZED_MODEL, +}; + // S2T const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`; const WHISPER_TINY_EN_ENCODER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_encoder_xnnpack.pte`; @@ -668,15 +736,25 @@ export const SELFIE_SEGMENTATION = { } as const; // Image Embeddings -const CLIP_VIT_BASE_PATCH32_IMAGE_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/clip-vit-base-patch32-vision_xnnpack.pte`; +const CLIP_VIT_BASE_PATCH32_IMAGE_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${NEXT_VERSION_TAG}/xnnpack/clip_vit_base_patch32_vision_xnnpack_fp32.pte`; +const CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${NEXT_VERSION_TAG}/xnnpack/clip_vit_base_patch32_vision_xnnpack_int8.pte`; /** * @category Models - Image Embeddings */ export const CLIP_VIT_BASE_PATCH32_IMAGE = { + modelName: 'clip-vit-base-patch32-image', modelSource: CLIP_VIT_BASE_PATCH32_IMAGE_MODEL, }; +/** + * @category Models - Image Embeddings + */ +export const CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED = { + modelName: 'clip-vit-base-patch32-image-quantized', + modelSource: CLIP_VIT_BASE_PATCH32_IMAGE_QUANTIZED_MODEL, +}; + // Text Embeddings const ALL_MINILM_L6_V2_MODEL = `${URL_PREFIX}-all-MiniLM-L6-v2/${VERSION_TAG}/all-MiniLM-L6-v2_xnnpack.pte`; const ALL_MINILM_L6_V2_TOKENIZER = `${URL_PREFIX}-all-MiniLM-L6-v2/${VERSION_TAG}/tokenizer.json`; @@ -686,8 +764,8 @@ const MULTI_QA_MINILM_L6_COS_V1_MODEL = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1 const MULTI_QA_MINILM_L6_COS_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1/${VERSION_TAG}/tokenizer.json`; const MULTI_QA_MPNET_BASE_DOT_V1_MODEL = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/multi-qa-mpnet-base-dot-v1_xnnpack.pte`; const MULTI_QA_MPNET_BASE_DOT_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/tokenizer.json`; -const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/clip-vit-base-patch32-text_xnnpack.pte`; -const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/tokenizer.json`; +const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${NEXT_VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`; +const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${NEXT_VERSION_TAG}/tokenizer.json`; /** * @category Models - Text Embeddings @@ -725,6 +803,7 @@ export const MULTI_QA_MPNET_BASE_DOT_V1 = { * @category Models - Text Embeddings */ export const CLIP_VIT_BASE_PATCH32_TEXT = { + modelName: 'clip-vit-base-patch32-text', modelSource: CLIP_VIT_BASE_PATCH32_TEXT_MODEL, tokenizerSource: CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER, };